parser.add_argument('--seed', type=int, default=4, metavar='N', help='random seed (default: 4)') parser.add_argument('--batch_size', type=int, default=128, metavar='N', help='batch size (default: 128)') parser.add_argument('--num_steps', type=int, default=100000, metavar='N', help='max episode length (default: 1000)') parser.add_argument('--num_episodes', type=int, default=300, metavar='N', help='number of episodes (default: 1000)') parser.add_argument('--hidden_size', type=int, default=128, metavar='N', help='number of episodes (default: 128)') parser.add_argument('--updates_per_step', type=int, default=5, metavar='N', help='model updates per simulator step (default: 5)') parser.add_argument('--replay_size', type=int, default=1000000, metavar='N', help='size of replay buffer (default: 1000000)') args = parser.parse_args() env = VehicleFollowingENV() print(""" Environment Initializing... The initial head car velocity is {} The initial safe distance is {} The Nash Eq* Factor RC is {} """.format(env.v_head, env.d0, env.RC)) # writer = SummaryWriter() ETA = 0.5 def fit_nash(): agent_vehicle = NAF(args.gamma, args.tau, args.hidden_size, env.observation_space, env.vehicle_action_space)
def fit_nash(self, env: VehicleFollowingENV, num_iterations, episode, total_step, max_episode_length=None): """ Fit with Nash Equilibrium """ # RL network: LSTM self.p1_net = self.net # target network self.p1_net2 = self.net2 self.p2_net = deepcopy(self.net) self.p2_net2 = deepcopy(self.net2) # SL network: NN self.p1_policy = self.create_SL_model(DIM_STATES, NUM_ACTIONS) self.p2_policy = self.create_SL_model(DIM_STATES, NUM_ACTIONS) self.p1_policy.compile('Adam', categorical_crossentropy) self.p2_policy.compile('Adam', mse) # ReplayMemory self.p1_RL_mem = ReplayMemory(max_size=100000) self.p2_RL_mem = ReplayMemory(max_size=100000) self.p1_SL_mem = ReplayMemory(max_size=100000) self.p2_SL_mem = ReplayMemory(max_size=100000) # MainLoop state = env.reset() total_reward = 0 done = False for i in num_iterations: total_step += 1 # if self.render: # env.render() if max_episode_length and i > max_episode_length: break if np.random.random() < ETA: best_response = True else: best_response = False if best_response: p1_action = self.select_action(state, net=self.p1_net) p2_action = self.select_action(state, net=self.p2_net) else: p1_action = self.select_action(state, net=self.p1_policy) p2_action = self.select_action(state, net=self.p2_policy) next_state, reward, done = env.step(action_weight=p1_action, action_attacker=p2_action) self.p1_RL_mem.append((state, p1_action, RC - reward, next_state, done)) self.p2_RL_mem.append((state, p2_action, reward, next_state, done)) self.p1_SL_mem.append((state, p1_action)) self.p2_SL_mem.append((state, p2_action)) total_reward += reward if done: with open(self.algorithm + 'total_reward.txt', 'a') as f: f.write('Episode ({}), reward: ({})\n'.format(episode, total_reward)) print("Episode finished after {} time steps, total_reward is {}...".format(i, total_reward)) break if total_step % self.renew == 0 and total_step != 0: self.p1_net2 = self.p1_net self.p2_net2 = self.p2_net # if total_step % 100000 == 0: # self.save(total_step) if total_step >= self.burn_in and total_step % self.train_freq == 0: batches = min(self.batch_size, len(self.p1_RL_mem)) p1_states, p1_actions, p1_q_values = self.sample_from_Replay_Memory(batches, self.p1_RL_mem, self.p1_net) p2_states, p2_actions, p2_q_values = self.sample_from_Replay_Memory(batches, self.p2_RL_mem, self.p2_net) self.p1_net.fit(p1_states, p1_q_values) self.p2_net.fit(p2_states, p2_q_values) self.p1_policy.fit(p1_states, p1_actions) self.p2_policy.fit(p2_states, p2_actions) state = next_state return total_step, done