コード例 #1
0
parser.add_argument('--seed', type=int, default=4, metavar='N',
                    help='random seed (default: 4)')
parser.add_argument('--batch_size', type=int, default=128, metavar='N',
                    help='batch size (default: 128)')
parser.add_argument('--num_steps', type=int, default=100000, metavar='N',
                    help='max episode length (default: 1000)')
parser.add_argument('--num_episodes', type=int, default=300, metavar='N',
                    help='number of episodes (default: 1000)')
parser.add_argument('--hidden_size', type=int, default=128, metavar='N',
                    help='number of episodes (default: 128)')
parser.add_argument('--updates_per_step', type=int, default=5, metavar='N',
                    help='model updates per simulator step (default: 5)')
parser.add_argument('--replay_size', type=int, default=1000000, metavar='N',
                    help='size of replay buffer (default: 1000000)')
args = parser.parse_args()
env = VehicleFollowingENV()
print("""
Environment Initializing...
The initial head car velocity is {}
The initial safe distance is     {}
The Nash Eq* Factor RC is        {}
""".format(env.v_head, env.d0, env.RC))
# writer = SummaryWriter()


ETA = 0.5


def fit_nash():
    agent_vehicle = NAF(args.gamma, args.tau, args.hidden_size,
                        env.observation_space, env.vehicle_action_space)
コード例 #2
0
    def fit_nash(self, env: VehicleFollowingENV, num_iterations, episode, total_step, max_episode_length=None):
        """
        Fit with Nash Equilibrium
        """
        # RL network: LSTM
        self.p1_net = self.net  # target network
        self.p1_net2 = self.net2

        self.p2_net = deepcopy(self.net)
        self.p2_net2 = deepcopy(self.net2)

        # SL network: NN
        self.p1_policy = self.create_SL_model(DIM_STATES, NUM_ACTIONS)
        self.p2_policy = self.create_SL_model(DIM_STATES, NUM_ACTIONS)

        self.p1_policy.compile('Adam', categorical_crossentropy)
        self.p2_policy.compile('Adam', mse)

        # ReplayMemory
        self.p1_RL_mem = ReplayMemory(max_size=100000)
        self.p2_RL_mem = ReplayMemory(max_size=100000)
        self.p1_SL_mem = ReplayMemory(max_size=100000)
        self.p2_SL_mem = ReplayMemory(max_size=100000)

        # MainLoop
        state = env.reset()
        total_reward = 0
        done = False

        for i in num_iterations:
            total_step += 1

            # if self.render:
            #     env.render()
            if max_episode_length and i > max_episode_length:
                break

            if np.random.random() < ETA:
                best_response = True
            else:
                best_response = False

            if best_response:
                p1_action = self.select_action(state, net=self.p1_net)
                p2_action = self.select_action(state, net=self.p2_net)
            else:
                p1_action = self.select_action(state, net=self.p1_policy)
                p2_action = self.select_action(state, net=self.p2_policy)

            next_state, reward, done = env.step(action_weight=p1_action, action_attacker=p2_action)

            self.p1_RL_mem.append((state, p1_action, RC - reward, next_state, done))
            self.p2_RL_mem.append((state, p2_action, reward, next_state, done))
            self.p1_SL_mem.append((state, p1_action))
            self.p2_SL_mem.append((state, p2_action))

            total_reward += reward

            if done:
                with open(self.algorithm + 'total_reward.txt', 'a') as f:
                    f.write('Episode ({}), reward: ({})\n'.format(episode, total_reward))
                print("Episode finished after {} time steps, total_reward is {}...".format(i, total_reward))
                break

            if total_step % self.renew == 0 and total_step != 0:
                self.p1_net2 = self.p1_net
                self.p2_net2 = self.p2_net

            # if total_step % 100000 == 0:
            #     self.save(total_step)

            if total_step >= self.burn_in and total_step % self.train_freq == 0:
                batches = min(self.batch_size, len(self.p1_RL_mem))
                p1_states, p1_actions, p1_q_values = self.sample_from_Replay_Memory(batches, self.p1_RL_mem,
                                                                                    self.p1_net)
                p2_states, p2_actions, p2_q_values = self.sample_from_Replay_Memory(batches, self.p2_RL_mem,
                                                                                    self.p2_net)

                self.p1_net.fit(p1_states, p1_q_values)
                self.p2_net.fit(p2_states, p2_q_values)
                self.p1_policy.fit(p1_states, p1_actions)
                self.p2_policy.fit(p2_states, p2_actions)

            state = next_state
        return total_step, done