Exemple #1
0
class Agent:
    def __init__(self,
                 env,
                 lr=0.00025,
                 batch_size=32,
                 gamma=0.99,
                 n_frames=3000000,
                 start_frame=50000,
                 anneal_frame=10**6,
                 update_freq=5000,
                 hist_len=4,
                 num_reward=200,
                 experience_size=10**6,
                 check_point_path=r'../checkpoints',
                 save_freq=1000,
                 no_ops=10,
                 eval_times=10,
                 restore=False):

        # 环境搭建
        self.env = GameEnv(env, hist_len)
        # 训练用到的参数
        self.lr = lr
        self.batch_size = batch_size
        self.gamma = gamma
        self.hist_len = hist_len
        self.experience_size = experience_size
        # 容易乱的参数.....(- & -)
        self.n_frames = n_frames  # 总共训练的帧数
        self.start_frame = start_frame  # 开始eps递增
        self.anneal_frame = anneal_frame  # eps递增到最大值
        self.update_freq = update_freq  # target_q_net_work的参数更新,每过update_freq个图,更新一次
        self.num_reward = num_reward  # 储存的reward的个数
        self.no_ops = no_ops  # 什么都不做的步数
        self.eval_times = eval_times  # 评估局数(默认十局)

        self.sess = tf.Session()
        self.save_freq = save_freq
        self.check_point_path = check_point_path

        n_actions = self.env.get_num_actions()
        self.action_chooser = ChooseAction(n_actions=n_actions,
                                           start_frame=self.start_frame,
                                           annealing_frame=self.anneal_frame)

        self.eval_dqn = DeepQNetwork(n_actions,
                                     batch_size=self.batch_size,
                                     lr=self.lr,
                                     name='eval_dqn')

        self.target_dqn = DeepQNetwork(n_actions,
                                       batch_size=self.batch_size,
                                       name='target_dqn')

        self.replay_memory = ReplayMemory(size=self.experience_size,
                                          batch_size=self.batch_size)

        self.sess.run(tf.global_variables_initializer())

        self.saver = tf.train.Saver(name="model")
        self.restore = restore
        self.step = 0

    # 读取以往的检查点
    def _restore(self):
        if self.restore:
            print("Checkpoint Path: ", self.check_point_path)
            print("Checkpoint to be Restored:",
                  tf.train.latest_checkpoint(self.check_point_path))
            self.saver.restore(
                self.sess, tf.train.latest_checkpoint(self.check_point_path))

    def _save(self):
        self.saver.save(self.sess,
                        self.check_point_path + '/model.ckpt',
                        global_step=self.step)

    # 评估训练结果(论文要求)
    def _eval(self):
        print("Evaluating...")
        # 保存当前状态
        internal_state, system_state = self.env.clone_state()
        for eval_episodes in range(self.eval_times):
            self.env.reset()
            start_ep = True
            eval_reward = 0
            eval_rewards = []
            no_op = 0
            no_ops = np.random.randint(0, self.no_ops)
            while True:
                self.env.render()
                if start_ep:
                    no_op += 1
                    action = 0
                else:
                    action = self.action_chooser.choose_action(
                        self.sess, 3000000, self.env.state, self.eval_dqn)
                _, _, reward, is_done, _ = self.env.step(action)
                eval_reward += reward
                if no_op == no_ops:
                    start_ep = False
                if is_done:
                    eval_rewards.append(eval_reward)
                    break
        avg_eval_rewards = np.mean(eval_rewards)
        print("Evaluation average reward: {}".format(avg_eval_rewards))
        # 恢复状态,继续训练
        self.env.restore_state(internal_state, system_state)

    def _learn(self):
        # 从记忆库中取出记忆
        states, score_lost, actions, rewards, new_states = self.replay_memory.get_minibatch(
        )
        # [None 84 84 4] boolen [64] [64] [None 84 84 4]
        # 令下个状态回报最大 的 最优动作
        best_action = self.sess.run(
            self.eval_dqn.best_action,
            feed_dict={self.eval_dqn.input: new_states})

        # mini_batch每个状态时,下个状态的 reward
        target_q_val = self.sess.run(
            self.target_dqn.q_value,
            feed_dict={self.target_dqn.input: new_states})

        # [batch_size, best_action]---每个状态下的最优动作值
        target_q_val = target_q_val[range(self.batch_size), best_action]

        # 因为每次失分会导致整个游戏reset(),所以每失分一次,算作一个结束
        # 根据论文算法,每次结束都采用reward作为target_q
        target_q = rewards + self.gamma * target_q_val * (1 - score_lost)

        # 为了反向传播....妈妈咪...-_-
        pred_q = self.sess.run(self.eval_dqn.q_value,
                               feed_dict={self.eval_dqn.input: states})
        target_q_transition = pred_q.copy()
        batch_index = np.arange(self.batch_size, dtype=np.int32)
        eval_act_index = actions
        target_q_transition[batch_index, eval_act_index] = target_q
        target_q = target_q_transition
        # print(target_q - pred_q)
        loss, _ = self.sess.run([self.eval_dqn.loss, self.eval_dqn.update],
                                feed_dict={
                                    self.eval_dqn.input: states,
                                    self.eval_dqn.target_q: target_q
                                })
        return pred_q, target_q, loss

    def _update_target_q_network(self):
        eval_vars = self.eval_dqn.variables()
        target_vars = self.target_dqn.variables()
        update_operation = []
        for eval_var, target_var in zip(eval_vars, target_vars):
            update_operation.append(
                tf.assign(target_var, tf.identity(eval_var)))
        copy_operation = tf.group(*update_operation)  # 返回一个操作而非数组
        self.sess.run(copy_operation)
        # 防止参数没有更新
        check_before = self.sess.run(eval_vars[0])
        check_after = self.sess.run(target_vars[0])
        assert (check_before == check_after).all(), "Parameters not updated"

    def train(self):
        self._restore()
        self.env.reset()
        start_ep = True
        no_op = 0
        no_ops = np.random.randint(0, self.no_ops)
        train_rewards = []
        train_reward = 0
        num_dones = 0
        # current_reward = 0
        # last_reward = 0

        print("Training for {} frames...".format(self.n_frames))

        # Training loop
        for elapsed_frame in range(0, self.n_frames):
            self.env.render()
            # 每过4帧换一次动作
            if elapsed_frame % 4 == 0:
                # TODO 矩阵维度问题...-_-(done)
                action = self.action_chooser.choose_action(
                    self.sess, elapsed_frame, self.env.state, self.eval_dqn)
            # print(action)
            score_lost, observation, reward, is_done, _ = self.env.step(action)
            train_reward += reward

            # 是否结束了一局(某一方得20分)
            if is_done:
                num_dones += 1
                if len(train_rewards) < self.num_reward:
                    train_rewards.append(train_reward)
                else:
                    train_rewards[num_dones % self.num_reward] = train_reward
                last_reward = sum(train_rewards) / len(train_rewards)
                current_reward = train_reward
                train_reward = 0
                print("Training Reward(average):", last_reward)
                print("Training Reward(current):", current_reward)

            self.replay_memory.add_experience(action, observation[:, :, 0],
                                              reward, score_lost)

            # start_frame之前是随机走动...只是为了丰富记忆库(林子大了什么鸟都有)
            if elapsed_frame > self.start_frame:
                _, _, loss = self._learn()
                print('loss:', loss)
                self.step += 1

            if (elapsed_frame % self.update_freq == 0
                    and elapsed_frame > self.start_frame):
                print("Updating target network params", end=', ')
                print("Current number of frames elapsed: {}".format(
                    elapsed_frame))
                self._update_target_q_network()

            if (elapsed_frame % self.save_freq == 0
                    and elapsed_frame > self.start_frame):
                # 保存参数,检测训练结果
                self._save()
                self._eval()

        print("Training finished")
        self.env.close()
Exemple #2
0
            #fresh env
            env.render()

            #RL choose action
            action = RL.choose_action(observation)

            #RL take action and get next observation and reward
            observation_, reward, done = env.step(action)

            RL.store_transition(observation, action, reward, observation_)

            if (step > 200) and (step % 5 == 0):
                RL.learn()

            #swap observation
            observation = observation_

            #break while loop when end of this episode
            if done:
                break
            step += 1
    
    print("gamen over")
    env.destroy()

if __name__ == "__main__":
    env = Maze()
    RL = DeepQNetwork(env.n_actions, env.n_features, learning_rate=0.01, reward_decay=0.9, replace_target_iter=200, memory_size=4000, output_graph=True)
    env.after(100, run_maze)
    env.mainloop()
    RL.plot_cost()
Exemple #3
0
    def __init__(self,
                 env,
                 lr=0.00025,
                 batch_size=32,
                 gamma=0.99,
                 n_frames=3000000,
                 start_frame=50000,
                 anneal_frame=10**6,
                 update_freq=5000,
                 hist_len=4,
                 num_reward=200,
                 experience_size=10**6,
                 check_point_path=r'../checkpoints',
                 save_freq=1000,
                 no_ops=10,
                 eval_times=10,
                 restore=False):

        # 环境搭建
        self.env = GameEnv(env, hist_len)
        # 训练用到的参数
        self.lr = lr
        self.batch_size = batch_size
        self.gamma = gamma
        self.hist_len = hist_len
        self.experience_size = experience_size
        # 容易乱的参数.....(- & -)
        self.n_frames = n_frames  # 总共训练的帧数
        self.start_frame = start_frame  # 开始eps递增
        self.anneal_frame = anneal_frame  # eps递增到最大值
        self.update_freq = update_freq  # target_q_net_work的参数更新,每过update_freq个图,更新一次
        self.num_reward = num_reward  # 储存的reward的个数
        self.no_ops = no_ops  # 什么都不做的步数
        self.eval_times = eval_times  # 评估局数(默认十局)

        self.sess = tf.Session()
        self.save_freq = save_freq
        self.check_point_path = check_point_path

        n_actions = self.env.get_num_actions()
        self.action_chooser = ChooseAction(n_actions=n_actions,
                                           start_frame=self.start_frame,
                                           annealing_frame=self.anneal_frame)

        self.eval_dqn = DeepQNetwork(n_actions,
                                     batch_size=self.batch_size,
                                     lr=self.lr,
                                     name='eval_dqn')

        self.target_dqn = DeepQNetwork(n_actions,
                                       batch_size=self.batch_size,
                                       name='target_dqn')

        self.replay_memory = ReplayMemory(size=self.experience_size,
                                          batch_size=self.batch_size)

        self.sess.run(tf.global_variables_initializer())

        self.saver = tf.train.Saver(name="model")
        self.restore = restore
        self.step = 0
Exemple #4
0
import gym
from DQN_model import DeepQNetwork

env = gym.make("MountainCar-v0")
env = env.unwrapped

print(env.action_space)
print(env.observation_space)
print(env.observation_space.high)
print(env.observation_space.low)

RL = DeepQNetwork(n_actions=3,
                  n_features=2,
                  learning_rate=0.001,
                  e_greedy=0.9,
                  replace_target_iter=300,
                  memory_size=3000,
                  e_greedy_increment=0.0001)

total_steps = 0

for i_episode in range(50):
    observation = env.reset()
    ep_r = 0
    while True:
        env.render()

        action = RL.choose_action(observation)

        observation_, reward, done, info = env.step(action)