Exemple #1
0
 def dqnTrain(self, double=True):
     step = 0
     memory = ReplayMemory(self.MEMORY_CAPACITY_N)
     eval_Qnetwork = QNetwork(self.env.action_space.n, self.LEARNING_RATE)
     target_Qnetwork = QNetwork(self.env.action_space.n, self.LEARNING_RATE)
     eval_Qnetwork.set_weights(self.Qnetwork.get_weights())
     target_Qnetwork.set_weights(eval_Qnetwork.get_weights())
     reward_list = self.reward_list
     time_start = time.time()
     for episode in range(1, self.episode_M + 1):
         episode_reward = 0
         state = self.env.reset()
         while True:
             step += 1
             action = self.selectAction(eval_Qnetwork, state)
             next_state, reward, done, _ = self.env.step(action)
             episode_reward += reward
             memory.add((state, action, reward, next_state, done))
             state = next_state
             if len(memory) > self.BATCH_SIZE:
                 sample_batch = memory.sample(self.BATCH_SIZE)
                 self.updateQNetwork(eval_Qnetwork, target_Qnetwork,
                                     sample_batch, double)
                 # self.EPS = self.EPS*self.EPS_DECAY if self.EPS > self.EPS_MIN else self.EPS_MIN
                 eps_fraction = min(
                     float(step) / self.schedule_timesteps, self.eps_init)
                 self.eps = self.eps_init + eps_fraction * (self.eps_final -
                                                            self.eps_init)
             if step % self.TARGET_UPDATE_C == 0:
                 target_Qnetwork.set_weights(eval_Qnetwork.get_weights())
             if done:
                 break
         reward_list.append(episode_reward)
         print(
             "episode: {}, reward: {}, tot_step: {}, {}min. eps: {}".format(
                 episode, episode_reward, step,
                 (time.time() - time_start) / 60, self.eps))
         if episode % 5 == 0:
             print(
                 "episode {}. recent 5 episode_reward:{}. using {} min. total step: {}. "
                 .format(episode, self.reward_list[-5:],
                         (time.time() - time_start) / 60, step))
         if episode % 50 == 0:
             self.save(target_Qnetwork, reward_list)
     self.Qnetwork.set_weights(target_Qnetwork.get_weights())
     self.reward_list = reward_list
     return target_Qnetwork, reward_list
Exemple #2
0
class DQNAgent():
    def __init__(self, game_name="pong"):
        self.LEARNING_RATE = 1e-4
        self.eps_init = 1
        self.eps_final = 0.02
        self.schedule_timesteps = 5e4  # tot_timestep * explo_frac
        self.eps = self.eps_init
        self.GAMMA = 0.95
        self.BATCH_SIZE = 32
        self.TARGET_UPDATE_C = 1000
        self.MEMORY_CAPACITY_N = 200
        self.episode_M = 5000
        self.GAME_ENV = {
            "pong": "PongNoFrameskip-v4",
            "cartpole": "CartPole-v0",
        }
        self.game_name = game_name
        env_name = self.GAME_ENV[self.game_name]
        self.env = wrapEnv(gym.make(env_name))
        self.reward_list = []
        self.Qnetwork = QNetwork(self.env.action_space.n, self.LEARNING_RATE)
        self.Qnetwork.summary()

    def selectAction(self, eval_Qnetwork, state, is_train=True):
        """ 使用epsilon-greedy策略选择动作 
            使用神经网络近似值函数
        """
        if random.random() <= self.eps and is_train:
            action = self.env.action_space.sample()
        else:
            action = np.argmax(
                eval_Qnetwork.predict([
                    np.ones((1, self.env.action_space.n)),
                    np.expand_dims(np.array(state), 0)
                ]))  # (1, 84, 84, 4) 增加一个维度 - batch_size
        return action

    def updateQNetwork(self,
                       eval_Qnetwork,
                       target_Qnetwork,
                       sample_batch,
                       double=True):
        """ DDQN/DQN 梯度下降更新网络 """
        # states = np.array([a[0] for a in sample_batch])
        # actions = np.array([a[1] for a in sample_batch])
        # rewards = np.array([a[2] for a in sample_batch])
        # next_states = np.array([a[3] for a in sample_batch])
        # dones = np.array([a[4] for a in sample_batch])
        states, actions, rewards, next_states, dones = [], [], [], [], []
        for i in range(len(sample_batch)):
            states.append(np.array(sample_batch[i][0], copy=False))
            actions.append(sample_batch[i][1])
            rewards.append(sample_batch[i][2])
            next_states.append(np.array(sample_batch[i][3], copy=False))
            dones.append(sample_batch[i][4])
        states = np.array(states)
        actions = np.array(actions)
        rewards = np.array(rewards)
        next_states = np.array(next_states)
        dones = np.array(dones)
        ones_mat = np.ones((len(sample_batch), self.env.action_space.n))
        if double == True:
            eval_actions = np.argmax(eval_Qnetwork.predict(
                [ones_mat, next_states]),
                                     axis=1)
            target_action_Qvalue = target_Qnetwork.predict(
                [ones_mat, next_states])[range(len(sample_batch)),
                                         eval_actions]
        else:
            target_action_Qvalue = np.max(target_Qnetwork.predict(
                [ones_mat, next_states]),
                                          axis=1)
        # y_true = eval_Qnetwork.predict(states)
        # y_true[range(len(y_true)), actions] = rewards + (1-dones)*self.GAMMA * target_action_Qvalue
        select_actions = np.zeros((len(sample_batch), self.env.action_space.n))
        select_actions[range(len(sample_batch)), actions] = 1
        y_true = rewards + (1 - dones) * self.GAMMA * target_action_Qvalue
        eval_Qnetwork.fit(x=[select_actions, states],
                          y=select_actions * np.expand_dims(y_true, axis=1),
                          epochs=1,
                          batch_size=len(sample_batch),
                          verbose=0)

    def dqnTrain(self, double=True):
        step = 0
        memory = ReplayMemory(self.MEMORY_CAPACITY_N)
        eval_Qnetwork = QNetwork(self.env.action_space.n, self.LEARNING_RATE)
        target_Qnetwork = QNetwork(self.env.action_space.n, self.LEARNING_RATE)
        eval_Qnetwork.set_weights(self.Qnetwork.get_weights())
        target_Qnetwork.set_weights(eval_Qnetwork.get_weights())
        reward_list = self.reward_list
        time_start = time.time()
        for episode in range(1, self.episode_M + 1):
            episode_reward = 0
            state = self.env.reset()
            while True:
                step += 1
                action = self.selectAction(eval_Qnetwork, state)
                next_state, reward, done, _ = self.env.step(action)
                episode_reward += reward
                memory.add((state, action, reward, next_state, done))
                state = next_state
                if len(memory) > self.BATCH_SIZE:
                    sample_batch = memory.sample(self.BATCH_SIZE)
                    self.updateQNetwork(eval_Qnetwork, target_Qnetwork,
                                        sample_batch, double)
                    # self.EPS = self.EPS*self.EPS_DECAY if self.EPS > self.EPS_MIN else self.EPS_MIN
                    eps_fraction = min(
                        float(step) / self.schedule_timesteps, self.eps_init)
                    self.eps = self.eps_init + eps_fraction * (self.eps_final -
                                                               self.eps_init)
                if step % self.TARGET_UPDATE_C == 0:
                    target_Qnetwork.set_weights(eval_Qnetwork.get_weights())
                if done:
                    break
            reward_list.append(episode_reward)
            print(
                "episode: {}, reward: {}, tot_step: {}, {}min. eps: {}".format(
                    episode, episode_reward, step,
                    (time.time() - time_start) / 60, self.eps))
            if episode % 5 == 0:
                print(
                    "episode {}. recent 5 episode_reward:{}. using {} min. total step: {}. "
                    .format(episode, self.reward_list[-5:],
                            (time.time() - time_start) / 60, step))
            if episode % 50 == 0:
                self.save(target_Qnetwork, reward_list)
        self.Qnetwork.set_weights(target_Qnetwork.get_weights())
        self.reward_list = reward_list
        return target_Qnetwork, reward_list

    def load(self, filename_prefix=None):
        if filename_prefix == None:
            filename_prefix = "pong/data/ddqn_bs" + str(self.BATCH_SIZE)
        self.Qnetwork = keras.models.load_model(filename_prefix + "network.h5")
        with open(filename_prefix + "reward.json", 'r') as file_obj:
            self.reward_list = json.loads(file_obj.read())

    def save(self, Qnetwork=None, reward_list=None, filename_prefix=None):
        if Qnetwork == None:
            Qnetwork = self.Qnetwork
        if reward_list == None:
            reward_list = self.reward_list
        if filename_prefix == None:
            filename_prefix = "pong/data/ddqn_bs_" + str(self.BATCH_SIZE)
        Qnetwork.save(filename_prefix + "network.h5")
        with open(filename_prefix + "reward.json", 'w') as file_obj:
            file_obj.write(json.dumps(reward_list))

    def playByQv(self, Qnetwork=None, episode_num=1):
        if Qnetwork == None:
            Qnetwork = self.Qnetwork
        for episode in range(1, episode_num + 1):
            state = self.env.reset()
            while True:
                self.env.render()
                action = self.selectAction(Qnetwork, state, is_train=False)
                state, reward, done, _ = self.env.step(action)
                time.sleep(0.02)
                if done:
                    break

    def plotReward(self, reward_list=None):
        if reward_list == None:
            reward_list = self.reward_list