Beispiel #1
0
            reward = max(min(reward, args.reward_clip),
                         -args.reward_clip)  # Clip rewards
        mem.append(state, action, reward, done)  # Append transition to memory
        T += 1

        # Train and test
        if T >= args.learn_start:
            mem.priority_weight = min(
                mem.priority_weight + priority_weight_increase,
                1)  # Anneal importance sampling weight β to 1
            weight_val_mem.append(state, action, reward, done)

            if T % args.replay_frequency == 0:
                if weight_val_mem.transitions.full:
                    dqn.update_meta_weights(weight_val_mem)
                    weight_val_mem.clear()
                dqn.learn(
                    mem)  # Train with n-step distributional double-Q learning

            if T % args.evaluation_interval == 0:
                dqn.eval()  # Set DQN (online network) to evaluation mode
                avg_reward, avg_Q = test(args, T, dqn, val_mem)  # Test
                log('T = ' + str(T) + ' / ' + str(args.T_max) +
                    ' | Avg. reward: ' + str(avg_reward) + ' | Avg. Q: ' +
                    str(avg_Q))
                dqn.train()  # Set DQN (online network) back to training mode

            # Update target network
            if T % args.target_update == 0:
                dqn.update_target_net()
Beispiel #2
0
class Agent:
    def __init__(self, env, sess, horizon, epsilon, learning_rate_policy,
                 learning_rate_value, gamma, lam, logger):
        self.env = env
        self.sess = sess
        self.horizon = horizon
        self.epsilon = epsilon
        self.learning_rate_policy = learning_rate_policy
        self.learning_rate_value = learning_rate_value
        self.gamma = gamma
        self.lam = lam
        self.logger = logger

        self.observation_space = env.observation_space.shape[0]
        self.action_space = env.action_space.shape[0]

        self.policy = Policy(self.observation_space, self.action_space,
                             self.epsilon, self.learning_rate_policy)
        self.value_function = Value_function(self.observation_space,
                                             self.learning_rate_value)

        self.replay_memory = ReplayMemory(self.horizon, self.observation_space,
                                          self.action_space)

    def learn(self):
        """
         Learning process that loops forever if not stopped
        """
        while True:
            #Fill replay memory with one trajectory
            self.run_trajectory()
            adv, vtarget = self.gae()
            self.sess.run(self.policy.network.copy_to(self.policy.network_old))
            #Train policy and value function on minibatch
            bg = BatchGenerator((self.replay_memory.observations,
                                 self.replay_memory.actions, adv), 1000)
            for _ in range(20):
                for ms, ma, madv in bg.iterate_once():
                    self.sess.run(
                        self.policy.optimizer, {
                            self.policy.network.input_pl: ms,
                            self.policy.network_old.input_pl: ms,
                            self.policy.action_pl: ma,
                            self.policy.adv_pl: madv
                        })
            bg = BatchGenerator((self.replay_memory.observations, vtarget),
                                250)
            for _ in range(10):
                for ms, mvpred in bg.iterate_once():
                    self.sess.run(
                        self.value_function.optimizer, {
                            self.value_function.network.input_pl: ms,
                            self.value_function.value_pl: mvpred
                        })

    def run_trajectory(self):
        """
         Runs for one trajectory and fills the replay memory
        Returns:
         Nothing, data is stored in replay memory for later use
        """
        self.replay_memory.clear()
        observation = self.env.reset()
        episode_reward = 0
        for _ in range(self.horizon):
            observation = np.array([observation])
            action = self.sess.run(
                self.policy.network.sample,
                {self.policy.network.input_pl: observation})[0]
            new_observation, reward, done, info = self.env.step(action)
            episode_reward += reward
            self.replay_memory.add(observation, action, reward,
                                   new_observation, done)
            if done:
                #Log episode reward and reset
                self.logger.add_reward(episode_reward)
                episode_reward = 0
                observation = self.env.reset()
            else:
                observation = new_observation

    def gae(self):
        """
         Takes data in replay memory and calculates general advantage estimate with it
        Returns:
         gae: general advantage estimate
         vtarget: predicted values
        """
        v = self.sess.run(self.value_function.network.predict, {
            self.value_function.network.input_pl:
            self.replay_memory.observations
        })
        v1 = self.sess.run(
            self.value_function.network.predict, {
                self.value_function.network.input_pl:
                self.replay_memory.new_observations
            })
        tds = self.replay_memory.rewards + self.gamma * v1 * (
            1 - self.replay_memory.done) - v
        gae = scipy.signal.lfilter([1.0], [1.0, -self.gamma * self.lam],
                                   tds[::-1])[::-1]
        vtarget = gae + v
        gae = (gae - gae.mean()) / (gae.std() + 1e-6)
        return gae, vtarget