Ejemplo n.º 1
0
    def process_rollout(self, rollout, gamma, lambda_=1.0):
        """
    given a rollout, compute its returns and the advantage
    """
        batch_si = np.asarray(rollout.states)
        batch_a = np.asarray(rollout.actions)
        rewards = np.asarray(rollout.rewards)
        time = np.asarray(rollout.time)
        meta = np.asarray(rollout.meta)
        vpred_t = np.asarray(rollout.values + [rollout.r])

        rewards_plus_v = np.asarray(rollout.rewards + [rollout.r])
        batch_r = util.discount(rewards_plus_v, gamma)[:-1]
        delta_t = rewards + gamma * vpred_t[1:] - vpred_t[:-1]
        # this formula for the advantage comes "Generalized Advantage Estimation":
        # https://arxiv.org/abs/1506.02438
        batch_adv = util.discount(delta_t, gamma * lambda_)

        features = rollout.features[0]
        return util.Batch(si=batch_si,
                          a=batch_a,
                          adv=batch_adv,
                          r=batch_r,
                          terminal=rollout.terminal,
                          features=features,
                          reward=rewards,
                          step=time,
                          meta=meta)
Ejemplo n.º 2
0
    def train(self, rollout, sess, gamma, lam, bootstrap_value):
        rollout = np.array(rollout)
        observations = rollout[:, 0]
        actions = rollout[:, 1]
        rewards = rollout[:, 2]
        prev_rewards = [0] + rewards[:-1].tolist()
        prev_actions = [np.array([0] * self.env.action_space.n)
                        ] + actions[:-1].tolist()
        next_observations = rollout[:, 3]  # ARA - currently unused
        values = rollout[:, 5]

        # Here we take the rewards and values from the rollout, and use
        # them to generate the advantage and discounted returns.  The
        # advantage function uses "Generalized Advantage Estimation"
        # Based on: https://github.com/awjuliani/DeepRL-Agents
        self.rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value])
        discounted_rewards = discount(self.rewards_plus, gamma)[:-1]
        self.value_plus = np.asarray(values.tolist() + [bootstrap_value])
        advantages = rewards + gamma * self.value_plus[
            1:] - self.value_plus[:-1]
        advantages = discount(advantages, gamma * lam)

        # Update the global network using gradients from loss
        # Generate network statistics to periodically save
        rnn_state = self.start_rnn_state
        feed_dict = {
            self.local_AC.target_v:
            discounted_rewards,
            # ARA - using np.stack to support Ndarray states
            self.local_AC.inputs:
            np.stack(observations),
            self.local_AC.prev_actions:
            np.vstack(prev_actions),
            self.local_AC.prev_rewards:
            np.vstack(prev_rewards),
            self.local_AC.is_training_ph:
            True,
            self.local_AC.actions:
            np.vstack(actions),
            self.local_AC.advantages:
            advantages,
            self.local_AC.state_in[0]:
            rnn_state[0],
            self.local_AC.state_in[1]:
            rnn_state[1]
        }
        v_l, p_l, e_l, g_n, v_n, _ = sess.run([
            self.local_AC.value_loss, self.local_AC.policy_loss,
            self.local_AC.entropy, self.local_AC.grad_norms,
            self.local_AC.var_norms, self.local_AC.apply_grads
        ],
                                              feed_dict=feed_dict)
        return v_l / len(rollout), p_l / len(rollout), e_l / len(
            rollout), g_n, v_n
Ejemplo n.º 3
0
    def process_rollout(self, rollout, gamma, lambda_=1.0):
        """
    given a rollout, compute its returns
    """
	#print ("shape of the roolout states:--------------")
	#print (len(rollout.states))
	#print ((rollout.states[0]).shape)
        batch_si = np.asarray(rollout.states)
        batch_a = np.asarray(rollout.actions)
        rewards = np.asarray(rollout.rewards)
        time = np.asarray(rollout.time)
        meta = np.asarray(rollout.meta)
        rewards_plus_v = np.asarray(rollout.rewards + [rollout.r])
        batch_r = util.discount(rewards_plus_v, gamma, time)
        features = rollout.features[0]

        return util.Batch(si=batch_si, 
              a=batch_a, 
              adv=None,
              r=batch_r, 
              terminal=rollout.terminal,
              features=features,
              reward=rewards,
              step=time,
              meta=meta,
              )
def lambda_advantage(rewards, values, gamma, td_lambda, bootstrap_value):
    td_advantages = td_return(rewards, values, gamma, bootstrap_value) - values
    # these terms telescope into lambda_advantage = G_t^lambda - V(S_t)
    lambda_advantages = discount(td_advantages, gamma * td_lambda)
    return lambda_advantages