def process_rollout(self, rollout, gamma, lambda_=1.0): """ given a rollout, compute its returns and the advantage """ batch_si = np.asarray(rollout.states) batch_a = np.asarray(rollout.actions) rewards = np.asarray(rollout.rewards) time = np.asarray(rollout.time) meta = np.asarray(rollout.meta) vpred_t = np.asarray(rollout.values + [rollout.r]) rewards_plus_v = np.asarray(rollout.rewards + [rollout.r]) batch_r = util.discount(rewards_plus_v, gamma)[:-1] delta_t = rewards + gamma * vpred_t[1:] - vpred_t[:-1] # this formula for the advantage comes "Generalized Advantage Estimation": # https://arxiv.org/abs/1506.02438 batch_adv = util.discount(delta_t, gamma * lambda_) features = rollout.features[0] return util.Batch(si=batch_si, a=batch_a, adv=batch_adv, r=batch_r, terminal=rollout.terminal, features=features, reward=rewards, step=time, meta=meta)
def train(self, rollout, sess, gamma, lam, bootstrap_value): rollout = np.array(rollout) observations = rollout[:, 0] actions = rollout[:, 1] rewards = rollout[:, 2] prev_rewards = [0] + rewards[:-1].tolist() prev_actions = [np.array([0] * self.env.action_space.n) ] + actions[:-1].tolist() next_observations = rollout[:, 3] # ARA - currently unused values = rollout[:, 5] # Here we take the rewards and values from the rollout, and use # them to generate the advantage and discounted returns. The # advantage function uses "Generalized Advantage Estimation" # Based on: https://github.com/awjuliani/DeepRL-Agents self.rewards_plus = np.asarray(rewards.tolist() + [bootstrap_value]) discounted_rewards = discount(self.rewards_plus, gamma)[:-1] self.value_plus = np.asarray(values.tolist() + [bootstrap_value]) advantages = rewards + gamma * self.value_plus[ 1:] - self.value_plus[:-1] advantages = discount(advantages, gamma * lam) # Update the global network using gradients from loss # Generate network statistics to periodically save rnn_state = self.start_rnn_state feed_dict = { self.local_AC.target_v: discounted_rewards, # ARA - using np.stack to support Ndarray states self.local_AC.inputs: np.stack(observations), self.local_AC.prev_actions: np.vstack(prev_actions), self.local_AC.prev_rewards: np.vstack(prev_rewards), self.local_AC.is_training_ph: True, self.local_AC.actions: np.vstack(actions), self.local_AC.advantages: advantages, self.local_AC.state_in[0]: rnn_state[0], self.local_AC.state_in[1]: rnn_state[1] } v_l, p_l, e_l, g_n, v_n, _ = sess.run([ self.local_AC.value_loss, self.local_AC.policy_loss, self.local_AC.entropy, self.local_AC.grad_norms, self.local_AC.var_norms, self.local_AC.apply_grads ], feed_dict=feed_dict) return v_l / len(rollout), p_l / len(rollout), e_l / len( rollout), g_n, v_n
def process_rollout(self, rollout, gamma, lambda_=1.0): """ given a rollout, compute its returns """ #print ("shape of the roolout states:--------------") #print (len(rollout.states)) #print ((rollout.states[0]).shape) batch_si = np.asarray(rollout.states) batch_a = np.asarray(rollout.actions) rewards = np.asarray(rollout.rewards) time = np.asarray(rollout.time) meta = np.asarray(rollout.meta) rewards_plus_v = np.asarray(rollout.rewards + [rollout.r]) batch_r = util.discount(rewards_plus_v, gamma, time) features = rollout.features[0] return util.Batch(si=batch_si, a=batch_a, adv=None, r=batch_r, terminal=rollout.terminal, features=features, reward=rewards, step=time, meta=meta, )
def lambda_advantage(rewards, values, gamma, td_lambda, bootstrap_value): td_advantages = td_return(rewards, values, gamma, bootstrap_value) - values # these terms telescope into lambda_advantage = G_t^lambda - V(S_t) lambda_advantages = discount(td_advantages, gamma * td_lambda) return lambda_advantages