def compute_advantages(self, rewards, returns, discounts, value_preds): """Compute advantages, optionally using GAE. Based on baselines ppo1 implementation. Removes final timestep, as it needs to use this timestep for next-step value prediction for TD error computation. Args: rewards: Tensor of per-timestep rewards. returns: Tensor of per-timestep returns. discounts: Tensor of per-timestep discounts. Zero for terminal timesteps. value_preds: Cached value estimates from the data-collection policy. Returns: advantages: Tensor of length (len(rewards) - 1), because the final timestep is just used for next-step value prediction. """ # Arg value_preds was appended with final next_step value. Make tensors # next_value_preds by stripping first and last elements respectively. final_value_pred = value_preds[:, -1] value_preds = value_preds[:, :-1] if not self._use_gae: with tf.name_scope('empirical_advantage'): advantages = returns - value_preds else: advantages = value_ops.generalized_advantage_estimation( values=value_preds, final_value=final_value_pred, rewards=rewards, discounts=discounts, td_lambda=self._lambda, time_major=False) return advantages
def compute_return_and_advantage(discount_factor, lambda_, rewards, next_time_steps, value_preds): """Compute the TD-lambda return and GAE(lambda) advantages. Normalization will be applied to the advantages. :param discount_factor: discount in [0,1] :param lambda_: trace_decay in [0,1] :param rewards: next_step rewards (possibly normalized) :param next_time_steps: batched tensor of TimeStep tuples after action is taken. :param value_preds: Batched value prediction tensor. Should have one more entry in time index than time_steps, with the final value corresponding to the value prediction of the final state. :return: tuple of (return, normalized_advantage), both are batched tensors. """ discounts = next_time_steps.discount * tf.constant(discount_factor, dtype=tf.float32) # Make discount 0.0 at end of each episode to restart cumulative sum # end of each episode. episode_mask = common.get_episode_mask(next_time_steps) discounts *= episode_mask # Arg value_preds was appended with final next_step value. Make tensors # next_value_preds by stripping first and last elements respectively. final_value_pred = value_preds[:, -1] value_preds = value_preds[:, :-1] # Compute advantages. advantages = value_ops.generalized_advantage_estimation( values=value_preds, final_value=final_value_pred, rewards=rewards, discounts=discounts, td_lambda=lambda_, time_major=False, ) normalized_advantages = _normalize_advantages(advantages, axes=(0, 1)) # compute TD-Lambda returns. returns = tf.add(advantages, value_preds, name="td_lambda_returns") return returns, normalized_advantages
def testAdvantagesAreCorrectlyComputed(self, batch_size, num_time_steps, td_lambda): rewards = np.random.rand(num_time_steps, batch_size).astype(np.float32) discounts = np.random.rand(num_time_steps, batch_size).astype(np.float32) values = np.random.rand(num_time_steps, batch_size).astype(np.float32) final_value = np.random.rand(batch_size).astype(np.float32) ground_truth = _naive_gae_as_ground_truth(discounts=discounts, rewards=rewards, values=values, final_value=final_value, td_lambda=td_lambda) advantages = value_ops.generalized_advantage_estimation( discounts=discounts, rewards=rewards, values=values, final_value=final_value, td_lambda=td_lambda) self.assertAllClose(advantages, ground_truth)
def testAdvantagesMatchPrecomputedResult(self): advantages = value_ops.generalized_advantage_estimation( discounts=tf.constant( [[1.0, 1.0, 1.0, 1.0, 0.0, 0.9, 0.9, 0.9, 0.0], [1.0, 1.0, 1.0, 1.0, 0.0, 0.9, 0.9, 0.9, 0.0]]), rewards=tf.fill([2, 9], 1.0), values=tf.fill([2, 9], 3.0), final_value=tf.fill([2], 3.0), td_lambda=0.95, time_major=False) # Precomputed according to equation (16) in paper. ground_truth = tf.constant([[ 2.0808625, 1.13775, 0.145, -0.9, -2.0, 0.56016475, -0.16355, -1.01, -2.0 ], [ 2.0808625, 1.13775, 0.145, -0.9, -2.0, 0.56016475, -0.16355, -1.01, -2.0 ]]) self.assertAllClose(advantages, ground_truth)