def _calc_returns_and_advantages(self, training_info, value): if self._use_vtrace: return value_ops.calc_vtrace_returns_and_advantages( training_info, value, self._gamma, self._action_spec, self._lambda, self._debug_summaries) returns = value_ops.discounted_return( rewards=training_info.reward, values=value, step_types=training_info.step_type, discounts=training_info.discount * self._gamma) returns = common.tensor_extend(returns, value[-1]) if not self._use_gae: advantages = returns - value else: advantages = value_ops.generalized_advantage_estimation( rewards=training_info.reward, values=value, step_types=training_info.step_type, discounts=training_info.discount * self._gamma, td_lambda=self._lambda) advantages = common.tensor_extend_zero(advantages) if self._use_td_lambda_return: returns = advantages + value return returns, advantages
def test_vtrace_returns_and_advantages_impl_on_policy_no_last_step(self): """Test vtrace_returns_and_advantages_impl on policy no last_step in the middle of the trajectory. """ importance_ratio_clipped = tf.constant([[1.] * 5], tf.float32) values = tf.constant([[2.] * 5], tf.float32) step_types = tf.constant([[StepType.MID] * 5], tf.int64) rewards = tf.constant([[3.] * 5], tf.float32) discounts = tf.constant([[0.9] * 5], tf.float32) td_lambda = 1.0 returns, advantages = value_ops.vtrace_returns_and_advantages_impl( importance_ratio_clipped, rewards, values, step_types, discounts, time_major=False) sa_returns, sa_adv = vtrace_scalable_agent(importance_ratio_clipped, discounts, rewards, values, step_types) self.assertAllClose(sa_adv, advantages, msg='advantages differ from scalable_agent') self.assertAllClose(sa_returns, returns, msg='returns differ from scalable_agent') expected_advantages = value_ops.generalized_advantage_estimation( rewards=rewards, values=values, step_types=step_types, discounts=discounts, td_lambda=td_lambda, time_major=False) expected_advantages = tf.transpose(a=expected_advantages) expected_advantages = common.tensor_extend_zero(expected_advantages) expected_advantages = tf.transpose(a=expected_advantages) self.assertAllClose(expected_advantages, advantages, msg='advantages differ from gold') expected_returns = value_ops.discounted_return(rewards=rewards, values=values, step_types=step_types, discounts=discounts, time_major=False) expected_returns = tf.transpose(a=expected_returns) values = tf.transpose(a=values) expected_returns = common.tensor_extend(expected_returns, values[-1]) expected_returns = tf.transpose(a=expected_returns) self.assertAllClose(expected_returns, returns, msg='returns differ from gold')
def test_vtrace_returns_and_advantages_impl_on_policy_has_last_step(self): """Test vtrace_returns_and_advantages_impl on policy has last_step in the middle of the trajectory. """ importance_ratio_clipped = tf.constant([[1.] * 5], tf.float32) values = tf.constant([[2., 2.1, 2.2, 2.3, 2.4]], tf.float32) step_types = tf.constant([[ StepType.MID, StepType.MID, StepType.LAST, StepType.MID, StepType.MID ]], tf.int32) rewards = tf.constant([[3., 3.1, 3.2, 3.3, 3.4]], tf.float32) discounts = tf.constant([[0.9, 0.9, 0.0, 0.9, 0.9]]) td_lambda = 1.0 returns, advantages = value_ops.vtrace_returns_and_advantages_impl( importance_ratio_clipped, rewards, values, step_types, discounts, time_major=False) expected_advantages = value_ops.generalized_advantage_estimation( rewards=rewards, values=values, step_types=step_types, discounts=discounts, td_lambda=td_lambda, time_major=False) expected_advantages = tf.transpose(a=expected_advantages) expected_advantages = common.tensor_extend_zero(expected_advantages) expected_advantages = tf.transpose(a=expected_advantages) self.assertAllClose(expected_advantages, advantages, msg='advantages differ') expected_returns = value_ops.discounted_return(rewards=rewards, values=values, step_types=step_types, discounts=discounts, time_major=False) expected_returns = tf.transpose(a=expected_returns) values = tf.transpose(a=values) expected_returns = common.tensor_extend(expected_returns, values[-1]) expected_returns = tf.transpose(a=expected_returns) self.assertAllClose(expected_returns, returns, msg='returns differ')
def vtrace_scalable_agent(imp_weights, discounts, rewards, values, step_types): # scalable agent has a one step shifted definition of some of these values. # E.g. action in alf is prev_action that caused the current reward. log_imp_weights = tf.math.log(imp_weights) log_imp_weights = tf.transpose(a=log_imp_weights)[:-1] discounts = tf.transpose(a=discounts)[1:] rewards = tf.transpose(a=rewards)[1:] values = tf.transpose(a=values) final_value = values[-1] values = values[:-1] vtrace_returns = from_importance_weights(log_rhos=log_imp_weights, discounts=discounts, rewards=rewards, values=values, bootstrap_value=final_value, clip_rho_threshold=1.0, clip_pg_rho_threshold=1.0) vs = vtrace_returns.vs vs = common.tensor_extend(vs, final_value) adv = vtrace_returns.pg_advantages adv = common.tensor_extend_zero(adv) vs = tf.transpose(a=vs) adv = tf.transpose(a=adv) return vs, adv
def test_vtrace_returns_and_advantages_impl_off_policy_has_last_step(self): """Test vtrace_returns_and_advantages_impl off policy has last_step in the middle of the trajectory. """ r = 0.999 d = 0.9 importance_ratio_clipped = tf.constant([[r] * 5], tf.float32) values = tf.constant([[2.] * 5], tf.float32) step_types = tf.constant([[ StepType.MID, StepType.MID, StepType.LAST, StepType.MID, StepType.MID ]], tf.int32) rewards = tf.constant([[3.] * 5], tf.float32) discounts = tf.constant([[d, d, 0., d, d]]) td_lambda = 1.0 returns, advantages = value_ops.vtrace_returns_and_advantages_impl( importance_ratio_clipped, rewards, values, step_types, discounts, time_major=False) td3 = (3. + 2. * d - 2.) * r expected_returns = tf.constant( [[td3 + d * r * (3. - 2.) * r, r, 0, td3, 0]], tf.float32) + values # 5.695401, 2.999 , 2. , 4.7972 , 2. self.assertAllClose(expected_returns, returns, msg='returns differ') is_lasts = tf.cast(tf.equal(tf.transpose(a=step_types), StepType.LAST), tf.float32) expected_advantages = (1 - is_lasts[:-1]) * r * ( tf.transpose(a=rewards)[1:] + tf.transpose(a=discounts)[1:] * tf.transpose(a=expected_returns)[1:] - tf.transpose(a=values)[:-1]) expected_advantages = common.tensor_extend_zero(expected_advantages) expected_advantages = tf.transpose(a=expected_advantages) # 3.695401, 0.999 , 0. , 2.7972 , 0. self.assertAllClose(expected_advantages, advantages, msg='advantages differ') # a case where values are not uniform over time. values = tf.constant([[0., 1., 2., 3., 4.]], tf.float32) returns, advantages = value_ops.vtrace_returns_and_advantages_impl( importance_ratio_clipped, rewards, values, step_types, discounts, time_major=False) td3 = (3. + 4. * d - 3) * r td1 = 2 * r expected_returns = tf.constant([[(3. + 1. * d - 0) * r + d * r * td1, td1, 0, td3, 0]], tf.float32) + values # 5.692502, 2.998 , 2. , 6.5964 , 4. self.assertAllClose(expected_returns, returns, msg='returns differ') is_lasts = tf.cast(tf.equal(tf.transpose(a=step_types), StepType.LAST), tf.float32) expected_advantages = (1 - is_lasts[:-1]) * r * ( tf.transpose(a=rewards)[1:] + tf.transpose(a=discounts)[1:] * tf.transpose(a=expected_returns)[1:] - tf.transpose(a=values)[:-1]) expected_advantages = common.tensor_extend_zero(expected_advantages) expected_advantages = tf.transpose(a=expected_advantages) # 5.692502, 1.998 , 0. , 3.5964 , 0. self.assertAllClose(expected_advantages, advantages, msg='advantages differ')
def vtrace_returns_and_advantages_impl(importance_ratio_clipped, rewards, values, step_types, discounts, td_lambda=1, time_major=True): """Actual implementation after getting importance_ratios Args: importance_ratio_clipped (Tensor): shape is [T, B] vtrace IS weights. rewards (Tensor): shape is [T, B] (or [T]) representing rewards. values (Tensor): shape is [T,B] (or [T]) representing values. step_types (Tensor): shape is [T,B] (or [T]) representing step types. discounts (Tensor): shape is [T, B] (or [T]) representing discounts. td_lambda (float): A scalar between [0, 1]. It's used for variance reduction in temporal difference. time_major (bool): Whether input tensors are time major. False means input tensors have shape [B, T]. Returns: Two tensors with shape [T-1, B] representing returns and advantages. Shape is [B, T-1] when time_major is false. The advantages returned are importance-weighted. """ if not time_major: importance_ratio_clipped = tf.transpose(a=importance_ratio_clipped) discounts = tf.transpose(a=discounts) rewards = tf.transpose(a=rewards) values = tf.transpose(a=values) step_types = tf.transpose(a=step_types) importance_ratio_clipped = importance_ratio_clipped[:-1] rewards = rewards[1:] next_values = values[1:] final_value = values[-1] values = values[:-1] discounts = discounts[1:] step_types = step_types[:-1] is_lasts = tf.cast(tf.equal(step_types, StepType.LAST), tf.float32) tds = (importance_ratio_clipped * (rewards + discounts * next_values - values)) weighted_discounts = discounts * importance_ratio_clipped * td_lambda def vs_target_minus_vs_fn(vs_target_minus_vs, params): weighted_discount, td, is_last = params return (1 - is_last) * (td + weighted_discount * vs_target_minus_vs) vs_target_minus_vs = tf.scan(fn=vs_target_minus_vs_fn, elems=(weighted_discounts, tds, is_lasts), initializer=tf.zeros_like(final_value), reverse=True, back_prop=False) returns = vs_target_minus_vs + values returns = common.tensor_extend(returns, final_value) next_vs_targets = returns[1:] # Note, advantage of last step cannot be computed, and is assumed to be 0. advantages = (1 - is_lasts) * importance_ratio_clipped * ( rewards + discounts * next_vs_targets - values) advantages = common.tensor_extend_zero(advantages) if not time_major: returns = tf.transpose(a=returns) advantages = tf.transpose(a=advantages) return returns, advantages