Example #1
0
    def _calc_returns_and_advantages(self, training_info, value):
        if self._use_vtrace:
            return value_ops.calc_vtrace_returns_and_advantages(
                training_info, value, self._gamma, self._action_spec,
                self._lambda, self._debug_summaries)

        returns = value_ops.discounted_return(
            rewards=training_info.reward,
            values=value,
            step_types=training_info.step_type,
            discounts=training_info.discount * self._gamma)
        returns = common.tensor_extend(returns, value[-1])

        if not self._use_gae:
            advantages = returns - value
        else:
            advantages = value_ops.generalized_advantage_estimation(
                rewards=training_info.reward,
                values=value,
                step_types=training_info.step_type,
                discounts=training_info.discount * self._gamma,
                td_lambda=self._lambda)
            advantages = common.tensor_extend_zero(advantages)
            if self._use_td_lambda_return:
                returns = advantages + value

        return returns, advantages
Example #2
0
    def test_vtrace_returns_and_advantages_impl_on_policy_no_last_step(self):
        """Test vtrace_returns_and_advantages_impl on policy no last_step
            in the middle of the trajectory.
        """
        importance_ratio_clipped = tf.constant([[1.] * 5], tf.float32)
        values = tf.constant([[2.] * 5], tf.float32)
        step_types = tf.constant([[StepType.MID] * 5], tf.int64)
        rewards = tf.constant([[3.] * 5], tf.float32)
        discounts = tf.constant([[0.9] * 5], tf.float32)
        td_lambda = 1.0

        returns, advantages = value_ops.vtrace_returns_and_advantages_impl(
            importance_ratio_clipped,
            rewards,
            values,
            step_types,
            discounts,
            time_major=False)
        sa_returns, sa_adv = vtrace_scalable_agent(importance_ratio_clipped,
                                                   discounts, rewards, values,
                                                   step_types)
        self.assertAllClose(sa_adv,
                            advantages,
                            msg='advantages differ from scalable_agent')
        self.assertAllClose(sa_returns,
                            returns,
                            msg='returns differ from scalable_agent')
        expected_advantages = value_ops.generalized_advantage_estimation(
            rewards=rewards,
            values=values,
            step_types=step_types,
            discounts=discounts,
            td_lambda=td_lambda,
            time_major=False)
        expected_advantages = tf.transpose(a=expected_advantages)
        expected_advantages = common.tensor_extend_zero(expected_advantages)
        expected_advantages = tf.transpose(a=expected_advantages)
        self.assertAllClose(expected_advantages,
                            advantages,
                            msg='advantages differ from gold')

        expected_returns = value_ops.discounted_return(rewards=rewards,
                                                       values=values,
                                                       step_types=step_types,
                                                       discounts=discounts,
                                                       time_major=False)
        expected_returns = tf.transpose(a=expected_returns)
        values = tf.transpose(a=values)
        expected_returns = common.tensor_extend(expected_returns, values[-1])
        expected_returns = tf.transpose(a=expected_returns)
        self.assertAllClose(expected_returns,
                            returns,
                            msg='returns differ from gold')
Example #3
0
    def test_vtrace_returns_and_advantages_impl_on_policy_has_last_step(self):
        """Test vtrace_returns_and_advantages_impl on policy has last_step
            in the middle of the trajectory.
        """
        importance_ratio_clipped = tf.constant([[1.] * 5], tf.float32)
        values = tf.constant([[2., 2.1, 2.2, 2.3, 2.4]], tf.float32)
        step_types = tf.constant([[
            StepType.MID, StepType.MID, StepType.LAST, StepType.MID,
            StepType.MID
        ]], tf.int32)
        rewards = tf.constant([[3., 3.1, 3.2, 3.3, 3.4]], tf.float32)
        discounts = tf.constant([[0.9, 0.9, 0.0, 0.9, 0.9]])
        td_lambda = 1.0

        returns, advantages = value_ops.vtrace_returns_and_advantages_impl(
            importance_ratio_clipped,
            rewards,
            values,
            step_types,
            discounts,
            time_major=False)

        expected_advantages = value_ops.generalized_advantage_estimation(
            rewards=rewards,
            values=values,
            step_types=step_types,
            discounts=discounts,
            td_lambda=td_lambda,
            time_major=False)
        expected_advantages = tf.transpose(a=expected_advantages)
        expected_advantages = common.tensor_extend_zero(expected_advantages)
        expected_advantages = tf.transpose(a=expected_advantages)
        self.assertAllClose(expected_advantages,
                            advantages,
                            msg='advantages differ')

        expected_returns = value_ops.discounted_return(rewards=rewards,
                                                       values=values,
                                                       step_types=step_types,
                                                       discounts=discounts,
                                                       time_major=False)
        expected_returns = tf.transpose(a=expected_returns)
        values = tf.transpose(a=values)
        expected_returns = common.tensor_extend(expected_returns, values[-1])
        expected_returns = tf.transpose(a=expected_returns)
        self.assertAllClose(expected_returns, returns, msg='returns differ')
Example #4
0
def vtrace_scalable_agent(imp_weights, discounts, rewards, values, step_types):
    # scalable agent has a one step shifted definition of some of these values.
    # E.g. action in alf is prev_action that caused the current reward.
    log_imp_weights = tf.math.log(imp_weights)
    log_imp_weights = tf.transpose(a=log_imp_weights)[:-1]
    discounts = tf.transpose(a=discounts)[1:]
    rewards = tf.transpose(a=rewards)[1:]
    values = tf.transpose(a=values)
    final_value = values[-1]
    values = values[:-1]
    vtrace_returns = from_importance_weights(log_rhos=log_imp_weights,
                                             discounts=discounts,
                                             rewards=rewards,
                                             values=values,
                                             bootstrap_value=final_value,
                                             clip_rho_threshold=1.0,
                                             clip_pg_rho_threshold=1.0)
    vs = vtrace_returns.vs
    vs = common.tensor_extend(vs, final_value)
    adv = vtrace_returns.pg_advantages
    adv = common.tensor_extend_zero(adv)
    vs = tf.transpose(a=vs)
    adv = tf.transpose(a=adv)
    return vs, adv
Example #5
0
    def test_vtrace_returns_and_advantages_impl_off_policy_has_last_step(self):
        """Test vtrace_returns_and_advantages_impl off policy has last_step
            in the middle of the trajectory.
        """
        r = 0.999
        d = 0.9
        importance_ratio_clipped = tf.constant([[r] * 5], tf.float32)
        values = tf.constant([[2.] * 5], tf.float32)
        step_types = tf.constant([[
            StepType.MID, StepType.MID, StepType.LAST, StepType.MID,
            StepType.MID
        ]], tf.int32)
        rewards = tf.constant([[3.] * 5], tf.float32)
        discounts = tf.constant([[d, d, 0., d, d]])
        td_lambda = 1.0

        returns, advantages = value_ops.vtrace_returns_and_advantages_impl(
            importance_ratio_clipped,
            rewards,
            values,
            step_types,
            discounts,
            time_major=False)

        td3 = (3. + 2. * d - 2.) * r
        expected_returns = tf.constant(
            [[td3 + d * r * (3. - 2.) * r, r, 0, td3, 0]], tf.float32) + values
        # 5.695401, 2.999   , 2.      , 4.7972  , 2.
        self.assertAllClose(expected_returns, returns, msg='returns differ')

        is_lasts = tf.cast(tf.equal(tf.transpose(a=step_types), StepType.LAST),
                           tf.float32)
        expected_advantages = (1 - is_lasts[:-1]) * r * (
            tf.transpose(a=rewards)[1:] + tf.transpose(a=discounts)[1:] *
            tf.transpose(a=expected_returns)[1:] - tf.transpose(a=values)[:-1])
        expected_advantages = common.tensor_extend_zero(expected_advantages)
        expected_advantages = tf.transpose(a=expected_advantages)
        # 3.695401, 0.999   , 0.      , 2.7972  , 0.
        self.assertAllClose(expected_advantages,
                            advantages,
                            msg='advantages differ')

        # a case where values are not uniform over time.
        values = tf.constant([[0., 1., 2., 3., 4.]], tf.float32)
        returns, advantages = value_ops.vtrace_returns_and_advantages_impl(
            importance_ratio_clipped,
            rewards,
            values,
            step_types,
            discounts,
            time_major=False)

        td3 = (3. + 4. * d - 3) * r
        td1 = 2 * r
        expected_returns = tf.constant([[(3. + 1. * d - 0) * r + d * r * td1,
                                         td1, 0, td3, 0]], tf.float32) + values
        # 5.692502, 2.998   , 2.      , 6.5964  , 4.
        self.assertAllClose(expected_returns, returns, msg='returns differ')

        is_lasts = tf.cast(tf.equal(tf.transpose(a=step_types), StepType.LAST),
                           tf.float32)
        expected_advantages = (1 - is_lasts[:-1]) * r * (
            tf.transpose(a=rewards)[1:] + tf.transpose(a=discounts)[1:] *
            tf.transpose(a=expected_returns)[1:] - tf.transpose(a=values)[:-1])
        expected_advantages = common.tensor_extend_zero(expected_advantages)
        expected_advantages = tf.transpose(a=expected_advantages)
        # 5.692502, 1.998   , 0.      , 3.5964  , 0.
        self.assertAllClose(expected_advantages,
                            advantages,
                            msg='advantages differ')
Example #6
0
def vtrace_returns_and_advantages_impl(importance_ratio_clipped,
                                       rewards,
                                       values,
                                       step_types,
                                       discounts,
                                       td_lambda=1,
                                       time_major=True):
    """Actual implementation after getting importance_ratios

    Args:
        importance_ratio_clipped (Tensor): shape is [T, B] vtrace IS weights.
        rewards (Tensor): shape is [T, B] (or [T]) representing rewards.
        values (Tensor): shape is [T,B] (or [T]) representing values.
        step_types (Tensor): shape is [T,B] (or [T]) representing step types.
        discounts (Tensor): shape is [T, B] (or [T]) representing discounts.
        td_lambda (float): A scalar between [0, 1]. It's used for variance
            reduction in temporal difference.
        time_major (bool): Whether input tensors are time major.
            False means input tensors have shape [B, T].

    Returns:
        Two tensors with shape [T-1, B] representing returns and advantages.
        Shape is [B, T-1] when time_major is false.
        The advantages returned are importance-weighted.
    """
    if not time_major:
        importance_ratio_clipped = tf.transpose(a=importance_ratio_clipped)
        discounts = tf.transpose(a=discounts)
        rewards = tf.transpose(a=rewards)
        values = tf.transpose(a=values)
        step_types = tf.transpose(a=step_types)

    importance_ratio_clipped = importance_ratio_clipped[:-1]
    rewards = rewards[1:]
    next_values = values[1:]
    final_value = values[-1]
    values = values[:-1]
    discounts = discounts[1:]
    step_types = step_types[:-1]
    is_lasts = tf.cast(tf.equal(step_types, StepType.LAST), tf.float32)

    tds = (importance_ratio_clipped *
           (rewards + discounts * next_values - values))
    weighted_discounts = discounts * importance_ratio_clipped * td_lambda

    def vs_target_minus_vs_fn(vs_target_minus_vs, params):
        weighted_discount, td, is_last = params
        return (1 - is_last) * (td + weighted_discount * vs_target_minus_vs)

    vs_target_minus_vs = tf.scan(fn=vs_target_minus_vs_fn,
                                 elems=(weighted_discounts, tds, is_lasts),
                                 initializer=tf.zeros_like(final_value),
                                 reverse=True,
                                 back_prop=False)

    returns = vs_target_minus_vs + values
    returns = common.tensor_extend(returns, final_value)

    next_vs_targets = returns[1:]

    # Note, advantage of last step cannot be computed, and is assumed to be 0.
    advantages = (1 - is_lasts) * importance_ratio_clipped * (
        rewards + discounts * next_vs_targets - values)
    advantages = common.tensor_extend_zero(advantages)

    if not time_major:
        returns = tf.transpose(a=returns)
        advantages = tf.transpose(a=advantages)

    return returns, advantages