Exemple #1
0
    def test_generalized_advantage_estimation(self):
        values = tf.constant([[2.] * 5], tf.float32)
        step_types = tf.constant([[StepType.MID] * 5], tf.int64)
        rewards = tf.constant([[3.] * 5], tf.float32)
        discounts = tf.constant([[0.9] * 5], tf.float32)
        td_lambda = 0.6 / 0.9

        d = 2 * 0.9 + 1
        expected = tf.constant([[((d * 0.6 + d) * 0.6 + d) * 0.6 + d,
                                 (d * 0.6 + d) * 0.6 + d, d * 0.6 + d, d]],
                               dtype=tf.float32)
        self.assertAllClose(
            value_ops.generalized_advantage_estimation(rewards=rewards,
                                                       values=values,
                                                       step_types=step_types,
                                                       discounts=discounts,
                                                       td_lambda=td_lambda,
                                                       time_major=False),
            expected)

        # two episodes, and exceed by time limit (discount=1)

        step_types = tf.constant([[
            StepType.MID, StepType.MID, StepType.LAST, StepType.MID,
            StepType.MID
        ]], tf.int32)
        expected = tf.constant([[d * 0.6 + d, d, 0, d]], dtype=tf.float32)
        self.assertAllClose(
            value_ops.generalized_advantage_estimation(rewards=rewards,
                                                       values=values,
                                                       step_types=step_types,
                                                       discounts=discounts,
                                                       td_lambda=td_lambda,
                                                       time_major=False),
            expected)

        # two episodes, and end normal (discount=0)
        step_types = tf.constant([[
            StepType.MID, StepType.MID, StepType.LAST, StepType.MID,
            StepType.MID
        ]], tf.int32)
        discounts = tf.constant([[0.9, 0.9, 0.0, 0.9, 0.9]])
        expected = tf.constant([[1 * 0.6 + d, 1, 0, d]], dtype=tf.float32)

        self.assertAllClose(
            value_ops.generalized_advantage_estimation(rewards=rewards,
                                                       values=values,
                                                       step_types=step_types,
                                                       discounts=discounts,
                                                       td_lambda=td_lambda,
                                                       time_major=False),
            expected)
Exemple #2
0
    def _calc_returns_and_advantages(self, training_info, value):
        if self._use_vtrace:
            return value_ops.calc_vtrace_returns_and_advantages(
                training_info, value, self._gamma, self._action_spec,
                self._lambda, self._debug_summaries)

        returns = value_ops.discounted_return(
            rewards=training_info.reward,
            values=value,
            step_types=training_info.step_type,
            discounts=training_info.discount * self._gamma)
        returns = common.tensor_extend(returns, value[-1])

        if not self._use_gae:
            advantages = returns - value
        else:
            advantages = value_ops.generalized_advantage_estimation(
                rewards=training_info.reward,
                values=value,
                step_types=training_info.step_type,
                discounts=training_info.discount * self._gamma,
                td_lambda=self._lambda)
            advantages = common.tensor_extend_zero(advantages)
            if self._use_td_lambda_return:
                returns = advantages + value

        return returns, advantages
Exemple #3
0
    def test_vtrace_returns_and_advantages_impl_on_policy_no_last_step(self):
        """Test vtrace_returns_and_advantages_impl on policy no last_step
            in the middle of the trajectory.
        """
        importance_ratio_clipped = tf.constant([[1.] * 5], tf.float32)
        values = tf.constant([[2.] * 5], tf.float32)
        step_types = tf.constant([[StepType.MID] * 5], tf.int64)
        rewards = tf.constant([[3.] * 5], tf.float32)
        discounts = tf.constant([[0.9] * 5], tf.float32)
        td_lambda = 1.0

        returns, advantages = value_ops.vtrace_returns_and_advantages_impl(
            importance_ratio_clipped,
            rewards,
            values,
            step_types,
            discounts,
            time_major=False)
        sa_returns, sa_adv = vtrace_scalable_agent(importance_ratio_clipped,
                                                   discounts, rewards, values,
                                                   step_types)
        self.assertAllClose(sa_adv,
                            advantages,
                            msg='advantages differ from scalable_agent')
        self.assertAllClose(sa_returns,
                            returns,
                            msg='returns differ from scalable_agent')
        expected_advantages = value_ops.generalized_advantage_estimation(
            rewards=rewards,
            values=values,
            step_types=step_types,
            discounts=discounts,
            td_lambda=td_lambda,
            time_major=False)
        expected_advantages = tf.transpose(a=expected_advantages)
        expected_advantages = common.tensor_extend_zero(expected_advantages)
        expected_advantages = tf.transpose(a=expected_advantages)
        self.assertAllClose(expected_advantages,
                            advantages,
                            msg='advantages differ from gold')

        expected_returns = value_ops.discounted_return(rewards=rewards,
                                                       values=values,
                                                       step_types=step_types,
                                                       discounts=discounts,
                                                       time_major=False)
        expected_returns = tf.transpose(a=expected_returns)
        values = tf.transpose(a=values)
        expected_returns = common.tensor_extend(expected_returns, values[-1])
        expected_returns = tf.transpose(a=expected_returns)
        self.assertAllClose(expected_returns,
                            returns,
                            msg='returns differ from gold')
Exemple #4
0
    def _check(self, rewards, values, step_types, discounts, td_lambda,
               expected):
        np.testing.assert_array_almost_equal(
            value_ops.generalized_advantage_estimation(rewards=rewards,
                                                       values=values,
                                                       step_types=step_types,
                                                       discounts=discounts,
                                                       td_lambda=td_lambda,
                                                       time_major=False),
            expected)

        np.testing.assert_array_almost_equal(
            value_ops.generalized_advantage_estimation(
                rewards=torch.stack([rewards, 2 * rewards], dim=2),
                values=torch.stack([values, 2 * values], dim=2),
                step_types=step_types,
                discounts=discounts,
                td_lambda=td_lambda,
                time_major=False),
            torch.stack([expected, 2 * expected], dim=2),
            decimal=5)
Exemple #5
0
 def preprocess_experience(self, exp: Experience):
     """Compute advantages and put it into exp.info."""
     advantages = value_ops.generalized_advantage_estimation(
         rewards=exp.reward,
         values=exp.info.value,
         step_types=exp.step_type,
         discounts=exp.discount * self._loss._gamma,
         td_lambda=self._loss._lambda,
         time_major=False)
     advantages = tf.concat([
         advantages,
         tf.zeros(shape=common.concat_shape(tf.shape(advantages)[:-1], [1]),
                  dtype=advantages.dtype)
     ],
                            axis=-1)
     returns = exp.info.value + advantages
     return exp._replace(info=PPOInfo(returns, advantages))
Exemple #6
0
    def test_vtrace_returns_and_advantages_impl_on_policy_has_last_step(self):
        """Test vtrace_returns_and_advantages_impl on policy has last_step
            in the middle of the trajectory.
        """
        importance_ratio_clipped = tf.constant([[1.] * 5], tf.float32)
        values = tf.constant([[2., 2.1, 2.2, 2.3, 2.4]], tf.float32)
        step_types = tf.constant([[
            StepType.MID, StepType.MID, StepType.LAST, StepType.MID,
            StepType.MID
        ]], tf.int32)
        rewards = tf.constant([[3., 3.1, 3.2, 3.3, 3.4]], tf.float32)
        discounts = tf.constant([[0.9, 0.9, 0.0, 0.9, 0.9]])
        td_lambda = 1.0

        returns, advantages = value_ops.vtrace_returns_and_advantages_impl(
            importance_ratio_clipped,
            rewards,
            values,
            step_types,
            discounts,
            time_major=False)

        expected_advantages = value_ops.generalized_advantage_estimation(
            rewards=rewards,
            values=values,
            step_types=step_types,
            discounts=discounts,
            td_lambda=td_lambda,
            time_major=False)
        expected_advantages = tf.transpose(a=expected_advantages)
        expected_advantages = common.tensor_extend_zero(expected_advantages)
        expected_advantages = tf.transpose(a=expected_advantages)
        self.assertAllClose(expected_advantages,
                            advantages,
                            msg='advantages differ')

        expected_returns = value_ops.discounted_return(rewards=rewards,
                                                       values=values,
                                                       step_types=step_types,
                                                       discounts=discounts,
                                                       time_major=False)
        expected_returns = tf.transpose(a=expected_returns)
        values = tf.transpose(a=values)
        expected_returns = common.tensor_extend(expected_returns, values[-1])
        expected_returns = tf.transpose(a=expected_returns)
        self.assertAllClose(expected_returns, returns, msg='returns differ')
Exemple #7
0
 def preprocess_experience(self, exp: Experience):
     """Compute advantages and put it into exp.rollout_info."""
     advantages = value_ops.generalized_advantage_estimation(
         rewards=exp.reward,
         values=exp.rollout_info.value,
         step_types=exp.step_type,
         discounts=exp.discount * self._loss._gamma,
         td_lambda=self._loss._lambda,
         time_major=False)
     advantages = torch.cat([
         advantages,
         torch.zeros(*advantages.shape[:-1], 1, dtype=advantages.dtype)
     ],
                            dim=-1)
     returns = exp.rollout_info.value + advantages
     return exp._replace(rollout_info=PPOInfo(
         exp.rollout_info.action_distribution, returns, advantages))
Exemple #8
0
 def preprocess_experience(self, exp: Experience):
     """Compute advantages and put it into exp.info."""
     reward = exp.reward
     if self._algorithm._reward_shaping_fn is not None:
         reward = self._algorithm._reward_shaping_fn(reward)
     reward = self._algorithm.calc_training_reward(reward, exp.info)
     advantages = value_ops.generalized_advantage_estimation(
         rewards=reward,
         values=exp.info.value,
         step_types=exp.step_type,
         discounts=exp.discount * self._algorithm._loss._gamma,
         td_lambda=self._algorithm._loss._lambda,
         time_major=False)
     advantages = tf.concat([
         advantages,
         tf.zeros(advantages.shape.as_list()[:-1] + [1],
                  dtype=advantages.dtype)
     ],
                            axis=-1)
     returns = exp.info.value + advantages
     return exp._replace(info=PPOInfo(returns, advantages))
Exemple #9
0
    def _calc_returns_and_advantages(self, experience, value):
        returns = value_ops.discounted_return(rewards=experience.reward,
                                              values=value,
                                              step_types=experience.step_type,
                                              discounts=experience.discount *
                                              self._gamma)
        returns = tensor_utils.tensor_extend(returns, value[-1])

        if not self._use_gae:
            advantages = returns - value
        else:
            advantages = value_ops.generalized_advantage_estimation(
                rewards=experience.reward,
                values=value,
                step_types=experience.step_type,
                discounts=experience.discount * self._gamma,
                td_lambda=self._lambda)
            advantages = tensor_utils.tensor_extend_zero(advantages)
            if self._use_td_lambda_return:
                returns = advantages + value

        return returns, advantages
Exemple #10
0
    def forward(self, experience, value, target_value):
        """Cacluate the loss.

        The first dimension of all the tensors is time dimension and the second
        dimesion is the batch dimension.

        Args:
            experience (Experience): experience collected from ``unroll()`` or
                a replay buffer. All tensors are time-major.
            value (torch.Tensor): the time-major tensor for the value at each time
                step. The loss is between this and the calculated return.
            target_value (torch.Tensor): the time-major tensor for the value at
                each time step. This is used to calculate return. ``target_value``
                can be same as ``value``.
        Returns:
            LossInfo: with the ``extra`` field same as ``loss``.
        """
        if self._lambda == 1.0:
            returns = value_ops.discounted_return(
                rewards=experience.reward,
                values=target_value,
                step_types=experience.step_type,
                discounts=experience.discount * self._gamma)
        elif self._lambda == 0.0:
            returns = value_ops.one_step_discounted_return(
                rewards=experience.reward,
                values=target_value,
                step_types=experience.step_type,
                discounts=experience.discount * self._gamma)
        else:
            advantages = value_ops.generalized_advantage_estimation(
                rewards=experience.reward,
                values=target_value,
                step_types=experience.step_type,
                discounts=experience.discount * self._gamma,
                td_lambda=self._lambda)
            returns = advantages + target_value[:-1]

        value = value[:-1]

        if self._debug_summaries and alf.summary.should_record_summaries():
            mask = experience.step_type[:-1] != StepType.LAST
            with alf.summary.scope(self._name):

                def _summarize(v, r, td, suffix):
                    alf.summary.scalar(
                        "explained_variance_of_return_by_value" + suffix,
                        tensor_utils.explained_variance(v, r, mask))
                    safe_mean_hist_summary('values' + suffix, v, mask)
                    safe_mean_hist_summary('returns' + suffix, r, mask)
                    safe_mean_hist_summary("td_error" + suffix, td, mask)

                if value.ndim == 2:
                    _summarize(value, returns, returns - value, '')
                else:
                    td = returns - value
                    for i in range(value.shape[2]):
                        suffix = '/' + str(i)
                        _summarize(value[..., i], returns[..., i], td[..., i],
                                   suffix)

        loss = self._td_error_loss_fn(returns.detach(), value)

        if loss.ndim == 3:
            # Multidimensional reward. Average over the critic loss for all dimensions
            loss = loss.mean(dim=2)

        # The shape of the loss expected by Algorith.update_with_gradient is
        # [T, B], so we need to augment it with additional zeros.
        loss = tensor_utils.tensor_extend_zero(loss)
        return LossInfo(loss=loss, extra=loss)