Ejemplo n.º 1
0
    def _check(self, rewards, values, step_types, discounts, expected):
        np.testing.assert_array_almost_equal(
            value_ops.discounted_return(rewards=rewards,
                                        values=values,
                                        step_types=step_types,
                                        discounts=discounts,
                                        time_major=False), expected)

        np.testing.assert_array_almost_equal(
            value_ops.discounted_return(
                rewards=torch.stack([rewards, 2 * rewards], dim=2),
                values=torch.stack([values, 2 * values], dim=2),
                step_types=step_types,
                discounts=discounts,
                time_major=False), torch.stack([expected, 2 * expected],
                                               dim=2))
Ejemplo n.º 2
0
    def _calc_returns_and_advantages(self, training_info, value):
        if self._use_vtrace:
            return value_ops.calc_vtrace_returns_and_advantages(
                training_info, value, self._gamma, self._action_spec,
                self._lambda, self._debug_summaries)

        returns = value_ops.discounted_return(
            rewards=training_info.reward,
            values=value,
            step_types=training_info.step_type,
            discounts=training_info.discount * self._gamma)
        returns = common.tensor_extend(returns, value[-1])

        if not self._use_gae:
            advantages = returns - value
        else:
            advantages = value_ops.generalized_advantage_estimation(
                rewards=training_info.reward,
                values=value,
                step_types=training_info.step_type,
                discounts=training_info.discount * self._gamma,
                td_lambda=self._lambda)
            advantages = common.tensor_extend_zero(advantages)
            if self._use_td_lambda_return:
                returns = advantages + value

        return returns, advantages
Ejemplo n.º 3
0
    def test_discounted_return(self):
        values = tf.constant([[1.] * 5], tf.float32)
        step_types = tf.constant([[StepType.MID] * 5], tf.int64)
        rewards = tf.constant([[2.] * 5], tf.float32)
        discounts = tf.constant([[0.9] * 5], tf.float32)
        expected = tf.constant(
            [[(((1 * 0.9 + 2) * 0.9 + 2) * 0.9 + 2) * 0.9 + 2,
              ((1 * 0.9 + 2) * 0.9 + 2) * 0.9 + 2,
              (1 * 0.9 + 2) * 0.9 + 2, 1 * 0.9 + 2]],
            dtype=tf.float32)
        self.assertAllClose(
            value_ops.discounted_return(rewards=rewards,
                                        values=values,
                                        step_types=step_types,
                                        discounts=discounts,
                                        time_major=False), expected)

        # two episodes, and exceed by time limit (discount=1)
        step_types = tf.constant([[
            StepType.MID, StepType.MID, StepType.LAST, StepType.MID,
            StepType.MID
        ]], tf.int32)
        expected = tf.constant(
            [[(1 * 0.9 + 2) * 0.9 + 2, 1 * 0.9 + 2, 1, 1 * 0.9 + 2]],
            dtype=tf.float32)
        self.assertAllClose(
            value_ops.discounted_return(rewards=rewards,
                                        values=values,
                                        step_types=step_types,
                                        discounts=discounts,
                                        time_major=False), expected)

        # two episodes, and end normal (discount=0)
        step_types = tf.constant([[
            StepType.MID, StepType.MID, StepType.LAST, StepType.MID,
            StepType.MID
        ]], tf.int32)
        discounts = tf.constant([[0.9, 0.9, 0.0, 0.9, 0.9]])
        expected = tf.constant([[(0 * 0.9 + 2) * 0.9 + 2, 2, 1, 1 * 0.9 + 2]],
                               dtype=tf.float32)

        self.assertAllClose(
            value_ops.discounted_return(rewards=rewards,
                                        values=values,
                                        step_types=step_types,
                                        discounts=discounts,
                                        time_major=False), expected)
Ejemplo n.º 4
0
    def test_vtrace_returns_and_advantages_impl_on_policy_no_last_step(self):
        """Test vtrace_returns_and_advantages_impl on policy no last_step
            in the middle of the trajectory.
        """
        importance_ratio_clipped = tf.constant([[1.] * 5], tf.float32)
        values = tf.constant([[2.] * 5], tf.float32)
        step_types = tf.constant([[StepType.MID] * 5], tf.int64)
        rewards = tf.constant([[3.] * 5], tf.float32)
        discounts = tf.constant([[0.9] * 5], tf.float32)
        td_lambda = 1.0

        returns, advantages = value_ops.vtrace_returns_and_advantages_impl(
            importance_ratio_clipped,
            rewards,
            values,
            step_types,
            discounts,
            time_major=False)
        sa_returns, sa_adv = vtrace_scalable_agent(importance_ratio_clipped,
                                                   discounts, rewards, values,
                                                   step_types)
        self.assertAllClose(sa_adv,
                            advantages,
                            msg='advantages differ from scalable_agent')
        self.assertAllClose(sa_returns,
                            returns,
                            msg='returns differ from scalable_agent')
        expected_advantages = value_ops.generalized_advantage_estimation(
            rewards=rewards,
            values=values,
            step_types=step_types,
            discounts=discounts,
            td_lambda=td_lambda,
            time_major=False)
        expected_advantages = tf.transpose(a=expected_advantages)
        expected_advantages = common.tensor_extend_zero(expected_advantages)
        expected_advantages = tf.transpose(a=expected_advantages)
        self.assertAllClose(expected_advantages,
                            advantages,
                            msg='advantages differ from gold')

        expected_returns = value_ops.discounted_return(rewards=rewards,
                                                       values=values,
                                                       step_types=step_types,
                                                       discounts=discounts,
                                                       time_major=False)
        expected_returns = tf.transpose(a=expected_returns)
        values = tf.transpose(a=values)
        expected_returns = common.tensor_extend(expected_returns, values[-1])
        expected_returns = tf.transpose(a=expected_returns)
        self.assertAllClose(expected_returns,
                            returns,
                            msg='returns differ from gold')
Ejemplo n.º 5
0
    def test_vtrace_returns_and_advantages_impl_on_policy_has_last_step(self):
        """Test vtrace_returns_and_advantages_impl on policy has last_step
            in the middle of the trajectory.
        """
        importance_ratio_clipped = tf.constant([[1.] * 5], tf.float32)
        values = tf.constant([[2., 2.1, 2.2, 2.3, 2.4]], tf.float32)
        step_types = tf.constant([[
            StepType.MID, StepType.MID, StepType.LAST, StepType.MID,
            StepType.MID
        ]], tf.int32)
        rewards = tf.constant([[3., 3.1, 3.2, 3.3, 3.4]], tf.float32)
        discounts = tf.constant([[0.9, 0.9, 0.0, 0.9, 0.9]])
        td_lambda = 1.0

        returns, advantages = value_ops.vtrace_returns_and_advantages_impl(
            importance_ratio_clipped,
            rewards,
            values,
            step_types,
            discounts,
            time_major=False)

        expected_advantages = value_ops.generalized_advantage_estimation(
            rewards=rewards,
            values=values,
            step_types=step_types,
            discounts=discounts,
            td_lambda=td_lambda,
            time_major=False)
        expected_advantages = tf.transpose(a=expected_advantages)
        expected_advantages = common.tensor_extend_zero(expected_advantages)
        expected_advantages = tf.transpose(a=expected_advantages)
        self.assertAllClose(expected_advantages,
                            advantages,
                            msg='advantages differ')

        expected_returns = value_ops.discounted_return(rewards=rewards,
                                                       values=values,
                                                       step_types=step_types,
                                                       discounts=discounts,
                                                       time_major=False)
        expected_returns = tf.transpose(a=expected_returns)
        values = tf.transpose(a=values)
        expected_returns = common.tensor_extend(expected_returns, values[-1])
        expected_returns = tf.transpose(a=expected_returns)
        self.assertAllClose(expected_returns, returns, msg='returns differ')
Ejemplo n.º 6
0
    def _calc_returns_and_advantages(self, experience, value):
        returns = value_ops.discounted_return(rewards=experience.reward,
                                              values=value,
                                              step_types=experience.step_type,
                                              discounts=experience.discount *
                                              self._gamma)
        returns = tensor_utils.tensor_extend(returns, value[-1])

        if not self._use_gae:
            advantages = returns - value
        else:
            advantages = value_ops.generalized_advantage_estimation(
                rewards=experience.reward,
                values=value,
                step_types=experience.step_type,
                discounts=experience.discount * self._gamma,
                td_lambda=self._lambda)
            advantages = tensor_utils.tensor_extend_zero(advantages)
            if self._use_td_lambda_return:
                returns = advantages + value

        return returns, advantages
Ejemplo n.º 7
0
    def forward(self, experience, value, target_value):
        """Cacluate the loss.

        The first dimension of all the tensors is time dimension and the second
        dimesion is the batch dimension.

        Args:
            experience (Experience): experience collected from ``unroll()`` or
                a replay buffer. All tensors are time-major.
            value (torch.Tensor): the time-major tensor for the value at each time
                step. The loss is between this and the calculated return.
            target_value (torch.Tensor): the time-major tensor for the value at
                each time step. This is used to calculate return. ``target_value``
                can be same as ``value``.
        Returns:
            LossInfo: with the ``extra`` field same as ``loss``.
        """
        if self._lambda == 1.0:
            returns = value_ops.discounted_return(
                rewards=experience.reward,
                values=target_value,
                step_types=experience.step_type,
                discounts=experience.discount * self._gamma)
        elif self._lambda == 0.0:
            returns = value_ops.one_step_discounted_return(
                rewards=experience.reward,
                values=target_value,
                step_types=experience.step_type,
                discounts=experience.discount * self._gamma)
        else:
            advantages = value_ops.generalized_advantage_estimation(
                rewards=experience.reward,
                values=target_value,
                step_types=experience.step_type,
                discounts=experience.discount * self._gamma,
                td_lambda=self._lambda)
            returns = advantages + target_value[:-1]

        value = value[:-1]

        if self._debug_summaries and alf.summary.should_record_summaries():
            mask = experience.step_type[:-1] != StepType.LAST
            with alf.summary.scope(self._name):

                def _summarize(v, r, td, suffix):
                    alf.summary.scalar(
                        "explained_variance_of_return_by_value" + suffix,
                        tensor_utils.explained_variance(v, r, mask))
                    safe_mean_hist_summary('values' + suffix, v, mask)
                    safe_mean_hist_summary('returns' + suffix, r, mask)
                    safe_mean_hist_summary("td_error" + suffix, td, mask)

                if value.ndim == 2:
                    _summarize(value, returns, returns - value, '')
                else:
                    td = returns - value
                    for i in range(value.shape[2]):
                        suffix = '/' + str(i)
                        _summarize(value[..., i], returns[..., i], td[..., i],
                                   suffix)

        loss = self._td_error_loss_fn(returns.detach(), value)

        if loss.ndim == 3:
            # Multidimensional reward. Average over the critic loss for all dimensions
            loss = loss.mean(dim=2)

        # The shape of the loss expected by Algorith.update_with_gradient is
        # [T, B], so we need to augment it with additional zeros.
        loss = tensor_utils.tensor_extend_zero(loss)
        return LossInfo(loss=loss, extra=loss)