Example #1
0
    def process_samples(self, itr, paths):
        """Process sample data based on the collected paths.

        Args:
            itr (int): Iteration number.
            paths (list[dict]): A list of collected paths

        Returns:
            dict: Processed sample data, with key
                * average_return: (float)

        """
        for path in paths:
            path['returns'] = tensor_utils.discount_cumsum(
                path['rewards'], self.discount)

        valids = [len(path['actions']) for path in paths]
        obs = torch.stack([
            loss_function_utils.pad_to_last(path['observations'],
                                            total_length=self.max_path_length,
                                            axis=0) for path in paths
        ])
        actions = torch.stack([
            loss_function_utils.pad_to_last(path['actions'],
                                            total_length=self.max_path_length,
                                            axis=0) for path in paths
        ])
        rewards = torch.stack([
            loss_function_utils.pad_to_last(path['rewards'],
                                            total_length=self.max_path_length)
            for path in paths
        ])

        return valids, obs, actions, rewards
Example #2
0
    def test_add_padding_last_3d(self):
        max_length = 10

        tensor_padding = torch_loss_utils.pad_to_last(nums_3d, total_length=10)
        expected = F.pad(torch.Tensor(nums_3d),
                         (0, max_length - nums_3d.shape[-1], 0, 0, 0, 0))
        assert expected.eq(tensor_padding).all()

        tensor_padding = torch_loss_utils.pad_to_last(nums_3d,
                                                      total_length=10,
                                                      axis=0)
        expected = F.pad(torch.Tensor(nums_3d),
                         (0, 0, 0, 0, 0, max_length - nums_3d.shape[0]))
        assert expected.eq(tensor_padding).all()

        tensor_padding = torch_loss_utils.pad_to_last(nums_3d,
                                                      total_length=10,
                                                      axis=1)
        expected = F.pad(torch.Tensor(nums_3d),
                         (0, 0, 0, max_length - nums_3d.shape[-1], 0, 0))
        assert expected.eq(tensor_padding).all()

        tensor_padding = torch_loss_utils.pad_to_last(nums_3d,
                                                      total_length=10,
                                                      axis=2)
        expected = F.pad(torch.Tensor(nums_3d),
                         (0, max_length - nums_3d.shape[-1], 0, 0, 0, 0))
        assert expected.eq(tensor_padding).all()
Example #3
0
    def test_add_padding_last_1d(self):
        max_length = 10

        expected = F.pad(torch.Tensor(nums_1d),
                         (0, max_length - nums_1d.shape[-1]))

        tensor_padding = torch_loss_utils.pad_to_last(nums_1d,
                                                      total_length=max_length)
        assert expected.eq(tensor_padding).all()

        tensor_padding = torch_loss_utils.pad_to_last(nums_1d,
                                                      total_length=10,
                                                      axis=0)
        assert expected.eq(tensor_padding).all()
Example #4
0
    def _compute_loss(self, itr, paths, valids, obs, actions, rewards):
        """Compute mean value of loss.

        Args:
            itr (int): Iteration number.
            paths (list[dict]): A list of collected paths
            valids (list[int]): Array of length of the valid values
            obs (torch.Tensor): Observation from the environment.
            actions (torch.Tensor): Predicted action.
            rewards (torch.Tensor): Feedback from the environment.

        Returns:
            torch.Tensor: Calculated mean value of loss

        """
        # pylint: disable=unused-argument
        policy_entropies = self._compute_policy_entropy(obs)

        baselines = torch.stack([
            loss_function_utils.pad_to_last(self._get_baselines(path),
                                            total_length=self.max_path_length)
            for path in paths
        ])

        if self._maximum_entropy:
            rewards += self._policy_ent_coeff * policy_entropies

        advantages = loss_function_utils.compute_advantages(
            self.discount, self._gae_lambda, self.max_path_length, baselines,
            rewards)

        if self._center_adv:
            means, variances = list(
                zip(*[(valid_adv.mean(), valid_adv.var())
                      for valid_adv in loss_function_utils.filter_valids(
                          advantages, valids)]))
            advantages = F.batch_norm(advantages.t(),
                                      torch.Tensor(means),
                                      torch.Tensor(variances),
                                      eps=self._eps).t()

        if self._positive_adv:
            advantages -= advantages.min()

        objective = self._compute_objective(advantages, valids, obs, actions,
                                            rewards)

        if self._entropy_regularzied:
            objective += self._policy_ent_coeff * policy_entropies

        valid_objectives = loss_function_utils.filter_valids(objective, valids)
        return torch.cat(valid_objectives).mean()
Example #5
0
 def test_out_of_index_error(self, nums):
     with pytest.raises(IndexError):
         torch_loss_utils.pad_to_last(nums,
                                      total_length=10,
                                      axis=len(nums.shape))