def process_samples(self, itr, paths): """Process sample data based on the collected paths. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths Returns: dict: Processed sample data, with key * average_return: (float) """ for path in paths: path['returns'] = tensor_utils.discount_cumsum( path['rewards'], self.discount) valids = [len(path['actions']) for path in paths] obs = torch.stack([ loss_function_utils.pad_to_last(path['observations'], total_length=self.max_path_length, axis=0) for path in paths ]) actions = torch.stack([ loss_function_utils.pad_to_last(path['actions'], total_length=self.max_path_length, axis=0) for path in paths ]) rewards = torch.stack([ loss_function_utils.pad_to_last(path['rewards'], total_length=self.max_path_length) for path in paths ]) return valids, obs, actions, rewards
def test_add_padding_last_3d(self): max_length = 10 tensor_padding = torch_loss_utils.pad_to_last(nums_3d, total_length=10) expected = F.pad(torch.Tensor(nums_3d), (0, max_length - nums_3d.shape[-1], 0, 0, 0, 0)) assert expected.eq(tensor_padding).all() tensor_padding = torch_loss_utils.pad_to_last(nums_3d, total_length=10, axis=0) expected = F.pad(torch.Tensor(nums_3d), (0, 0, 0, 0, 0, max_length - nums_3d.shape[0])) assert expected.eq(tensor_padding).all() tensor_padding = torch_loss_utils.pad_to_last(nums_3d, total_length=10, axis=1) expected = F.pad(torch.Tensor(nums_3d), (0, 0, 0, max_length - nums_3d.shape[-1], 0, 0)) assert expected.eq(tensor_padding).all() tensor_padding = torch_loss_utils.pad_to_last(nums_3d, total_length=10, axis=2) expected = F.pad(torch.Tensor(nums_3d), (0, max_length - nums_3d.shape[-1], 0, 0, 0, 0)) assert expected.eq(tensor_padding).all()
def test_add_padding_last_1d(self): max_length = 10 expected = F.pad(torch.Tensor(nums_1d), (0, max_length - nums_1d.shape[-1])) tensor_padding = torch_loss_utils.pad_to_last(nums_1d, total_length=max_length) assert expected.eq(tensor_padding).all() tensor_padding = torch_loss_utils.pad_to_last(nums_1d, total_length=10, axis=0) assert expected.eq(tensor_padding).all()
def _compute_loss(self, itr, paths, valids, obs, actions, rewards): """Compute mean value of loss. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths valids (list[int]): Array of length of the valid values obs (torch.Tensor): Observation from the environment. actions (torch.Tensor): Predicted action. rewards (torch.Tensor): Feedback from the environment. Returns: torch.Tensor: Calculated mean value of loss """ # pylint: disable=unused-argument policy_entropies = self._compute_policy_entropy(obs) baselines = torch.stack([ loss_function_utils.pad_to_last(self._get_baselines(path), total_length=self.max_path_length) for path in paths ]) if self._maximum_entropy: rewards += self._policy_ent_coeff * policy_entropies advantages = loss_function_utils.compute_advantages( self.discount, self._gae_lambda, self.max_path_length, baselines, rewards) if self._center_adv: means, variances = list( zip(*[(valid_adv.mean(), valid_adv.var()) for valid_adv in loss_function_utils.filter_valids( advantages, valids)])) advantages = F.batch_norm(advantages.t(), torch.Tensor(means), torch.Tensor(variances), eps=self._eps).t() if self._positive_adv: advantages -= advantages.min() objective = self._compute_objective(advantages, valids, obs, actions, rewards) if self._entropy_regularzied: objective += self._policy_ent_coeff * policy_entropies valid_objectives = loss_function_utils.filter_valids(objective, valids) return torch.cat(valid_objectives).mean()
def test_out_of_index_error(self, nums): with pytest.raises(IndexError): torch_loss_utils.pad_to_last(nums, total_length=10, axis=len(nums.shape))