def _compute_loss(self, obs, actions, rewards, valids, baselines): r"""Compute mean value of loss. Notes: P is the maximum path length (self.max_path_length) Args: obs (torch.Tensor): Observation from the environment with shape :math:`(N, P, O*)`. actions (torch.Tensor): Actions fed to the environment with shape :math:`(N, P, A*)`. rewards (torch.Tensor): Acquired rewards with shape :math:`(N, P)`. valids (list[int]): Numbers of valid steps in each paths baselines (torch.Tensor): Value function estimation at each step with shape :math:`(N, P)`. Returns: torch.Tensor: Calculated negative mean scalar value of objective (float). """ obs_flat = torch.cat(filter_valids(obs, valids)) actions_flat = torch.cat(filter_valids(actions, valids)) rewards_flat = torch.cat(filter_valids(rewards, valids)) advantages_flat = self._compute_advantage(rewards, valids, baselines) return self._compute_loss_with_adv(obs_flat, actions_flat, rewards_flat, advantages_flat)
def _compute_loss(self, itr, obs, avail_actions, actions, rewards, valids, baselines): """Compute mean value of loss. Args: itr (int): Iteration number. obs (torch.Tensor): Observation from the environment. actions (torch.Tensor): Predicted action. rewards (torch.Tensor): Feedback from the environment. valids (list[int]): Array of length of the valid values. baselines (torch.Tensor): Value function estimation at each step. Returns: torch.Tensor: Calculated mean value of loss """ del itr if self.policy.recurrent: policy_entropies = self._compute_policy_entropy( obs, avail_actions, actions) else: policy_entropies = self._compute_policy_entropy(obs, avail_actions) if self._maximum_entropy: rewards += self._policy_ent_coeff * policy_entropies advantages = compute_advantages(self.discount, self._gae_lambda, self.max_path_length, baselines, rewards) if self._center_adv: means, variances = list( zip(*[(valid_adv.mean(), valid_adv.var(unbiased=False)) for valid_adv in filter_valids(advantages, valids)])) advantages = F.batch_norm(advantages.t(), torch.Tensor(means), torch.Tensor(variances), eps=self._eps).t() if self._positive_adv: advantages -= advantages.min() objective = self._compute_objective(advantages, valids, obs, avail_actions, actions, rewards) if self._entropy_regularzied: objective += self._policy_ent_coeff * policy_entropies valid_objectives = filter_valids(objective, valids) return -torch.cat(valid_objectives).mean()
def _compute_loss(self, itr, paths, valids, obs, actions, rewards): """Compute mean value of loss. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths valids (list[int]): Array of length of the valid values obs (torch.Tensor): Observation from the environment. actions (torch.Tensor): Predicted action. rewards (torch.Tensor): Feedback from the environment. Returns: torch.Tensor: Calculated mean value of loss """ # pylint: disable=unused-argument policy_entropies = self._compute_policy_entropy(obs) baselines = torch.stack([ pad_to_last(self._get_baselines(path), total_length=self.max_path_length) for path in paths ]) if self._maximum_entropy: rewards += self._policy_ent_coeff * policy_entropies advantages = compute_advantages(self.discount, self._gae_lambda, self.max_path_length, baselines, rewards) if self._center_adv: means, variances = list( zip(*[(valid_adv.mean(), valid_adv.var()) for valid_adv in filter_valids(advantages, valids)])) advantages = F.batch_norm(advantages.t(), torch.Tensor(means), torch.Tensor(variances), eps=self._eps).t() if self._positive_adv: advantages -= advantages.min() objective = self._compute_objective(advantages, valids, obs, actions, rewards) if self._entropy_regularzied: objective += self._policy_ent_coeff * policy_entropies valid_objectives = filter_valids(objective, valids) return -torch.cat(valid_objectives).mean()
def _compute_advantage(self, rewards, valids, baselines): r"""Compute mean value of loss. Notes: P is the maximum path length (self.max_path_length) Args: rewards (torch.Tensor): Acquired rewards with shape :math:`(N, P)`. valids (list[int]): Numbers of valid steps in each paths baselines (torch.Tensor): Value function estimation at each step with shape :math:`(N, P)`. Returns: torch.Tensor: Calculated advantage values given rewards and baselines with shape :math:`(N \dot [T], )`. """ advantages = compute_advantages(self.discount, self._gae_lambda, self.max_path_length, baselines, rewards) advantage_flat = torch.cat(filter_valids(advantages, valids)) if self._center_adv: means = advantage_flat.mean() variance = advantage_flat.var() advantage_flat = (advantage_flat - means) / (variance + 1e-8) if self._positive_adv: advantage_flat -= advantage_flat.min() return advantage_flat
def train_once(self, itr, paths): """Train the algorithm once. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths. Returns: numpy.float64: Calculated mean value of undiscounted returns. """ obs, actions, rewards, valids, baselines = self.process_samples( itr, paths) if self._maximum_entropy: policy_entropies = self._compute_policy_entropy(obs) rewards += self._policy_ent_coeff * policy_entropies obs_flat = torch.cat(filter_valids(obs, valids)) actions_flat = torch.cat(filter_valids(actions, valids)) rewards_flat = torch.cat(filter_valids(rewards, valids)) advantages_flat = self._compute_advantage(rewards, valids, baselines) with torch.no_grad(): loss_before = self._compute_loss_with_adv(obs_flat, actions_flat, rewards_flat, advantages_flat) kl_before = self._compute_kl_constraint(obs) step_size = (self._minibatch_size if self._minibatch_size else len(rewards_flat)) for epoch in range(self._max_optimization_epochs): shuffled_ids = np.random.permutation(len(rewards_flat)) for start in range(0, len(rewards_flat), step_size): ids = shuffled_ids[start:start + step_size] loss = self._train(obs_flat[ids], actions_flat[ids], rewards_flat[ids], advantages_flat[ids]) logger.log('Mini epoch: {} | Loss: {}'.format(epoch, loss)) self._value_function.fit(paths) with torch.no_grad(): loss_after = self._compute_loss_with_adv(obs_flat, actions_flat, rewards_flat, advantages_flat) kl_after = self._compute_kl_constraint(obs) policy_entropy = self._compute_policy_entropy(obs) with tabular.prefix(self.policy.name): tabular.record('/LossBefore', loss_before.item()) tabular.record('/LossAfter', loss_after.item()) tabular.record('/dLoss', loss_before.item() - loss_after.item()) tabular.record('/KLBefore', kl_before.item()) tabular.record('/KL', kl_after.item()) tabular.record('/Entropy', policy_entropy.mean().item()) self._old_policy.load_state_dict(self.policy.state_dict()) undiscounted_returns = log_performance( itr, TrajectoryBatch.from_trajectory_list(self.env_spec, paths), discount=self.discount) return np.mean(undiscounted_returns)
def train_once(self, itr, paths): """Train the algorithm once. Args: itr (int): Iteration number. paths (list[dict]): A list of collected paths. Returns: numpy.float64: Calculated mean value of undiscounted returns. """ obs, actions, rewards, returns, valids, baselines = \ self.process_samples(itr, paths) if self._maximum_entropy: policy_entropies = self._compute_policy_entropy(obs) rewards += self._policy_ent_coeff * policy_entropies obs_flat = torch.cat(filter_valids(obs, valids)) actions_flat = torch.cat(filter_valids(actions, valids)) rewards_flat = torch.cat(filter_valids(rewards, valids)) returns_flat = torch.cat(filter_valids(returns, valids)) advs_flat = self._compute_advantage(rewards, valids, baselines) with torch.no_grad(): policy_loss_before = self._compute_loss_with_adv( obs_flat, actions_flat, rewards_flat, advs_flat) vf_loss_before = self._value_function.compute_loss( obs_flat, returns_flat) kl_before = self._compute_kl_constraint(obs) self._train(obs_flat, actions_flat, rewards_flat, returns_flat, advs_flat) with torch.no_grad(): policy_loss_after = self._compute_loss_with_adv( obs_flat, actions_flat, rewards_flat, advs_flat) vf_loss_after = self._value_function.compute_loss( obs_flat, returns_flat) kl_after = self._compute_kl_constraint(obs) policy_entropy = self._compute_policy_entropy(obs) with tabular.prefix(self.policy.name): tabular.record('/LossBefore', policy_loss_before.item()) tabular.record('/LossAfter', policy_loss_after.item()) tabular.record('/dLoss', (policy_loss_before - policy_loss_after).item()) tabular.record('/KLBefore', kl_before.item()) tabular.record('/KL', kl_after.item()) tabular.record('/Entropy', policy_entropy.mean().item()) with tabular.prefix(self._value_function.name): tabular.record('/LossBefore', vf_loss_before.item()) tabular.record('/LossAfter', vf_loss_after.item()) tabular.record('/dLoss', vf_loss_before.item() - vf_loss_after.item()) self._old_policy.load_state_dict(self.policy.state_dict()) undiscounted_returns = log_performance( itr, TrajectoryBatch.from_trajectory_list(self.env_spec, paths), discount=self.discount) return np.mean(undiscounted_returns)