def get_action_out(self, inputs: torch.Tensor, masks: torch.Tensor) -> torch.Tensor: """ Gets the tensors corresponding to the output of the policy network to be used for inference. Called by the Actor's forward call. :params inputs: The encoding from the network body :params masks: Action masks for discrete actions :return: A tuple of torch tensors corresponding to the inference output """ dists = self._get_dists(inputs, masks) continuous_out, discrete_out, action_out_deprecated = None, None, None if self.action_spec.continuous_size > 0 and dists.continuous is not None: continuous_out = dists.continuous.exported_model_output() action_out_deprecated = dists.continuous.exported_model_output() if self._clip_action_on_export: continuous_out = torch.clamp(continuous_out, -3, 3) / 3 action_out_deprecated = torch.clamp(action_out_deprecated, -3, 3) / 3 if self.action_spec.discrete_size > 0 and dists.discrete is not None: discrete_out_list = [ discrete_dist.exported_model_output() for discrete_dist in dists.discrete ] discrete_out = torch.cat(discrete_out_list, dim=1) action_out_deprecated = torch.cat(discrete_out_list, dim=1) # deprecated action field does not support hybrid action if self.action_spec.continuous_size > 0 and self.action_spec.discrete_size > 0: action_out_deprecated = None return continuous_out, discrete_out, action_out_deprecated
def ppo_value_loss( self, values: Dict[str, torch.Tensor], old_values: Dict[str, torch.Tensor], returns: Dict[str, torch.Tensor], epsilon: float, loss_masks: torch.Tensor, ) -> torch.Tensor: """ Evaluates value loss for PPO. :param values: Value output of the current network. :param old_values: Value stored with experiences in buffer. :param returns: Computed returns. :param epsilon: Clipping value for value estimate. :param loss_mask: Mask for losses. Used with LSTM to ignore 0'ed out experiences. """ value_losses = [] for name, head in values.items(): old_val_tensor = old_values[name] returns_tensor = returns[name] clipped_value_estimate = old_val_tensor + torch.clamp( head - old_val_tensor, -1 * epsilon, epsilon) v_opt_a = (returns_tensor - head)**2 v_opt_b = (returns_tensor - clipped_value_estimate)**2 value_loss = ModelUtils.masked_mean(torch.max(v_opt_a, v_opt_b), loss_masks) value_losses.append(value_loss) value_loss = torch.mean(torch.stack(value_losses)) return value_loss
def ppo_policy_loss( self, advantages: torch.Tensor, log_probs: torch.Tensor, old_log_probs: torch.Tensor, loss_masks: torch.Tensor, ) -> torch.Tensor: """ Evaluate PPO policy loss. :param advantages: Computed advantages. :param log_probs: Current policy probabilities :param old_log_probs: Past policy probabilities :param loss_masks: Mask for losses. Used with LSTM to ignore 0'ed out experiences. """ advantage = advantages.unsqueeze(-1) decay_epsilon = self.hyperparameters.epsilon r_theta = torch.exp(log_probs - old_log_probs) p_opt_a = r_theta * advantage p_opt_b = ( torch.clamp(r_theta, 1.0 - decay_epsilon, 1.0 + decay_epsilon) * advantage) policy_loss = -1 * ModelUtils.masked_mean(torch.min(p_opt_a, p_opt_b), loss_masks) return policy_loss
def forward( self, vec_inputs: List[torch.Tensor], vis_inputs: List[torch.Tensor], masks: Optional[torch.Tensor] = None, memories: Optional[torch.Tensor] = None, ) -> Tuple[torch.Tensor, int, int, int, int]: """ Note: This forward() method is required for exporting to ONNX. Don't modify the inputs and outputs. """ dists, _ = self.get_dists(vec_inputs, vis_inputs, masks, memories, 1) if self.action_spec.is_continuous(): action_list = self.sample_action(dists) action_out = torch.stack(action_list, dim=-1) if self._clip_action_on_export: action_out = torch.clamp(action_out, -3, 3) / 3 else: action_out = torch.cat([dist.all_log_prob() for dist in dists], dim=1) return ( action_out, self.version_number, torch.Tensor([self.network_body.memory_size]), self.is_continuous_int, self.act_size_vector, )
def evaluate(self, decision_requests: DecisionSteps, global_agent_ids: List[str]) -> Dict[str, Any]: """ Evaluates policy for the agent experiences provided. :param global_agent_ids: :param decision_requests: DecisionStep object containing inputs. :return: Outputs from network as defined by self.inference_dict. """ vec_vis_obs, masks = self._split_decision_step(decision_requests) vec_obs = [torch.as_tensor(vec_vis_obs.vector_observations)] vis_obs = [ torch.as_tensor(vis_ob) for vis_ob in vec_vis_obs.visual_observations ] memories = torch.as_tensor( self.retrieve_memories(global_agent_ids)).unsqueeze(0) run_out = {} with torch.no_grad(): action, log_probs, entropy, memories = self.sample_actions( vec_obs, vis_obs, masks=masks, memories=memories) if self._clip_action and self.use_continuous_act: clipped_action = torch.clamp(action, -3, 3) / 3 else: clipped_action = action run_out["pre_action"] = ModelUtils.to_numpy(action) run_out["action"] = ModelUtils.to_numpy(clipped_action) # Todo - make pre_action difference run_out["log_probs"] = ModelUtils.to_numpy(log_probs) run_out["entropy"] = ModelUtils.to_numpy(entropy) run_out["learning_rate"] = 0.0 if self.use_recurrent: run_out["memory_out"] = ModelUtils.to_numpy(memories).squeeze(0) return run_out
def forward(self, inputs: torch.Tensor) -> torch.Tensor: normalized_state = torch.clamp( (inputs - self.running_mean) / torch.sqrt(self.running_variance / self.normalization_steps), -5, 5, ) return normalized_state
def masked_mean(tensor: torch.Tensor, masks: torch.Tensor) -> torch.Tensor: """ Returns the mean of the tensor but ignoring the values specified by masks. Used for masking out loss functions. :param tensor: Tensor which needs mean computation. :param masks: Boolean tensor of masks with same dimension as tensor. """ return (tensor.T * masks).sum() / torch.clamp( (torch.ones_like(tensor.T) * masks).float().sum(), min=1.0)
def forward(self, inputs: torch.Tensor) -> List[DistInstance]: mu = self.mu(inputs) if self.conditional_sigma: log_sigma = torch.clamp(self.log_sigma(inputs), min=-20, max=2) else: log_sigma = self.log_sigma if self.tanh_squash: return [TanhGaussianDistInstance(mu, torch.exp(log_sigma))] else: return [GaussianDistInstance(mu, torch.exp(log_sigma))]
def forward(self, inputs: torch.Tensor) -> List[DistInstance]: mu = self.mu(inputs) if self.conditional_sigma: log_sigma = torch.clamp(self.log_sigma(inputs), min=-20, max=2) else: # Expand so that entropy matches batch size log_sigma = self.log_sigma.expand(inputs.shape[0], -1) if self.tanh_squash: return [TanhGaussianDistInstance(mu, torch.exp(log_sigma))] else: return [GaussianDistInstance(mu, torch.exp(log_sigma))]
def forward(self, inputs: torch.Tensor) -> List[DistInstance]: mu = self.mu(inputs) if self.conditional_sigma: log_sigma = torch.clamp(self.log_sigma(inputs), min=-20, max=2) else: # Expand so that entropy matches batch size. Note that we're using # torch.cat here instead of torch.expand() becuase it is not supported in the # verified version of Barracuda (1.0.2). log_sigma = torch.cat([self.log_sigma] * inputs.shape[0], axis=0) if self.tanh_squash: return [TanhGaussianDistInstance(mu, torch.exp(log_sigma))] else: return [GaussianDistInstance(mu, torch.exp(log_sigma))]
def sample_actions( self, vec_obs: List[torch.Tensor], vis_obs: List[torch.Tensor], masks: Optional[torch.Tensor] = None, memories: Optional[torch.Tensor] = None, seq_len: int = 1, all_log_probs: bool = False, ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: """ :param vec_obs: List of vector observations. :param vis_obs: List of visual observations. :param masks: Loss masks for RNN, else None. :param memories: Input memories when using RNN, else None. :param seq_len: Sequence length when using RNN. :param all_log_probs: Returns (for discrete actions) a tensor of log probs, one for each action. :return: Tuple of actions, actions clipped to -1, 1, log probabilities (dependent on all_log_probs), entropies, and output memories, all as Torch Tensors. """ if memories is None: dists, memories = self.actor_critic.get_dists( vec_obs, vis_obs, masks, memories, seq_len) else: # If we're using LSTM. we need to execute the values to get the critic memories dists, _, memories = self.actor_critic.get_dist_and_value( vec_obs, vis_obs, masks, memories, seq_len) action_list = self.actor_critic.sample_action(dists) log_probs, entropies, all_logs = ModelUtils.get_probs_and_entropy( action_list, dists) actions = torch.stack(action_list, dim=-1) if self.use_continuous_act: actions = actions[:, :, 0] else: actions = actions[:, 0, :] # Use the sum of entropy across actions, not the mean entropy_sum = torch.sum(entropies, dim=1) if self._clip_action and self.use_continuous_act: clipped_action = torch.clamp(actions, -3, 3) / 3 else: clipped_action = actions return ( actions, clipped_action, all_logs if all_log_probs else log_probs, entropy_sum, memories, )
def to_action_tuple(self, clip: bool = False) -> ActionTuple: """ Returns an ActionTuple """ action_tuple = ActionTuple() if self.continuous_tensor is not None: _continuous_tensor = self.continuous_tensor if clip: _continuous_tensor = torch.clamp(_continuous_tensor, -3, 3) / 3 continuous = ModelUtils.to_numpy(_continuous_tensor) action_tuple.add_continuous(continuous) if self.discrete_list is not None: discrete = ModelUtils.to_numpy(self.discrete_tensor[:, 0, :]) action_tuple.add_discrete(discrete) return action_tuple
def forward(self, inputs: torch.Tensor) -> List[DistInstance]: mu = self.mu(inputs) if self.conditional_sigma: log_sigma = torch.clamp(self.log_sigma(inputs), min=-20, max=2) else: # Expand so that entropy matches batch size. Note that we're using # mu*0 here to get the batch size implicitly since Barracuda 1.2.1 # throws error on runtime broadcasting due to unknown reason. We # use this to replace torch.expand() becuase it is not supported in # the verified version of Barracuda (1.0.X). log_sigma = mu * 0 + self.log_sigma if self.tanh_squash: return TanhGaussianDistInstance(mu, torch.exp(log_sigma)) else: return GaussianDistInstance(mu, torch.exp(log_sigma))
def trust_region_policy_loss( advantages: torch.Tensor, log_probs: torch.Tensor, old_log_probs: torch.Tensor, loss_masks: torch.Tensor, epsilon: float, ) -> torch.Tensor: """ Evaluate policy loss clipped to stay within a trust region. Used for PPO and POCA. :param advantages: Computed advantages. :param log_probs: Current policy probabilities :param old_log_probs: Past policy probabilities :param loss_masks: Mask for losses. Used with LSTM to ignore 0'ed out experiences. """ advantage = advantages.unsqueeze(-1) r_theta = torch.exp(log_probs - old_log_probs) p_opt_a = r_theta * advantage p_opt_b = torch.clamp(r_theta, 1.0 - epsilon, 1.0 + epsilon) * advantage policy_loss = -1 * ModelUtils.masked_mean(torch.min(p_opt_a, p_opt_b), loss_masks) return policy_loss
def _inverse_tanh(self, value): capped_value = torch.clamp(value, -1 + EPSILON, 1 - EPSILON) return 0.5 * torch.log((1 + capped_value) / (1 - capped_value) + EPSILON)