Ejemplo n.º 1
0
 def get_action_out(self, inputs: torch.Tensor, masks: torch.Tensor) -> torch.Tensor:
     """
     Gets the tensors corresponding to the output of the policy network to be used for
     inference. Called by the Actor's forward call.
     :params inputs: The encoding from the network body
     :params masks: Action masks for discrete actions
     :return: A tuple of torch tensors corresponding to the inference output
     """
     dists = self._get_dists(inputs, masks)
     continuous_out, discrete_out, action_out_deprecated = None, None, None
     if self.action_spec.continuous_size > 0 and dists.continuous is not None:
         continuous_out = dists.continuous.exported_model_output()
         action_out_deprecated = dists.continuous.exported_model_output()
         if self._clip_action_on_export:
             continuous_out = torch.clamp(continuous_out, -3, 3) / 3
             action_out_deprecated = torch.clamp(action_out_deprecated, -3, 3) / 3
     if self.action_spec.discrete_size > 0 and dists.discrete is not None:
         discrete_out_list = [
             discrete_dist.exported_model_output()
             for discrete_dist in dists.discrete
         ]
         discrete_out = torch.cat(discrete_out_list, dim=1)
         action_out_deprecated = torch.cat(discrete_out_list, dim=1)
     # deprecated action field does not support hybrid action
     if self.action_spec.continuous_size > 0 and self.action_spec.discrete_size > 0:
         action_out_deprecated = None
     return continuous_out, discrete_out, action_out_deprecated
Ejemplo n.º 2
0
 def ppo_value_loss(
     self,
     values: Dict[str, torch.Tensor],
     old_values: Dict[str, torch.Tensor],
     returns: Dict[str, torch.Tensor],
     epsilon: float,
     loss_masks: torch.Tensor,
 ) -> torch.Tensor:
     """
     Evaluates value loss for PPO.
     :param values: Value output of the current network.
     :param old_values: Value stored with experiences in buffer.
     :param returns: Computed returns.
     :param epsilon: Clipping value for value estimate.
     :param loss_mask: Mask for losses. Used with LSTM to ignore 0'ed out experiences.
     """
     value_losses = []
     for name, head in values.items():
         old_val_tensor = old_values[name]
         returns_tensor = returns[name]
         clipped_value_estimate = old_val_tensor + torch.clamp(
             head - old_val_tensor, -1 * epsilon, epsilon)
         v_opt_a = (returns_tensor - head)**2
         v_opt_b = (returns_tensor - clipped_value_estimate)**2
         value_loss = ModelUtils.masked_mean(torch.max(v_opt_a, v_opt_b),
                                             loss_masks)
         value_losses.append(value_loss)
     value_loss = torch.mean(torch.stack(value_losses))
     return value_loss
Ejemplo n.º 3
0
    def ppo_policy_loss(
        self,
        advantages: torch.Tensor,
        log_probs: torch.Tensor,
        old_log_probs: torch.Tensor,
        loss_masks: torch.Tensor,
    ) -> torch.Tensor:
        """
        Evaluate PPO policy loss.
        :param advantages: Computed advantages.
        :param log_probs: Current policy probabilities
        :param old_log_probs: Past policy probabilities
        :param loss_masks: Mask for losses. Used with LSTM to ignore 0'ed out experiences.
        """
        advantage = advantages.unsqueeze(-1)

        decay_epsilon = self.hyperparameters.epsilon
        r_theta = torch.exp(log_probs - old_log_probs)
        p_opt_a = r_theta * advantage
        p_opt_b = (
            torch.clamp(r_theta, 1.0 - decay_epsilon, 1.0 + decay_epsilon) *
            advantage)
        policy_loss = -1 * ModelUtils.masked_mean(torch.min(p_opt_a, p_opt_b),
                                                  loss_masks)
        return policy_loss
Ejemplo n.º 4
0
 def forward(
     self,
     vec_inputs: List[torch.Tensor],
     vis_inputs: List[torch.Tensor],
     masks: Optional[torch.Tensor] = None,
     memories: Optional[torch.Tensor] = None,
 ) -> Tuple[torch.Tensor, int, int, int, int]:
     """
     Note: This forward() method is required for exporting to ONNX. Don't modify the inputs and outputs.
     """
     dists, _ = self.get_dists(vec_inputs, vis_inputs, masks, memories, 1)
     if self.action_spec.is_continuous():
         action_list = self.sample_action(dists)
         action_out = torch.stack(action_list, dim=-1)
         if self._clip_action_on_export:
             action_out = torch.clamp(action_out, -3, 3) / 3
     else:
         action_out = torch.cat([dist.all_log_prob() for dist in dists],
                                dim=1)
     return (
         action_out,
         self.version_number,
         torch.Tensor([self.network_body.memory_size]),
         self.is_continuous_int,
         self.act_size_vector,
     )
Ejemplo n.º 5
0
    def evaluate(self, decision_requests: DecisionSteps,
                 global_agent_ids: List[str]) -> Dict[str, Any]:
        """
        Evaluates policy for the agent experiences provided.
        :param global_agent_ids:
        :param decision_requests: DecisionStep object containing inputs.
        :return: Outputs from network as defined by self.inference_dict.
        """
        vec_vis_obs, masks = self._split_decision_step(decision_requests)
        vec_obs = [torch.as_tensor(vec_vis_obs.vector_observations)]
        vis_obs = [
            torch.as_tensor(vis_ob)
            for vis_ob in vec_vis_obs.visual_observations
        ]
        memories = torch.as_tensor(
            self.retrieve_memories(global_agent_ids)).unsqueeze(0)

        run_out = {}
        with torch.no_grad():
            action, log_probs, entropy, memories = self.sample_actions(
                vec_obs, vis_obs, masks=masks, memories=memories)

        if self._clip_action and self.use_continuous_act:
            clipped_action = torch.clamp(action, -3, 3) / 3
        else:
            clipped_action = action
        run_out["pre_action"] = ModelUtils.to_numpy(action)
        run_out["action"] = ModelUtils.to_numpy(clipped_action)
        # Todo - make pre_action difference
        run_out["log_probs"] = ModelUtils.to_numpy(log_probs)
        run_out["entropy"] = ModelUtils.to_numpy(entropy)
        run_out["learning_rate"] = 0.0
        if self.use_recurrent:
            run_out["memory_out"] = ModelUtils.to_numpy(memories).squeeze(0)
        return run_out
Ejemplo n.º 6
0
 def forward(self, inputs: torch.Tensor) -> torch.Tensor:
     normalized_state = torch.clamp(
         (inputs - self.running_mean) /
         torch.sqrt(self.running_variance / self.normalization_steps),
         -5,
         5,
     )
     return normalized_state
Ejemplo n.º 7
0
 def masked_mean(tensor: torch.Tensor, masks: torch.Tensor) -> torch.Tensor:
     """
     Returns the mean of the tensor but ignoring the values specified by masks.
     Used for masking out loss functions.
     :param tensor: Tensor which needs mean computation.
     :param masks: Boolean tensor of masks with same dimension as tensor.
     """
     return (tensor.T * masks).sum() / torch.clamp(
         (torch.ones_like(tensor.T) * masks).float().sum(), min=1.0)
Ejemplo n.º 8
0
 def forward(self, inputs: torch.Tensor) -> List[DistInstance]:
     mu = self.mu(inputs)
     if self.conditional_sigma:
         log_sigma = torch.clamp(self.log_sigma(inputs), min=-20, max=2)
     else:
         log_sigma = self.log_sigma
     if self.tanh_squash:
         return [TanhGaussianDistInstance(mu, torch.exp(log_sigma))]
     else:
         return [GaussianDistInstance(mu, torch.exp(log_sigma))]
Ejemplo n.º 9
0
 def forward(self, inputs: torch.Tensor) -> List[DistInstance]:
     mu = self.mu(inputs)
     if self.conditional_sigma:
         log_sigma = torch.clamp(self.log_sigma(inputs), min=-20, max=2)
     else:
         # Expand so that entropy matches batch size
         log_sigma = self.log_sigma.expand(inputs.shape[0], -1)
     if self.tanh_squash:
         return [TanhGaussianDistInstance(mu, torch.exp(log_sigma))]
     else:
         return [GaussianDistInstance(mu, torch.exp(log_sigma))]
Ejemplo n.º 10
0
 def forward(self, inputs: torch.Tensor) -> List[DistInstance]:
     mu = self.mu(inputs)
     if self.conditional_sigma:
         log_sigma = torch.clamp(self.log_sigma(inputs), min=-20, max=2)
     else:
         # Expand so that entropy matches batch size. Note that we're using
         # torch.cat here instead of torch.expand() becuase it is not supported in the
         # verified version of Barracuda (1.0.2).
         log_sigma = torch.cat([self.log_sigma] * inputs.shape[0], axis=0)
     if self.tanh_squash:
         return [TanhGaussianDistInstance(mu, torch.exp(log_sigma))]
     else:
         return [GaussianDistInstance(mu, torch.exp(log_sigma))]
Ejemplo n.º 11
0
    def sample_actions(
        self,
        vec_obs: List[torch.Tensor],
        vis_obs: List[torch.Tensor],
        masks: Optional[torch.Tensor] = None,
        memories: Optional[torch.Tensor] = None,
        seq_len: int = 1,
        all_log_probs: bool = False,
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor,
               torch.Tensor]:
        """
        :param vec_obs: List of vector observations.
        :param vis_obs: List of visual observations.
        :param masks: Loss masks for RNN, else None.
        :param memories: Input memories when using RNN, else None.
        :param seq_len: Sequence length when using RNN.
        :param all_log_probs: Returns (for discrete actions) a tensor of log probs, one for each action.
        :return: Tuple of actions, actions clipped to -1, 1, log probabilities (dependent on all_log_probs),
            entropies, and output memories, all as Torch Tensors.
        """
        if memories is None:
            dists, memories = self.actor_critic.get_dists(
                vec_obs, vis_obs, masks, memories, seq_len)
        else:
            # If we're using LSTM. we need to execute the values to get the critic memories
            dists, _, memories = self.actor_critic.get_dist_and_value(
                vec_obs, vis_obs, masks, memories, seq_len)
        action_list = self.actor_critic.sample_action(dists)
        log_probs, entropies, all_logs = ModelUtils.get_probs_and_entropy(
            action_list, dists)
        actions = torch.stack(action_list, dim=-1)
        if self.use_continuous_act:
            actions = actions[:, :, 0]
        else:
            actions = actions[:, 0, :]
        # Use the sum of entropy across actions, not the mean
        entropy_sum = torch.sum(entropies, dim=1)

        if self._clip_action and self.use_continuous_act:
            clipped_action = torch.clamp(actions, -3, 3) / 3
        else:
            clipped_action = actions
        return (
            actions,
            clipped_action,
            all_logs if all_log_probs else log_probs,
            entropy_sum,
            memories,
        )
Ejemplo n.º 12
0
 def to_action_tuple(self, clip: bool = False) -> ActionTuple:
     """
     Returns an ActionTuple
     """
     action_tuple = ActionTuple()
     if self.continuous_tensor is not None:
         _continuous_tensor = self.continuous_tensor
         if clip:
             _continuous_tensor = torch.clamp(_continuous_tensor, -3, 3) / 3
         continuous = ModelUtils.to_numpy(_continuous_tensor)
         action_tuple.add_continuous(continuous)
     if self.discrete_list is not None:
         discrete = ModelUtils.to_numpy(self.discrete_tensor[:, 0, :])
         action_tuple.add_discrete(discrete)
     return action_tuple
Ejemplo n.º 13
0
 def forward(self, inputs: torch.Tensor) -> List[DistInstance]:
     mu = self.mu(inputs)
     if self.conditional_sigma:
         log_sigma = torch.clamp(self.log_sigma(inputs), min=-20, max=2)
     else:
         # Expand so that entropy matches batch size. Note that we're using
         # mu*0 here to get the batch size implicitly since Barracuda 1.2.1
         # throws error on runtime broadcasting due to unknown reason. We
         # use this to replace torch.expand() becuase it is not supported in
         # the verified version of Barracuda (1.0.X).
         log_sigma = mu * 0 + self.log_sigma
     if self.tanh_squash:
         return TanhGaussianDistInstance(mu, torch.exp(log_sigma))
     else:
         return GaussianDistInstance(mu, torch.exp(log_sigma))
Ejemplo n.º 14
0
 def trust_region_policy_loss(
     advantages: torch.Tensor,
     log_probs: torch.Tensor,
     old_log_probs: torch.Tensor,
     loss_masks: torch.Tensor,
     epsilon: float,
 ) -> torch.Tensor:
     """
     Evaluate policy loss clipped to stay within a trust region. Used for PPO and POCA.
     :param advantages: Computed advantages.
     :param log_probs: Current policy probabilities
     :param old_log_probs: Past policy probabilities
     :param loss_masks: Mask for losses. Used with LSTM to ignore 0'ed out experiences.
     """
     advantage = advantages.unsqueeze(-1)
     r_theta = torch.exp(log_probs - old_log_probs)
     p_opt_a = r_theta * advantage
     p_opt_b = torch.clamp(r_theta, 1.0 - epsilon,
                           1.0 + epsilon) * advantage
     policy_loss = -1 * ModelUtils.masked_mean(torch.min(p_opt_a, p_opt_b),
                                               loss_masks)
     return policy_loss
Ejemplo n.º 15
0
 def _inverse_tanh(self, value):
     capped_value = torch.clamp(value, -1 + EPSILON, 1 - EPSILON)
     return 0.5 * torch.log((1 + capped_value) /
                            (1 - capped_value) + EPSILON)