Beispiel #1
0
 def _unsquash(self, values):
     normed_values = (values - self.low) / (self.high - self.low) * 2.0 - \
                     1.0
     # Stabilize input to atanh.
     save_normed_values = torch.clamp(normed_values, -1.0 + SMALL_NUMBER,
                                      1.0 - SMALL_NUMBER)
     unsquashed = atanh(save_normed_values)
     return unsquashed
Beispiel #2
0
 def bc_log(model, obs, actions):
     z = atanh(actions)
     logits = model.get_policy_output(obs)
     mean, log_std = torch.chunk(logits, 2, dim=-1)
     # Mean Clamping for Stability
     mean = torch.clamp(mean, MEAN_MIN, MEAN_MAX)
     log_std = torch.clamp(log_std, MIN_LOG_NN_OUTPUT,
                           MAX_LOG_NN_OUTPUT)
     std = torch.exp(log_std)
     normal_dist = torch.distributions.Normal(mean, std)
     return torch.sum(normal_dist.log_prob(z) -
                      torch.log(1 - actions * actions + SMALL_NUMBER),
                      dim=-1)
Beispiel #3
0
        def bc_log(model, obs, actions):
            # Stabilize input to atanh.
            normed_actions = \
                (actions - action_dist_t.low) / \
                (action_dist_t.high - action_dist_t.low) * 2.0 - 1.0
            save_normed_actions = torch.clamp(
                normed_actions, -1.0 + SMALL_NUMBER, 1.0 - SMALL_NUMBER)
            z = atanh(save_normed_actions)

            logits = model.get_policy_output(obs)
            mean, log_std = torch.chunk(logits, 2, dim=-1)
            # Mean Clamping for Stability
            mean = torch.clamp(mean, MEAN_MIN, MEAN_MAX)
            log_std = torch.clamp(log_std, MIN_LOG_NN_OUTPUT,
                                  MAX_LOG_NN_OUTPUT)
            std = torch.exp(log_std)
            normal_dist = torch.distributions.Normal(mean, std)
            return torch.sum(
                normal_dist.log_prob(z) -
                torch.log(1 - actions * actions + SMALL_NUMBER),
                dim=-1)
Beispiel #4
0
 def _unsquash(self, values):
     return atanh((values - self.low) / (self.high - self.low) * 2.0 - 1.0)