Exemple #1
0
    def forward(self, state, action=None):
        a = t.relu(self.fc1(state))
        a = t.relu(self.fc2(a))
        mu = self.mu_head(a)
        sigma = softplus(self.sigma_head(a))
        dist = Normal(mu, sigma)
        act = (atanh(action / self.action_range)
               if action is not None
               else dist.rsample())
        act_entropy = dist.entropy()

        # the suggested way to confine your actions within a valid range
        # is not clamping, but remapping the distribution
        act_log_prob = dist.log_prob(act)
        act_tanh = t.tanh(act)
        act = act_tanh * self.action_range

        # the distribution remapping process used in the original essay.
        act_log_prob -= t.log(self.action_range *
                              (1 - act_tanh.pow(2)) +
                              1e-6)
        act_log_prob = act_log_prob.sum(1, keepdim=True)

        # If your distribution is different from "Normal" then you may either:
        # 1. deduce the remapping function for your distribution and clamping
        #    function such as tanh
        # 2. clamp you action, but please take care:
        #    1. do not clamp actions before calculating their log probability,
        #       because the log probability of clamped actions might will be
        #       extremely small, and will cause nan
        #    2. do not clamp actions after sampling and before storing them in
        #       the replay buffer, because during update, log probability will
        #       be re-evaluated they might also be extremely small, and network
        #       will "nan". (might happen in PPO, not in SAC because there is
        #       no re-evaluation)
        # Only clamp actions sent to the environment, this is equivalent to
        # change the action reward distribution, will not cause "nan", but
        # this makes your training environment further differ from you real
        # environment.
        return act, act_log_prob, act_entropy
    def act(self, states, TEST):
        # states = Variable(torch.from_numpy(states))
        # if self.use_cuda:
        #     states = states.cuda()
        value, action_mu, action_sigma, (self.hx, self.cx) = self.network(
            states, (self.hx, self.cx))
        a_dist = Normal(action_mu, action_sigma)
        if not TEST:
            action = a_dist.sample()
        else:
            action = action_mu
        a_log_probs = a_dist.log_prob(action)
        a_dist_entropy = a_dist.entropy()

        # print("action_mu:", action_mu)
        print("action_sigma:", action_sigma.data)
        # print("action:", action)
        # print("hx,cx:",self.hx,self.cx)
        # print "value:",
        # print value

        return value, action, a_log_probs, a_dist_entropy
    def optimize_model(self, epochs, variance):
        self.model.train()
        actions, states, old_probs, rewards = self.memory.return_mem()
        old_probs = torch.Tensor(old_probs).detach()
        for epoch in range(epochs):
            for i in range(len(states)):
                new_action_mean = self.model(states[i])
                dist = Normal(new_action_mean, variance)
                dist_entropy = dist.entropy()
                new_prob = dist.log_prob(actions[i])

                r = torch.exp(new_prob - old_probs[i])

                # Loss follows the PPO loss of the (new prob of action / old prob of action) * reward clamped between 2 values
                actor_loss = -min(r * rewards[i],
                                  torch.clamp(r, 1-self.clip_factor, 1+self.clip_factor) * rewards[i])

                actor_loss = actor_loss - (0.01 * dist_entropy) # Small bonus for entropy

                self.optimizer.zero_grad()
                actor_loss.backward()
                self.optimizer.step()
        self.memory.clear_mem()
Exemple #4
0
    def forward(self, state, action=None):
        a = t.relu(self.fc1(state))
        a = t.relu(self.fc2(a))
        mu = t.tanh(self.mu_head(a)) * self.action_range
        sigma = softplus(self.sigma_head(a))
        dist = Normal(mu, sigma)
        act = (action
               if action is not None
               else dist.rsample())
        act_entropy = dist.entropy()
        # do not clamp actions here, because
        # we action probability might be extremely small,
        # and network will "nan".
        act_log_prob = dist.log_prob(act)

        # do not clamp actions here, because
        # actions will be stored in replay buffer
        # and new evaluated log probability in update
        # might also be extremely small, and network will "nan".

        # clamp actions only before sending your actions into
        # the environment.
        return act, act_log_prob, act_entropy
    def forward(self, state, action=None):
        x = state
        x = self.actor_bn(x)
        for l in self.actor_linears:
            x = l(x)
            x = self.relu(x)
        mu = self.tanh(self.mu(x))

        log_var = -3. - self.relu(self.log_var(x))
        # log_var = -3. - self.relu(self.log_var_const)
        sigmas = log_var.exp().sqrt() + 1.0e-5
        dists = Normal(mu, sigmas)
        if action is None:
            action = dists.sample()

        log_prob = dists.log_prob(action).sum(dim=-1, keepdim=True)

        x = state
        x = self.critic_bn(x)
        for l in self.critic_linears:
            x = l(x)
            x = self.relu(x)
        v = self.v(x)
        return action, log_prob, dists.entropy(), v
Exemple #6
0
    def forward(self, x):
        body_actor = F.tanh(self.body_actor(x))
        y = F.tanh(self.a(body_actor))
        mean = self.mean(y)
        logstd = self.logstd(y)
        std = logstd.exp()

        dist = Normal(mean, std)
        action = dist.sample()
        a_logp = dist.log_prob(action)
        entropy = dist.entropy()

        body_critic = F.relu(self.body_critic(x))
        z = F.relu(self.v1(body_critic))
        value = self.v2(z)

        return {
            'action': action,
            'a_logp': a_logp,
            'value': value,
            'entropy': entropy,
            'mean': mean,
            'logstd': logstd,
        }
    def forward(self, state, action=None):
        if type(state) != torch.Tensor:
            state = torch.FloatTensor(state).to(device)

        x = self.layers[0](state)

        for layer in self.layers[1:-1]:
            x = self.activ(layer(x))

        mean = torch.tanh(self.layers[-1](x))  # (-1, 1)

        # Always positive value.
        # See https://sefiks.com/2017/08/11/softplus-as-a-neural-networks-activation-function/
        std = F.softplus(self.std)

        dist = Normal(mean, std)

        if action is None:
            action = dist.sample()

        log_prob = dist.log_prob(action).sum(-1).unsqueeze(-1)
        entropy = dist.entropy().sum(-1).unsqueeze(-1)

        return mean, action, log_prob, entropy
Exemple #8
0
 def forward_multiple_mcs(self,
                          model_params,
                          data,
                          var_params,
                          itr,
                          num_samples=5):
     '''
         useful for analytic kl  kl = torch.distributions.kl.kl_divergence(z_dist, self.prior).sum(-1)
     '''
     y, x = self.unpack_data(data)
     loc, log_scale = self.unpack_var_params(var_params)
     var_dist = Normal(loc, torch.exp(log_scale))
     #cov = torch.diag(torch.exp(log_scale))**2
     #scale_tril = cov.tril()
     #var_dist = MultivariateNormal(loc, scale_tril=scale_tril)
     samples = var_dist.rsample(torch.Size((num_samples, )))
     #data_term = self.model.log_joint(y, x, samples[0])
     data_terms = torch.empty(num_samples, device=device)
     for i in range(len(samples)):
         data_terms[i] = self.model.log_joint(model_params, y, x,
                                              samples[i])
     data_term = torch.mean(data_terms)
     entropy = torch.sum(var_dist.entropy())
     return (data_term + entropy)
Exemple #9
0
    def evaluate(self, state, action):
        # return the value of given state, and the probability of the actor take {action}
        state_value = self.critic(state)

        if action is None:
            return state_value, None

        act_hid = self.actor(state)

        action_mean = self.action_mean(act_hid)
        action_log_std = self.action_log_std(act_hid)

        action_log_std = torch.clamp(action_log_std,
                                     min=LOG_SIG_MIN,
                                     max=LOG_SIG_MAX)
        action_std = action_log_std.exp()

        normal = Normal(action_mean, action_std)
        entropy = normal.entropy()

        action = normal.sample()
        log_prob = normal.log_prob(action)

        return state_value, log_prob, entropy
Exemple #10
0
class GaussianPolicy(nn.Module):
    def __init__(self, input_dim, action_dim, hidden_layer=[64, 64]):
        super(GaussianPolicy, self).__init__()
        actor_layer_size = [input_dim] + hidden_layer
        actor_feature_layers = nn.ModuleList([])
        for i in range(len(actor_layer_size) - 1):
            actor_feature_layers.append(
                nn.Linear(actor_layer_size[i], actor_layer_size[i + 1]))
            actor_feature_layers.append(nn.ReLU())
        self.actor = nn.Sequential(*actor_feature_layers)
        self.mu_head = nn.Sequential(nn.Linear(hidden_layer[-1], action_dim),
                                     nn.Tanh())
        self.std_head = nn.Sequential(nn.Linear(hidden_layer[-1], action_dim),
                                      nn.Softplus())

        critic_layer_size = [input_dim] + hidden_layer
        critic_layers = nn.ModuleList([])
        for i in range(len(critic_layer_size) - 1):
            critic_layers.append(
                nn.Linear(critic_layer_size[i], critic_layer_size[i + 1]))
            critic_layers.append(nn.ReLU())
        critic_layers.append(nn.Linear(hidden_layer[-1], 1))
        self.critic = nn.Sequential(*critic_layers)

    def forward(self, x, action=None):
        actor_features = self.actor(x)
        mu = self.mu_head(actor_features)
        std = self.std_head(actor_features)
        self.dist = Normal(mu, std)
        if action is None:
            action = self.dist.sample()
        action_log_prob = self.dist.log_prob(action).sum(-1)
        entropy = self.dist.entropy().sum(-1)
        value = self.critic(x)

        return action, action_log_prob, value.squeeze(-1), entropy
Exemple #11
0
  def sample(self, obs):

    mean, log_std, hidden = self.actor.forward(obs)
    
    std = log_std.exp()
    normal = Normal(mean, std)
    x_t = normal.rsample()  # for reparameterization trick (mean + std * N(0,1))
    y_t = torch.tanh(x_t)

    action = y_t * self.hyperps['action_scale'] #+ self.hyperps['action_bias']
    action[:, 0] += self.hyperps['action_bias']
    log_prob = normal.log_prob(x_t)

    # Enforcing Action Bound
    log_prob -= torch.log(self.hyperps['action_scale'] * (1 - y_t.pow(2)) + self.hyperps['epsilon'])
    log_prob = log_prob.sum(1, keepdim=True)

    mean = torch.tanh(mean) * self.hyperps['action_scale'] + self.hyperps['action_bias']

    entropy = normal.entropy()
    entropy1, entropy2 = entropy[0][0].item(), entropy[0][1].item()

    #print('Std: {:2.3f}, {:2.3f}, log_std: {:2.3f},{:2.3f}, entropy:{:2.3f}, {:2.3f}'.format(std[0][0].item(),std[0][1].item(), log_std[0][0].item(), log_std[0][1].item(), entropy1, entropy2))
    return action, log_prob, mean, std, hidden
class DiagGaussianDistribution(Distribution):
    """
    Gaussian distribution with diagonal covariance matrix,
    for continuous actions.

    :param action_dim: (int)  Dimension of the action space.
    """
    def __init__(self, action_dim: int):
        super(DiagGaussianDistribution, self).__init__()
        self.distribution = None
        self.action_dim = action_dim
        self.mean_actions = None
        self.log_std = None

    def proba_distribution_net(
            self,
            latent_dim: int,
            log_std_init: float = 0.0) -> Tuple[nn.Module, nn.Parameter]:
        """
        Create the layers and parameter that represent the distribution:
        one output will be the mean of the Gaussian, the other parameter will be the
        standard deviation (log std in fact to allow negative values)

        :param latent_dim: (int) Dimension og the last layer of the policy (before the action layer)
        :param log_std_init: (float) Initial value for the log standard deviation
        :return: (nn.Linear, nn.Parameter)
        """
        mean_actions = nn.Linear(latent_dim, self.action_dim)
        # TODO: allow action dependent std
        log_std = nn.Parameter(th.ones(self.action_dim) * log_std_init,
                               requires_grad=True)
        return mean_actions, log_std

    def proba_distribution(self, mean_actions: th.Tensor,
                           log_std: th.Tensor) -> 'DiagGaussianDistribution':
        """
        Create the distribution given its parameters (mean, std)

        :param mean_actions: (th.Tensor)
        :param log_std: (th.Tensor)
        :return: (DiagGaussianDistribution)
        """
        action_std = th.ones_like(mean_actions) * log_std.exp()
        self.distribution = Normal(mean_actions, action_std)
        return self

    def mode(self) -> th.Tensor:
        return self.distribution.mean

    def sample(self) -> th.Tensor:
        # Reparametrization trick to pass gradients
        return self.distribution.rsample()

    def entropy(self) -> th.Tensor:
        return sum_independent_dims(self.distribution.entropy())

    def actions_from_params(self,
                            mean_actions: th.Tensor,
                            log_std: th.Tensor,
                            deterministic: bool = False) -> th.Tensor:
        # Update the proba distribution
        self.proba_distribution(mean_actions, log_std)
        return self.get_actions(deterministic=deterministic)

    def log_prob_from_params(
            self, mean_actions: th.Tensor,
            log_std: th.Tensor) -> Tuple[th.Tensor, th.Tensor]:
        """
        Compute the log probability of taking an action
        given the distribution parameters.

        :param mean_actions: (th.Tensor)
        :param log_std: (th.Tensor)
        :return: (Tuple[th.Tensor, th.Tensor])
        """
        actions = self.actions_from_params(mean_actions, log_std)
        log_prob = self.log_prob(actions)
        return actions, log_prob

    def log_prob(self, actions: th.Tensor) -> th.Tensor:
        """
        Get the log probabilities of actions according to the distribution.
        Note that you must call ``proba_distribution()`` method before.

        :param actions: (th.Tensor)
        :return: (th.Tensor)
        """
        log_prob = self.distribution.log_prob(actions)
        return sum_independent_dims(log_prob)
class StateDependentNoiseDistribution(Distribution):
    """
    Distribution class for using generalized State Dependent Exploration (gSDE).
    Paper: https://arxiv.org/abs/2005.05719

    It is used to create the noise exploration matrix and
    compute the log probability of an action with that noise.

    :param action_dim: (int) Dimension of the action space.
    :param full_std: (bool) Whether to use (n_features x n_actions) parameters
        for the std instead of only (n_features,)
    :param use_expln: (bool) Use ``expln()`` function instead of ``exp()`` to ensure
        a positive standard deviation (cf paper). It allows to keep variance
        above zero and prevent it from growing too fast. In practice, ``exp()`` is usually enough.
    :param squash_output: (bool) Whether to squash the output using a tanh function,
        this allows to ensure boundaries.
    :param learn_features: (bool) Whether to learn features for gSDE or not.
        This will enable gradients to be backpropagated through the features
        ``latent_sde`` in the code.
    :param epsilon: (float) small value to avoid NaN due to numerical imprecision.
    """
    def __init__(self,
                 action_dim: int,
                 full_std: bool = True,
                 use_expln: bool = False,
                 squash_output: bool = False,
                 learn_features: bool = False,
                 epsilon: float = 1e-6):
        super(StateDependentNoiseDistribution, self).__init__()
        self.distribution = None
        self.action_dim = action_dim
        self.latent_sde_dim = None
        self.mean_actions = None
        self.log_std = None
        self.weights_dist = None
        self.exploration_mat = None
        self.exploration_matrices = None
        self._latent_sde = None
        self.use_expln = use_expln
        self.full_std = full_std
        self.epsilon = epsilon
        self.learn_features = learn_features
        if squash_output:
            self.bijector = TanhBijector(epsilon)
        else:
            self.bijector = None

    def get_std(self, log_std: th.Tensor) -> th.Tensor:
        """
        Get the standard deviation from the learned parameter
        (log of it by default). This ensures that the std is positive.

        :param log_std: (th.Tensor)
        :return: (th.Tensor)
        """
        if self.use_expln:
            # From gSDE paper, it allows to keep variance
            # above zero and prevent it from growing too fast
            below_threshold = th.exp(log_std) * (log_std <= 0)
            # Avoid NaN: zeros values that are below zero
            safe_log_std = log_std * (log_std > 0) + self.epsilon
            above_threshold = (th.log1p(safe_log_std) + 1.0) * (log_std > 0)
            std = below_threshold + above_threshold
        else:
            # Use normal exponential
            std = th.exp(log_std)

        if self.full_std:
            return std
        # Reduce the number of parameters:
        return th.ones(self.latent_sde_dim, self.action_dim).to(
            log_std.device) * std

    def sample_weights(self, log_std: th.Tensor, batch_size: int = 1) -> None:
        """
        Sample weights for the noise exploration matrix,
        using a centered Gaussian distribution.

        :param log_std: (th.Tensor)
        :param batch_size: (int)
        """
        std = self.get_std(log_std)
        self.weights_dist = Normal(th.zeros_like(std), std)
        # Reparametrization trick to pass gradients
        self.exploration_mat = self.weights_dist.rsample()
        # Pre-compute matrices in case of parallel exploration
        self.exploration_matrices = self.weights_dist.rsample((batch_size, ))

    def proba_distribution_net(
        self,
        latent_dim: int,
        log_std_init: float = -2.0,
        latent_sde_dim: Optional[int] = None
    ) -> Tuple[nn.Module, nn.Parameter]:
        """
        Create the layers and parameter that represent the distribution:
        one output will be the deterministic action, the other parameter will be the
        standard deviation of the distribution that control the weights of the noise matrix.

        :param latent_dim: (int) Dimension of the last layer of the policy (before the action layer)
        :param log_std_init: (float) Initial value for the log standard deviation
        :param latent_sde_dim: (Optional[int]) Dimension of the last layer of the feature extractor
            for gSDE. By default, it is shared with the policy network.
        :return: (nn.Linear, nn.Parameter)
        """
        # Network for the deterministic action, it represents the mean of the distribution
        mean_actions_net = nn.Linear(latent_dim, self.action_dim)
        # When we learn features for the noise, the feature dimension
        # can be different between the policy and the noise network
        self.latent_sde_dim = latent_dim if latent_sde_dim is None else latent_sde_dim
        # Reduce the number of parameters if needed
        log_std = th.ones(self.latent_sde_dim,
                          self.action_dim) if self.full_std else th.ones(
                              self.latent_sde_dim, 1)
        # Transform it to a parameter so it can be optimized
        log_std = nn.Parameter(log_std * log_std_init, requires_grad=True)
        # Sample an exploration matrix
        self.sample_weights(log_std)
        return mean_actions_net, log_std

    def proba_distribution(
            self, mean_actions: th.Tensor, log_std: th.Tensor,
            latent_sde: th.Tensor) -> 'StateDependentNoiseDistribution':
        """
        Create the distribution given its parameters (mean, std)

        :param mean_actions: (th.Tensor)
        :param log_std: (th.Tensor)
        :param latent_sde: (th.Tensor)
        :return: (StateDependentNoiseDistribution)
        """
        # Stop gradient if we don't want to influence the features
        self._latent_sde = latent_sde if self.learn_features else latent_sde.detach(
        )
        variance = th.mm(self._latent_sde**2, self.get_std(log_std)**2)
        self.distribution = Normal(mean_actions,
                                   th.sqrt(variance + self.epsilon))
        return self

    def mode(self) -> th.Tensor:
        actions = self.distribution.mean
        if self.bijector is not None:
            return self.bijector.forward(actions)
        return actions

    def get_noise(self, latent_sde: th.Tensor) -> th.Tensor:
        latent_sde = latent_sde if self.learn_features else latent_sde.detach()
        # Default case: only one exploration matrix
        if len(latent_sde) == 1 or len(latent_sde) != len(
                self.exploration_matrices):
            return th.mm(latent_sde, self.exploration_mat)
        # Use batch matrix multiplication for efficient computation
        # (batch_size, n_features) -> (batch_size, 1, n_features)
        latent_sde = latent_sde.unsqueeze(1)
        # (batch_size, 1, n_actions)
        noise = th.bmm(latent_sde, self.exploration_matrices)
        return noise.squeeze(1)

    def sample(self) -> th.Tensor:
        noise = self.get_noise(self._latent_sde)
        actions = self.distribution.mean + noise
        if self.bijector is not None:
            return self.bijector.forward(actions)
        return actions

    def entropy(self) -> Optional[th.Tensor]:
        # No analytical form,
        # entropy needs to be estimated using -log_prob.mean()
        if self.bijector is not None:
            return None
        return sum_independent_dims(self.distribution.entropy())

    def actions_from_params(self,
                            mean_actions: th.Tensor,
                            log_std: th.Tensor,
                            latent_sde: th.Tensor,
                            deterministic: bool = False) -> th.Tensor:
        # Update the proba distribution
        self.proba_distribution(mean_actions, log_std, latent_sde)
        return self.get_actions(deterministic=deterministic)

    def log_prob_from_params(
            self, mean_actions: th.Tensor, log_std: th.Tensor,
            latent_sde: th.Tensor) -> Tuple[th.Tensor, th.Tensor]:
        actions = self.actions_from_params(mean_actions, log_std, latent_sde)
        log_prob = self.log_prob(actions)
        return actions, log_prob

    def log_prob(self, actions: th.Tensor) -> th.Tensor:
        if self.bijector is not None:
            gaussian_actions = self.bijector.inverse(actions)
        else:
            gaussian_actions = actions
        # log likelihood for a gaussian
        log_prob = self.distribution.log_prob(gaussian_actions)
        # Sum along action dim
        log_prob = sum_independent_dims(log_prob)

        if self.bijector is not None:
            # Squash correction (from original SAC implementation)
            log_prob -= th.sum(
                self.bijector.log_prob_correction(gaussian_actions), dim=1)
        return log_prob
Exemple #14
0
    def entropy(self, datas):
        mean, std = datas

        distribution = Normal(mean, std)
        return distribution.entropy().float().to(set_device(self.use_gpu))
Exemple #15
0
class TanhNormal(Distribution):
    """
    Represent distribution of X where
        X ~ tanh(Z)
        Z ~ N(mean, std)

    Note: this is not very numerically stable.
    """
    def __init__(self, normal_mean, normal_std, epsilon=1e-6):
        """
        :param normal_mean: Mean of the normal distribution
        :param normal_std: Std of the normal distribution
        :param epsilon: Numerical stability epsilon when computing log-prob.
        """
        self.normal_mean = normal_mean
        self.normal_std = normal_std
        self.normal = Normal(normal_mean, normal_std)
        self.epsilon = epsilon

    def sample_n(self, n, return_pre_tanh_value=False):
        z = self.normal.sample_n(n)
        if return_pre_tanh_value:
            return torch.tanh(z), z
        else:
            return torch.tanh(z)

    def log_prob(self, value, pre_tanh_value=None):
        """

        :param value: some value, x
        :param pre_tanh_value: arctanh(x)
        :return:
        """
        if pre_tanh_value is None:
            pre_tanh_value = torch.log((1 + value) / (1 - value)) / 2
        return self.normal.log_prob(pre_tanh_value) - torch.log(1 -
                                                                value * value +
                                                                self.epsilon)

    def sample(self, return_pretanh_value=False):
        """
        Gradients will and should *not* pass through this operation.

        See https://github.com/pytorch/pytorch/issues/4620 for discussion.
        """
        z = self.normal.sample().detach()

        if return_pretanh_value:
            return torch.tanh(z), z
        else:
            return torch.tanh(z)

    def rsample(self, return_pretanh_value=False):
        """
        Sampling in the reparameterization case.
        """
        # z = (
        #     self.normal_mean +
        #     self.normal_std *
        #     Normal(
        #         ptu.zeros(self.normal_mean.size()),
        #         ptu.ones(self.normal_std.size())
        #     ).sample()
        # )
        # z.requires_grad_()
        z = self.normal.rsample()

        if return_pretanh_value:
            return torch.tanh(z), z
        else:
            return torch.tanh(z)

    def entropy(self):
        """Returns entropy of the underlying normal distribution.

        Returns:
            torch.Tensor: entropy of the underlying normal distribution.

        """
        return self.normal.entropy().sum(-1, keepdim=True)
Exemple #16
0
    state = torch.Tensor([np.swapaxes(npa(state_raw), 0, 2)]).to(device)

    # encode to latent variables (mu/var)
    latent_mu, latent_stddev = policy.encode(state)
    m = Normal(latent_mu, latent_stddev)

    rewards = []
    rewards_raw = []
    log_probs = []
    entropies = []

    for k in range(SAMPLES):
        # sample K times
        action = m.sample()
        log_probs.append(m.log_prob(action))
        entropies.append(m.entropy())

        params = policy.decode(action)

        # render out an image for each of the K samples
        # IMPORTANT THIS CURRENTLY ASSUMES BATCH SIZE = 1
        next_state = env.render(params.detach().view(-1).cpu().numpy(), data_generator.cam)

        # calculate reward for each one of the K samples
        reward_raw = -(np.square(npa(state_raw) - npa(next_state))).mean(axis=None)
        rewards_raw.append(reward_raw)

    # deduct average reward of all K-1 samples (variance reduction)
    for k in range(SAMPLES):
        baseline = np.mean(rewards_raw[:k] + rewards_raw[k + 1:])
        rewards.append(rewards_raw[k] - baseline)
Exemple #17
0
 def entropy(self, mean, std):
     distribution    = Normal(mean, std)    
     return distribution.entropy().float().to(self.device)
Exemple #18
0
def train(episodes):
    # Initialize global counters
    first_batch = True
    episode_i = 0
    total_i = 0

    while episode_i < episodes:  # START MAIN LOOP
        # Initialize batch lists
        current_state_q = []
        next_state_q = []
        reward_q = []
        action_log_prob_q = []
        value_q = []
        advantage_q_new = []
        done_q = []
        action_q = []
        avg_reward_batch = []
        episode_in_batch = 0
        i_in_batch = 0

        # while i_in_batch < N_STEPS:  # START EPISODE BATCH LOOP
        while episode_in_batch < N_TRAJECTORIES:
            # Reset environment and get first state
            cur_state = env.reset()
            done = False
            ret = 0
            i_in_episode = 0

            while not done:  # RUN SINGLE EPISODE
                # Get parameters for distribution and assign action
                torch_state = torch.tensor(cur_state).unsqueeze(0).float()

                with torch.no_grad():
                    mu, sd = ac_net_actor(torch_state)
                    val_out = ac_net_critic(torch_state)

                distribution = Normal(mu[0], sd[0])
                action = distribution.sample()
                clamped_action_t = torch.clamp(action, -1.0, 1.0)
                clamped_action = clamped_action_t.data.numpy()

                for action_count in range(10):
                    # Step environment
                    next_state, reward, done, info = env.step(clamped_action)

                    # Append values to queues
                    current_state_q.append(cur_state)
                    next_state_q.append(next_state)
                    reward_q.append(float(reward))
                    value_q.append(val_out)
                    action_q.append(clamped_action)
                    action_log_prob_q.append(distribution.log_prob(clamped_action_t).data.numpy())
                    done_q.append(1-done)

                    ret += reward  # Sum total reward for episode

                    # Iterate counters, etc
                    cur_state = next_state
                    i_in_episode += 1
                    i_in_batch += 1
                    total_i += 1

                    if i_in_episode % 10 == 0 and episode_i % 25 == 0 and episode_i >= 0:
                        env.render()

                    # TODO get args
                    if i_in_episode > 3000:
                        done = True
                    if done:
                        break

            # END SINGLE EPISODE

            episode_in_batch += 1
            episode_i += 1
            avg_reward.append(ret)
            avg_reward_batch.append(ret)

        # END EPISODE BATCH LOOP


        # START CUMULATIVE REWARD CALC
        discounted_reward = []
        cumul_reward = 0
        for reward, done, in zip(reversed(reward_q), reversed(done_q)):
            if done == 1:
                cumul_reward = cumul_reward*gamma + reward
                discounted_reward.insert(0, cumul_reward)
            elif done == 0:
                cumul_reward = reward
                discounted_reward.insert(0, cumul_reward)

        # SET UP TENSORS
        batch_length = len(current_state_q)

        current_state_t = torch.tensor(current_state_q).float()
        action_log_prob_t = torch.tensor(action_log_prob_q).float()
        action_t = torch.tensor(action_q).float()
        reward_t = torch.tensor(discounted_reward).float()

        # CALCULATE ADVANTAGE
        value_t_new = ac_net_critic(current_state_t)
        for reward_i, value_i in zip(np.asarray(discounted_reward), value_t_new.data.numpy()):
            advantage_q_new.append(reward_i - value_i)
        advantage_q_new = np.asarray(advantage_q_new)
        # TODO check how this is converted between numpy and tensor

        advantage_q_new = (advantage_q_new-np.mean(advantage_q_new))/(np.std(advantage_q_new))

        advantage_t = torch.tensor(advantage_q_new).float()

        # START UPDATING NETWORKS

        # START BASELINE OPTIMIZE
        for epoch in range(B_EPOCHS):
            # Get random permutation of indexes
            indexes = torch.tensor(np.random.permutation(batch_length)).type(torch.LongTensor)
            n_batch = 0
            batch_start = 0
            batch_end = 0

            # Loop over permutation
            while batch_end < batch_length:
                # Get batch indexes
                batch_end = batch_start + N_MINI_BATCH
                if batch_end > batch_length:
                    batch_end = batch_length

                batch_idx = indexes[batch_start:batch_end]

                # Gather data from saved tensors
                batch_state_t = torch.index_select(current_state_t, 0, batch_idx).float()
                batch_reward_t = torch.index_select(reward_t, 0, batch_idx)

                # Get new baseline values
                new_val = ac_net_critic(batch_state_t)

                # Calculate loss compared with reward and optimize
                critic_loss_batch = criterion_val(new_val, batch_reward_t.unsqueeze(1))

                # Do optimization
                optimizer_c.zero_grad()
                critic_loss_batch.backward()
                optimizer_c.step()

                # Iterate counters
                batch_start = batch_end
                n_batch += 1
        # END BASELINE OPTIMIZE

        # START POLICY OPTIMIZE
        for epoch in range(K_EPOCHS):
            # Get random permutation of indexes
            indexes = torch.tensor(np.random.permutation(batch_length)).type(torch.LongTensor)
            n_batch = 0
            batch_start = 0
            batch_end = 0

            # Loop over permutation
            while batch_end < batch_length:
                # Get batch indexes
                batch_end = batch_start + N_MINI_BATCH
                if batch_end > batch_length:
                    batch_end = batch_length

                batch_idx = indexes[batch_start:batch_end]

                # Gather data from saved tensors
                batch_state_t = torch.index_select(current_state_t, 0, batch_idx).float()
                batch_advantage_t = torch.index_select(advantage_t, 0, batch_idx).float()
                batch_action_log_prob_t = torch.index_select(action_log_prob_t, 0, batch_idx)
                batch_action_t = torch.index_select(action_t, 0, batch_idx)
                # batch_reward_t = torch.index_select(reward_t, 0, batch_idx)

                # Get new batch of parameters and action log probs
                mu_batch, sd_batch = ac_net_actor(batch_state_t)
                batch_distribution = Normal(mu_batch, sd_batch)
                exp_probs = batch_distribution.log_prob(batch_action_t).exp()
                old_exp_probs = batch_action_log_prob_t.exp()
                r_theta_i = torch.div(exp_probs, old_exp_probs)

                # Expand advantage to dimensions of r_theta_i
                batch_advantage_t4 = batch_advantage_t.expand_as(r_theta_i)

                # Calculate the options
                surrogate1 = r_theta_i * batch_advantage_t4
                surrogate2 = torch.clamp(r_theta_i, 1 - EPSILON, 1 + EPSILON) * batch_advantage_t4
                batch_entropy = batch_distribution.entropy()
                batch_entropy_loss = torch.mean(torch.pow(batch_entropy, 2))

                # Choose minimum of surrogates and calculate L_clip as final loss function
                r_theta_surrogate_min = torch.min(surrogate1, surrogate2)
                L_clip = -torch.sum(r_theta_surrogate_min) / r_theta_surrogate_min.size()[0] + 0.03 * batch_entropy_loss

                # if batch_entropy_loss > 1.2:
                #     L_clip = L_clip + 0.05 * batch_entropy_loss

                # Optimize
                optimizer_a.zero_grad()
                L_clip.backward()
                optimizer_a.step()

                # Iterate counters
                batch_start = batch_end
                n_batch += 1
        # END UPDATING ACTOR

        if episode_i % return_time == 0:
            print("%4d, %6.0d, %6.2f, %6.2f | %6.2f"
                  % (episode_i, total_i, np.mean(avg_reward_batch), np.mean(avg_reward), torch.mean(batch_entropy).item()))

        with open('C:\\Users\\genia\\source\\repos\\Box2dEnv\\Box2dEnv\\saves\\{}.csv'.format("testWrite"), 'a+') as csv:
            for ret_write in zip(np.asarray(avg_reward_batch)):
                csv.write("{:2.2f}\n".format(ret_write[0]))

    return episode_i
Exemple #19
0
    def get_entropy(self, state):
        mu = self.forward(state)
        std = torch.exp(self.log_std)

        ac_dist = Normal(mu, std)
        return ac_dist.entropy()
Exemple #20
0
def Worker(global_actor, n_steps, multi):
    if n_steps == 1 and multi == 1:
        mode = "SS"
    elif n_steps != 1 and multi == 1:
        mode = "MS"
    elif n_steps !=1 and multi != 1:
        mode = "MM"

    env = gym.make('InvertedPendulumSwingupBulletEnv-v0')

    local_actor = ActorCritic()

    if mode == "SS":
        lr = 0.001
    elif mode == "MS":
        lr = 0.0005
    elif mode == "MM":
        lr=0.0001
    
    optimizer = optim.Adam(global_actor.parameters(), lr=lr)

    t = 1
    score = 0.0
    beta = 0.05
    start_time = time.time()
    for train_episode in range(3000):
        local_actor.load_state_dict(global_actor.state_dict())

        t_start = t - 1

        state = env.reset()
        done = False

        rewards, log_probs, values, Rs = [], [], [], []
        policy_losses, value_losses = [], []
        entropies = []
        R = 0

        while True:
            # get action
            mu, sigma = local_actor.act(torch.from_numpy(state).float())
            norm_dist = Normal(mu, sigma)
            action = norm_dist.sample()
            action = torch.clamp(action, min=-ACT_LIMIT, max=ACT_LIMIT)

            # get next_state and reward according to action
            next_state, reward, done, _ = env.step(action)
            score += reward

            log_prob = norm_dist.log_prob(action)
            value = local_actor.cri(torch.from_numpy(state).float())
            entropy = norm_dist.entropy()

            log_probs.append(log_prob)
            values.append(value)
            rewards.append(reward)
            entropies.append(entropy)

            # gradient update
            if t - t_start == n_steps or done:
                if done:
                    R = 0
                else:
                    R = local_actor.cri(torch.from_numpy(next_state).float())

                for r in rewards[::-1]:
                    R = r + GAMMA * R
                    Rs.insert(0, R)

                for log_prob, value, entropy, R in zip(log_probs, values, entropies, Rs):
                    advantage = R - value.item()

                    policy_losses.append(-(log_prob * advantage + beta * entropy))
                    value_losses.append(F.mse_loss(value, torch.tensor([R])))

                loss = torch.stack(policy_losses).sum() + torch.stack(value_losses).sum()
                
                optimizer.zero_grad()
                loss.backward()
                
                for local_param, global_param in zip(local_actor.parameters(), global_actor.parameters()):
                    global_param._grad = local_param.grad

                optimizer.step()

                local_actor.load_state_dict(global_actor.state_dict())

                rewards, log_probs, values, Rs = [], [], [], []
                policy_losses, value_losses = [], []
                entropies = []
                R = 0

                state = next_state
                t += 1
                t_start = t - 1

                if done:
                    if mode == "SS":
                        if score > 500:
                            optimizer.param_groups[0]['lr'] = 0.00005
                        elif score > 400:
                            optimizer.param_groups[0]['lr'] = 0.0001
                        elif score > 300:
                            optimizer.param_groups[0]['lr'] = 0.0002
                        elif score > 200:
                            optimizer.param_groups[0]['lr'] = 0.0003
                        elif score > 100:
                            optimizer.param_groups[0]['lr'] = 0.0005

                    beta = beta * 0.999 if beta > 0.025 else 0.025
                    break
            else:
                state = next_state
                t += 1

        #print("Train Episode: {}, Score: {:.1f}, Time: {:.2f}".format(train_episode, score, time.time() - start_time))
        score = 0.0

    env.close()
    print("Training process reached maximum episode.")
Exemple #21
0
  x = 400. * theta2 * (torch.exp(-theta1*design) - torch.exp(-theta2*design)) / (theta3*(theta2-theta1))
  return x

n_inner = 1000
n_outer = 100
loc = torch.tensor(np.log((0.1, 1., 20.)), dtype=torch.float64)
scale = torch.tensor(np.sqrt((0.05, 0.05, 0.05)), dtype=torch.float64)
prior = LogNormal(loc, scale)
prior = Independent(prior, 1)
theta_inner = prior.sample((n_inner,))
theta_outer = prior.sample((n_outer,))
loc = torch.zeros(15, dtype=torch.float64)
scale = 0.1 * torch.ones(15, dtype=torch.float64)
noise = Normal(loc, scale)
noise = Independent(noise, 1)
noise_entropy = noise.entropy()
noise_outer = noise.sample((n_outer,))

def objective(design):
  x_outer = get_x(theta_outer, design)
  x_inner = get_x(theta_inner, design)
  y_outer = x_outer + noise_outer
  # Get matrix of all y_outer-x_inner values
  diff = y_outer.unsqueeze(1) - x_inner.unsqueeze(0)
  log_prob_diff = noise.log_prob(diff)
  log_evidence = torch.logsumexp(log_prob_diff, dim=1) - np.log(n_inner)
  sig = noise_entropy - log_evidence.mean()
  print('Design ', np.sort(design))
  print('SIG {:.3f}'.format(sig.numpy()))
  return -sig.numpy()
Exemple #22
0
 def compute_action(self, cur_obs_tensor):
     m, std, v = self.model(cur_obs_tensor)
     dist = Normal(m, std)
     entropy = dist.entropy().sum(1, keepdim=True)
     return dist, entropy, v
Exemple #23
0
 def evaluate(self, state, action):
     mu, sigma = self.actor(state)
     sigma = sigma.expand_as(mu)
     dist = Normal(mu, sigma)
     return dist.log_prob(action).sum(dim=-1), dist.entropy().sum(
         dim=-1), torch.squeeze(self.critic(state), 1)
Exemple #24
0
class ActorCriticPPO(StochasticContinuousNeuralNet):
    def __init__(self,
                 architecture,
                 weight_init=gauss_weights_init(0, 0.02),
                 activation_functions=None):
        super(ActorCriticPPO, self).__init__()
        if len(architecture) < 2:
            raise Exception(
                "Architecture needs at least two numbers to create network")
        #assert architecture[-1]%2 == 1, "Last layer has to represent 2*actions_space for the Gaussian + 1 for value"
        self.activation_functions = activation_functions
        self.layer_list = []
        self.layer_list_val = []
        self.siglog = tor.zeros(1, requires_grad=True)

        self.siglog = nn.Parameter(self.siglog)

        for i in range(len(architecture) - 1):
            self.layer_list.append(
                nn.Linear(architecture[i], architecture[i + 1]))
            setattr(self, "fc" + str(i), self.layer_list[-1])

        for i in range(len(architecture) - 2):
            self.layer_list_val.append(
                nn.Linear(architecture[i], architecture[i + 1]))
            setattr(self, "fc_val" + str(i), self.layer_list_val[-1])

        self.layer_list_val.append(nn.Linear(architecture[-2], 1))
        setattr(self, "fc_val" + str(len(architecture) - 1),
                self.layer_list_val[-1])

        self.apply(weight_init)

    def policy_forward(self, x):

        # Policy network

        if self.activation_functions:
            for i, func in enumerate(self.activation_functions):
                x = func(self.layer_list[i](x))
        else:
            for i, layer in enumerate(self.layer_list[:-1]):
                x = self.tanh(layer(x))

        x = self.layer_list[-1](x)

        self._means = self.tanh(x)
        self._dist = Normal(self._means, tor.exp(self.siglog))

        self.sampled = self._dist.rsample()
        x = self.sampled

        return x

    def mu(self):
        return self.means

    def value_forward(self, x):

        if self.activation_functions:
            for i, func in enumerate(self.activation_functions):
                x = func(self.layer_list_val[i](x))
        else:
            for i, layer in enumerate(self.layer_list_val[:-1]):
                x = self.tanh(layer(x))

        x = self.layer_list_val[-1](x)

        return x

    def forward(self, x):

        # Policy network
        action = self.policy_forward(x)
        value = self.value_forward(x)

        return tor.cat([action, value], dim=1)

    def __call__(self, state):
        #self.sigma_log -= sigma_epsilon
        action, value = self.policy_forward(state), self.value_forward(state)

        return action, value

    def sigma(self):

        return self.sigmas

    def mu(self):
        return self._means

    def logprob(self, values):
        return self._dist.log_prob(values)

    def entropy(self):
        return self._dist.entropy()
    def _optimize(self, obs, acts, advs, est_rs):

        self.obs, self.acts, self.advs, self.est_rs = obs, acts, advs, est_rs

        self.obs = Tensor(self.obs)
        self.acts = Tensor(self.acts)
        self.advs = Tensor(self.advs).unsqueeze(1)
        self.est_rs = Tensor(self.est_rs).unsqueeze(1)

        # Calculate Advantage & Normalize it
        self.advs = (self.advs - self.advs.mean()) / (self.advs.std() + 1e-8)

        # Surrogate loss with Entropy

        if self.continuous:
            mean, std, values = self.model(self.obs)

            dis = Normal(mean, std)

            log_prob = dis.log_prob(self.acts).sum(-1, keepdim=True)

            ent = dis.entropy().sum(-1, keepdim=True)

            probs_new = torch.exp(log_prob)
            probs_old = probs_new.detach() + 1e-8

        else:

            probs, values = self.model(self.obs)

            dis = F.softmax(probs, dim=1)

            self.acts = self.acts.long()

            probs_new = dis.gather(1, self.acts)
            probs_old = probs_new + 1e-8

            ent = -(dis.log() * dis).sum(-1)

        ratio = probs_new / probs_old

        surrogate_loss = -torch.mean(
            ratio * self.advs) - self.entropy_para * ent.mean()

        # criterion = torch.nn.MSELoss()
        # empty_value_loss = criterion( values, values.detach() )

        # Calculate the gradient of the surrogate loss
        self.model.zero_grad()
        surrogate_loss.backward()
        policy_gradient = parameters_to_vector([
            p.grad for p in self.model.policy_parameters()
        ]).squeeze(0).detach()

        # ensure gradient is not zero
        if policy_gradient.nonzero().size()[0]:
            # Use Conjugate gradient to calculate step direction
            step_direction = self.conjugate_gradient(-policy_gradient)
            # line search for step
            shs = .5 * step_direction.dot(
                self.hessian_vector_product(step_direction))

            lm = torch.sqrt(shs / self.max_kl)
            fullstep = step_direction / lm

            gdotstepdir = -policy_gradient.dot(step_direction)
            theta = self.linesearch(
                parameters_to_vector(self.model.policy_parameters()).detach(),
                fullstep, gdotstepdir / lm)
            # Update parameters of policy model
            old_model = copy.deepcopy(self.model)
            old_model.load_state_dict(self.model.state_dict())

            if any(np.isnan(theta.cpu().detach().numpy())):
                print("NaN detected. Skipping update...")
            else:
                # for param in self.model.policy_parameters():
                #     print(param)
                vector_to_parameters(theta, self.model.policy_parameters())

            kl_old_new = self.mean_kl_divergence(old_model)
            print('KL:{:10} , Entropy:{:10}'.format(kl_old_new.item(),
                                                    ent.mean().item()))

        else:
            print("Policy gradient is 0. Skipping update...")
            print(policy_gradient.shape)

        self.model.zero_grad()

        if self.continuous:
            _, _, values = self.model(self.obs)
        else:
            _, values = self.model(self.obs)

        criterion = torch.nn.MSELoss()
        critic_loss = self.value_loss_coeff * criterion(values, self.est_rs)
        critic_loss.backward()
        self.optim.step()
        print("MSELoss for Value Net:{}".format(critic_loss.item()))
Exemple #26
0
def train(episodes):
    env.env.unwrapped.seed(random_seed)
    first_batch = True
    episode_i = 0
    total_i = 0
    curious_reward_std = 0.2
    while episode_i < episodes:  # START MAIN LOOP
        cur_state_q = []
        next_state_q = []
        reward_q = []
        action_log_prob_q = []
        value_q = []
        advantage_q_new = []
        done_q = []
        action_q = []
        avg_reward_batch = []
        avg_curious_reward_batch = []
        curious_reward_q = []
        avg_max_height = []
        i_in_batch = 0
        completed_q = []

        while i_in_batch < N_STEPS:  # START EPISODE BATCH LOOP
            cur_state = env.reset()
            cur_state_copy = cur_state.copy()
            cur_state_copy[1] = cur_state_copy[1]/0.035

            done = False
            ret = 0
            curious_ret = 0
            i_in_episode = 0
            episode_distance_q = []
            next_cur_state_episode_q = []
            while not done:  # RUN SINGLE EPISODE
                # Get parameters for distribution and assign action
                torch_state = torch.tensor(cur_state_copy).unsqueeze(0).float()
                with torch.no_grad():
                    mu, sd = ac_net_actor(torch_state)
                    # val_out = ac_net_critic(torch_state)
                    # curious_out = ac_net_c_critic(torch_state)
                distribution = Normal(mu[0], sd[0])
                action = distribution.sample()
                if episode_i < 15:
                    clamped_action = torch.clamp(action, min=-1, max=1).data.numpy()
                else:
                    clamped_action = torch.clamp(action, min=-1, max=1).data.numpy()

                episode_distance_q.append(cur_state[0])
                # Step environment
                next_state, reward, done, info = env.step(clamped_action)

                # Append values to queues
                cur_state_q.append(cur_state_copy)

                next_state_copy = next_state.copy()
                next_state_copy[1] = next_state_copy[1]/0.035
                next_cur_state_episode_q.append(next_state_copy)
                next_state_q.append(next_state_copy)

                reward_i = reward/20.0
                reward_q.append(float(reward_i))
                # value_q.append(val_out)
                action_q.append(action.data.numpy())
                action_log_prob_q.append(distribution.log_prob(torch.tensor(clamped_action)).data.numpy())
                done_q.append(1-done)  # Why 1-done?

                ret += reward  # Sum total reward for episode

                # Iterate counters, etc
                cur_state = next_state
                cur_state_copy = next_state_copy
                i_in_episode += 1
                i_in_batch += 1
                total_i += 1
                if i_in_episode % 50 == 0 and episode_i % 10 == 0 and episode_i >= 0:
                    env.render()
                # if i_in_episode > 500:
                #     done = True
                if done:
                    break

            # END SINGLE EPISODE

            if ret > 0.01:
                completed_q += np.ones((len(episode_distance_q), 1)).tolist()
            else:
                completed_q += np.zeros((len(episode_distance_q), 1)).tolist()

            next_state_episode = np.asarray(next_cur_state_episode_q)
            next_curious_state = get_curious_state(next_state_episode, p1, p2)

            with torch.no_grad():
                rnd_val = ac_net_rnd(next_curious_state)
                pred_val = ac_net_pred(next_curious_state)

            curious_reward_episode = torch.pow((rnd_val - pred_val), 2)
            curious_rewards_episode = (curious_reward_episode.data.numpy())
            curious_reward_q += curious_rewards_episode.tolist()
            curious_ret = np.sum(curious_rewards_episode)
            avg_curious_ret = curious_ret/i_in_episode

            episode_i += 1
            avg_reward.append(ret)
            avg_curious_reward.append(curious_ret)
            avg_reward_batch.append(ret)
            avg_curious_reward_batch.append(curious_ret)
            avg_max_height_q.append(np.max(episode_distance_q))
            avg_max_height.append(np.max(episode_distance_q))
            print("%4d, %6.2f, %6.0f | " % (episode_i, np.max(episode_distance_q), curious_ret))

        # print("")
        # END EPISODE BATCH LOOP

        max_achieved_height_in_batch = np.max(avg_max_height)

        # NORMALIZE CURIOUS REWARD
        if first_batch:
            curious_reward_std = np.std(np.asarray(curious_reward_q))
            first_batch = False


        # START CUMULATIVE REWARD CALC
        curious_reward_q = curious_reward_q / curious_reward_std
        discounted_reward = []
        discounted_curious_reward = []
        cul_reward = 0
        cul_curious_reward = 0
        for reward, cur_reward, done, in zip(reversed(reward_q), reversed(curious_reward_q), reversed(done_q)):
            if done == 1:
                cul_reward = cul_reward*gamma1 + reward
                cul_curious_reward = cul_curious_reward*gamma2 + cur_reward
                discounted_reward.insert(0, cul_reward)
                discounted_curious_reward.insert(0, cul_curious_reward)
            elif done == 0:
                cul_reward = reward
                cul_curious_reward = cul_curious_reward*gamma2 + cur_reward
                discounted_reward.insert(0, cul_reward)
                discounted_curious_reward.insert(0, cul_curious_reward)

        # CALCULATE ADVANTAGE
        # Why is this a loop, dumbass?
        current_state_t = torch.tensor(cur_state_q).float()
        curious_advantage_q_new = []
        advantage_q_new = []
        with torch.no_grad():
            value_t_new = ac_net_critic(current_state_t)
            curious_value_t_new = ac_net_c_critic(current_state_t)

        for reward_i, value_i in zip(np.asarray(discounted_reward), value_t_new.data.numpy()):
            advantage_q_new.append(reward_i - value_i)
        advantage_q_new = np.asarray(advantage_q_new)
        for reward_i, value_i in zip(np.asarray(discounted_curious_reward), curious_value_t_new.data.numpy()):
            curious_advantage_q_new.append(reward_i - value_i)
        curious_advantage_q_new = np.asarray(curious_advantage_q_new)

        advantage_q_new = (advantage_q_new-np.mean(advantage_q_new))/(np.std(advantage_q_new))  # Should advantage be recalculated at each optimize step?
        # curious_advantage_q_new = (curious_advantage_q_new-np.mean(curious_advantage_q_new))/(np.std(curious_advantage_q_new))  # Should advantage be recalculated at each optimize step?
        curious_advantage_q_new = (np.asarray(discounted_curious_reward) - np.mean(discounted_curious_reward))/(np.std(discounted_curious_reward))  # Should advantage be recalculated at each optimize step?

        plotted_data = np.transpose(np.asarray((np.asarray(cur_state_q)[:, 0], np.squeeze(discounted_curious_reward))))
        plotted_data = np.transpose(np.asarray((np.asarray(cur_state_q)[:, 0], np.squeeze(curious_advantage_q_new))))

        plt.plot(plotted_data)
        plt.show()

        max_curious_advantage = np.max(curious_advantage_q_new)
        std_curious_advantage = np.std(curious_advantage_q_new)
        mean_curious_advantage = np.mean(curious_advantage_q_new)

        max_advantage = np.max(advantage_q_new)
        std_advantage = np.std(advantage_q_new)
        mean_advantage = np.mean(advantage_q_new)

        advantage_t = torch.tensor(advantage_q_new).float()
        curious_advantage_t = torch.tensor(curious_advantage_q_new).float()
        completed_t = torch.tensor(np.asarray(completed_q)).float()
        # advantage_t = completed_t * advantage_t
        a_prop = 0.5
        summed_advantage_t = torch.add(torch.mul(advantage_t, 1), torch.mul(curious_advantage_t, 1))

        # START UPDATING NETWORKS

        batch_length = len(cur_state_q)

        action_log_prob_t = torch.tensor(action_log_prob_q).float()
        action_t = torch.tensor(action_q).float()
        reward_t = torch.tensor(discounted_reward).float()
        curious_reward_t = torch.tensor(discounted_curious_reward).float()
        summed_reward_t = torch.add(curious_reward_t, reward_t)

        # START BASELINE OPTIMIZE
        avg_baseline_loss = []
        for epoch in range(B_epochs):
            # Get random permutation of indexes
            indexes = torch.tensor(np.random.permutation(batch_length)).type(torch.LongTensor)
            n_batch = 0
            batch_start = 0
            batch_end = 0
            # Loop over permutation
            avg_baseline_batch_loss = []
            avg_baseline_curious_batch_loss = []
            while batch_end < batch_length:
                # Get batch indexes
                batch_end = batch_start + N_MINI_BATCH
                if batch_end > batch_length:
                    batch_end = batch_length

                batch_idx = indexes[batch_start:batch_end]

                # Gather data from saved tensors

                batch_state_t = torch.index_select(current_state_t, 0, batch_idx).float()
                batch_reward_t = torch.index_select(reward_t, 0, batch_idx)
                batch_curious_reward_t = torch.index_select(curious_reward_t, 0, batch_idx)
                batch_summed_reward_t = torch.index_select(summed_reward_t, 0, batch_idx)
                batch_start = batch_end

                n_batch += 1

                # Get new baseline values
                new_val = ac_net_critic(batch_state_t)
                new_curious_val = ac_net_c_critic(batch_state_t)
                # Calculate loss compared with reward and optimize
                # NEEDS TO BE OPTIMIZED WITH CURIOUS VAL AS WELL
                # new_summed_val = new_val + new_curious_val
                critic_loss_batch = criterion_val(new_val, batch_reward_t.unsqueeze(1))
                critic_curious_loss_batch = criterion_val(new_curious_val, batch_curious_reward_t)
                # critic_loss_batch = criterion_val(new_summed_val, batch_summed_reward_t.unsqueeze(1))
                # critic_loss_both = critic_curious_loss_batch  # + critic_loss_batch
                optimizer_c.zero_grad()
                optimizer_cc.zero_grad()

                critic_loss_batch.backward()
                critic_curious_loss_batch.backward()
                optimizer_cc.step()
                optimizer_c.step()

                # avg_value_STD.append(critic_loss_batch.item())
                avg_baseline_batch_loss.append(critic_loss_batch.item())
                avg_baseline_curious_batch_loss.append(critic_curious_loss_batch.item())
            # print(np.mean(avg_baseline_batch_loss), np.mean(avg_baseline_curious_batch_loss), " ", end="")
            # avg_baseline_loss.append(np.mean(avg_baseline_batch_loss))

        # print("")
        # END BASELINE OPTIMIZE

        # START POLICY OPTIMIZE
        for epoch in range(K_epochs):
            # Get random permutation of indexes
            indexes = torch.tensor(np.random.permutation(batch_length)).type(torch.LongTensor)
            n_batch = 0
            batch_start = 0
            batch_end = 0
            # Loop over permutation
            while batch_end < batch_length:
                # Get batch indexes
                batch_end = batch_start + N_MINI_BATCH
                if batch_end > batch_length:
                    batch_end = batch_length

                batch_idx = indexes[batch_start:batch_end]

                # Gather data from saved tensors
                batch_state_t = torch.index_select(current_state_t, 0, batch_idx).float()
                if np.max(reward_q) > 0.01:
                    if CURIOUS:
                        batch_advantage_t = torch.index_select(summed_advantage_t, 0, batch_idx)
                    else:
                        batch_advantage_t = torch.index_select(advantage_t, 0, batch_idx)
                else:
                    if CURIOUS:
                        batch_advantage_t = torch.index_select(curious_advantage_t, 0, batch_idx)
                    else:
                        batch_advantage_t = torch.index_select(advantage_t, 0, batch_idx)

                # batch_advantage_t = torch.index_select(summed_advantage_t, 0, batch_idx)

                batch_action_log_prob_t = torch.index_select(action_log_prob_t, 0, batch_idx)
                batch_action_t = torch.index_select(action_t, 0, batch_idx)
                # batch_reward_t = torch.index_select(reward_t, 0, batch_idx)

                batch_start = batch_end
                n_batch += 1

                # Get new batch of parameters and action log probs
                mu_batch, sd_batch = ac_net_actor(batch_state_t)
                batch_distribution = Normal(mu_batch, sd_batch)
                exp_probs = batch_distribution.log_prob(batch_action_t).exp()
                old_exp_probs = batch_action_log_prob_t.exp()
                r_theta_i = torch.div(exp_probs, old_exp_probs)

                # Advantage needs to include curious advantage. Should advantage be recalculated each epoch?
                batch_advantage_t4 = batch_advantage_t.expand_as(r_theta_i)

                surrogate1 = r_theta_i * batch_advantage_t4
                surrogate2 = torch.clamp(r_theta_i, 1 - epsilon, 1 + epsilon) * batch_advantage_t4

                batch_entropy = batch_distribution.entropy()
                batch_entropy_loss = torch.mean(batch_entropy)

                r_theta_surrogate_min = torch.min(surrogate1, surrogate2)
                L_clip = -torch.sum(r_theta_surrogate_min) / r_theta_surrogate_min.size()[0] + 0.03 * batch_entropy_loss
                optimizer_a.zero_grad()
                L_clip.backward()
                optimizer_a.step()

        # END OPTIMIZE POLICY

        # START OPTIMIZE CURIOUS

        curious_state_t = get_curious_state(np.asarray(cur_state_q), p1, p2)
        avg_curious_loss = []
        curious_batch_length = N_CURIOUS_BATCH
        for epoch in range(R_epochs):
            # Get random permutation of indexes
            indexes = torch.tensor(np.random.permutation(batch_length)).type(torch.LongTensor)
            n_batch = 0
            batch_start = 0
            batch_end = 0
            # Loop over permutation
            # avg_curious_loss = []
            while batch_end < curious_batch_length:
                # Get batch indexes
                batch_end = batch_start + N_CURIOUS_BATCH
                if batch_end > curious_batch_length:
                    batch_end = curious_batch_length

                batch_idx = indexes[batch_start:batch_end]

                # Gather data from saved tensors
                batch_state_t = torch.index_select(curious_state_t, 0, batch_idx).float()
                # batch_state_t = batch_state_t.unsqueeze(1)
                # batch_reward_t = torch.index_select(reward_t, 0, batch_idx)
                # batch_summed_reward_t = torch.index_select(summed_reward_t, 0, batch_idx)
                batch_start = batch_end
                n_batch += 1

                with torch.no_grad():
                    rnd_val = ac_net_rnd(batch_state_t)
                pred_val = ac_net_pred(batch_state_t)
                # Calculate loss compared with reward and optimize
                optimizer_rnd.zero_grad()
                pred_loss_batch_curious = criterion_val(pred_val, rnd_val)
                pred_loss_batch_curious.backward()
                # nn.utils.clip_grad_norm(ac_net_pred.parameters(), 1)
                # nn.utils.clip_grad_value_(ac_net_pred.parameters(), 100)
                # clip_min_grad_value_(ac_net_pred.parameters(), 0.2)

                optimizer_rnd.step()
                avg_curious_loss.append(pred_loss_batch_curious.item())

            # print((pred_loss_batch_curious.data.numpy()), " ", end="")
            # print("")
            # print(epoch)
        # print("")

        if episode_i % return_time == 0:
            print("%4d | %6.0d | %6.1f, %6.1f | %6.1f, %6.1f | %6.2f, %6.2f, %6.2f | %6.2f, %6.2f, %6.2f | %6.2f, %6.2f | %6.2f"
                  % (episode_i, total_i,
                     np.mean(avg_reward_batch), np.mean(avg_reward),
                     np.mean(avg_curious_reward_batch), np.mean(avg_curious_reward),
                     max_advantage, mean_advantage, std_advantage,
                     max_curious_advantage, mean_curious_advantage, std_curious_advantage,
                     max_achieved_height_in_batch, np.mean(avg_max_height_q),
                     torch.mean(batch_entropy).item()))
        # END UPDATING ACTOR

    return episode_i
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('run_number', help="Consecutive number of this run")
    parser.add_argument('-e',
                        '--episodes',
                        type=int,
                        default=None,
                        help="Number of episodes")
    parser.add_argument('-t',
                        '--timesteps',
                        type=int,
                        default=None,
                        help="Number of timesteps")
    parser.add_argument('-m',
                        '--max-episode-timesteps',
                        type=int,
                        default=None,
                        help="Maximum number of timesteps per episode")
    parser.add_argument('-ns', '--network-size', type=int, default=128)
    parser.add_argument('-s', '--save', help="Save agent to this dir")
    parser.add_argument('-se',
                        '--save-episodes',
                        type=int,
                        default=100,
                        help="Save agent every x episodes")
    parser.add_argument('-rl', '--reward-level', type=int, default=3)
    parser.add_argument('-rn', '--random-level', type=int, default=3)
    parser.add_argument('-sc', '--reward-scale', type=int, default=6)
    parser.add_argument('-rp',
                        '--repeat',
                        type=int,
                        default=1,
                        help='How many times to repeat an action')
    parser.add_argument('-bt', '--batch-size', type=int)
    parser.add_argument('-os', '--optimization-steps', type=int)
    parser.add_argument('-bs', '--baseline-steps', type=int)
    parser.add_argument('-mb', '--mini-batch', type=int, default=128)
    parser.add_argument('-sd',
                        '--seed',
                        type=int,
                        default=None,
                        help='Random seed for this trial')
    parser.add_argument('-tk', '--task', type=int, default=0)
    parser.add_argument('-gm', '--gamma', type=float, default=0.99)
    parser.add_argument('-lr', '--epsilon', type=float, default=0.2)
    parser.add_argument('-en', '--entropy', type=float, default=0.0)
    args = parser.parse_args()

    random_seed = args.seed
    torch.manual_seed(random_seed)
    np.random.seed(random_seed)
    random.seed(a=random_seed)

    # Make environment and set parameters
    env = gym.make('EnvTestContinuousR-v2')
    env.unwrapped.set_reward(args.reward_level)
    env.unwrapped.set_random(args.random_level)
    env.unwrapped.set_reward_scale(args.reward_scale)
    env.unwrapped.set_task(args.task)
    env.unwrapped.seed(random_seed)
    env.unwrapped.set_repeat(args.repeat)

    return_time = 1

    # Set network parameters and initialize
    N_STATES = 5
    N_ACTIONS = 3

    # Initialise network and hyper params
    NETWORK_SIZE = args.network_size
    ac_net_critic = Net_Critic.Net(N_STATES, NETWORK_SIZE)
    ac_net_actor = Net_Actor.Net(N_STATES, N_ACTIONS, NETWORK_SIZE)

    criterion_val = nn.MSELoss()
    optimizer_c = torch.optim.Adam(ac_net_critic.parameters(),
                                   lr=0.0001,
                                   betas=(0.9, 0.999),
                                   eps=1e-08,
                                   weight_decay=0.00,
                                   amsgrad=False)
    optimizer_a = torch.optim.Adam(ac_net_actor.parameters(),
                                   lr=0.0001,
                                   betas=(0.9, 0.999),
                                   eps=1e-08,
                                   weight_decay=0.00,
                                   amsgrad=False)
    # optimizer_c = torch.optim.SGD(ac_net_critic.parameters(), lr=0.001, momentum=0.9, nesterov=True)
    # optimizer_a = torch.optim.SGD(ac_net_actor.parameters(), lr=0.001, momentum=0.9, nesterov=True)

    gamma = args.gamma

    N_TRAJECTORIES = args.batch_size
    K_EPOCHS = args.optimization_steps
    B_EPOCHS = args.baseline_steps
    N_MINI_BATCH = args.mini_batch
    EPSILON = args.epsilon

    # Initialize tracking queues
    avg_reward = deque(maxlen=100)

    # Setup filename
    run_number = args.run_number
    # Naming variables
    nNum = str(run_number).zfill(4)
    task = env.unwrapped.task
    if task == 'LIFT':
        nTask = 'L'
    else:
        nTask = 'P'
    nReward = env.unwrapped.reward_level
    nRandom = env.unwrapped.rand_level
    nSeed = str(random_seed).zfill(2)
    nAlg = 'mPPO'

    nName = ("{}-{}{}{}-{}-{}".format(nNum, nTask, nReward, nRandom, nSeed,
                                      nAlg))

    # Initialize global counters
    episode_i = 0
    total_i = 0
    episodes = args.episodes
    # noinspection PyCallingNonCallable
    while episode_i < episodes:  # START MAIN LOOP
        # Initialize batch lists
        current_state_q = []
        next_state_q = []
        reward_q = []
        action_log_prob_q = []
        value_q = []
        advantage_q_new = []
        done_q = []
        action_q = []
        avg_reward_batch = []
        episode_in_batch = 0
        i_in_batch = 0

        #if episode_i > 500:
        #    env.unwrapped.set_repeat(int(args.repeat/2))

        while episode_in_batch < N_TRAJECTORIES:
            # Reset environment and get first state
            cur_state = env.reset()
            done = False
            ret = 0
            i_in_episode = 0

            while not done:  # RUN SINGLE EPISODE
                # Get parameters for distribution and assign action
                torch_state = torch.tensor(cur_state).unsqueeze(0).float()

                with torch.no_grad():
                    mu, sd = ac_net_actor(torch_state)
                    val_out = ac_net_critic(torch_state)

                distribution = Normal(mu[0], sd[0])
                action = distribution.sample()
                clamped_action_t = torch.clamp(action, -1.0, 1.0)
                clamped_action = clamped_action_t.data.numpy()

                # Step environment
                next_state, reward, done, info = env.step(clamped_action)

                # Append values to queues
                current_state_q.append(cur_state)
                next_state_q.append(next_state)
                reward_q.append(float(reward))
                value_q.append(val_out)
                action_q.append(clamped_action)
                action_log_prob_q.append(
                    distribution.log_prob(clamped_action_t).data.numpy())
                done_q.append(1 - done)

                ret += reward  # Sum total reward for episode

                # Iterate counters, etc
                cur_state = next_state
                i_in_episode += 1
                i_in_batch += 1
                total_i += 1

                #if i_in_episode % 1 == 0 and episode_i % 10 == 0 and episode_i >= 0:
                #    env.render()
                if i_in_episode > args.max_episode_timesteps:
                    done = True
                if done:
                    break

            # END SINGLE EPISODE

            episode_in_batch += 1
            episode_i += 1
            avg_reward.append(ret)
            avg_reward_batch.append(ret)

        # END EPISODE BATCH LOOP

        # START CUMULATIVE REWARD CALC
        discounted_reward = []
        cumul_reward = 0
        for reward, done, in zip(reversed(reward_q), reversed(done_q)):
            if done == 1:
                cumul_reward = cumul_reward * gamma + reward
                discounted_reward.insert(0, cumul_reward)
            elif done == 0:
                cumul_reward = reward
                discounted_reward.insert(0, cumul_reward)

        # SET UP TENSORS
        batch_length = len(current_state_q)

        current_state_t = torch.tensor(current_state_q).float()
        action_log_prob_t = torch.tensor(action_log_prob_q).float()
        action_t = torch.tensor(action_q).float()
        reward_t = torch.tensor(discounted_reward).float()

        # CALCULATE ADVANTAGE
        value_t_new = ac_net_critic(current_state_t)
        for reward_i, value_i in zip(np.asarray(discounted_reward),
                                     value_t_new.data.numpy()):
            advantage_q_new.append(reward_i - value_i)
        advantage_q_new = np.asarray(advantage_q_new)
        # TODO check how this is converted between numpy and tensor

        advantage_q_new = (advantage_q_new - np.mean(advantage_q_new)) / (
            np.std(advantage_q_new))

        advantage_t = torch.tensor(advantage_q_new).float()

        # START UPDATING NETWORKS

        # START BASELINE OPTIMIZE
        for epoch in range(B_EPOCHS):
            # Get random permutation of indexes
            indexes = torch.tensor(np.random.permutation(batch_length)).type(
                torch.LongTensor)
            n_batch = 0
            batch_start = 0
            batch_end = 0

            # Loop over permutation
            while batch_end < batch_length:
                # Get batch indexes
                batch_end = batch_start + N_MINI_BATCH
                if batch_end > batch_length:
                    batch_end = batch_length

                batch_idx = indexes[batch_start:batch_end]

                # Gather data from saved tensors
                batch_state_t = torch.index_select(current_state_t, 0,
                                                   batch_idx).float()
                batch_reward_t = torch.index_select(reward_t, 0, batch_idx)

                # Get new baseline values
                new_val = ac_net_critic(batch_state_t)

                # Calculate loss compared with reward and optimize
                critic_loss_batch = criterion_val(new_val,
                                                  batch_reward_t.unsqueeze(1))

                # Do optimization
                optimizer_c.zero_grad()
                critic_loss_batch.backward()
                optimizer_c.step()

                # Iterate counters
                batch_start = batch_end
                n_batch += 1
        # END BASELINE OPTIMIZE

        # START POLICY OPTIMIZE
        for epoch in range(K_EPOCHS):
            # Get random permutation of indexes
            indexes = torch.tensor(np.random.permutation(batch_length)).type(
                torch.LongTensor)
            n_batch = 0
            batch_start = 0
            batch_end = 0

            # Loop over permutation
            while batch_end < batch_length:
                # Get batch indexes
                batch_end = batch_start + N_MINI_BATCH
                if batch_end > batch_length:
                    batch_end = batch_length

                batch_idx = indexes[batch_start:batch_end]

                # Gather data from saved tensors
                batch_state_t = torch.index_select(current_state_t, 0,
                                                   batch_idx).float()
                batch_advantage_t = torch.index_select(advantage_t, 0,
                                                       batch_idx).float()
                batch_action_log_prob_t = torch.index_select(
                    action_log_prob_t, 0, batch_idx)
                batch_action_t = torch.index_select(action_t, 0, batch_idx)
                # batch_reward_t = torch.index_select(reward_t, 0, batch_idx)

                # Get new batch of parameters and action log probs
                mu_batch, sd_batch = ac_net_actor(batch_state_t)
                batch_distribution = Normal(mu_batch, sd_batch)
                exp_probs = batch_distribution.log_prob(batch_action_t).exp()
                old_exp_probs = batch_action_log_prob_t.exp()
                r_theta_i = torch.div(exp_probs, old_exp_probs)

                # Expand advantage to dimensions of r_theta_i
                batch_advantage_t4 = batch_advantage_t.expand_as(r_theta_i)

                # Calculate the options
                surrogate1 = r_theta_i * batch_advantage_t4
                surrogate2 = torch.clamp(r_theta_i, 1 - EPSILON,
                                         1 + EPSILON) * batch_advantage_t4

                # Calculate batch entropy
                batch_entropy = batch_distribution.entropy()
                batch_entropy_loss = torch.mean(torch.pow(batch_entropy, 2))

                # Choose minimum of surrogates and calculate L_clip as final loss function
                r_theta_surrogate_min = torch.min(surrogate1, surrogate2)
                L_clip = -torch.sum(r_theta_surrogate_min) / (
                    r_theta_surrogate_min.size()[0]
                ) + args.entropy * batch_entropy_loss

                # if batch_entropy_loss > 1.2:
                #     L_clip = L_clip + 0.05 * batch_entropy_loss

                # Optimize
                optimizer_a.zero_grad()
                L_clip.backward()
                optimizer_a.step()

                # Iterate counters
                batch_start = batch_end
                n_batch += 1
        # END UPDATING ACTOR

        if episode_i % return_time == 0:
            print("%4d, %6.0d, %6.2f, %6.2f | %6.2f" %
                  (episode_i, total_i, np.mean(avg_reward_batch),
                   np.mean(avg_reward), torch.mean(batch_entropy).item()))

        with open(
                '/home/adf/exp715/Box2dEnv/Box2dEnv/saves/{}.csv'.format(
                    nName), 'a+') as csv:
            for ret_write in zip(np.asarray(avg_reward_batch)):
                csv.write("{:2.2f}\n".format(ret_write[0]))

        # END UPDATE OF BATCH - RETURN TO TOP WHILE STILL EPISODES TO GO

    # END MAIN LOOP

    env.close()
Exemple #28
0
    def lowtrain(self):
        buffer, buffer_capacity, batch_size = self.lowmemory.show()
        s = torch.tensor(buffer['s'], dtype=torch.double).to(self.device)
        option = torch.tensor(buffer['option'],
                              dtype=torch.double).view(-1, 1).to(self.device)

        s_ = torch.tensor(buffer['s_'], dtype=torch.double).to(self.device)
        option_ = torch.tensor(buffer['option_'],
                               dtype=torch.double).view(-1, 1).to(self.device)

        a = torch.tensor(buffer['a'], dtype=torch.double).to(self.device)
        old_a_logp = torch.tensor(buffer['a_logp'],
                                  dtype=torch.double).view(-1,
                                                           1).to(self.device)
        r = torch.tensor(buffer['r'],
                         dtype=torch.double).view(-1, 1).to(self.device)
        done = torch.tensor(buffer['done'],
                            dtype=torch.double).view(-1, 1).to(self.device)

        action_loss_record, value_loss_record, entropy_record, loop_record = 0, 0, 0, 0

        with torch.no_grad():
            value_next = self.lownet(s_)['value']
            option_change_next = torch.where(option_ > 5,
                                             torch.zeros_like(option_),
                                             option_)
            value_next_zeros = torch.gather(value_next, 1,
                                            option_change_next.long())
            value_next = torch.where(
                option_ > 5,
                value_next.sum(dim=1, keepdim=True) /
                self.config.get('num_options'), value_next_zeros)

            value_now = self.lownet(s)['value']
            option_change_now = torch.where(option > 5,
                                            torch.zeros_like(option), option)
            value_now_zeros = torch.gather(value_now, 1,
                                           option_change_now.long())
            value_now = torch.where(
                option > 5,
                value_now.sum(dim=1, keepdim=True) /
                self.config.get('num_options'), value_now_zeros)

            delta = r + (
                1 - done) * self.config.get('gamma') * value_next - value_now
            adv = torch.zeros_like(delta)
            adv[-1] = delta[-1]
            # GAE
            for i in reversed(range(buffer_capacity - 1)):
                adv[i] = delta[i] + self.config.get('tau') * (
                    1 - done[i]) * adv[i + 1]

            target_v = value_now + adv
            adv = (adv - adv.mean()) / (adv.std() + np.finfo(np.float).eps
                                        )  # Normalize advantage

        for _ in range(self.config.get('ppoepoch')):
            for index in BatchSampler(
                    SubsetRandomSampler(range(buffer_capacity)), batch_size,
                    False):
                mean, logstd = self.lownet(s[index])['mean'], self.lownet(
                    s[index])['logstd']
                std = logstd.exp()
                dist = Normal(mean, std)
                a_logp = dist.log_prob(a[index])
                option_short = option[index]

                mask = torch.zeros_like(a_logp).double()
                index_list = [
                    torch.where(option_short == i)[0]
                    for i in range(self.config.get('num_options'))
                ]
                input_list = torch.zeros(self.config.get('num_options'),
                                         self.config.get('action_dim'))
                start_list = self.config.get('start_list')
                end_list = self.config.get('end_list')
                for i in range(self.config.get('num_options')):
                    input_list[i][start_list[i]:end_list[i]] = 1

                for i in range(self.config.get('num_options')):
                    if torch.tensor(index_list[i].shape) != 0:
                        mask[index_list[i]] = torch.ones(
                            torch.tensor(index_list[i].shape),
                            self.config.get('action_dim')).double().to(
                                self.device) * input_list[i].double().to(
                                    self.device)

                a_logp = a_logp * mask
                a_p_1 = a_logp.sum(dim=1, keepdim=True)

                ratio = torch.exp((a_p_1 - old_a_logp[index]))
                surr1 = ratio * adv[index]
                surr2 = torch.clamp(
                    ratio, 1.0 - self.config.get('clip_param'),
                    1.0 + self.config.get('clip_param')) * adv[index]
                action_loss = -torch.min(surr1, surr2).mean()
                entropy = dist.entropy() * mask

                value_now = self.lownet(s[index])['value']
                option_change_now = torch.where(
                    option[index] > 5, torch.zeros_like(option[index]),
                    option[index])
                value_now_zeros = torch.gather(value_now, 1,
                                               option_change_now.long())
                value_now = torch.where(
                    option[index] > 5,
                    value_now.sum(dim=1, keepdim=True) /
                    self.config.get('num_options'), value_now_zeros)

                value_loss = F.smooth_l1_loss(value_now, target_v[index])
                self.lowoptimizition.zero_grad()
                loss = action_loss + value_loss - self.config.get(
                    'entropy_para_low') * entropy.mean()
                loss.backward()
                nn.utils.clip_grad_norm_(self.lownet.parameters(),
                                         self.config.get('max_grad_norm'))
                self.lowoptimizition.step()

                action_loss_record += action_loss.cpu().detach()
                value_loss_record += value_loss.cpu().detach()
                entropy_record += entropy.mean().cpu().detach()
                loop_record += 1

        return {
            'actionloss': action_loss_record / loop_record,
            'valueloss': value_loss_record / loop_record,
            'entropy': entropy_record / loop_record,
        }
Exemple #29
0
def get_entropy(mu, std):
    dist = Normal(mu, std)
    entropy = dist.entropy().mean()
    return entropy
Exemple #30
0
 def entropy(self):
     distribution = Normal(loc=self.mu, scale=self.log_var.exp())
     return distribution.entropy().mean()