コード例 #1
0
ファイル: models.py プロジェクト: MartinBertran/ReaPER
 def get_action(self, belief, state, det=False, scale=None):
     action_mean, action_std = self.forward(belief, state)
     if scale:
         #exploration distribution
         dist = Normal(action_mean,
                       action_std + action_std.detach() * (1 - scale))
         dist = TransformedDistribution(dist, TanhBijector())
         dist = torch.distributions.Independent(dist, 1)
         dist = SampleDist(dist)
         action = dist.mode() if det else dist.rsample()
         proposal_loglike = dist.log_prob(action).detach()
         #true distribution
         dist = Normal(action_mean, action_std)
         dist = TransformedDistribution(dist, TanhBijector())
         dist = torch.distributions.Independent(dist, 1)
         dist = SampleDist(dist)
         policy_loglike = dist.log_prob(action)
         return action, policy_loglike, proposal_loglike
     else:
         dist = Normal(action_mean, action_std)
         dist = TransformedDistribution(dist, TanhBijector())
         dist = torch.distributions.Independent(dist, 1)
         dist = SampleDist(dist)
         action = dist.mode() if det else dist.rsample()
         return action
コード例 #2
0
def evaluate_policy(observation,
                    deterministic: bool = True,
                    with_log_prob: bool = False):
    if not isinstance(observation, torch.Tensor):
        observation = torch.as_tensor(observation, dtype=torch.float32)
    output = policy.forward(observation)
    mu = policy_mu_layer.forward(output)
    log_sigma = policy_log_sigma_layer.forward(output).clamp(
        LOG_STD_MIN, LOG_STD_MAX)
    std = torch.exp(log_sigma)

    pi_distribution = Normal(mu, std)
    if deterministic:
        # Only used for evaluating policy at test time.
        action = mu
    else:
        action = pi_distribution.rsample()

    if with_log_prob:
        log_prob = pi_distribution.log_prob(action).sum(axis=-1)
        log_prob -= (2 * (np.log(2) - action - F.softplus(-2 * action))).sum(
            axis=-1)
    else:
        log_prob = None

    action = torch.tanh(action)
    action = action_higher_bound * action

    return action, log_prob
コード例 #3
0
    def forward(self, obs, deterministic = False, with_logprob = True):
        obs = F.relu(self.layer1(obs))
        obs = F.relu(self.layer2(obs))

        mean = self.mean_layer(obs)
        log_std = self.log_std_layer(obs)
        torch.clamp(log_std, LOG_STD_MIN, LOG_STD_MAX)
        std = torch.exp(log_std)

        dist = Normal(mean, std)

        if deterministic:
            action = mean
        else:
            action = dist.rsample()

        if with_logprob:
            logp_prob = dist.log_prob(action).sum(axis=-1)
            logp_prob -= (2*(np.log(2) - action - F.softplus(-2*action))).sum(axis=1)
        else:
            logp_prob = None


        action = torch.tanh(action)
        action = self.act_limit * action      

        return action , logp_prob.unsqueeze(-1)  
コード例 #4
0
    def forward(self, o, deterministic=False, squash_action=True):
        net_ouput = self.net(o)
        mu = self.mu(net_ouput)
        log_std = self.log_std(net_ouput)
        log_std = torch.clamp(log_std, LOG_STD_MIN, LOG_STD_MAX)
        #std = torch.exp(log_std)
        std = torch.sigmoid(log_std)

        policy = Normal(mu, std)
        if deterministic:
            pi = mu
        else:
            pi = policy.rsample()  # .sample() : no grad

        # Squash those unbounded actions
        if squash_action:
            # Compute logprob from Gaussian, and then apply correction for Tanh squashing.
            # NOTE: The correction formula is a little bit magic. To get an understanding
            # of where it comes from, check out the original SAC paper (arXiv 1801.01290)
            # and look in appendix C. This is a more numerically-stable equivalent to Eq 21.
            # Try deriving it yourself as a (very difficult) exercise. :)

            # gaussian likelihood
            logp_pi = policy.log_prob(pi).sum(dim=1)
            logp_pi -= (2 * (np.log(2) - pi - F.softplus(-2 * pi))).sum(axis=1)
        else:
            logp_pi = None
        pi = torch.tanh(pi)
        return mu, pi, logp_pi
コード例 #5
0
    def forward(self, state, deterministic=False):
        """
        Param state is a torch tensor
        """
        x = F.relu(self.linear1(state))
        x = F.relu(self.linear2(x))
        x = self.linear3(x)

        mu = self.mu_layer(x)
        log_std = self.log_std_layer(x)
        log_std = torch.clamp(log_std, self.LOG_STD_MIN, self.LOG_STD_MAX)
        std = torch.exp(log_std)

        # Pre-squash distribution and sample
        pi_distribution = Normal(mu, std)
        if deterministic:
            pi_action = mu
        else:
            pi_action = pi_distribution.rsample()

        # Compute logprob from Gaussian, and then apply correction for Tanh squashing.
        logp_pi = pi_distribution.log_prob(pi_action).sum(axis=-1)
        logp_pi -= (2*(np.log(2) - pi_action - F.softplus(-2*pi_action))).sum(axis=1)

        pi_action = torch.tanh(pi_action)
        return pi_action, logp_pi
コード例 #6
0
ファイル: networks.py プロジェクト: Aks-Dmv/maddpg-pytorch
    def forward(self, X):
        """
        Inputs:
            X (PyTorch Matrix): Batch of observations
        Outputs:
            out (PyTorch Matrix): Output of network (actions, values, etc)
        """
        h1 = self.nonlin(self.fc1(self.in_fn(X)))
        h2 = self.nonlin(self.fc2(h1))

        mu = self.mu_layer(h2)
        log_std = self.log_std_layer(h2)
        log_std = torch.clamp(log_std, -3, 1)
        std = torch.exp(log_std)

        pi_distribution = Normal(mu, std)
        if self.training:
            pi_action = pi_distribution.rsample()
        else:
            pi_action = mu

        logp_pi = pi_distribution.log_prob(pi_action).sum(dim=-1)
        log_2_torch = torch.log(torch.Tensor([2.]))
        if pi_action.is_cuda:
            log_2_torch = log_2_torch.cuda()
        logp_pi -= (
            2 *
            (log_2_torch - pi_action - F.softplus(-2 * pi_action))).sum(dim=-1)

        pi_action = self.out_fn(pi_action)
        return pi_action, logp_pi
コード例 #7
0
ファイル: networks.py プロジェクト: LeeDaeil/CNS_Autonomous
    def sample_normal(self, state, reparameterize=True):
        if self.control_strategy == 'Continuouse':
            mu, sigma = self.forward(state)
            probabilities = Normal(mu, sigma)

            if reparameterize:
                actions = probabilities.rsample()
            else:
                actions = probabilities.sample()

            action = T.tanh(actions) * T.tensor(self.max_action).to(
                self.device)
            log_probs = probabilities.log_prob(actions)
            log_probs -= T.log(1 - action.pow(2) + self.reparam_noise)

            log_probs = log_probs.sum(1, keepdim=True)  # Loss 구하기 위해서

            # 사용안함.
            action_probs, greedy_actions = 0, 0
        else:
            action_probs = self.forward(state)
            greedy_actions = T.argmax(action_probs, dim=1, keepdim=True)

            categorical = Categorical(action_probs)
            action = categorical.sample().view(-1, 1)

            log_probs = T.log(action_probs +
                              (action_probs == 0.0).float() * 1e-8)

        return action, log_probs, action_probs, greedy_actions
コード例 #8
0
    def sample_normal(self, state, reparameterize=True):
        """
        Calculation of the actual policy.

        Policy = "What's the probability of choosing a certain action given a certain state?"

        This is assuming a continuous action space.

        TODO change this to a discrete action space! (simpler = faster = easier)
        http://www.youtube.com/watch?v=ioidsRlf79o&t=25m30s
        :param state:
        :param reparameterize: Add "noise" to the sampling (to encourage exploration)
        :return:
        """
        mu, sigma = self.forward(state)
        probabilities = Normal(mu, sigma)

        if reparameterize:  # sample + noise (for exploration)
            actions = probabilities.rsample()
        else:  # sample
            actions = probabilities.sample()

        action = T.tanh(actions) * T.tensor(self.max_action).to(self.device)
        log_probs = probabilities.log_prob(actions)
        log_probs -= T.log(1 - action.pow(2) + self.reparam_noise)
        log_probs = log_probs.sum(1, keepdim=True)

        return action, log_probs
コード例 #9
0
    def forward(self, x, with_logprob=False):
        x = self.layers(x)
        mean = self.mean_layer(x)
        std = self.log_std_layer(x).clamp(-20, 2).exp()
        pi_distribution = Normal(mean, std)
        pi_action = pi_distribution.rsample()

        if with_logprob:
            # Compute logprob from Gaussian, and then apply correction for Tanh squashing.
            # NOTE: The correction formula is a little bit magic. To get an understanding
            # of where it comes from, check out the original SAC paper (arXiv 1801.01290)
            # and look in appendix C. This is a more numerically-stable equivalent to Eq 21.
            # Try deriving it yourself as a (very difficult) exercise. :)
            logp = pi_distribution.log_prob(pi_action).sum(axis=-1)
            logp -= (2 *
                     (np.log(2) - pi_action - F.softplus(-2 * pi_action))).sum(
                         axis=1)
        else:
            logp = None

        x = torch.tanh(pi_action)  # [N, action_dim]

        # scale (-1, 1) to [action.low, action_high]
        action = (x + 1) * (self.action_high -
                            self.action_low) / 2 + self.action_low

        if with_logprob:
            return (action, logp)
        else:
            return action
コード例 #10
0
    def forward(self, obs, deterministic=False, with_logprob=True):
        net_out = self.net(obs)
        mu = self.mu_layer(net_out)
        log_std = self.log_std_layer(net_out)
        log_std = torch.clamp(log_std, self.LOG_STD_MIN, self.LOG_STD_MAX)
        std = torch.exp(log_std)

        pi_distribution = Normal(mu, std)
        if deterministic:
            pi_action = mu  # Used for evaluating policy at test time.
        else:
            pi_action = pi_distribution.rsample()

        if with_logprob:
            logp_pi = pi_distribution.log_prob(pi_action).sum(axis=-1)
            logp_pi -= (
                2 * (np.log(2) - pi_action - F.softplus(-2 * pi_action))).sum(
                    axis=1)
        else:
            logp_pi = None

        pi_action = torch.tanh(pi_action)
        pi_action = self.act_limit * pi_action

        return pi_action, logp_pi
コード例 #11
0
    def sample_action(self, state, reparameterize=False, actor_network=None):
        if actor_network is None:
            actor_network = self.actor

        # Forward
        if isinstance(state, np.ndarray):
            state = torch.from_numpy(state).to(self.device)

        actor_output = actor_network(state)

        if len(state.shape) > 1:  # It's a batch
            actions_means = actor_output[:, :self.nb_actions]
            actions_stds = actor_output[:, self.nb_actions:]
        else:
            actions_means = actor_output[:self.nb_actions]
            actions_stds = actor_output[self.nb_actions:]

        actions_stds = torch.clamp(actions_stds, min=self.min_std, max=1)

        actions_distribution = Normal(actions_means, actions_stds)

        if reparameterize:
            actions = actions_distribution.rsample()
        else:
            actions = actions_distribution.sample()

        action = torch.tanh(
            actions) * self.actions_bounds_range + self.actions_bounds_mean
        log_probs = actions_distribution.log_prob(actions)
        log_probs -= torch.log(1 - action.pow(2) + self.min_std)
        log_probs = log_probs.sum(dim=-1)

        return action, log_probs
コード例 #12
0
def perplexity(model, data, device):
    model.eval()
    total_per = 0
    for step, (input, targets, lenghts) in enumerate(data):
        input = input.to(device)
        targets = targets.to(device)
        batch_size = input.shape[0]
        seq_len = input.shape[1]
        lenghts = torch.tensor(lenghts).to(device).float()
        mean, std = model.encoder(input)

        #Reparameterization trick
        q_z = Normal(mean, std)
        sample_z = q_z.rsample()

        h_0 = torch.tanh(model.upscale(sample_z)).unsqueeze(0)

        if (model.skip):
            z = model.z_lin(sample_z).unsqueeze(1)
            px_logits, _ = model.skip_decoder(input, h_0, z, device)
        else:
            px_logits, _ = model.decoder(input, h_0)

        criterion = nn.CrossEntropyLoss(ignore_index=0, reduction='sum')
        perplexity = 0
        for i in range(batch_size):
            seq = px_logits[i, :, :]
            target = targets[i, :]
            perplexity += torch.exp(criterion(seq, target) / lenghts[i])
        perplexity /= batch_size
        total_per += perplexity.detach()
    return total_per / step
コード例 #13
0
    def forward(self, rnn_outputs):
        #Create Mu
        mu = []
        for _i in range(self.steps):
            #mu_tmp = torch.matmul(rnn_outputs[:,_i,:],mu_w)+mu_b
            mu_tmp = self.mu(rnn_outputs[:, _i, :])
            mu.append(mu_tmp)
        mu = torch.cat(mu, dim=1).view(-1, self.steps, self.p_att_shape[1])

        #Create Sigma
        sigma = []
        for _k in range(self.steps):
            sigma_tmp = self.sigma(rnn_outputs[:, _k, :])
            sigma.append(sigma_tmp)
        sigma = torch.cat(sigma, 1).view(-1, self.steps, self.p_att_shape[1])
        sigma = self.softplus(sigma)

        distribution = Normal(loc=mu, scale=sigma)

        att = distribution.rsample([1])
        att = torch.squeeze(att, 0)

        if self.strd_id == 'alpha':
            squashed_att = self.softmax(att)
            #print('Done with generating alpha attention.')
        elif self.strd_id == 'beta':
            squashed_att = self.tanh(att)
            #print('Done with generating beta attention.')
        else:
            raise ValueError(
                'You must re-check the attention id. required to \'alpha\' or \'beta\''
            )

        return squashed_att
コード例 #14
0
    def forward(self, obs, deterministic=False, with_logprob=True):
        net_out = self.net(obs)
        mu = self.mu_layer(net_out)
        log_std = self.log_std_layer(net_out)
        log_std = torch.clamp(log_std, LOG_STD_MIN, LOG_STD_MAX)
        std = torch.exp(log_std)

        # Pre-squash distribution and sample
        pi_distribution = Normal(mu, std)
        if deterministic:
            # Only used for evaluating policy at test time.
            pi_action = mu
        else:
            pi_action = pi_distribution.rsample()

        if with_logprob:
            # Compute logprob from Gaussian, and then apply correction for Tanh squashing.
            logp_pi = pi_distribution.log_prob(pi_action).sum(axis=-1)
            logp_pi -= (
                2 * (np.log(2) - pi_action - F.softplus(-2 * pi_action))).sum(
                    axis=1)
        else:
            logp_pi = None

        pi_action = torch.tanh(pi_action)
        pi_action = self.act_limit * pi_action

        return pi_action, logp_pi
コード例 #15
0
def compute_stochastic_elbo(a, b, nu, omega, x, y, a_0, b_0, mu_0):
    """
    Return a monte-carlo estimate of the ELBO, using a single sample from Q(sigma^-2, beta)
    
    a, b are the Gamma 'shape' and 'rate' parameters for the variational posterior over *precision*: q(tau) = q(sigma^-2)
    nu_k, omega_k are Normal 'mean' and 'precision' parameters for the variational posterior over weights: q(beta_k)
    x is an n by k matrix, where each row contains the regression inputs [1, x, x^2, x^3]
    y is an n by 1 values
    a_0, b_0 the parameters for the Gamma prior over precision P(tau) = P(sigma^-2)
    mu_0 is the mean of the Gamma prior on weights beta
    """
    
    # Define mean field variational distribution over (beta, tau).
    Q_beta = Normal(nu, omega**-0.5)
    Q_tau = Gamma(a, b) 
    
    # Sample from variational distribution: (tau, beta) ~ Q
    # Use rsample to make sure that the result is differentiable.
    tau = Q_tau.rsample()
    sigma = tau**-0.5
    beta = Q_beta.rsample()
    
    # Create a single sample monte-carlo estimate of ELBO.
    P_tau = Gamma(a_0, b_0) 
    P_beta = Normal(mu_0, sigma) 
    P_y = Normal((beta[None, :]*x).sum(dim=1, keepdim=True), sigma) 
    
    kl_tau = Q_tau.log_prob(tau) - P_tau.log_prob(tau)
    kl_beta = Q_beta.log_prob(beta).sum() - P_beta.log_prob(beta).sum()
    log_likelihood = P_y.log_prob(y).sum()

    elbo = log_likelihood - kl_tau - kl_beta
    return elbo
コード例 #16
0
ファイル: sac.py プロジェクト: tmjeong1103/RL_with_RAY
    def forward(self, o, deterministic=False, get_logprob=True):
        net_ouput = self.net(o)
        mu = self.mu(net_ouput)
        log_std = self.log_std(net_ouput)

        LOG_STD_MIN, LOG_STD_MAX = -10.0, +2.0
        log_std = torch.clamp(log_std, LOG_STD_MIN, LOG_STD_MAX)  #log_std
        #std = torch.exp(log_std)
        std = torch.sigmoid(log_std)  #std

        # Pre-squash distribution and sample
        dist = Normal(mu, std)
        if deterministic:
            pi = mu
        else:
            pi = dist.rsample()  # sampled

        if get_logprob:
            # Compute logprob from Gaussian, and then apply correction for Tanh squashing.
            # NOTE: The correction formula is a little bit magic. To get an understanding
            # of where it comes from, check out the original SAC paper (arXiv 1801.01290)
            # and look in appendix C. This is a more numerically-stable equivalent to Eq 21.
            # Try deriving it yourself as a (very difficult) exercise. :)
            logp_pi = dist.log_prob(pi).sum(
                axis=-1)  #gaussian log_likelihood # modified axis
            logp_pi -= (2 * (np.log(2) - pi - F.softplus(-2 * pi))).sum(axis=1)
        else:
            logp_pi = None
        pi = torch.tanh(pi)
        return pi, logp_pi
コード例 #17
0
ファイル: SkipVAE.py プロジェクト: Sasafrass/NLP2_DGM
    def forward(self, input, targets, lengths, device):
        """
        Given input, perform an encoding and decoding step and return the
        negative average elbo for the given batch.
        """
        batch_size = input.shape[0]
        seq_len = input.shape[1]
        
        average_negative_elbo = None
        mean, std = self.encoder(input)
        
        #Reparameterization trick
        q_z = Normal(mean,std)
        sample_z = q_z.rsample()

        h_0 = torch.tanh(self.upscale(sample_z)).unsqueeze(0)
        z = self.z_lin(sample_z).unsqueeze(1)
        px_logits, _ = self.decoder(input,h_0,z,device)
        p_x = Categorical(logits=px_logits)
        
        prior = Normal(torch.zeros(self.z_dim).to(device),torch.ones(self.z_dim).to(device))
        
        KLD = distributions.kl_divergence(q_z, prior)

        criterion =  nn.CrossEntropyLoss(ignore_index=0)
        recon_loss = criterion(p_x.logits.view(batch_size*seq_len,-1),targets.view(-1))*seq_len
        average_negative_elbo = torch.sum(torch.mean(KLD,dim=0)) + recon_loss
        
        return average_negative_elbo, KLD
コード例 #18
0
    def evaluate(self,
                 state,
                 deterministic: bool = False,
                 with_log_probability: bool = True):
        mean, std = self.forward(state)
        distribution = Normal(mean, std)
        sample = distribution.rsample()

        if deterministic:
            action = mean
        else:
            action = torch.tanh(
                sample
            )  # todo when sac working, multiply by action_scale and add action_bias

        if with_log_probability:
            # Implementation that I originally implemented
            # the "_" are only here for now to debug the values and the shapes
            # log_probability_ = distribution.log_prob(sample) - torch.log((1 - action.pow(2)) + self.epsilon)
            # log_probability = log_probability_.sum(1, keepdim=True)

            # OPENAI Implementation
            # https://github.com/openai/spinningup/blob/038665d62d569055401d91856abb287263096178/spinup/algos/pytorch/sac/core.py#L59
            log_probability_ = distribution.log_prob(sample).sum(axis=-1,
                                                                 keepdim=True)
            log_probability__ = (
                2 * (np.log(2) - sample - F.softplus(-2 * sample))).sum(
                    axis=1).unsqueeze(1)
            log_probability = log_probability_ - log_probability__
        else:
            log_probability = None

        return action, log_probability
コード例 #19
0
    def log_forward(self, x):

        out = torch.Tensor(x).reshape(-1, self.in_dim)

        out = self.l1(out)
        out = self.leaky_relu(out)
        out = self.l2(out)
        out = self.leaky_relu(out)
        out = self.l3(out)
        out = self.leaky_relu(out)
        out = self.l4(out)
        #out = self.tanh(out)

        mu = self.mu_linear(out)
        log_std = self.log_linear(out)

        log_std = torch.clamp(log_std, -20, 2)
        std = torch.exp(log_std)
        distribution = Normal(mu, std)

        action = distribution.rsample()
        log_p = distribution.log_prob(action)
        log_p -= (2 * (np.log(2) - action - F.softplus(-2 * action)))

        action = torch.tanh(action)

        return action, log_p
コード例 #20
0
	def forward(self, observation, deterministic=False, with_log_prob=True):
		net_out = self.net(observation)
		# computer the \mu and \sigma of the gaussian
		mu = self.mu_layer(net_out)

		log_std = self.log_std_layer(net_out)
		log_std = torch.clamp(log_std, self.log_std_min, self.log_std_max)
		std = torch.exp(log_std)

		# Pre-squash distribution and sample
		pi_distribution = Normal(mu, std)

		if deterministic:
			# only used for evaluating policy at test time.
			pi_action = mu
		else:
			pi_action = pi_distribution.rsample()

		if with_log_prob:
			# Appendix C
			log_pro_pi = pi_distribution.log_prob(pi_action).sum(dim=-1)
			log_pro_pi -= (2 * (np.log(2) - pi_action - F.softplus(-2*pi_action))).sum(dim=-1)
		else:
			log_pro_pi = None

		pi_action = torch.tanh(pi_action)
		pi_action = self.act_limit * pi_action
		return pi_action, log_pro_pi
コード例 #21
0
    def forward(self, obs, deterministic=False, with_logprob=True):  # Actor

        actor_tmp = F.relu(self.actor_layer1(obs))
        actor_tmp = F.relu(self.actor_layer2(actor_tmp))
        mu = self.mu_layer(actor_tmp)
        log_std = self.log_std_layer(actor_tmp)
        log_std = torch.clamp(log_std, self.log_std_min, self.log_std_max)
        std = torch.exp(log_std)

        distribution = Normal(mu, std)
        if deterministic:
            action = mu
        else:
            action = distribution.rsample()

        # Reference for OpenAI SpinningUp's Implementation of SAC
        if with_logprob:
            logp_prob = distribution.log_prob(action).sum(axis=-1)
            logp_prob -= (2 *
                          (np.log(2) - action - F.softplus(-2 * action))).sum(
                              axis=1)
        else:
            logp_prob = None

        action = torch.tanh(action)
        action = self.act_limit * action

        return action, logp_prob
コード例 #22
0
    def forward(self,
                x,
                deterministic=False,
                repara_trick=False,
                with_logprob=True):
        x = torch.FloatTensor(x.reshape(1, -1)).to(self.args.device)
        x = F.relu(self.l1(x))
        log_std = self.log_std_layer(x)
        std = torch.exp(log_std)
        x = F.relu(self.l2(x))
        x = F.relu(self.l2_additional(x))
        mu = self.max_action * torch.tanh(self.l3(x))
        pi_distribution = Normal(mu, std)

        if deterministic:
            pi_action = mu
        elif repara_trick:
            pi_action = pi_distribution.rsample()
        else:
            pi_action = pi_distribution.sample()

        if with_logprob:
            logp_pi = pi_distribution.log_prob(pi_action).sum(axis=-1)
            logp_pi -= (
                2 * (np.log(2) - pi_action - F.softplus(-2 * pi_action))).sum(
                    axis=1)
        else:
            logp_pi = None

        pi_action = torch.tanh(pi_action)
        pi_action = self.max_action * pi_action

        return pi_action.cpu().data.numpy().flatten()
コード例 #23
0
ファイル: popsan.py プロジェクト: starhxh/pop-spiking-deep-rl
 def forward(self, obs, batch_size, deterministic=False, with_logprob=True):
     """
     :param obs: observation
     :param batch_size: batch size
     :param deterministic: If true use deterministic action
     :param with_logprob: if true return log prob
     :return: action scale with action limit
     """
     in_pop_spikes = self.encoder(obs, batch_size)
     out_pop_activity = self.snn(in_pop_spikes, batch_size)
     mu = self.decoder(out_pop_activity)
     log_std = self.log_std_network(obs)
     log_std = torch.clamp(log_std, LOG_STD_MIN, LOG_STD_MAX)
     std = torch.exp(log_std)
     # Pre-squash distribution and sample
     pi_distribution = Normal(mu, std)
     if deterministic:
         # Only used for evaluating policy at test time.
         pi_action = mu
     else:
         pi_action = pi_distribution.rsample()
     if with_logprob:
         logp_pi = pi_distribution.log_prob(pi_action).sum(axis=-1)
         logp_pi -= (
             2 * (np.log(2) - pi_action - F.softplus(-2 * pi_action))).sum(
                 axis=1)
     else:
         logp_pi = None
     pi_action = torch.tanh(pi_action)
     pi_action = self.act_limit * pi_action
     return pi_action, logp_pi
コード例 #24
0
ファイル: core.py プロジェクト: hari-sikchi/AWAC
    def forward(self, obs, deterministic=False, with_logprob=True):
        net_out = self.net(obs)
        mu = self.mu_layer(net_out)
        log_std = self.log_std_layer(net_out)
        log_std = torch.clamp(log_std, LOG_STD_MIN, LOG_STD_MAX)
        std = torch.exp(log_std)

        # Pre-squash distribution and sample
        pi_distribution = Normal(mu, std)
        if deterministic:
            # Only used for evaluating policy at test time.
            pi_action = mu
        else:
            pi_action = pi_distribution.rsample()

        if with_logprob:
            # Compute logprob from Gaussian, and then apply correction for Tanh squashing.
            # NOTE: The correction formula is a little bit magic. To get an understanding
            # of where it comes from, check out the original SAC paper (arXiv 1801.01290)
            # and look in appendix C. This is a more numerically-stable equivalent to Eq 21.
            # Try deriving it yourself as a (very difficult) exercise. :)
            logp_pi = pi_distribution.log_prob(pi_action).sum(axis=-1)
            logp_pi -= (
                2 * (np.log(2) - pi_action - F.softplus(-2 * pi_action))).sum(
                    axis=1)
        else:
            logp_pi = None

        pi_action = torch.tanh(pi_action)
        pi_action = self.act_limit * pi_action

        return pi_action, logp_pi
コード例 #25
0
ファイル: main.py プロジェクト: insperatum/gpmws
def getProposalDistribution(i, x):
    if random.random() < 0.5 or any(n_observations[i] == 0):
        # Propose using recognition model
        phi, _ = r(x)
        return phi
    else:
        # Local proposal
        model, train_x, train_y = getGPModel(i)
        model.eval()
        likelihood.eval()
        v = Variable(train_x, requires_grad=True)
        m = model(v).mean
        best_score, best_idx = torch.max(m, dim=1)
        v_unrolled = v.data.view(-1, *v.size()[2:])
        best_phi = v_unrolled[
            torch.arange(0, len(v_unrolled), v.size(1)).cuda() + best_idx]
        #m.sum().backward()
        #grad_unrolled = v.grad.view(-1, *v.size()[2:])
        #best_grad = grad_unrolled[torch.arange(0, len(v_unrolled), v.size(1)).cuda() + best_idx]
        lr = 0.01  #Variable(torch.Tensor([0.001]).cuda(), requires_grad=True)
        #step = best_grad*lr
        mu = best_phi  # + step
        sigma = lr  #step.abs()
        dist = Normal(mu, sigma)
        return dist.rsample()
コード例 #26
0
ファイル: main.py プロジェクト: insperatum/gpmws
 def forward(self, z, x=None):
     coeffs = z
     mu = (coeffs[:, :, None] * self.inputs[None, :, :]).sum(dim=1)
     dist = Normal(mu, self.sigma)
     if x is None: x = dist.rsample()
     score = dist.log_prob(x)
     score = score.sum(dim=1)
     return x, score
コード例 #27
0
 def sample_prior(self, shape=torch.Size([]), store=False):
     p_mu = Normal(self.mu_loc_prior * torch.ones(self.dim_in),
                   self.mu_scale_prior * torch.ones(self.dim_in))
     p_s = LogitNormal(loc=p_mu.rsample(shape),
                       scale=self.s_scale_prior * torch.ones(self.dim_in))
     s = p_s.rsample(shape)
     if store: self.s = s
     return s
コード例 #28
0
ファイル: sac.py プロジェクト: seolhokim/Mujoco-Pytorch
 def get_action(self, state):
     mu, std = self.actor(state)
     dist = Normal(mu, std)
     u = dist.rsample()
     u_log_prob = dist.log_prob(u)
     a = torch.tanh(u)
     a_log_prob = u_log_prob - torch.log(1 - torch.square(a) + 1e-3)
     return a, a_log_prob.sum(-1, keepdim=True)
コード例 #29
0
 def forward(self, x):
     hidden = self.model(x)
     mu = self.mu(hidden)
     sigma = self.sigma(hidden)
     std = nn.functional.softplus(sigma)
     dist = Normal(mu, std)
     z = dist.rsample()
     return z, dist, mu
コード例 #30
0
ファイル: models.py プロジェクト: zivzone/dreamer-pytorch
 def get_action(self, belief, state, det=False):
     action_mean, action_std = self.forward(belief, state)
     dist = Normal(action_mean, action_std)
     dist = TransformedDistribution(dist, TanhBijector())
     dist = torch.distributions.Independent(dist, 1)
     dist = SampleDist(dist)
     if det: return dist.mode()
     else: return dist.rsample()
コード例 #31
0
ファイル: example_pixelcnn.py プロジェクト: insperatum/vhe
	def forward(self, inputs, c, z=None):
		inputs = inputs.view(-1, 1, 28, 28) #huh?
		mu = self.localization_mu(inputs)
		sigma = self.localization_sigma(inputs)
		dist = Normal(mu, sigma)
		if z is None: 
			z = dist.rsample()
		score = dist.log_prob(z).sum(dim=1).sum(dim=1).sum(dim=1)
		return z, score
コード例 #32
0
ファイル: example_czx.py プロジェクト: insperatum/vhe
 def forward(self, inputs, c=None):    
     inputs_permuted = inputs.transpose(0,1) # |D| * batch * ... 
     embeddings = [self.enc(x) for x in inputs_permuted]
     mean_embedding = sum(embeddings)/len(embeddings)
     mu_c = self.mu_c(mean_embedding)
     sigma_c = self.sigma_c(mean_embedding)
     dist = Normal(mu_c, sigma_c)
     if c is None: c = dist.rsample()
     return c, dist.log_prob(c).sum(dim=1) # Return value, score
コード例 #33
0
ファイル: example_pixelcnn.py プロジェクト: insperatum/vhe
	def forward(self, inputs, c=None):
		# transform the input
		xs = [self.stn(inputs[:,i,:,:,:]) for i in range(inputs.size(1))]

		embs = [self.conv_post_stn(x) for x in xs]
		emb = sum(embs)/len(embs)
		mu = self.conv_mu(emb)
		sigma = self.conv_sigma(emb)
		dist = Normal(mu, sigma)
		if c is None: c = dist.rsample()
		return c, dist.log_prob(c).sum(dim=1).sum(dim=1).sum(dim=1)
コード例 #34
0
ファイル: example_czx.py プロジェクト: insperatum/vhe
 def forward(self, inputs, c, z=None):    
     mu_z = self.mu_z(inputs[:, 0])
     sigma_z = self.sigma_z(inputs[:, 0])
     dist = Normal(mu_z, sigma_z)
     if z is None: z = dist.rsample()
     return z, dist.log_prob(z).sum(dim=1) # Return value, score
コード例 #35
0
ファイル: example_czx.py プロジェクト: insperatum/vhe
 def forward(self, c, z, x=None):
     cz = torch.cat([c,z], dim=1)
     dist = Normal(self.mu(cz), self.sigma(cz))
     if x is None: x = dist.rsample()
     return x, dist.log_prob(x).sum(dim=1) # Return value, score