def get_action(self, belief, state, det=False, scale=None): action_mean, action_std = self.forward(belief, state) if scale: #exploration distribution dist = Normal(action_mean, action_std + action_std.detach() * (1 - scale)) dist = TransformedDistribution(dist, TanhBijector()) dist = torch.distributions.Independent(dist, 1) dist = SampleDist(dist) action = dist.mode() if det else dist.rsample() proposal_loglike = dist.log_prob(action).detach() #true distribution dist = Normal(action_mean, action_std) dist = TransformedDistribution(dist, TanhBijector()) dist = torch.distributions.Independent(dist, 1) dist = SampleDist(dist) policy_loglike = dist.log_prob(action) return action, policy_loglike, proposal_loglike else: dist = Normal(action_mean, action_std) dist = TransformedDistribution(dist, TanhBijector()) dist = torch.distributions.Independent(dist, 1) dist = SampleDist(dist) action = dist.mode() if det else dist.rsample() return action
def evaluate_policy(observation, deterministic: bool = True, with_log_prob: bool = False): if not isinstance(observation, torch.Tensor): observation = torch.as_tensor(observation, dtype=torch.float32) output = policy.forward(observation) mu = policy_mu_layer.forward(output) log_sigma = policy_log_sigma_layer.forward(output).clamp( LOG_STD_MIN, LOG_STD_MAX) std = torch.exp(log_sigma) pi_distribution = Normal(mu, std) if deterministic: # Only used for evaluating policy at test time. action = mu else: action = pi_distribution.rsample() if with_log_prob: log_prob = pi_distribution.log_prob(action).sum(axis=-1) log_prob -= (2 * (np.log(2) - action - F.softplus(-2 * action))).sum( axis=-1) else: log_prob = None action = torch.tanh(action) action = action_higher_bound * action return action, log_prob
def forward(self, obs, deterministic = False, with_logprob = True): obs = F.relu(self.layer1(obs)) obs = F.relu(self.layer2(obs)) mean = self.mean_layer(obs) log_std = self.log_std_layer(obs) torch.clamp(log_std, LOG_STD_MIN, LOG_STD_MAX) std = torch.exp(log_std) dist = Normal(mean, std) if deterministic: action = mean else: action = dist.rsample() if with_logprob: logp_prob = dist.log_prob(action).sum(axis=-1) logp_prob -= (2*(np.log(2) - action - F.softplus(-2*action))).sum(axis=1) else: logp_prob = None action = torch.tanh(action) action = self.act_limit * action return action , logp_prob.unsqueeze(-1)
def forward(self, o, deterministic=False, squash_action=True): net_ouput = self.net(o) mu = self.mu(net_ouput) log_std = self.log_std(net_ouput) log_std = torch.clamp(log_std, LOG_STD_MIN, LOG_STD_MAX) #std = torch.exp(log_std) std = torch.sigmoid(log_std) policy = Normal(mu, std) if deterministic: pi = mu else: pi = policy.rsample() # .sample() : no grad # Squash those unbounded actions if squash_action: # Compute logprob from Gaussian, and then apply correction for Tanh squashing. # NOTE: The correction formula is a little bit magic. To get an understanding # of where it comes from, check out the original SAC paper (arXiv 1801.01290) # and look in appendix C. This is a more numerically-stable equivalent to Eq 21. # Try deriving it yourself as a (very difficult) exercise. :) # gaussian likelihood logp_pi = policy.log_prob(pi).sum(dim=1) logp_pi -= (2 * (np.log(2) - pi - F.softplus(-2 * pi))).sum(axis=1) else: logp_pi = None pi = torch.tanh(pi) return mu, pi, logp_pi
def forward(self, state, deterministic=False): """ Param state is a torch tensor """ x = F.relu(self.linear1(state)) x = F.relu(self.linear2(x)) x = self.linear3(x) mu = self.mu_layer(x) log_std = self.log_std_layer(x) log_std = torch.clamp(log_std, self.LOG_STD_MIN, self.LOG_STD_MAX) std = torch.exp(log_std) # Pre-squash distribution and sample pi_distribution = Normal(mu, std) if deterministic: pi_action = mu else: pi_action = pi_distribution.rsample() # Compute logprob from Gaussian, and then apply correction for Tanh squashing. logp_pi = pi_distribution.log_prob(pi_action).sum(axis=-1) logp_pi -= (2*(np.log(2) - pi_action - F.softplus(-2*pi_action))).sum(axis=1) pi_action = torch.tanh(pi_action) return pi_action, logp_pi
def forward(self, X): """ Inputs: X (PyTorch Matrix): Batch of observations Outputs: out (PyTorch Matrix): Output of network (actions, values, etc) """ h1 = self.nonlin(self.fc1(self.in_fn(X))) h2 = self.nonlin(self.fc2(h1)) mu = self.mu_layer(h2) log_std = self.log_std_layer(h2) log_std = torch.clamp(log_std, -3, 1) std = torch.exp(log_std) pi_distribution = Normal(mu, std) if self.training: pi_action = pi_distribution.rsample() else: pi_action = mu logp_pi = pi_distribution.log_prob(pi_action).sum(dim=-1) log_2_torch = torch.log(torch.Tensor([2.])) if pi_action.is_cuda: log_2_torch = log_2_torch.cuda() logp_pi -= ( 2 * (log_2_torch - pi_action - F.softplus(-2 * pi_action))).sum(dim=-1) pi_action = self.out_fn(pi_action) return pi_action, logp_pi
def sample_normal(self, state, reparameterize=True): if self.control_strategy == 'Continuouse': mu, sigma = self.forward(state) probabilities = Normal(mu, sigma) if reparameterize: actions = probabilities.rsample() else: actions = probabilities.sample() action = T.tanh(actions) * T.tensor(self.max_action).to( self.device) log_probs = probabilities.log_prob(actions) log_probs -= T.log(1 - action.pow(2) + self.reparam_noise) log_probs = log_probs.sum(1, keepdim=True) # Loss 구하기 위해서 # 사용안함. action_probs, greedy_actions = 0, 0 else: action_probs = self.forward(state) greedy_actions = T.argmax(action_probs, dim=1, keepdim=True) categorical = Categorical(action_probs) action = categorical.sample().view(-1, 1) log_probs = T.log(action_probs + (action_probs == 0.0).float() * 1e-8) return action, log_probs, action_probs, greedy_actions
def sample_normal(self, state, reparameterize=True): """ Calculation of the actual policy. Policy = "What's the probability of choosing a certain action given a certain state?" This is assuming a continuous action space. TODO change this to a discrete action space! (simpler = faster = easier) http://www.youtube.com/watch?v=ioidsRlf79o&t=25m30s :param state: :param reparameterize: Add "noise" to the sampling (to encourage exploration) :return: """ mu, sigma = self.forward(state) probabilities = Normal(mu, sigma) if reparameterize: # sample + noise (for exploration) actions = probabilities.rsample() else: # sample actions = probabilities.sample() action = T.tanh(actions) * T.tensor(self.max_action).to(self.device) log_probs = probabilities.log_prob(actions) log_probs -= T.log(1 - action.pow(2) + self.reparam_noise) log_probs = log_probs.sum(1, keepdim=True) return action, log_probs
def forward(self, x, with_logprob=False): x = self.layers(x) mean = self.mean_layer(x) std = self.log_std_layer(x).clamp(-20, 2).exp() pi_distribution = Normal(mean, std) pi_action = pi_distribution.rsample() if with_logprob: # Compute logprob from Gaussian, and then apply correction for Tanh squashing. # NOTE: The correction formula is a little bit magic. To get an understanding # of where it comes from, check out the original SAC paper (arXiv 1801.01290) # and look in appendix C. This is a more numerically-stable equivalent to Eq 21. # Try deriving it yourself as a (very difficult) exercise. :) logp = pi_distribution.log_prob(pi_action).sum(axis=-1) logp -= (2 * (np.log(2) - pi_action - F.softplus(-2 * pi_action))).sum( axis=1) else: logp = None x = torch.tanh(pi_action) # [N, action_dim] # scale (-1, 1) to [action.low, action_high] action = (x + 1) * (self.action_high - self.action_low) / 2 + self.action_low if with_logprob: return (action, logp) else: return action
def forward(self, obs, deterministic=False, with_logprob=True): net_out = self.net(obs) mu = self.mu_layer(net_out) log_std = self.log_std_layer(net_out) log_std = torch.clamp(log_std, self.LOG_STD_MIN, self.LOG_STD_MAX) std = torch.exp(log_std) pi_distribution = Normal(mu, std) if deterministic: pi_action = mu # Used for evaluating policy at test time. else: pi_action = pi_distribution.rsample() if with_logprob: logp_pi = pi_distribution.log_prob(pi_action).sum(axis=-1) logp_pi -= ( 2 * (np.log(2) - pi_action - F.softplus(-2 * pi_action))).sum( axis=1) else: logp_pi = None pi_action = torch.tanh(pi_action) pi_action = self.act_limit * pi_action return pi_action, logp_pi
def sample_action(self, state, reparameterize=False, actor_network=None): if actor_network is None: actor_network = self.actor # Forward if isinstance(state, np.ndarray): state = torch.from_numpy(state).to(self.device) actor_output = actor_network(state) if len(state.shape) > 1: # It's a batch actions_means = actor_output[:, :self.nb_actions] actions_stds = actor_output[:, self.nb_actions:] else: actions_means = actor_output[:self.nb_actions] actions_stds = actor_output[self.nb_actions:] actions_stds = torch.clamp(actions_stds, min=self.min_std, max=1) actions_distribution = Normal(actions_means, actions_stds) if reparameterize: actions = actions_distribution.rsample() else: actions = actions_distribution.sample() action = torch.tanh( actions) * self.actions_bounds_range + self.actions_bounds_mean log_probs = actions_distribution.log_prob(actions) log_probs -= torch.log(1 - action.pow(2) + self.min_std) log_probs = log_probs.sum(dim=-1) return action, log_probs
def perplexity(model, data, device): model.eval() total_per = 0 for step, (input, targets, lenghts) in enumerate(data): input = input.to(device) targets = targets.to(device) batch_size = input.shape[0] seq_len = input.shape[1] lenghts = torch.tensor(lenghts).to(device).float() mean, std = model.encoder(input) #Reparameterization trick q_z = Normal(mean, std) sample_z = q_z.rsample() h_0 = torch.tanh(model.upscale(sample_z)).unsqueeze(0) if (model.skip): z = model.z_lin(sample_z).unsqueeze(1) px_logits, _ = model.skip_decoder(input, h_0, z, device) else: px_logits, _ = model.decoder(input, h_0) criterion = nn.CrossEntropyLoss(ignore_index=0, reduction='sum') perplexity = 0 for i in range(batch_size): seq = px_logits[i, :, :] target = targets[i, :] perplexity += torch.exp(criterion(seq, target) / lenghts[i]) perplexity /= batch_size total_per += perplexity.detach() return total_per / step
def forward(self, rnn_outputs): #Create Mu mu = [] for _i in range(self.steps): #mu_tmp = torch.matmul(rnn_outputs[:,_i,:],mu_w)+mu_b mu_tmp = self.mu(rnn_outputs[:, _i, :]) mu.append(mu_tmp) mu = torch.cat(mu, dim=1).view(-1, self.steps, self.p_att_shape[1]) #Create Sigma sigma = [] for _k in range(self.steps): sigma_tmp = self.sigma(rnn_outputs[:, _k, :]) sigma.append(sigma_tmp) sigma = torch.cat(sigma, 1).view(-1, self.steps, self.p_att_shape[1]) sigma = self.softplus(sigma) distribution = Normal(loc=mu, scale=sigma) att = distribution.rsample([1]) att = torch.squeeze(att, 0) if self.strd_id == 'alpha': squashed_att = self.softmax(att) #print('Done with generating alpha attention.') elif self.strd_id == 'beta': squashed_att = self.tanh(att) #print('Done with generating beta attention.') else: raise ValueError( 'You must re-check the attention id. required to \'alpha\' or \'beta\'' ) return squashed_att
def forward(self, obs, deterministic=False, with_logprob=True): net_out = self.net(obs) mu = self.mu_layer(net_out) log_std = self.log_std_layer(net_out) log_std = torch.clamp(log_std, LOG_STD_MIN, LOG_STD_MAX) std = torch.exp(log_std) # Pre-squash distribution and sample pi_distribution = Normal(mu, std) if deterministic: # Only used for evaluating policy at test time. pi_action = mu else: pi_action = pi_distribution.rsample() if with_logprob: # Compute logprob from Gaussian, and then apply correction for Tanh squashing. logp_pi = pi_distribution.log_prob(pi_action).sum(axis=-1) logp_pi -= ( 2 * (np.log(2) - pi_action - F.softplus(-2 * pi_action))).sum( axis=1) else: logp_pi = None pi_action = torch.tanh(pi_action) pi_action = self.act_limit * pi_action return pi_action, logp_pi
def compute_stochastic_elbo(a, b, nu, omega, x, y, a_0, b_0, mu_0): """ Return a monte-carlo estimate of the ELBO, using a single sample from Q(sigma^-2, beta) a, b are the Gamma 'shape' and 'rate' parameters for the variational posterior over *precision*: q(tau) = q(sigma^-2) nu_k, omega_k are Normal 'mean' and 'precision' parameters for the variational posterior over weights: q(beta_k) x is an n by k matrix, where each row contains the regression inputs [1, x, x^2, x^3] y is an n by 1 values a_0, b_0 the parameters for the Gamma prior over precision P(tau) = P(sigma^-2) mu_0 is the mean of the Gamma prior on weights beta """ # Define mean field variational distribution over (beta, tau). Q_beta = Normal(nu, omega**-0.5) Q_tau = Gamma(a, b) # Sample from variational distribution: (tau, beta) ~ Q # Use rsample to make sure that the result is differentiable. tau = Q_tau.rsample() sigma = tau**-0.5 beta = Q_beta.rsample() # Create a single sample monte-carlo estimate of ELBO. P_tau = Gamma(a_0, b_0) P_beta = Normal(mu_0, sigma) P_y = Normal((beta[None, :]*x).sum(dim=1, keepdim=True), sigma) kl_tau = Q_tau.log_prob(tau) - P_tau.log_prob(tau) kl_beta = Q_beta.log_prob(beta).sum() - P_beta.log_prob(beta).sum() log_likelihood = P_y.log_prob(y).sum() elbo = log_likelihood - kl_tau - kl_beta return elbo
def forward(self, o, deterministic=False, get_logprob=True): net_ouput = self.net(o) mu = self.mu(net_ouput) log_std = self.log_std(net_ouput) LOG_STD_MIN, LOG_STD_MAX = -10.0, +2.0 log_std = torch.clamp(log_std, LOG_STD_MIN, LOG_STD_MAX) #log_std #std = torch.exp(log_std) std = torch.sigmoid(log_std) #std # Pre-squash distribution and sample dist = Normal(mu, std) if deterministic: pi = mu else: pi = dist.rsample() # sampled if get_logprob: # Compute logprob from Gaussian, and then apply correction for Tanh squashing. # NOTE: The correction formula is a little bit magic. To get an understanding # of where it comes from, check out the original SAC paper (arXiv 1801.01290) # and look in appendix C. This is a more numerically-stable equivalent to Eq 21. # Try deriving it yourself as a (very difficult) exercise. :) logp_pi = dist.log_prob(pi).sum( axis=-1) #gaussian log_likelihood # modified axis logp_pi -= (2 * (np.log(2) - pi - F.softplus(-2 * pi))).sum(axis=1) else: logp_pi = None pi = torch.tanh(pi) return pi, logp_pi
def forward(self, input, targets, lengths, device): """ Given input, perform an encoding and decoding step and return the negative average elbo for the given batch. """ batch_size = input.shape[0] seq_len = input.shape[1] average_negative_elbo = None mean, std = self.encoder(input) #Reparameterization trick q_z = Normal(mean,std) sample_z = q_z.rsample() h_0 = torch.tanh(self.upscale(sample_z)).unsqueeze(0) z = self.z_lin(sample_z).unsqueeze(1) px_logits, _ = self.decoder(input,h_0,z,device) p_x = Categorical(logits=px_logits) prior = Normal(torch.zeros(self.z_dim).to(device),torch.ones(self.z_dim).to(device)) KLD = distributions.kl_divergence(q_z, prior) criterion = nn.CrossEntropyLoss(ignore_index=0) recon_loss = criterion(p_x.logits.view(batch_size*seq_len,-1),targets.view(-1))*seq_len average_negative_elbo = torch.sum(torch.mean(KLD,dim=0)) + recon_loss return average_negative_elbo, KLD
def evaluate(self, state, deterministic: bool = False, with_log_probability: bool = True): mean, std = self.forward(state) distribution = Normal(mean, std) sample = distribution.rsample() if deterministic: action = mean else: action = torch.tanh( sample ) # todo when sac working, multiply by action_scale and add action_bias if with_log_probability: # Implementation that I originally implemented # the "_" are only here for now to debug the values and the shapes # log_probability_ = distribution.log_prob(sample) - torch.log((1 - action.pow(2)) + self.epsilon) # log_probability = log_probability_.sum(1, keepdim=True) # OPENAI Implementation # https://github.com/openai/spinningup/blob/038665d62d569055401d91856abb287263096178/spinup/algos/pytorch/sac/core.py#L59 log_probability_ = distribution.log_prob(sample).sum(axis=-1, keepdim=True) log_probability__ = ( 2 * (np.log(2) - sample - F.softplus(-2 * sample))).sum( axis=1).unsqueeze(1) log_probability = log_probability_ - log_probability__ else: log_probability = None return action, log_probability
def log_forward(self, x): out = torch.Tensor(x).reshape(-1, self.in_dim) out = self.l1(out) out = self.leaky_relu(out) out = self.l2(out) out = self.leaky_relu(out) out = self.l3(out) out = self.leaky_relu(out) out = self.l4(out) #out = self.tanh(out) mu = self.mu_linear(out) log_std = self.log_linear(out) log_std = torch.clamp(log_std, -20, 2) std = torch.exp(log_std) distribution = Normal(mu, std) action = distribution.rsample() log_p = distribution.log_prob(action) log_p -= (2 * (np.log(2) - action - F.softplus(-2 * action))) action = torch.tanh(action) return action, log_p
def forward(self, observation, deterministic=False, with_log_prob=True): net_out = self.net(observation) # computer the \mu and \sigma of the gaussian mu = self.mu_layer(net_out) log_std = self.log_std_layer(net_out) log_std = torch.clamp(log_std, self.log_std_min, self.log_std_max) std = torch.exp(log_std) # Pre-squash distribution and sample pi_distribution = Normal(mu, std) if deterministic: # only used for evaluating policy at test time. pi_action = mu else: pi_action = pi_distribution.rsample() if with_log_prob: # Appendix C log_pro_pi = pi_distribution.log_prob(pi_action).sum(dim=-1) log_pro_pi -= (2 * (np.log(2) - pi_action - F.softplus(-2*pi_action))).sum(dim=-1) else: log_pro_pi = None pi_action = torch.tanh(pi_action) pi_action = self.act_limit * pi_action return pi_action, log_pro_pi
def forward(self, obs, deterministic=False, with_logprob=True): # Actor actor_tmp = F.relu(self.actor_layer1(obs)) actor_tmp = F.relu(self.actor_layer2(actor_tmp)) mu = self.mu_layer(actor_tmp) log_std = self.log_std_layer(actor_tmp) log_std = torch.clamp(log_std, self.log_std_min, self.log_std_max) std = torch.exp(log_std) distribution = Normal(mu, std) if deterministic: action = mu else: action = distribution.rsample() # Reference for OpenAI SpinningUp's Implementation of SAC if with_logprob: logp_prob = distribution.log_prob(action).sum(axis=-1) logp_prob -= (2 * (np.log(2) - action - F.softplus(-2 * action))).sum( axis=1) else: logp_prob = None action = torch.tanh(action) action = self.act_limit * action return action, logp_prob
def forward(self, x, deterministic=False, repara_trick=False, with_logprob=True): x = torch.FloatTensor(x.reshape(1, -1)).to(self.args.device) x = F.relu(self.l1(x)) log_std = self.log_std_layer(x) std = torch.exp(log_std) x = F.relu(self.l2(x)) x = F.relu(self.l2_additional(x)) mu = self.max_action * torch.tanh(self.l3(x)) pi_distribution = Normal(mu, std) if deterministic: pi_action = mu elif repara_trick: pi_action = pi_distribution.rsample() else: pi_action = pi_distribution.sample() if with_logprob: logp_pi = pi_distribution.log_prob(pi_action).sum(axis=-1) logp_pi -= ( 2 * (np.log(2) - pi_action - F.softplus(-2 * pi_action))).sum( axis=1) else: logp_pi = None pi_action = torch.tanh(pi_action) pi_action = self.max_action * pi_action return pi_action.cpu().data.numpy().flatten()
def forward(self, obs, batch_size, deterministic=False, with_logprob=True): """ :param obs: observation :param batch_size: batch size :param deterministic: If true use deterministic action :param with_logprob: if true return log prob :return: action scale with action limit """ in_pop_spikes = self.encoder(obs, batch_size) out_pop_activity = self.snn(in_pop_spikes, batch_size) mu = self.decoder(out_pop_activity) log_std = self.log_std_network(obs) log_std = torch.clamp(log_std, LOG_STD_MIN, LOG_STD_MAX) std = torch.exp(log_std) # Pre-squash distribution and sample pi_distribution = Normal(mu, std) if deterministic: # Only used for evaluating policy at test time. pi_action = mu else: pi_action = pi_distribution.rsample() if with_logprob: logp_pi = pi_distribution.log_prob(pi_action).sum(axis=-1) logp_pi -= ( 2 * (np.log(2) - pi_action - F.softplus(-2 * pi_action))).sum( axis=1) else: logp_pi = None pi_action = torch.tanh(pi_action) pi_action = self.act_limit * pi_action return pi_action, logp_pi
def forward(self, obs, deterministic=False, with_logprob=True): net_out = self.net(obs) mu = self.mu_layer(net_out) log_std = self.log_std_layer(net_out) log_std = torch.clamp(log_std, LOG_STD_MIN, LOG_STD_MAX) std = torch.exp(log_std) # Pre-squash distribution and sample pi_distribution = Normal(mu, std) if deterministic: # Only used for evaluating policy at test time. pi_action = mu else: pi_action = pi_distribution.rsample() if with_logprob: # Compute logprob from Gaussian, and then apply correction for Tanh squashing. # NOTE: The correction formula is a little bit magic. To get an understanding # of where it comes from, check out the original SAC paper (arXiv 1801.01290) # and look in appendix C. This is a more numerically-stable equivalent to Eq 21. # Try deriving it yourself as a (very difficult) exercise. :) logp_pi = pi_distribution.log_prob(pi_action).sum(axis=-1) logp_pi -= ( 2 * (np.log(2) - pi_action - F.softplus(-2 * pi_action))).sum( axis=1) else: logp_pi = None pi_action = torch.tanh(pi_action) pi_action = self.act_limit * pi_action return pi_action, logp_pi
def getProposalDistribution(i, x): if random.random() < 0.5 or any(n_observations[i] == 0): # Propose using recognition model phi, _ = r(x) return phi else: # Local proposal model, train_x, train_y = getGPModel(i) model.eval() likelihood.eval() v = Variable(train_x, requires_grad=True) m = model(v).mean best_score, best_idx = torch.max(m, dim=1) v_unrolled = v.data.view(-1, *v.size()[2:]) best_phi = v_unrolled[ torch.arange(0, len(v_unrolled), v.size(1)).cuda() + best_idx] #m.sum().backward() #grad_unrolled = v.grad.view(-1, *v.size()[2:]) #best_grad = grad_unrolled[torch.arange(0, len(v_unrolled), v.size(1)).cuda() + best_idx] lr = 0.01 #Variable(torch.Tensor([0.001]).cuda(), requires_grad=True) #step = best_grad*lr mu = best_phi # + step sigma = lr #step.abs() dist = Normal(mu, sigma) return dist.rsample()
def forward(self, z, x=None): coeffs = z mu = (coeffs[:, :, None] * self.inputs[None, :, :]).sum(dim=1) dist = Normal(mu, self.sigma) if x is None: x = dist.rsample() score = dist.log_prob(x) score = score.sum(dim=1) return x, score
def sample_prior(self, shape=torch.Size([]), store=False): p_mu = Normal(self.mu_loc_prior * torch.ones(self.dim_in), self.mu_scale_prior * torch.ones(self.dim_in)) p_s = LogitNormal(loc=p_mu.rsample(shape), scale=self.s_scale_prior * torch.ones(self.dim_in)) s = p_s.rsample(shape) if store: self.s = s return s
def get_action(self, state): mu, std = self.actor(state) dist = Normal(mu, std) u = dist.rsample() u_log_prob = dist.log_prob(u) a = torch.tanh(u) a_log_prob = u_log_prob - torch.log(1 - torch.square(a) + 1e-3) return a, a_log_prob.sum(-1, keepdim=True)
def forward(self, x): hidden = self.model(x) mu = self.mu(hidden) sigma = self.sigma(hidden) std = nn.functional.softplus(sigma) dist = Normal(mu, std) z = dist.rsample() return z, dist, mu
def get_action(self, belief, state, det=False): action_mean, action_std = self.forward(belief, state) dist = Normal(action_mean, action_std) dist = TransformedDistribution(dist, TanhBijector()) dist = torch.distributions.Independent(dist, 1) dist = SampleDist(dist) if det: return dist.mode() else: return dist.rsample()
def forward(self, inputs, c, z=None): inputs = inputs.view(-1, 1, 28, 28) #huh? mu = self.localization_mu(inputs) sigma = self.localization_sigma(inputs) dist = Normal(mu, sigma) if z is None: z = dist.rsample() score = dist.log_prob(z).sum(dim=1).sum(dim=1).sum(dim=1) return z, score
def forward(self, inputs, c=None): inputs_permuted = inputs.transpose(0,1) # |D| * batch * ... embeddings = [self.enc(x) for x in inputs_permuted] mean_embedding = sum(embeddings)/len(embeddings) mu_c = self.mu_c(mean_embedding) sigma_c = self.sigma_c(mean_embedding) dist = Normal(mu_c, sigma_c) if c is None: c = dist.rsample() return c, dist.log_prob(c).sum(dim=1) # Return value, score
def forward(self, inputs, c=None): # transform the input xs = [self.stn(inputs[:,i,:,:,:]) for i in range(inputs.size(1))] embs = [self.conv_post_stn(x) for x in xs] emb = sum(embs)/len(embs) mu = self.conv_mu(emb) sigma = self.conv_sigma(emb) dist = Normal(mu, sigma) if c is None: c = dist.rsample() return c, dist.log_prob(c).sum(dim=1).sum(dim=1).sum(dim=1)
def forward(self, inputs, c, z=None): mu_z = self.mu_z(inputs[:, 0]) sigma_z = self.sigma_z(inputs[:, 0]) dist = Normal(mu_z, sigma_z) if z is None: z = dist.rsample() return z, dist.log_prob(z).sum(dim=1) # Return value, score
def forward(self, c, z, x=None): cz = torch.cat([c,z], dim=1) dist = Normal(self.mu(cz), self.sigma(cz)) if x is None: x = dist.rsample() return x, dist.log_prob(x).sum(dim=1) # Return value, score