def sample_conditional_a(self, resid_image, var_so_far, pixel_1d): is_on = (pixel_1d < (self.n_discrete_latent - 1)).float() # pass through galaxy encoder pixel_2d = self.one_galaxy_vae.pixel_1d_to_2d(pixel_1d) z_mean, z_var = self.one_galaxy_vae.enc(resid_image, pixel_2d) # sample z q_z = Normal(z_mean, z_var.sqrt()) z_sample = q_z.rsample() # kl term for continuous latent vars log_q_z = q_z.log_prob(z_sample).sum(1) p_z = Normal(torch.zeros_like(z_sample), torch.ones_like(z_sample)) log_p_z = p_z.log_prob(z_sample).sum(1) kl_z = is_on * (log_q_z - log_p_z) # run through decoder recon_mean, recon_var = self.one_galaxy_vae.dec(is_on, pixel_2d, z_sample) # NOTE: we will have to the recon means once we do more detections # recon_means = recon_mean + image_so_far # recon_vars = recon_var + var_so_far return recon_mean, recon_var, is_on, kl_z
def forward(self, X: Tensor) -> Tensor: r"""Evaluate Expected Improvement on the candidate set X. Args: X: A `b1 x ... bk x 1 x d`-dim batched tensor of `d`-dim design points. Expected Improvement is computed for each point individually, i.e., what is considered are the marginal posteriors, not the joint. Returns: A `b1 x ... bk`-dim tensor of Expected Improvement values at the given design points `X`. """ self.best_f = self.best_f.to(X) posterior = self.model.posterior(X) self._validate_single_output_posterior(posterior) mean = posterior.mean # deal with batch evaluation and broadcasting view_shape = mean.shape[:-2] if mean.dim() >= X.dim() else X.shape[:-2] mean = mean.view(view_shape) sigma = posterior.variance.clamp_min(1e-9).sqrt().view(view_shape) u = (mean - self.best_f.expand_as(mean)) / sigma if not self.maximize: u = -u normal = Normal(torch.zeros_like(u), torch.ones_like(u)) ucdf = normal.cdf(u) updf = torch.exp(normal.log_prob(u)) ei = sigma * (updf + u * ucdf) return ei
def forward(self, X: Tensor) -> Tensor: r"""Evaluate Constrained Expected Improvement on the candidate set X. Args: X: A `(b) x 1 x d`-dim Tensor of `(b)` t-batches of `d`-dim design points each. Returns: A `(b)`-dim Tensor of Expected Improvement values at the given design points `X`. """ posterior = self.model.posterior(X) means = posterior.mean.squeeze(dim=-2) # (b) x t sigmas = posterior.variance.squeeze(dim=-2).sqrt().clamp_min(1e-9) # (b) x t # (b) x 1 mean_obj = means[..., [self.objective_index]] sigma_obj = sigmas[..., [self.objective_index]] u = (mean_obj - self.best_f.expand_as(mean_obj)) / sigma_obj if not self.maximize: u = -u normal = Normal( torch.zeros(1, device=u.device, dtype=u.dtype), torch.ones(1, device=u.device, dtype=u.dtype), ) ei_pdf = torch.exp(normal.log_prob(u)) # (b) x 1 ei_cdf = normal.cdf(u) ei = sigma_obj * (ei_pdf + u * ei_cdf) prob_feas = self._compute_prob_feas(X=X, means=means, sigmas=sigmas) ei = ei.mul(prob_feas) return ei.squeeze(dim=-1)
def forward(self, state: torch.Tensor, rsample: bool = True) -> Tuple[torch.Tensor, torch.Tensor]: nn_out = self.nn(state) means = nn_out[:, self.action_dim:] logstds = nn_out[:, :self.action_dim] stds = logstds.exp() action_dist = Normal(means, stds) action = action_dist.rsample() if rsample else means # Stabler version of log-probability calculation # Reference: Spinning Up implementation of SAC logprobs = action_dist.log_prob(action).sum(-1, keepdims=True) logprobs -= (2 * (np.log(2) - action - F.softplus(-2 * action))).sum( -1, keepdims=True) action = torch.tanh(action) return action, logprobs
def log_prob(self, x, context, should_sum=True, feedback=None): mean, std = self.cond_dist_params(context, feedback=feedback) dist = Normal(torch.zeros(mean.shape, device=mean.device), torch.ones(std.shape, device=std.device)) adjusted_x = (x - mean) / std adjusted_a = (0 - mean) / std log_gx = dist.log_prob(adjusted_x) log_c = ((1 - dist.cdf(adjusted_a)) * std).log() log_prob = log_gx - log_c # return sum_except_batch(dist.log_prob((x - mean).abs())) ''' # Folded normal distribution mean, std = self.cond_dist_params(context) dist1 = Normal(mean, std) dist2 = Normal(-mean, std) log_prob = (dist1.log_prob(x).exp() + dist2.log_prob(x).exp()).log() ''' if should_sum: return sum_except_batch(log_prob) else: return log_prob
def UsSs(self, hessian_approx, X=None, y=None): arg_check(hessian_approx) X, y = (torch.from_numpy(X) if X is not None else self.Xt, torch.from_numpy(y) if y is not None else self.y) Us = list() Ss = np.ones(len(y)) for xi, yi in zip(X, y): self.model.zero_grad() output = self.model.forward(xi) if hessian_approx == 'g': likelihood = Normal(output.flatten(), self.sigma_noise) loss = - likelihood.log_prob(yi) loss.backward() Us.append(self.model.gradient) elif hessian_approx == 'J': output.backward() Us.append(self.model.gradient) elif hessian_approx == 'H': raise NotImplementedError return np.stack(Us), Ss if hessian_approx == 'g' else self.bn * Ss
def sample(self, obs, prev_acts, rnn_hidden_states, available_actions=None): # TODO: review this method means, log_stds, h_outs = self.forward(obs, prev_acts, rnn_hidden_states) stds = log_stds.exp() normal = Normal(means, stds) x_t = normal.rsample() y_t = torch.tanh(x_t) sampled_actions = y_t * self.action_scale + self.action_bias log_probs = normal.log_prob(x_t) log_probs -= torch.log(self.action_scale * (1 - y_t.pow(2)) + epsilon) log_probs = log_probs.sum(2, keepdim=True) means = torch.tanh(means) * self.action_scale + self.action_bias return sampled_actions, log_probs, means, h_outs
def forward(self, state: torch.Tensor) -> torch.Tensor: x = F.relu(self.hidden1(state)) x = F.relu(self.hidden2(x)) mu = self.mu_layer(x).tanh() log_std = self.log_std_layer(x).tanh() log_std = self.log_std_min + 0.5 * ( self.log_std_max - self.log_std_min ) * (log_std + 1) std = torch.exp(log_std) dist = Normal(mu, std) z = dist.rsample() action = z.tanh() log_prob = dist.log_prob(z) - torch.log(1 - action.pow(2) + 1e-7) log_prob = log_prob.sum(-1, keepdim=True) return action, log_prob
def sample(self, mean: t.tensor, action=None): """ You must call this function to sample an action and its log probability during forward(). Args: mean: Probability tensor of shape ``[batch, action_num]``, usually produced by a softmax layer. action: The action to be evaluated. set to ``None`` if you are sampling a new batch of actions. Returns: Action tensor of shape ``[batch, action_dim]``, Action log probability tensor of shape ``[batch, 1]``. """ self.action_param = mean dist = Normal(loc=mean, scale=t.exp(self.action_log_std)) if action is None: action = dist.sample() return action, dist.log_prob(action).sum(dim=1, keepdims=True)
class TanhNormal(Distribution): def __init__(self, loc, scale): super().__init__() self.normal = Normal(loc, scale) def sample(self): return torch.tanh(self.normal.sample()) def rsample(self): return torch.tanh(self.normal.rsample()) # Calculates log probability of value using the change-of-variables technique (uses log1p = log(1 + x) for extra numerical stability) def log_prob(self, value): inv_value = (torch.log1p(value) - torch.log1p(-value)) / 2 # artanh(y) return self.normal.log_prob(inv_value) - torch.log1p( -value.pow(2) + 1e-6) # log p(f^-1(y)) + log |det(J(f^-1(y)))| @property def mean(self): return torch.tanh(self.normal.mean)
def log_prob_for_single_animal(self, inputs, **kwargs): """ calculate the log prob for inputs[1:] based on inputs[:-1] :param inputs: (T, 2) :param kwargs: :return: (T-1, K) """ T, d = inputs.shape assert d == 2, d # get the mu and cov based on the observations except the last one mu, cov = self.get_mu_and_cov_for_single_animal(inputs[:-1], **kwargs) # mean: (T-1, K, 2), covariance (T-1, K, 2) m = Normal(mu, torch.sqrt(cov)) # evaluated the observations except the first one. (T-1, 1, 2) log_prob = m.log_prob(inputs[1:, None]) # (T-1, K, 2) log_prob = torch.sum(log_prob, dim=-1) assert log_prob.shape == (T - 1, self.K), log_prob.shape return log_prob
class TanhNormal(Distribution): def __init__(self, normal_mean, normal_std): super().__init__() self.normal_mean = normal_mean self.normal_std = normal_std self.standard_normal = Normal( torch.zeros_like(self.normal_mean, device=DEVICE), torch.ones_like(self.normal_std, device=DEVICE)) self.normal = Normal(normal_mean, normal_std) def log_prob(self, pre_tanh): log_det = 2 * np.log(2) + logsigmoid(2 * pre_tanh) + logsigmoid( -2 * pre_tanh) result = self.normal.log_prob(pre_tanh) - log_det return result def rsample(self): pretanh = self.normal_mean + self.normal_std * self.standard_normal.sample( ) #重参数 return torch.tanh(pretanh), pretanh
def sample_with_logp(self, x): mu, log_std = self.forward(x) std = torch.exp(log_std) # print("mu",mu) # print("log_std", log_std) # print("std",std) normal = Normal(mu, std) x_t = normal.rsample() # print("x_t", x_t) logp = normal.log_prob(x_t) # print("logp 1",logp) y_t = torch.tanh(x_t) logp -= torch.log(1 - torch.pow(y_t, 2) + 1e-6) # print("y_t", y_t) # print("logp 2",logp) return y_t, logp
def sample(self, state): ''' :param state: (batch_num, state_dim) :return: action: (batch_num, action_dim, option_num) log_prob: (batch_num, option_num) mean_mat: (batch_num, action_dim, option_num) ''' mean_mat, log_std_mat = self.forward(state) std_mat = log_std_mat.exp() normal = Normal(mean_mat, std_mat) x_t = normal.rsample( ) # for reparameterization trick (mean + std * N(0,1)) y_t = torch.tanh(x_t) action = y_t * self.action_scale + self.action_bias log_prob = normal.log_prob(x_t) # log(pi(at|st)) # Enforcing Action Bound, because the Gaussian distribution changes from (-inf, inf) to (-1, 1) log_prob -= torch.log(self.action_scale * (1 - y_t.pow(2)) + epsilon) log_prob = log_prob.sum(1, keepdim=True) mean_mat = torch.tanh(mean_mat) * self.action_scale + self.action_bias return action, log_prob, mean_mat
def fwd_step(self, z): z = self.cast(z) z_ω = self.flt_ω(z) # ω only observes PD params (μ_ω, σ_ω) = self.ω(z_ω) dist_ω = Normal(μ_ω, σ_ω) a_ω = dist_ω.rsample() z_ = self.env.step_batch(z, a_ω.detach()) # ω-step z_lst = [z_] for t in range(1, self.n): z_ = self.env.step_batch(z_, torch.zeros_like( a_ω)) # n state propagations with no PD update z_lst.append(z_) z_q = self.flt_q(z, z_lst) # q does not observe PD_t+n (μ_q, σ_q) = self.q(z_q) dist_q = Normal(μ_q, σ_q) self.it += 1 return (dist_q.log_prob(a_ω) - dist_ω.log_prob(a_ω)).sum(-1)
def mix_gaussian_loss_1d(x, l): """ log-likelihood for mixture of continuous Gaussians, assumes the data has been rescaled to [-1,1] interval Args: x (Tensor): Target (B x D x D x 1) (B batch size, D dimensions, 1 channel (B/W image) y_hat (Tensor): Predictive distribution, (B x D x D x 3*nr_mix) (B batch size, D dimensions, 3 * number of mixture components channels) log_scale_min (float): Log scale minimum value reduce (bool): If True, the losses are averaged or summed for each minibatch. Returns Tensor: loss """ x = x.permute(0, 2, 3, 1) l = l.permute(0, 2, 3, 1) xs = [int(y) for y in x.size()] ls = [int(y) for y in l.size()] # here and below: unpacking the params of the mixture of logistics nr_mix = int(ls[-1] / 3) logit_probs = l[:, :, :, :nr_mix] # l = l[:, :, :, nr_mix:].contiguous().view(xs + [nr_mix * 2]) # 2 for mean, scale means = l[:, :, :, nr_mix:2 * nr_mix] log_scales = torch.clamp(l[:, :, :, 2 * nr_mix:3 * nr_mix], min=-7.) # here and below: getting the means and adjusting them based on preceding # sub-pixels x = x.expand_as(means) #x = x.contiguous() #x = x.unsqueeze(-1) + Variable(torch.zeros(xs + [nr_mix]).cuda(), requires_grad=False) # means = torch.cat((means[:, :, :, 0, :].unsqueeze(3), m2, m3), dim=3) centered_x = x - means dist = Normal(loc=0., scale=torch.exp(log_scales)) # do we need to add a trick to avoid log(0)? log_probs = dist.log_prob(centered_x) if nr_mix > 1: log_probs = log_probs + F.log_softmax(logit_probs, -1) if nr_mix == 1: return -log_sum_exp(log_probs) ## ?? # return -torch.sum(log_sum_exp(log_probs)) else: return -log_sum_exp(log_probs)
def update(self, optim, trajectory): states = torch.stack(trajectory["states"]).float() actions = torch.stack(trajectory["actions"]).float() next_states = torch.stack(trajectory["next_states"]).float() beta_log_probs = torch.stack(trajectory["log_probs"]).float() rewards = torch.stack(trajectory["rewards"]).float() values = torch.stack(trajectory["values"]).float() masks = torch.stack(trajectory["dones"]) returns = self.__Tensor(rewards.size(0),1) deltas = self.__Tensor(rewards.size(0),1) advantages = self.__Tensor(rewards.size(0),1) prev_return = 0 prev_value = 0 prev_advantage = 0 for i in reversed(range(rewards.size(0))): if masks[i] == 0: next_action, _, v_next = self.select_action(next_states[i]) state_action = torch.cat([next_states[i], next_action], dim=0) state_inf = next_states[i]+self.__dynamics(state_action) _, _, v_inf = self.select_action(state_inf) v_fin = v_next.detach()+self.__gamma*v_inf.detach() else: v_fin = 0 returns[i] = rewards[i]+self.__gamma*(prev_return*masks[i]+v_fin) deltas[i] = rewards[i]+self.__gamma*(prev_value*masks[i]+v_fin)-values.data[i] advantages[i] = deltas[i]+self.__gamma*self.__lmbd*prev_advantage*masks[i] prev_return = returns[i, 0] prev_value = values.data[i, 0] prev_advantage = advantages[i, 0] advantages = (advantages-advantages.mean())/(advantages.std()+1e-10) returns = (returns-returns.mean())/(returns.std()+1e-10) mu_pi, logvar_pi, _ = self.__pi(states) dist_pi = Normal(mu_pi, logvar_pi.exp().sqrt()) pi_log_probs = dist_pi.log_prob(actions) ratio = (pi_log_probs-beta_log_probs.detach()).sum(dim=1, keepdim=True).exp() optim.zero_grad() actor_loss = -torch.min(ratio*advantages, torch.clamp(ratio, 1-self.__eps, 1+self.__eps)*advantages).mean() critic_loss = F.smooth_l1_loss(values, returns) loss = actor_loss+critic_loss loss.backward(retain_graph=True) optim.step()
class SigmoidNormal(Distribution): """ Represent distribution of X where X ~ sigmoid(Z) Z ~ N(mean, std) Note: this is not very numerically stable. """ def __init__(self, normal_mean, normal_std, epsilon=1e-6): super(SigmoidNormal, self).__init__() """ :param normal_mean: Mean of the normal distribution :param normal_std: Std of the normal distribution :param epsilon: Numerical stability epsilon when computing log-prob. """ self.normal = Normal(normal_mean, normal_std) self.epsilon = epsilon def sample_n(self, n, return_pre_sigmoid_value=False): z = self.normal.sample_n(n) if return_pre_sigmoid_value: return F.sigmoid(z), z else: return F.sigmoid(z) def log_prob(self, value, pre_sigmoid_value=None): """ :param value: some value, x :param pre_sigmoid_value: arcsigmoid(x) :return: """ if pre_sigmoid_value is None: pre_sigmoid_value = torch.log((value) / (1 - value)) return self.normal.log_prob(pre_sigmoid_value) - torch.log( value * (1 - value) + self.epsilon) def sample(self, return_pre_sigmoid_value=False): z = self.normal.sample() if return_pre_sigmoid_value: return F.sigmoid(z), z else: return F.sigmoid(z)
def log_px_z(self, tensors, z): """ Usage reserved for Annealed Importance sampling """ n_latent = z.shape[-1] z_prior = Normal(torch.zeros(n_latent), torch.ones(n_latent)) log_pz = z_prior.log_prob(z).sum(-1) x = tensors[0] px_mean, px_var, qz_m, qz_v, _, log_qz_given_x = self.inference( x, n_samples=1, reparam=True) # Following step required to be consistent with below method z = z.unsqueeze(0) _, log_pxz, _ = self.log_ratio(x, px_mean, px_var, log_qz_given_x, z, return_full=True) log_pxz = log_pxz.squeeze() return log_pxz - log_pz
def forward(self, state, action=None): x = state x = self.actor_bn(x) for l in self.actor_linears: x = l(x) x = self.relu(x) mu = self.tanh(self.mu(x)) log_var = -self.relu(self.log_var(x)) sigmas = log_var.exp().sqrt() dists = Normal(mu, sigmas) if action is None: action = dists.sample() log_prob = dists.log_prob(action).sum(dim=-1, keepdim=True) x = state x = self.critic_bn(x) for l in self.critic_linears: x = l(x) x = self.relu(x) v = self.v(x) return action, log_prob, dists.entropy(), v
def _loss_function(self, x_context, y_context, x_target, y_target): """ :param x_context: [batch, N_con, x_dim] :param y_context: [batch, N_con, y_dim] :param x_target: [batch, N_tar, x_dim] :param y_target: [batch, N_tar, x_dim] :return: """ x_all = torch.cat([x_context, x_target], dim=-2) y_all = torch.cat([y_context, y_target], dim=-2) z_all_mu, z_all_sigma = self._calc_zparam(x_all, y_all) # z_all [batch, z_dim] z_c_mu, z_c_sigma = self._calc_zparam(x_context, y_context) # z_c [batch, z_dim] z_all_sample = self._sample_z(z_all_mu, z_all_sigma, self.N_z) # z_all_sample [batch, N_z, z_dim] y_t_mu, y_t_sigma = self.model.Decoder(z_all_sample, x_target) # y_t [batch, N_t, N_z, y_dim] y_normal = Normal(loc=y_t_mu, scale=y_t_sigma) y_target_exd = y_target.unsqueeze(dim=-2).expand(-1, -1, self.N_z, -1) # to [batch, N_t, N_z, y_dim] loglik = -y_normal.log_prob(y_target_exd).sum(dim=[-1,-3]).mean() # sum for N_t, y_dim, mean for N_z, batch kldiv = NeuralProcessTrainer.calc_kldiv_gaussian(z_all_mu, z_all_sigma, z_c_mu, z_c_sigma) # sum for z_dim, mean for batch return loglik, kldiv
def get_action(self, inputs, std_scale=None, epsilon=1e-6, mean_pi=False, probs=False, entropy=False): mean, log_std = self(inputs) if mean_pi: return T.tanh(mean) std = log_std.exp() if std_scale is not None: std *= std_scale mu = Normal(mean, std) z = mu.rsample() action = T.tanh(z) if not probs: return action * self.action_scaling else: if action.shape == (self.action_dim,): action = action.reshape((1, self.action_dim)) log_probs = (mu.log_prob(z) - T.log(1 - action.pow(2) + epsilon)).sum(1, keepdim=True) if not entropy: return action * self.action_scaling, log_probs else: entropy = mu.entropy() return action * self.action_scaling, log_probs, entropy
def get_probs(self, z, q): # Calculate log q(z|x) log_qz_x = q.log_prob(z).sum(dim=1) # Calculate log p(z) pz = Normal(loc=torch.zeros_like(z), scale=1) log_pz = pz.log_prob(z).sum(dim=1) # Calculate log q(z) batch_size = len(z) mat_log_qz = BtcvaeLoss.matrix_log_density_gaussian(z, q) if self.is_mss: log_iw_mat = BtcvaeLoss.log_importance_weight_matrix( batch_size, self.dataset_size) log_iw_mat = torch.unsqueeze(log_iw_mat, dim=-1).to(z.device) mat_log_qz = mat_log_qz + log_iw_mat log_qz = torch.logsumexp(mat_log_qz.sum(dim=2), dim=1, keepdim=False) log_prod_qzi = torch.logsumexp(mat_log_qz, dim=1, keepdim=False).sum(dim=1) return log_pz, log_qz, log_prod_qzi, log_qz_x
def _slow(mu, std, x): log_probs = [] # Iterate over all density components for i in range(self.num_density): # Retrieve means and stds mu_i = mu[:, i, :] std_i = std[:, i, :] # Thresholding std, if std is 0, it leads to NaN loss. # std_i = torch.clamp(std_i, min=min_std, max=std_i.max().item()) # Create Gaussian distribution dist = Normal(loc=mu_i, scale=std_i) # Calculate the log-probability logp = dist.log_prob(x) # Record the log probability for current density log_probs.append(logp) # Stack log-probabilities with shape [N, K, D] log_probs = torch.stack(log_probs, dim=1) return log_probs
def select_action(self,state): state = torch.from_numpy(state).float().unsqueeze(0) # just to make it a Tensor obj # get mean and std mean, std = self.policy(state) # create normal distribution normal = Normal(mean, std) # sample action action = normal.sample() # get log prob of that action ln_prob = normal.log_prob(action) ln_prob = ln_prob.sum() # squeeze action into [-1,1] action = torch.tanh(action) # turn actions into numpy array action = action.numpy() return action[0], ln_prob #, mean, std
def tile_map_prior(prior: ImagePrior, tile_map): # Source probabilities dist_sources = Poisson(torch.tensor(prior.mean_sources)) log_prob_no_source = dist_sources.log_prob(torch.tensor(0)) log_prob_one_source = dist_sources.log_prob(torch.tensor(1)) log_prob_source = (tile_map["n_sources"] == 0) * log_prob_no_source + ( tile_map["n_sources"] == 1) * log_prob_one_source # Binary probabilities galaxy_log_prob = torch.tensor(0.7).log() star_log_prob = torch.tensor(0.3).log() log_prob_binary = (galaxy_log_prob * tile_map["galaxy_bools"] + star_log_prob * tile_map["star_bools"]) # Galaxy probabiltiies gal_dist = Normal(0.0, 1.0) galaxy_probs = gal_dist.log_prob( tile_map["galaxy_params"]) * tile_map["galaxy_bools"] # prob_normalized = return log_prob_source.sum() + log_prob_binary.sum() + galaxy_probs.sum()
def select_action_continuous(state, policy: SimplePolicyContinuous, training_info: TrainingInfo, env: gym.Env): """ Given a policy which outputs a mean and a standard deviation, constructs a Normal distributions and then returns an action sampled from that distribution. This functions also logs the entropy of the distribution and the log probability of the sampled action. """ # Get distribution state = prepare_state(state) mu, sigma = policy.forward(state) # Sample action and remember its log probability n = Normal(mu, sigma) action = n.sample() action = tensor_clamp(action, env.action_space.low, env.action_space.high) # This is not very clean. TODO: clean this up training_info.log_probs.append(n.log_prob(action).sum()) training_info.entropies.append(n.entropy()) return action
def _choose_action(loc, scale): """sample an action from gaussian distribution, given its parameters Parameters ---------- loc : torch.tensor the mean parameter scale : torch.tensor the scale parameter Returns ------- type Description of returned object. """ m = Normal(loc, scale) a_t = m.sample() log_prob_a_t = m.log_prob(a_t) ent_t = gaussian_entropy(m) return a_t, log_prob_a_t, ent_t
def forward(self, state, action=None): a = t.relu(self.fc1(state)) a = t.relu(self.fc2(a)) mu = self.mu_head(a) sigma = softplus(self.sigma_head(a)) dist = Normal(mu, sigma) act = (atanh(action / self.action_range) if action is not None else dist.rsample()) act_entropy = dist.entropy() # the suggested way to confine your actions within a valid range # is not clamping, but remapping the distribution act_log_prob = dist.log_prob(act) act_tanh = t.tanh(act) act = act_tanh * self.action_range # the distribution remapping process used in the original essay. act_log_prob -= t.log(self.action_range * (1 - act_tanh.pow(2)) + 1e-6) act_log_prob = act_log_prob.sum(1, keepdim=True) return act, act_log_prob, act_entropy
def __call__(self, x_sample, x): if self.layer_outputs is None: raise ValueError("The model needs to return the latent space " "distribution parameters z_mu, z_var.") if self.use_distributions: p = x_sample q = self.layer_outputs["q"] else: z_mu = self.layer_outputs["z_mu"] z_var = self.layer_outputs["z_var"] p = Normal(x_sample, 0.5) q = Normal(z_mu, z_var.pow(0.5)) # reconstruction loss: log likelihood ll_loss = - p.log_prob(x).sum(-1, keepdim=True) # regularization loss: KL divergence kl_loss = kl_divergence(q, Normal(0, 1)).sum(-1, keepdim=True) combined_loss = ll_loss + kl_loss return combined_loss, {"ll_loss": ll_loss, "kl_loss": kl_loss}
def loss_function(pi, sigma, mu, target): """ Calculate the loss for our MDN. This is the negative-log likelihood - based on https://github.com/sagelywizard/pytorch-mdn and adapted to use torch.distributions.Normal The original paper: http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.120.5685&rep=rep1&type=pdf """ sequence = pi.size()[1] target = target.view(-1, sequence, 1, VAE.z_size) normal_distributions = Normal(mu, sigma) # log_prob(y) ... log of pdf at value y log_probabilities = torch.exp(pi * normal_distributions.log_prob(target)) result = -torch.log(torch.sum(log_probabilities, dim=2)) return torch.mean(result)
def forward(self, state: np.ndarray, goal: np.ndarray, deterministic=False, compute_log_prob=True) -> Tuple[torch.Tensor, torch.Tensor]: """ Returns the actions and their log probs as a torch Tensors (gradients can be computed)""" if self.has_goal: state, goal = get_tensor(state), get_tensor(goal) total_input = torch.cat([state, goal], dim=-1) # Concatenate to format [states | goals] else: total_input = get_tensor(state) hidden_state = self.layers.forward(total_input) mu = self.mu_layer(hidden_state) log_std = self.sigma_layer(hidden_state) log_std = LOG_SIGMA_MIN + (LOG_SIGMA_MAX - LOG_SIGMA_MIN) * (torch.tanh(log_std) + 1) / 2.0 # log_std = torch.clamp(log_std, LOG_SIGMA_MIN, LOG_SIGMA_MAX) std = torch.exp(log_std) policy_distribution = Normal(mu, std) actions = mu if deterministic else policy_distribution.rsample() if compute_log_prob: # Exact source: https://github.com/openai/spinningup/blob/master/spinup/algos/pytorch/sac/core.py#L54 # "Compute logprob from Gaussian, and then apply correction for Tanh squashing. # NOTE: The correction formula is a little bit magic. To get an understanding # of where it comes from, check out the original SAC paper (arXiv 1801.01290) # and look in appendix C. This is a more numerically-stable equivalent to Eq 21. # Try deriving it yourself as a (very difficult) exercise. :)" log_prob = policy_distribution.log_prob(actions).sum(axis=-1) try: log_prob -= (2 * (np.log(2) - actions - F.softplus(-2 * actions))).sum(axis=1) except IndexError: log_prob -= (2 * (np.log(2) - actions - F.softplus(-2 * actions))).sum() else: log_prob = None actions = torch.tanh(actions) # The log_prob above takes into account this "tanh squashing" action_center = (self.action_high + self.action_low) / 2 action_range = (self.action_high - self.action_low) / 2 actions_in_range = action_center + actions * action_range # print(f"Mu {mu}\t sigma {std}\tactions {actions}\taction_in_range {actions_in_range}") return actions_in_range, log_prob
class GaussianSeparatedPolicy(nn.Module): def __init__(self, input_dim, action_dim, hidden_layer=[64, 64]): super(GaussianSeparatedPolicy, self).__init__() actor_layer_size = [input_dim] + hidden_layer mu_feature_layers = nn.ModuleList([]) std_feature_layers = nn.ModuleList([]) for i in range(len(actor_layer_size) - 1): mu_feature_layers.append( nn.Linear(actor_layer_size[i], actor_layer_size[i + 1])) mu_feature_layers.append(nn.ReLU()) std_feature_layers.append( nn.Linear(actor_layer_size[i], actor_layer_size[i + 1])) std_feature_layers.append(nn.ReLU()) self.mu_body = nn.Sequential(*mu_feature_layers) self.std_body = nn.Sequential(*std_feature_layers) self.mu_head = nn.Sequential(nn.Linear(hidden_layer[-1], action_dim), nn.Tanh()) self.std_head = nn.Sequential(nn.Linear(hidden_layer[-1], action_dim), nn.Softplus()) critic_layer_size = [input_dim] + hidden_layer critic_layers = nn.ModuleList([]) for i in range(len(critic_layer_size) - 1): critic_layers.append( nn.Linear(critic_layer_size[i], critic_layer_size[i + 1])) critic_layers.append(nn.ReLU()) critic_layers.append(nn.Linear(hidden_layer[-1], 1)) self.critic = nn.Sequential(*critic_layers) def forward(self, x, action=None): mu = self.mu_head(self.mu_body(x)) std = self.std_head(self.std_body(x)) self.dist = Normal(mu, std) if action is None: action = self.dist.sample() action_log_prob = self.dist.log_prob(action).sum(-1) entropy = self.dist.entropy().sum(-1) value = self.critic(x) return action, action_log_prob, value.squeeze(-1), entropy