def get_reinforce_ps_loss(phi, p0, reinforce=False): # returns pseudoloss: loss whose gradient is unbiased for the # true gradient d = len(p0) e_b = sigmoid(phi) bn_rv = Bernoulli(probs=torch.ones(d) * e_b) binary_samples = bn_rv.sample().detach() # binary_samples = (torch.rand(d) > e_b).float().detach() if reinforce: binary_samples_ = bn_rv.sample().detach() baseline = torch.sum((binary_samples_ - p0)**2) else: baseline = 0.0 sampled_loss = torch.sum((binary_samples - p0)**2) # probs, draw_array = get_all_probs(e_b, d) # losses_array = get_losses_from_draw_array(draw_array, p0) # # cat_rv = Categorical(probs) # indx = cat_rv.sample() # binary_samples = draw_array[indx] # sampled_loss = losses_array[indx] # sampled_log_q = get_bernoulli_log_prob(e_b, binary_samples) ps_loss = (sampled_loss - baseline).detach() * sampled_log_q return ps_loss
def add_noise_to_canv(self, canv, **kwargs): if self.noise_method == 'blur': canv = self.noise_blur(canv) elif self.noise_method == 'peaky': canv = self.noise_peaky(canv) elif self.noise_method == 'peaky_blur': prop_orig = np.random.rand() canv = canv*prop_orig + (1 - prop_orig)*self.noise_blur(self.noise_peaky(canv)) elif self.noise_method == 'bern': p_subtract = 0.1*np.random.rand() p_add = 0.1*np.random.rand() #print(p_subtract, p_add) bern_noise_subtract = Bernoulli(p_subtract*torch.ones(self.canv_shape)) bern_noise_add = Bernoulli(p_add*torch.ones(self.canv_shape)) canv = canv - bern_noise_subtract.sample() canv = canv.clamp(0.0, 1.0) canv = canv + bern_noise_add.sample() canv = canv.clamp(0.0, 1.0) else: canv = self.noise_gaussian(canv, **kwargs) canv = canv.clamp(0.0, 1.0) return canv
def get_reinforce_ps_loss(phi, p0, reinforce = False): # returns pseudoloss: loss whose gradient is unbiased for the # true gradient d = len(p0) e_b = sigmoid(phi) bn_rv = Bernoulli(probs = torch.ones(d) * e_b) binary_samples = bn_rv.sample().detach() # binary_samples = (torch.rand(d) > e_b).float().detach() if reinforce: binary_samples_ = bn_rv.sample().detach() baseline = torch.sum((binary_samples_ - p0)**2) else: baseline = 0.0 sampled_loss = torch.sum((binary_samples - p0)**2) # probs, draw_array = get_all_probs(e_b, d) # losses_array = get_losses_from_draw_array(draw_array, p0) # # cat_rv = Categorical(probs) # indx = cat_rv.sample() # binary_samples = draw_array[indx] # sampled_loss = losses_array[indx] # sampled_log_q = get_bernoulli_log_prob(e_b, binary_samples) ps_loss = (sampled_loss - baseline).detach() * sampled_log_q return ps_loss
def test_bernoulli_shape_tensor_params(self): bernoulli = Bernoulli(torch.Tensor([[0.6, 0.3], [0.6, 0.3], [0.6, 0.3]])) self.assertEqual(bernoulli._batch_shape, torch.Size((3, 2))) self.assertEqual(bernoulli._event_shape, torch.Size(())) self.assertEqual(bernoulli.sample().size(), torch.Size((3, 2))) self.assertEqual(bernoulli.sample((3, 2)).size(), torch.Size((3, 2, 3, 2))) self.assertEqual(bernoulli.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2))) self.assertRaises(ValueError, bernoulli.log_prob, self.tensor_sample_2)
def test_bernoulli_shape_scalar_params(self): bernoulli = Bernoulli(0.3) self.assertEqual(bernoulli._batch_shape, torch.Size()) self.assertEqual(bernoulli._event_shape, torch.Size()) self.assertEqual(bernoulli.sample().size(), torch.Size((1,))) self.assertEqual(bernoulli.sample((3, 2)).size(), torch.Size((3, 2))) self.assertRaises(ValueError, bernoulli.log_prob, self.scalar_sample) self.assertEqual(bernoulli.log_prob(self.tensor_sample_1).size(), torch.Size((3, 2))) self.assertEqual(bernoulli.log_prob(self.tensor_sample_2).size(), torch.Size((3, 2, 3)))
def backward_pass(self, hid, temperature=1): """Computes samples of visible neurons.""" vis_logits = hid @ self._W.T + self._bv vis_probs = torch.sigmoid(vis_logits / temperature) bernoulli = Bernoulli(vis_probs) vis = bernoulli.sample() return vis
def forward_pass(self, vis, temperature=1): """Computes samples of hidden neurons.""" hid_logits = vis @ self._W + self._bh hid_probs = torch.sigmoid(hid_logits / temperature) bernoulli = Bernoulli(hid_probs) hid = bernoulli.sample() return hid
def forward(self, h_t, eps=0.): """ Parameters ---------- h_t : torch.float hidden state of RNN at timestep t. eps : float epsilon -- controls the explore/exploit trade-off during training. this value is decreased as training progresses. Returns ------- halt : torch.long binary halting decision (If 0, wait. If 1: halt). log_pi : torch.float log probability of the selected action. -torch.log(probs) : torch.float negative log probability of the halting probability. """ probs = torch.sigmoid(self.fc( h_t.detach())) # Compute halting-probability probs = (1 - eps) * probs + eps * torch.FloatTensor( [0.5]) # Add randomness according to eps m = Bernoulli( probs=probs ) # Define bernoulli distribution parameterized with predicted probability halt = m.sample() # Sample action log_pi = m.log_prob(halt) # Compute log probability for optimization return halt, log_pi, -torch.log(probs)
def be_sample(segm: Tensor) -> Tuple[Tensor, Mask]: dist = Bernoulli(segm) mask_sample = dist.sample() L: Tensor = (segm * mask_sample + (1 - segm) * (1 - mask_sample)).log().sum() / segm.numel() return L, Mask(mask_sample)
def tensor_to_probabilistic_action_dict( env: gym.Env, x: torch.Tensor) -> Tuple[OrderedDict, torch.Tensor]: actions = env.action_space.noop() log_probs = [0 for _ in range(10)] m_1 = Normal(x[0], x[2]) camera_action_1 = m_1.rsample() log_prob_1 = m_1.log_prob(camera_action_1) log_probs[0] = log_prob_1 if not torch.isnan( log_prob_1).any() else torch.tensor(0.0, device=DEVICE) m_2 = Normal(x[1], x[3]) camera_action_2 = m_2.rsample() log_prob_2 = m_2.log_prob(camera_action_2) log_probs[1] = log_prob_2 if not torch.isnan( log_prob_2).any() else torch.tensor(0.0, device=DEVICE) actions['camera'] = (float(camera_action_1.item()), float(camera_action_2.item())) for idx, action in enumerate(BINARY_CONSTANTS, start=4): m = Bernoulli(x[idx]) sampled_action = m.sample() log_probs[idx - 2] = m.log_prob(sampled_action) actions[action] = int(sampled_action) # print(log_probs) return actions, torch.stack(log_probs).sum()
def resample_Zik(self, X, Z, A, i, k): ''' m = number of observations not including Z_ik containing feature k Prior: p(z_ik=1) = m / (N-1) Posterior combines the prior with the likelihood: p(z_ik=1|Z_-nk,A,X) propto p(z_ik=1)p(X|Z,A) ''' N, D = X.size() Z_k = Z[:, k] # Called m_-nk in the paper m = Z_k.sum() - Z_k[i] # If Z_nk were 0 Z_if_0 = Z.clone() Z_if_0[i, k] = 0 log_prior_if_0 = (1 - (m / (N - 1))).log() log_likelihood_if_0 = self.log_likelihood_given_ZA(X, Z_if_0, A) log_score_if_0 = log_prior_if_0 + log_likelihood_if_0 # If Z_nk were 1 Z_if_1 = Z.clone() Z_if_1[i, k] = 1 log_prior_if_1 = (m / (N - 1)).log() log_likelihood_if_1 = self.log_likelihood_given_ZA(X, Z_if_1, A) log_score_if_1 = log_prior_if_1 + log_likelihood_if_1 # Exp, Normalize, Sample log_scores = torch.cat((log_score_if_0, log_score_if_1), 0) probs = self.renormalize_log_probs(log_scores) p_znk = Bern(probs[1]) return p_znk.sample()
def _adapted_sampling(self, shape, device, dtype): """ The Bernoulli sampling function, used to sample from batch """ _bernoulli = Bernoulli(torch.tensor(float(self.p), device=device, dtype=dtype)) target = _bernoulli.sample((shape,)).bool() return target
def forward(self, x, gamma): # shape: (bsize, channels, height, width) if self.training: batch_size, channels, height, width = x.shape bernoulli = Bernoulli(gamma) mask = bernoulli.sample( ( batch_size, channels, height - (self.block_size - 1), width - (self.block_size - 1), ) ) if torch.cuda.is_available(): mask = mask.cuda() block_mask = self._compute_block_mask(mask) countM = ( block_mask.size()[0] * block_mask.size()[1] * block_mask.size()[2] * block_mask.size()[3] ) count_ones = block_mask.sum() return block_mask * x * (countM / count_ones) else: return x
def make_move(self, x): # self.states.append(x) probability = self.forward(x) probability = Bernoulli(probability) move = probability.sample() # self.log_probs.append(probability.log_prob(move)) return move.item()
def sample(self, epoch, num=64): z = torch.randn(num, self.latent_dim) x_probs = self.decode(z) dist = Bernoulli(x_probs) x_sample = dist.sample() save_image(x_sample.view(num, 1, 28, 28), 'results/epoch_{}_samples.png'.format(epoch))
def simulate_data(model, batch_size=10, n_batch=1): """Simulate data from the VAE model. Sample from the joint distribution p(z)p(x|z). This is equivalent to sampling from p(x)p(z|x), i.e. z is from the posterior. Bidirectional Monte Carlo only works on simulated data, where we could obtain exact posterior samples. Args: model: VAE model for simulation batch_size: batch size for simulated data n_batch: number of batches Returns: iterator that loops over batches of torch Tensor pair x, z """ # shorter aliases batches = [] for i in range(n_batch): # assume prior for VAE is unit Gaussian z = torch.randn(batch_size, model.latent_dim).cuda() x_logits = model.decode(z) if isinstance(x_logits, tuple): x_logits = x_logits[0] x_bernoulli_dist = Bernoulli(probs=x_logits.sigmoid()) x = x_bernoulli_dist.sample().data paired_batch = (x, z) batches.append(paired_batch) return iter(batches)
def act(batch_states, theta, values): batch_states = torch.from_numpy(batch_states).long() probs = torch.sigmoid(theta)[batch_states] m = Bernoulli(1 - probs) actions = m.sample() log_probs_actions = m.log_prob(actions) return actions.numpy().astype(int), log_probs_actions, values[batch_states]
def forward(self, x, gamma): """give a 4-d tensor, apply dropblock Args: x (torch.Tensor): (batch_size, num_channel, h, w) gamma (float): the probability of each upper left corner of a block to be zeroed out a rough estimate of how to set this value is given in the dropblock paper Returns: torch.Tensor: x with each channel's (block_size, block_size) blocks randomly zeroed out. """ if self.training: batch_size, channels, height, width = x.shape bernoulli = Bernoulli(gamma) # mask is indicators of the upper left corner of the blocks to be zeroed out mask = bernoulli.sample( sample_shape=(batch_size, channels, height - (self.block_size - 1), width - (self.block_size - 1))).to(x.device) #print((x.sample[-2], x.sample[-1])) block_mask = self._compute_block_mask(mask) #print (block_mask.size()) #print (x.size()) countM = block_mask.size()[0] * block_mask.size( )[1] * block_mask.size()[2] * block_mask.size()[3] count_ones = block_mask.sum() return block_mask * x * (countM / count_ones) else: return x
def get_action_from_actor(self, state, deterministic=False): """Given the state, produces an action, the probability of the action, the log probability of the action, and the argmax action""" action_probabilities = self.actor_local( state) # output size should be [B*H*W] action_probabilities = F.sigmoid( action_probabilities) # make sure the probs are in range [0,1] B, _, _ = action_probabilities.shape action_probabilities = action_probabilities.view(B, -1) # TODO leave this to future process; seems it will get the index max_probability_action = torch.argmax(action_probabilities, dim=-1) assert action_probabilities.size()[ 1, 2] == self.action_size, "Actor output the wrong size" if deterministic: # using deteministic policy during test time action = action_probabilities(action_probabilities > 0.5).cpu() else: # using stochastic policy during traning time action_distribution = Bernoulli( action_probabilities ) # this creates a distribution to sample from action = action_distribution.sample().cpu( ) # sample the discrete action and copy it to cpu # Have to deal with situation of 0.0 probabilities because we can't do log 0 z = action_probabilities == 0.0 z = z.float() * 1e-8 log_action_probabilities = torch.log(action_probabilities + z) return action, (action_probabilities, log_action_probabilities), max_probability_action
def forward(self, x): prev_h = [self.agent(x)] prev_h.extend( [torch.zeros_like(prev_h[0]) for _ in range(self.num_layers - 1)]) prev_c = [torch.zeros_like(prev_h[0]) for _ in range(self.num_layers)] # only used for LSTM input = torch.stack([self.sos_embedding] * x.size(0)) symb_seq = [] stop_seq = [] symb_logits = [] stop_logits = [] symb_entropy = [] stop_entropy = [] for step in range(self.max_len): for i, layer in enumerate(self.cells): e_t = float(self.training) * ( self.noise_loc + self.noise_scale * torch.randn_like(prev_h[0]).to(prev_h[0])) if isinstance(layer, nn.LSTMCell): h_t, c_t = layer(input, (prev_h[i], prev_c[i])) c_t = c_t + e_t prev_c[i] = c_t else: h_t = layer(input, prev_h[i]) h_t = h_t + e_t prev_h[i] = h_t input = h_t symb_probs = F.softmax(self.output_symbol(h_t), dim=1) stop_probs = torch.sigmoid( torch.squeeze(self.whether_to_stop(h_t), 1)) symb_distr = Categorical(probs=symb_probs) stop_distr = Bernoulli(probs=stop_probs) symb = symb_distr.sample() if self.training else symb_probs.argmax( dim=1) stop = stop_distr.sample() if self.training else ( stop_probs > 0.5).float() symb_logits.append(symb_distr.log_prob(symb)) stop_logits.append(stop_distr.log_prob(stop)) symb_entropy.append(symb_distr.entropy()) stop_entropy.append(stop_distr.entropy()) symb_seq.append(symb) stop_seq.append(stop) input = self.embedding(symb) symb_seq = torch.stack(symb_seq).permute(1, 0) stop_seq = torch.stack(stop_seq).permute(1, 0).long() symb_logits = torch.stack(symb_logits).permute(1, 0) stop_logits = torch.stack(stop_logits).permute(1, 0) symb_entropy = torch.stack(symb_entropy).permute(1, 0) stop_entropy = torch.stack(stop_entropy).permute(1, 0) logits = (symb_logits, stop_logits) entropy = (symb_entropy, stop_entropy) return symb_seq, stop_seq, logits, entropy
def sample(self, x_index, init_hidden, use_cuda): lstm_i2h_h0, lstm_i2h_c0, lstm_h2s_h0, lstm_h2s_c0 = init_hidden batch_size, seq_len = x_index.size() x = torch.transpose(self.embed(x_index), 1, 0) hidden_i2h , (_, _) = self.lstm_i2h(x, (lstm_i2h_h0, lstm_i2h_c0)) if use_cuda: z = torch.zeros((seq_len, batch_size)).cuda() else: z = torch.zeros((seq_len, batch_size)) z = Variable(z) s_h2s_h = lstm_h2s_h0 s_h2s_c = lstm_h2s_c0 for i in range(seq_len): cur_p_z = self.h2o(torch.cat((hidden_i2h[i], s_h2s_h[0]), dim=1)) cur_p_z = F.sigmoid(torch.squeeze(cur_p_z, 1)) m = Bernoulli(cur_p_z) z[i] = m.sample() cat_hidden_z = torch.unsqueeze(torch.cat((hidden_i2h[i], torch.unsqueeze(z[i], 1)), dim=1), 0) _, (s_h2s_h, s_h2s_c) = self.lstm_h2s(cat_hidden_z, (s_h2s_h, s_h2s_c)) return torch.transpose(z, 1, 0)
def sample_mask(cls, p, n): """Returns the mask of the weights""" bn = Bernoulli(p) mask = bn.sample((n, 1)) return mask
def forward(self, x, hidden=None): seq_len, batch_size = x.size(0), x.size(1) if hidden is None: hidden = (torch.zeros(self.first_dim_hidden, batch_size, self.hidden_size).to(x.device), torch.zeros(self.first_dim_hidden, batch_size, self.hidden_size).to(x.device)) dist_params, actions = [], [] for element in x: output, hidden_ = self.decoder(element.unsqueeze(0), hidden) pred = self.predictor(output) dist_params.append(pred) sampler = Bernoulli(pred) selected_prev_y = sampler.sample() actions.append(selected_prev_y) # bug ? below line was output, _ = ... output, hidden = self.hierarchical_decoder(element.unsqueeze(0), hidden) hidden = [h_ + h * pred for h_, h in zip(hidden_, hidden)] params = torch.stack(dist_params, dim=0).view(seq_len, batch_size) actions = torch.stack(actions, dim=0).view(seq_len, batch_size) return params, actions
class Dropout(Module): """Dropout module" Attributes ---------- p : float Probability for activation to be ignored """ def __init__(self, p=0.5): super().__init__() from torch.distributions import Bernoulli self.p = p self._bernoulli = Bernoulli(1 - self.p) self._sample = None def forward(self, input): if self.training: self._sample = self._bernoulli.sample( input.shape).float() / (1 - self.p) input = input * self._sample return input def backward(self, gradient): if self.training: gradient = gradient * self._sample return gradient
def forward(self, state): policy_p = self.fn_approximator(state) policy_p = F.sigmoid(policy_p) try: stochastic_policy = Bernoulli(policy_p) actions = stochastic_policy.sample() log_probs = stochastic_policy.log_prob(actions) except RuntimeError as e: logging.debug( 'Runtime error occured. policy_p was {}'.format(policy_p)) logging.debug('State was: {}'.format(state)) logging.debug('Function approximator return was: {}'.format( self.fn_approximator(state))) logging.debug( 'This has occured before when parameters of the network became NaNs.' ) logging.debug( 'Check learning rate, or change eps in adaptive gradient descent methods.' ) raise RuntimeError( 'BernoulliPolicy returned nan information. Logger level with DEBUG will have more ' 'information') return actions, log_probs
def select_action(policy, state: int = 0, mode = "param"): """ Select an action (0 or 1) by running policy model and choosing based on the probabilities in state input: policy: nn.module, the policy specified state: int, the numerical encode of the state return: action: int, the action (0 or 1) selected """ assert mode in ["param", "NN"] with torch.no_grad(): # convert state to one hot state = get_one_hot(state, 6) # from policy create a probablistic distribution action_prob = policy(state) if mode == "param": # by the probablity obtained, create a categorical distribution action_prob = torch.clamp(action_prob, min = 0.0, max = 1.0) c = Bernoulli(action_prob) else: # by the probablity obtained, create a categorical distribution c = Categorical(action_prob) # sample from this distribution action = c.sample() return action.item()
class DropBlock(nn.Module): def __init__(self, drop_prob, block_size, feat_size): super(DropBlock, self).__init__() assert feat_size > block_size, \ "block_size can't exceed feat_size" self.drop_prob = drop_prob self.block_size = block_size self.feat_size = feat_size self.gamma = self._compute_gamma() self.bernouli = Bernoulli(self.gamma) def forward(self, x): # shape: (bsize, channels, height, width) assert x.dim() == 4, \ "Expected input with 4 dimensions (bsize, channels, height, width)" if not self.training: return x else: mask = self.bernouli.sample((x.shape[-2], x.shape[-1])) block_mask = self._compute_block_mask(mask) out = x * block_mask[None, None, :, :] out = out * block_mask.numel() / block_mask.sum() return out def _compute_block_mask(self, mask): height, width = mask.shape non_zero_idxs = mask.nonzero() nr_blocks = non_zero_idxs.shape[0] offsets = torch.stack([ torch.arange(self.block_size).view(-1, 1).expand( self.block_size, self.block_size).reshape(-1), torch.arange(self.block_size).repeat(self.block_size) ]).t() non_zero_idxs = non_zero_idxs.repeat(self.block_size**2, 1) offsets = offsets.repeat(1, nr_blocks).view(-1, 2) block_idxs = non_zero_idxs + offsets padded_mask = F.pad(mask, (0, self.block_size, 0, self.block_size)) padded_mask[block_idxs[:, 0], block_idxs[:, 1]] = 1. block_mask = padded_mask[:height, :width] return 1 - block_mask def _compute_gamma(self): return (self.drop_prob / (self.block_size ** 2)) * \ ((self.feat_size ** 2) / ((self.feat_size - self.block_size + 1) ** 2)) def set_drop_probability(self, drop_prob): self.drop_prob = drop_prob self.gamma = self._compute_gamma() self.bernouli = Bernoulli(self.gamma)
def act(self, obs, actor, critic): obs = torch.from_numpy(obs).long() prob = torch.sigmoid(actor)[obs] bernoulli = Bernoulli(1 - prob) action = bernoulli.sample() logprob = bernoulli.log_prob(action) return action.numpy().astype(int), logprob, critic[obs]
def predict_edges(G_t, adjacencies, prev_i): edge_weights = adjacencies.data.squeeze().sigmoid() # print(edge_weights) edge_preds = [] m = Bernoulli(edge_weights) preds = m.sample() G_t[:, :prev_i + 1] = torch.narrow(preds, 0, 0, prev_i + 1) return G_t
def train(epoch): agent.train() rnet.train() matches, rewards, policies = [], [], [] for batch_idx, (inputs, targets) in tqdm.tqdm(enumerate(trainloader), total=len(trainloader)): inputs, targets = Variable(inputs), Variable(targets).cuda(async=True) if not args.parallel: inputs = inputs.cuda() probs, value = agent(inputs) #---------------------------------------------------------------------# policy_map = probs.data.clone() policy_map[policy_map<0.5] = 0.0 policy_map[policy_map>=0.5] = 1.0 policy_map = Variable(policy_map) probs = probs*args.alpha + (1-probs)*(1-args.alpha) distr = Bernoulli(probs) policy = distr.sample() v_inputs = Variable(inputs.data, volatile=True) preds_map = rnet.forward(v_inputs, policy_map) preds_sample = rnet.forward(inputs, policy) reward_map, _ = get_reward(preds_map, targets, policy_map.data) reward_sample, match = get_reward(preds_sample, targets, policy.data) advantage = reward_sample - reward_map # advantage = advantage.expand_as(policy) loss = -distr.log_prob(policy).sum(1, keepdim=True) * Variable(advantage) loss = loss.sum() #---------------------------------------------------------------------# loss += F.cross_entropy(preds_sample, targets) optimizer.zero_grad() loss.backward() optimizer.step() matches.append(match.cpu()) rewards.append(reward_sample.cpu()) policies.append(policy.data.cpu()) accuracy, reward, sparsity, variance, policy_set = utils.performance_stats(policies, rewards, matches) log_str = 'E: %d | A: %.3f | R: %.2E | S: %.3f | V: %.3f | #: %d'%(epoch, accuracy, reward, sparsity, variance, len(policy_set)) print log_str log_value('train_accuracy', accuracy, epoch) log_value('train_reward', reward, epoch) log_value('train_sparsity', sparsity, epoch) log_value('train_variance', variance, epoch) log_value('train_unique_policies', len(policy_set), epoch)
class RewardHighVelocity(gym.RewardWrapper): """Wrapper to modify environment rewards of 'Cheetah','Walker' and 'Hopper'. Penalizes with certain probability if velocity of the agent is greater than a predefined max velocity. Parameters ---------- kwargs: dict with keys: 'prob_vel_penal': prob of penalization 'cost_vel': cost of penalization 'max_vel': max velocity Methods ------- step(action): next_state, reward, done, info execute a step in the environment. """ def __init__(self, env, **kwargs): super(RewardHighVelocity, self).__init__(env) self.penal_v_distr = Bernoulli(kwargs['prob_vel_penal']) self.penal = kwargs['cost_vel'] self.max_vel = kwargs['max_vel'] allowed_envs = ['Cheetah', 'Hopper', 'Walker'] assert(any(e in self.env.unwrapped.spec.id for e in allowed_envs)), \ 'Env {self.env.unwrapped.spec.id} not allowed for RewardWrapper' def step(self, action): observation, reward, done, info = self.env.step(action) vel = info['x_velocity'] info['risky_state'] = vel > self.max_vel info['angle'] = self.env.sim.data.qpos[2] if 'Cheetah' in self.env.unwrapped.spec.id: return (observation, self.new_reward(reward, info), done, info) if 'Walker' in self.env.unwrapped.spec.id: return (observation, self.new_reward(reward, info), done, info) if 'Hopper' in self.env.unwrapped.spec.id: return (observation, self.new_reward(reward, info), done, info) def new_reward(self, reward, info): if 'Cheetah' in self.env.unwrapped.spec.id: forward_reward = info['reward_run'] else: forward_reward = info['x_velocity'] penal = info['risky_state'] * \ self.penal_v_distr.sample().item() * self.penal # If penalty applied, substract the forward_reward from total_reward # original_reward = rew_healthy + forward_reward - cntrl_cost new_reward = penal + reward + (penal != 0) * (-forward_reward) return new_reward @property def name(self): return f'{self.__class__.__name__}{self.env}'
def forward(self, image): latent_means = self.encoder.forward(image) bernoulli_rv = Bernoulli(latent_means) bernoulli_samples = bernoulli_rv.sample().detach() image_mean = self.decoder.forward(bernoulli_samples) return image_mean, latent_means, bernoulli_samples