def forward( self, observation, state, actions=None, reparameterize=True, deterministic=False, return_log_prob=False, ): # import ipdb; ipdb.set_trace() tensor = observation # normalize # tensor = torch.div(tensor, 255.) for layer in self.convs: tensor = layer(tensor) tensor = self.conv_activation(tensor) tensor = batch_flatten(tensor) if state is not None: tensor = torch.cat((tensor, state), 1) i = 0 for layer in self.fcs: h = tensor tensor = layer(tensor) if i == len(self.fcs) - 1: tensor = self.output_activation(tensor) else: tensor = self.fcs_activation(tensor) i += 1 mean = tensor log_std = self.last_fc_log_std(h) log_std = torch.clamp(log_std, LOG_SIG_MIN, LOG_SIG_MAX) std = torch.exp(log_std) mean = torch.clamp(mean, LOG_MEAN_MIN, LOG_MEAN_MAX) tanh_normal = TanhNormal(mean, std) log_prob = None entropy = None mean_action_log_prob = None pre_tanh_value = None if deterministic: action = torch.tanh(mean) else: tanh_normal = TanhNormal(mean, std) if return_log_prob: if reparameterize is True: action, pre_tanh_value = tanh_normal.rsample( return_pretanh_value=True) else: action, pre_tanh_value = tanh_normal.sample( return_pretanh_value=True) log_prob = tanh_normal.log_prob(action, pre_tanh_value=pre_tanh_value) log_prob = log_prob.sum(dim=1, keepdim=True) else: if reparameterize is True: action = tanh_normal.rsample() else: action = tanh_normal.sample() return ( action, mean, log_std, log_prob, entropy, std, mean_action_log_prob, pre_tanh_value, )
def forward( self, obs, reparameterize=True, deterministic=False, return_log_prob=False, return_entropy=False, return_log_prob_of_mean=False, ): """ :param obs: Observation :param deterministic: If True, do not sample :param return_log_prob: If True, return a sample and its log probability :param return_entropy: If True, return the true expected log prob. Will not need to be differentiated through, so this can be a number. :param return_log_prob_of_mean: If True, return the true expected log prob. Will not need to be differentiated through, so this can be a number. """ h = obs for i, fc in enumerate(self.fcs): h = self.hidden_activation(fc(h)) mean = self.last_fc(h) if self.std is None: log_std = self.last_fc_log_std(h) log_std = torch.clamp(log_std, LOG_SIG_MIN, LOG_SIG_MAX) std = torch.exp(log_std) else: std = self.std log_std = self.log_std log_prob = None entropy = None mean_action_log_prob = None pre_tanh_value = None if deterministic: action = torch.tanh(mean) else: tanh_normal = TanhNormal(mean, std) if return_log_prob: if reparameterize is True: action, pre_tanh_value = tanh_normal.rsample( return_pretanh_value=True) else: action, pre_tanh_value = tanh_normal.sample( return_pretanh_value=True) log_prob = tanh_normal.log_prob(action, pre_tanh_value=pre_tanh_value) log_prob = log_prob.sum(dim=1, keepdim=True) else: if reparameterize is True: action = tanh_normal.rsample() else: action = tanh_normal.sample() if return_entropy: entropy = log_std + 0.5 + np.log(2 * np.pi) / 2 # I'm not sure how to compute the (differential) entropy for a # tanh(Gaussian) entropy = entropy.sum(dim=1, keepdim=True) raise NotImplementedError() if return_log_prob_of_mean: tanh_normal = TanhNormal(mean, std) mean_action_log_prob = tanh_normal.log_prob( torch.tanh(mean), pre_tanh_value=mean, ) mean_action_log_prob = mean_action_log_prob.sum(dim=1, keepdim=True) return ( action, mean, log_std, log_prob, entropy, std, mean_action_log_prob, pre_tanh_value, )