class ActorMuLogstd(BaseModel): """ use for PPO/PG algorithms' actor network. input: vector of state output: [stochastic action(mu), log of std] """ def __init__(self, obs_spec, rep_net_params, output_shape, network_settings): super().__init__(obs_spec, rep_net_params) self.condition_sigma = network_settings['condition_sigma'] self.log_std_min, self.log_std_max = network_settings['log_std_bound'] self.share = MLP(self.rep_net.h_dim, network_settings['hidden_units']) if network_settings['hidden_units']: ins = network_settings['hidden_units'][-1] else: ins = self.rep_net.h_dim self.mu = MLP(ins, [], output_shape=output_shape, out_act='tanh') if self.condition_sigma: self.log_std = MLP(ins, [], output_shape=output_shape) else: self.log_std = nn.Parameter(-0.5 * th.ones(output_shape)) def forward(self, x, **kwargs): x = self.repre(x, **kwargs) x = self.share(x) mu = self.mu(x) if self.condition_sigma: log_std = self.log_std(x) # [T, B, *] or [B, *] else: # TODO: log_std = self.log_std.repeat(mu.shape[:-1] + (1, )) # [T, B, *] or [B, *] log_std = log_std.clamp(self.log_std_min, self.log_std_max) return mu, log_std
class ActorCriticValueCts(BaseModel): """ combine actor network and critic network, share some nn layers. use for continuous action space. input: vector of state output: mean(mu) of Gaussian Distribution of actions given a state, v(s) """ def __init__(self, obs_spec, rep_net_params, output_shape, network_settings): super().__init__(obs_spec, rep_net_params) self.condition_sigma = network_settings['condition_sigma'] self.log_std_min, self.log_std_max = network_settings['log_std_bound'] self.share = MLP(self.rep_net.h_dim, network_settings['share']) if network_settings['share']: ins = network_settings['share'][-1] else: ins = self.rep_net.h_dim self.mu_logstd_share = MLP(ins, network_settings['mu']) self.v = MLP(ins, network_settings['v'], output_shape=1) if network_settings['mu']: ins = network_settings['mu'][-1] self.mu = MLP(ins, [], output_shape=output_shape, out_act='tanh') if self.condition_sigma: self.log_std = MLP(ins, [], output_shape=output_shape) else: self.log_std = nn.Parameter(-0.5 * th.ones(output_shape)) def forward(self, x, **kwargs): x = self.repre(x, **kwargs) x = self.share(x) v = self.v(x) x_mu_logstd = self.mu_logstd_share(x) mu = self.mu(x_mu_logstd) if self.condition_sigma: log_std = self.log_std(x_mu_logstd) # [T, B, *] or [B, *] else: log_std = self.log_std.repeat(mu.shape[:-1] + (1, )) # [T, B, *] or [B, *] log_std = log_std.clamp(self.log_std_min, self.log_std_max) return mu, log_std, v