Example #1
0
class ActorMuLogstd(BaseModel):
    """
    use for PPO/PG algorithms' actor network.
    input: vector of state
    output: [stochastic action(mu), log of std]
    """
    def __init__(self, obs_spec, rep_net_params, output_shape,
                 network_settings):
        super().__init__(obs_spec, rep_net_params)
        self.condition_sigma = network_settings['condition_sigma']
        self.log_std_min, self.log_std_max = network_settings['log_std_bound']

        self.share = MLP(self.rep_net.h_dim, network_settings['hidden_units'])
        if network_settings['hidden_units']:
            ins = network_settings['hidden_units'][-1]
        else:
            ins = self.rep_net.h_dim
        self.mu = MLP(ins, [], output_shape=output_shape, out_act='tanh')
        if self.condition_sigma:
            self.log_std = MLP(ins, [], output_shape=output_shape)
        else:
            self.log_std = nn.Parameter(-0.5 * th.ones(output_shape))

    def forward(self, x, **kwargs):
        x = self.repre(x, **kwargs)
        x = self.share(x)
        mu = self.mu(x)
        if self.condition_sigma:
            log_std = self.log_std(x)  # [T, B, *] or [B, *]
        else:
            # TODO:
            log_std = self.log_std.repeat(mu.shape[:-1] +
                                          (1, ))  # [T, B, *] or [B, *]
        log_std = log_std.clamp(self.log_std_min, self.log_std_max)
        return mu, log_std
Example #2
0
class ActorCriticValueCts(BaseModel):
    """
    combine actor network and critic network, share some nn layers. use for continuous action space.
    input: vector of state
    output: mean(mu) of Gaussian Distribution of actions given a state, v(s)
    """
    def __init__(self, obs_spec, rep_net_params, output_shape,
                 network_settings):
        super().__init__(obs_spec, rep_net_params)
        self.condition_sigma = network_settings['condition_sigma']
        self.log_std_min, self.log_std_max = network_settings['log_std_bound']

        self.share = MLP(self.rep_net.h_dim, network_settings['share'])
        if network_settings['share']:
            ins = network_settings['share'][-1]
        else:
            ins = self.rep_net.h_dim
        self.mu_logstd_share = MLP(ins, network_settings['mu'])
        self.v = MLP(ins, network_settings['v'], output_shape=1)
        if network_settings['mu']:
            ins = network_settings['mu'][-1]
        self.mu = MLP(ins, [], output_shape=output_shape, out_act='tanh')
        if self.condition_sigma:
            self.log_std = MLP(ins, [], output_shape=output_shape)
        else:
            self.log_std = nn.Parameter(-0.5 * th.ones(output_shape))

    def forward(self, x, **kwargs):
        x = self.repre(x, **kwargs)
        x = self.share(x)
        v = self.v(x)
        x_mu_logstd = self.mu_logstd_share(x)
        mu = self.mu(x_mu_logstd)
        if self.condition_sigma:
            log_std = self.log_std(x_mu_logstd)  # [T, B, *] or [B, *]
        else:
            log_std = self.log_std.repeat(mu.shape[:-1] +
                                          (1, ))  # [T, B, *] or [B, *]
        log_std = log_std.clamp(self.log_std_min, self.log_std_max)
        return mu, log_std, v