Esempio n. 1
0
class MLPPolicy(torch.nn.Module):
    def __init__(self, num_inputs, action_space):
        super(MLPPolicy, self).__init__()

        self.obs_filter = ObsNorm((1, num_inputs), clip=5)
        self.action_space = action_space

        self.a_fc1 = nn.Linear(num_inputs, 64, bias=False)
        self.a_ab1 = AddBias(64)
        self.a_fc2 = nn.Linear(64, 64, bias=False)
        self.a_ab2 = AddBias(64)
        self.a_fc_mean = nn.Linear(64, action_space.shape[0], bias=False)
        self.a_ab_mean = AddBias(action_space.shape[0])
        self.a_ab_logstd = AddBias(action_space.shape[0])

        self.v_fc1 = nn.Linear(num_inputs, 64, bias=False)
        self.v_ab1 = AddBias(64)
        self.v_fc2 = nn.Linear(64, 64, bias=False)
        self.v_ab2 = AddBias(64)
        self.v_fc3 = nn.Linear(64, 1, bias=False)
        self.v_ab3 = AddBias(1)

        self.apply(weights_init_mlp)

        tanh_gain = nn.init.calculate_gain('tanh')
        #self.a_fc1.weight.data.mul_(tanh_gain)
        #self.a_fc2.weight.data.mul_(tanh_gain)
        self.a_fc_mean.weight.data.mul_(0.01)
        #self.v_fc1.weight.data.mul_(tanh_gain)
        #self.v_fc2.weight.data.mul_(tanh_gain)

        self.train()

    def cuda(self, **args):
        super(MLPPolicy, self).cuda(**args)
        self.obs_filter.cuda()

    def cpu(self, **args):
        super(MLPPolicy, self).cpu(**args)
        self.obs_filter.cpu()

    def forward(self, inputs):
        inputs.data = self.obs_filter(inputs.data)

        x = self.v_fc1(inputs)
        x = self.v_ab1(x)
        x = F.tanh(x)

        x = self.v_fc2(x)
        x = self.v_ab2(x)
        x = F.tanh(x)

        x = self.v_fc3(x)
        x = self.v_ab3(x)
        value = x

        x = self.a_fc1(inputs)
        x = self.a_ab1(x)
        x = F.tanh(x)

        x = self.a_fc2(x)
        x = self.a_ab2(x)
        x = F.tanh(x)

        x = self.a_fc_mean(x)
        x = self.a_ab_mean(x)
        action_mean = x

        #  An ugly hack for my KFAC implementation.
        zeros = Variable(torch.zeros(x.size()), volatile=x.volatile)
        if x.is_cuda:
            zeros = zeros.cuda()

        x = self.a_ab_logstd(zeros)
        action_logstd = x

        return value, action_mean, action_logstd

    def act(self, inputs, deterministic=False):
        value, action_mean, action_logstd = self(inputs)

        action_std = action_logstd.exp()

        noise = Variable(torch.randn(action_std.size()))
        if action_std.is_cuda:
            noise = noise.cuda()

        if deterministic is False:
            action = action_mean + action_std * noise
        else:
            action = action_mean
        return value, action

    def evaluate_actions(self, inputs, actions):
        assert inputs.dim(
        ) == 2, "Expect to have inputs in num_processes * num_steps x ... format"

        value, action_mean, action_logstd = self(inputs)

        action_std = action_logstd.exp()

        action_log_probs = -0.5 * (
            (actions - action_mean) / action_std).pow(2) - 0.5 * math.log(
                2 * math.pi) - action_logstd
        action_log_probs = action_log_probs.sum(1, keepdim=True)
        dist_entropy = 0.5 + math.log(2 * math.pi) + action_log_probs
        dist_entropy = dist_entropy.sum(-1).mean()

        return value, action_log_probs, dist_entropy
Esempio n. 2
0
class MLPPolicy(FFPolicy):
    def __init__(self, num_inputs, action_space):
        super(MLPPolicy, self).__init__()

        self.obs_filter = ObsNorm((1, num_inputs), clip=5)
        self.action_space = action_space

        self.a_fc1 = nn.Linear(num_inputs, 64, bias=False)
        self.a_ab1 = AddBias(64)
        self.a_fc2 = nn.Linear(64, 64, bias=False)
        self.a_ab2 = AddBias(64)

        self.v_fc1 = nn.Linear(num_inputs, 64, bias=False)
        self.v_ab1 = AddBias(64)
        self.v_fc2 = nn.Linear(64, 64, bias=False)
        self.v_ab2 = AddBias(64)
        self.v_fc3 = nn.Linear(64, 1, bias=False)
        self.v_ab3 = AddBias(1)

        if action_space.__class__.__name__ == "Discrete":
            num_outputs = action_space.n
            self.dist = Categorical(64, num_outputs)
        elif action_space.__class__.__name__ == "Box":
            num_outputs = action_space.shape[0]
            self.dist = DiagGaussian(64, num_outputs)
        else:
            raise NotImplementedError

        self.apply(weights_init_mlp)

        tanh_gain = nn.init.calculate_gain('tanh')
        #self.a_fc1.weight.data.mul_(tanh_gain)
        #self.a_fc2.weight.data.mul_(tanh_gain)
        #self.v_fc1.weight.data.mul_(tanh_gain)
        #self.v_fc2.weight.data.mul_(tanh_gain)

        if action_space.__class__.__name__ == "Box":
            self.dist.fc_mean.weight.data.mul_(0.01)

        self.train()

    def cuda(self, **args):
        super(MLPPolicy, self).cuda(**args)
        self.obs_filter.cuda()

    def cpu(self, **args):
        super(MLPPolicy, self).cpu(**args)
        self.obs_filter.cpu()

    def forward(self, inputs):
        inputs.data = self.obs_filter(inputs.data)

        x = self.v_fc1(inputs)
        x = self.v_ab1(x)
        x = F.tanh(x)

        x = self.v_fc2(x)
        x = self.v_ab2(x)
        x = F.tanh(x)

        x = self.v_fc3(x)
        x = self.v_ab3(x)
        value = x

        x = self.a_fc1(inputs)
        x = self.a_ab1(x)
        x = F.tanh(x)

        x = self.a_fc2(x)
        x = self.a_ab2(x)
        x = F.tanh(x)

        return value, x