class CNNPolicy(nn.Module):
    def __init__(self, num_inputs, action_space):
        super(CNNPolicy, self).__init__()
        self.conv1 = nn.Conv2d(num_inputs, 32, 8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, 4, stride=2)
        self.conv3 = nn.Conv2d(64, 32, 3, stride=1)

        self.act_func = F.leaky_relu  # F.tanh ##  F.elu F.relu F.softplus

        self.linear1 = nn.Linear(32 * 7 * 7, 512)

        self.critic_linear = nn.Linear(512, 1)

        if action_space.__class__.__name__ == "Discrete":
            num_outputs = action_space.n
            self.dist = Categorical(512, num_outputs)
        elif action_space.__class__.__name__ == "Box":
            num_outputs = action_space.shape[0]
            self.dist = DiagGaussian(512, num_outputs)
        else:
            # raise NotImplementedError
            self.dist = Categorical(512, action_space)

        self.train()
        self.reset_parameters()

    def reset_parameters(self):
        self.apply(weights_init)

        relu_gain = nn.init.calculate_gain('relu')
        self.conv1.weight.data.mul_(relu_gain)
        self.conv2.weight.data.mul_(relu_gain)
        self.conv3.weight.data.mul_(relu_gain)
        self.linear1.weight.data.mul_(relu_gain)

        if self.dist.__class__.__name__ == "DiagGaussian":
            self.dist.fc_mean.weight.data.mul_(0.01)

    def encode(self, inputs):

        x = self.conv1(inputs / 255.0)
        x = self.act_func(x)

        x = self.conv2(x)
        x = self.act_func(x)

        x = self.conv3(x)
        x = self.act_func(x)

        x = x.view(-1, 32 * 7 * 7)

        x = self.linear1(x)

        return x

    def predict_for_action(self, inputs):

        for_action = self.act_func(inputs)

        return for_action

    def predict_for_value(self, inputs):

        x = self.act_func(inputs)
        for_value = self.critic_linear(x)

        return for_value

    def forward(self, inputs):

        x = self.encode(inputs)
        for_action = self.predict_for_action(x)
        for_value = self.predict_for_value(x)

        return for_value, for_action

    def action_dist(self, inputs):
        x = self.encode(inputs)
        for_action = self.predict_for_action(x)

        dist = self.dist.action_probs(for_action)

        # print (torch.sum(torch.autograd.grad(torch.sum(torch.log(dist)), self.linear1.weight)[0]))  #nonzero
        # print (torch.sum(torch.autograd.grad(torch.sum(torch.log(dist)), self.conv3.weight)[0]))  #nonzero
        # print (torch.sum(torch.autograd.grad(torch.sum(torch.log(dist)), self.conv2.weight)[0]))     # ZERO
        # print (torch.sum(torch.autograd.grad(torch.sum(torch.log(dist)), self.conv1.weight)[0]))      # ZERO
        # fdsa

        return dist

    def action_logdist(self, inputs):
        x = self.encode(inputs)
        for_action = self.predict_for_action(x)

        dist = self.dist.action_logprobs(for_action)

        # print (torch.sum(torch.autograd.grad(torch.sum(torch.log(dist)), self.linear1.weight)[0]))  #nonzero
        # print (torch.sum(torch.autograd.grad(torch.sum(torch.log(dist)), self.conv3.weight)[0]))  #nonzero
        # print (torch.sum(torch.autograd.grad(torch.sum(torch.log(dist)), self.conv2.weight)[0]))     # ZERO
        # print (torch.sum(torch.autograd.grad(torch.sum(torch.log(dist)), self.conv1.weight)[0]))      # ZERO
        # fdsa

        return dist

    def act(self, inputs, deterministic=False):
        value, x_action = self(inputs)
        # action = self.dist.sample(x_action, deterministic=deterministic)
        # action_log_probs, dist_entropy = self.dist.evaluate_actions(x_action, actions)

        # x_action.mean().backward()
        # fsadf

        action, action_log_probs, dist_entropy = self.dist.sample2(
            x_action, deterministic=deterministic)

        # action_log_probs.mean().backward()
        # fsadf

        return value, action, action_log_probs, dist_entropy
Beispiel #2
0
class CNNPolicy(nn.Module):
    def __init__(self, num_inputs, action_size):
        super(CNNPolicy, self).__init__()
        self.conv1 = nn.Conv2d(num_inputs, 32, 8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, 4, stride=2)
        self.conv3 = nn.Conv2d(64, 32, 3, stride=1)

        # self.conv1_bn = nn.BatchNorm2d(32)
        # self.conv2_bn = nn.BatchNorm2d(64)
        # self.conv3_bn = nn.BatchNorm2d(32)

        self.act_func = F.leaky_relu  # F.tanh ##  F.elu F.relu F.softplus

        # print (num_inputs)
        # fasd

        if num_inputs == 6:
            self.intermediate_size = 11264
        else:
            self.intermediate_size = 32 * 7 * 7

        # self.linear1 = nn.Linear(32 * 7 * 7, 512)
        self.linear1 = nn.Linear(self.intermediate_size, 512)

        self.critic_linear = nn.Linear(512, 1)

        num_outputs = action_size  # action_space.n
        self.dist = Categorical(512, num_outputs)

        # if action_space.__class__.__name__ == "Discrete":
        #     num_outputs = action_space.n
        #     self.dist = Categorical(512, num_outputs)
        # elif action_space.__class__.__name__ == "Box":
        #     num_outputs = action_space.shape[0]
        #     self.dist = DiagGaussian(512, num_outputs)
        # else:
        #     raise NotImplementedError

        self.train()
        self.reset_parameters()

    def reset_parameters(self):
        self.apply(weights_init)

        relu_gain = nn.init.calculate_gain('relu')
        self.conv1.weight.data.mul_(relu_gain)
        self.conv2.weight.data.mul_(relu_gain)
        self.conv3.weight.data.mul_(relu_gain)
        self.linear1.weight.data.mul_(relu_gain)

        if self.dist.__class__.__name__ == "DiagGaussian":
            self.dist.fc_mean.weight.data.mul_(0.01)

    def encode(self, inputs):

        x = self.conv1(inputs)  # / 255.0)
        # x = self.conv1_bn(self.conv1(inputs / 255.0))
        # x = F.relu(x)
        # x = F.elu(x)
        # x = F.softplus(x)
        # x = F.tanh(x)
        x = self.act_func(x)

        x = self.conv2(x)
        # x = self.conv2_bn(self.conv2(x))
        # x = F.relu(x)
        # x = F.elu(x)
        # x = F.softplus(x)
        x = self.act_func(x)

        x = self.conv3(x)
        # x = self.conv3_bn(self.conv3(x))
        # x = F.relu(x)
        # x = F.elu(x)
        # x = F.softplus(x)
        x = self.act_func(x)

        x = x.view(-1, self.intermediate_size)

        x = self.linear1(x)

        return x

    def predict_for_action(self, inputs):

        # for_action = F.relu(inputs)
        # for_action = F.elu(inputs)
        # for_action = F.softplus(inputs)
        for_action = self.act_func(inputs)

        return for_action

    def predict_for_value(self, inputs):

        # x = F.relu(inputs)
        # x = F.elu(inputs)
        # x = F.softplus(inputs)
        x = self.act_func(inputs)

        for_value = self.critic_linear(x)

        return for_value

    def forward(self, inputs):

        x = self.encode(inputs)
        for_action = self.predict_for_action(x)
        for_value = self.predict_for_value(x)

        return for_value, for_action

    def action_dist(self, inputs):
        x = self.encode(inputs)
        for_action = self.predict_for_action(x)

        return self.dist.action_probs(for_action)

    def action_logdist(self, inputs):
        x = self.encode(inputs)
        for_action = self.predict_for_action(x)
        dist = self.dist.action_logprobs(for_action)
        return dist

    def act(self, inputs, deterministic=False):

        # print ('sss')
        value, x_action = self(inputs)
        # action = self.dist.sample(x_action, deterministic=deterministic)
        # action_log_probs, dist_entropy = self.dist.evaluate_actions(x_action, actions)

        # x_action.mean().backward()
        # fsadf

        action, action_log_probs, dist_entropy = self.dist.sample2(
            x_action, deterministic=deterministic)

        # action_log_probs.mean().backward()
        # fsadf

        # print (value)
        # print (action)
        # fdsfa

        return value, action, action_log_probs, dist_entropy
class CNNPolicy(nn.Module):
    def __init__(self, num_inputs, action_space):
        super(CNNPolicy, self).__init__()
        self.conv1 = nn.Conv2d(num_inputs, 32, 8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, 4, stride=2)
        self.conv3 = nn.Conv2d(64, 32, 3, stride=1)

        self.act_func = F.leaky_relu # F.tanh ##  F.elu F.relu F.softplus

        self.linear1 = nn.Linear(32 * 7 * 7, 512)

        self.critic_linear = nn.Linear(512, 1)

        if action_space.__class__.__name__ == "Discrete":
            num_outputs = action_space.n
            self.dist = Categorical(512, num_outputs)
        elif action_space.__class__.__name__ == "Box":
            num_outputs = action_space.shape[0]
            self.dist = DiagGaussian(512, num_outputs)
        else:
            # raise NotImplementedError
            self.dist = Categorical(512, action_space)



        self.train()
        self.reset_parameters()

    def reset_parameters(self):
        self.apply(weights_init)

        relu_gain = nn.init.calculate_gain('relu')
        self.conv1.weight.data.mul_(relu_gain)
        self.conv2.weight.data.mul_(relu_gain)
        self.conv3.weight.data.mul_(relu_gain)
        self.linear1.weight.data.mul_(relu_gain)

        if self.dist.__class__.__name__ == "DiagGaussian":
            self.dist.fc_mean.weight.data.mul_(0.01)


    def encode(self, inputs):

        # x = self.conv1(inputs / 255.0)
        x = self.conv1(inputs )
        x = self.act_func(x)

        x = self.conv2(x)
        x = self.act_func(x)

        x = self.conv3(x)
        x = self.act_func(x)

        x = x.view(-1, 32 * 7 * 7)

        x = self.linear1(x)

        return x


    def predict_for_action(self, inputs):

        for_action = self.act_func(inputs)

        return for_action

    def predict_for_value(self, inputs):

        x = self.act_func(inputs)
        for_value= self.critic_linear(x)

        return for_value

    def forward(self, inputs):

        x = self.encode(inputs)
        for_action = self.predict_for_action(x)
        for_value = self.predict_for_value(x)

        return for_value, for_action


    def action_dist(self, inputs):
        x = self.encode(inputs)
        for_action = self.predict_for_action(x)

        dist = self.dist.action_probs(for_action)

        # print (torch.sum(torch.autograd.grad(torch.sum(torch.log(dist)), self.linear1.weight)[0]))  #nonzero
        # print (torch.sum(torch.autograd.grad(torch.sum(torch.log(dist)), self.conv3.weight)[0]))  #nonzero
        # print (torch.sum(torch.autograd.grad(torch.sum(torch.log(dist)), self.conv2.weight)[0]))     # ZERO
        # print (torch.sum(torch.autograd.grad(torch.sum(torch.log(dist)), self.conv1.weight)[0]))      # ZERO 
        # fdsa

        return dist


    def action_logdist(self, inputs):
        x = self.encode(inputs)
        for_action = self.predict_for_action(x)

        dist = self.dist.action_logprobs(for_action)

        # print (torch.sum(torch.autograd.grad(torch.sum(torch.log(dist)), self.linear1.weight)[0]))  #nonzero
        # print (torch.sum(torch.autograd.grad(torch.sum(torch.log(dist)), self.conv3.weight)[0]))  #nonzero
        # print (torch.sum(torch.autograd.grad(torch.sum(torch.log(dist)), self.conv2.weight)[0]))     # ZERO
        # print (torch.sum(torch.autograd.grad(torch.sum(torch.log(dist)), self.conv1.weight)[0]))      # ZERO 
        # fdsa

        return dist




    def act(self, inputs, deterministic=False):
        value, x_action = self(inputs)
        # action = self.dist.sample(x_action, deterministic=deterministic)
        # action_log_probs, dist_entropy = self.dist.evaluate_actions(x_action, actions)

        # x_action.mean().backward()
        # fsadf

        action, action_log_probs, dist_entropy = self.dist.sample2(x_action, deterministic=deterministic)

        # action_log_probs.mean().backward()
        # fsadf

        return value, action, action_log_probs, dist_entropy
class CNNPolicy(nn.Module):
    def __init__(self, num_inputs, action_size):
        super(CNNPolicy, self).__init__()
        self.conv1 = nn.Conv2d(num_inputs, 32, 8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, 4, stride=2)
        self.conv3 = nn.Conv2d(64, 32, 3, stride=1)


        # self.conv1_bn = nn.BatchNorm2d(32)
        # self.conv2_bn = nn.BatchNorm2d(64)
        # self.conv3_bn = nn.BatchNorm2d(32)

        self.act_func = F.leaky_relu # F.tanh ##  F.elu F.relu F.softplus

        # print (num_inputs)
        # fasd

        if num_inputs == 6:
            self.intermediate_size = 11264
        else:
            self.intermediate_size = 32*7*7



        # self.linear1 = nn.Linear(32 * 7 * 7, 512)
        self.linear1 = nn.Linear(self.intermediate_size, 512)

        self.critic_linear = nn.Linear(512, 1)


        num_outputs = action_size # action_space.n
        self.dist = Categorical(512, num_outputs)

        # if action_space.__class__.__name__ == "Discrete":
        #     num_outputs = action_space.n
        #     self.dist = Categorical(512, num_outputs)
        # elif action_space.__class__.__name__ == "Box":
        #     num_outputs = action_space.shape[0]
        #     self.dist = DiagGaussian(512, num_outputs)
        # else:
        #     raise NotImplementedError

        self.train()
        self.reset_parameters()

    def reset_parameters(self):
        self.apply(weights_init)

        relu_gain = nn.init.calculate_gain('relu')
        self.conv1.weight.data.mul_(relu_gain)
        self.conv2.weight.data.mul_(relu_gain)
        self.conv3.weight.data.mul_(relu_gain)
        self.linear1.weight.data.mul_(relu_gain)

        if self.dist.__class__.__name__ == "DiagGaussian":
            self.dist.fc_mean.weight.data.mul_(0.01)


    def encode(self, inputs):

        x = self.conv1(inputs)# / 255.0)
        # x = self.conv1_bn(self.conv1(inputs / 255.0))
        # x = F.relu(x)
        # x = F.elu(x)
        # x = F.softplus(x)
        # x = F.tanh(x)
        x = self.act_func(x)

        x = self.conv2(x)
        # x = self.conv2_bn(self.conv2(x))
        # x = F.relu(x)
        # x = F.elu(x)
        # x = F.softplus(x)
        x = self.act_func(x)

        x = self.conv3(x)
        # x = self.conv3_bn(self.conv3(x))
        # x = F.relu(x)
        # x = F.elu(x)
        # x = F.softplus(x)
        x = self.act_func(x)


        x = x.view(-1, self.intermediate_size)

        x = self.linear1(x)

        return x


    def predict_for_action(self, inputs):

        # for_action = F.relu(inputs)
        # for_action = F.elu(inputs)
        # for_action = F.softplus(inputs)
        for_action = self.act_func(inputs)


        return for_action

    def predict_for_value(self, inputs):

        # x = F.relu(inputs)
        # x = F.elu(inputs)
        # x = F.softplus(inputs)
        x = self.act_func(inputs)
        
        for_value= self.critic_linear(x)

        return for_value

    def forward(self, inputs):

        x = self.encode(inputs)
        for_action = self.predict_for_action(x)
        for_value = self.predict_for_value(x)

        return for_value, for_action


    def action_dist(self, inputs):
        x = self.encode(inputs)
        for_action = self.predict_for_action(x)

        return self.dist.action_probs(for_action)




    def action_logdist(self, inputs):
        x = self.encode(inputs)
        for_action = self.predict_for_action(x)
        dist = self.dist.action_logprobs(for_action)
        return dist




    def act(self, inputs, deterministic=False):

        # print ('sss')
        value, x_action = self(inputs)
        # action = self.dist.sample(x_action, deterministic=deterministic)
        # action_log_probs, dist_entropy = self.dist.evaluate_actions(x_action, actions)

        # x_action.mean().backward()
        # fsadf

        action, action_log_probs, dist_entropy = self.dist.sample2(x_action, deterministic=deterministic)

        # action_log_probs.mean().backward()
        # fsadf

        # print (value)
        # print (action)
        # fdsfa

        return value, action, action_log_probs, dist_entropy