class CNNPolicy(nn.Module): def __init__(self, num_inputs, action_space): super(CNNPolicy, self).__init__() self.conv1 = nn.Conv2d(num_inputs, 32, 8, stride=4) self.conv2 = nn.Conv2d(32, 64, 4, stride=2) self.conv3 = nn.Conv2d(64, 32, 3, stride=1) self.act_func = F.leaky_relu # F.tanh ## F.elu F.relu F.softplus self.linear1 = nn.Linear(32 * 7 * 7, 512) self.critic_linear = nn.Linear(512, 1) if action_space.__class__.__name__ == "Discrete": num_outputs = action_space.n self.dist = Categorical(512, num_outputs) elif action_space.__class__.__name__ == "Box": num_outputs = action_space.shape[0] self.dist = DiagGaussian(512, num_outputs) else: # raise NotImplementedError self.dist = Categorical(512, action_space) self.train() self.reset_parameters() def reset_parameters(self): self.apply(weights_init) relu_gain = nn.init.calculate_gain('relu') self.conv1.weight.data.mul_(relu_gain) self.conv2.weight.data.mul_(relu_gain) self.conv3.weight.data.mul_(relu_gain) self.linear1.weight.data.mul_(relu_gain) if self.dist.__class__.__name__ == "DiagGaussian": self.dist.fc_mean.weight.data.mul_(0.01) def encode(self, inputs): x = self.conv1(inputs / 255.0) x = self.act_func(x) x = self.conv2(x) x = self.act_func(x) x = self.conv3(x) x = self.act_func(x) x = x.view(-1, 32 * 7 * 7) x = self.linear1(x) return x def predict_for_action(self, inputs): for_action = self.act_func(inputs) return for_action def predict_for_value(self, inputs): x = self.act_func(inputs) for_value = self.critic_linear(x) return for_value def forward(self, inputs): x = self.encode(inputs) for_action = self.predict_for_action(x) for_value = self.predict_for_value(x) return for_value, for_action def action_dist(self, inputs): x = self.encode(inputs) for_action = self.predict_for_action(x) dist = self.dist.action_probs(for_action) # print (torch.sum(torch.autograd.grad(torch.sum(torch.log(dist)), self.linear1.weight)[0])) #nonzero # print (torch.sum(torch.autograd.grad(torch.sum(torch.log(dist)), self.conv3.weight)[0])) #nonzero # print (torch.sum(torch.autograd.grad(torch.sum(torch.log(dist)), self.conv2.weight)[0])) # ZERO # print (torch.sum(torch.autograd.grad(torch.sum(torch.log(dist)), self.conv1.weight)[0])) # ZERO # fdsa return dist def action_logdist(self, inputs): x = self.encode(inputs) for_action = self.predict_for_action(x) dist = self.dist.action_logprobs(for_action) # print (torch.sum(torch.autograd.grad(torch.sum(torch.log(dist)), self.linear1.weight)[0])) #nonzero # print (torch.sum(torch.autograd.grad(torch.sum(torch.log(dist)), self.conv3.weight)[0])) #nonzero # print (torch.sum(torch.autograd.grad(torch.sum(torch.log(dist)), self.conv2.weight)[0])) # ZERO # print (torch.sum(torch.autograd.grad(torch.sum(torch.log(dist)), self.conv1.weight)[0])) # ZERO # fdsa return dist def act(self, inputs, deterministic=False): value, x_action = self(inputs) # action = self.dist.sample(x_action, deterministic=deterministic) # action_log_probs, dist_entropy = self.dist.evaluate_actions(x_action, actions) # x_action.mean().backward() # fsadf action, action_log_probs, dist_entropy = self.dist.sample2( x_action, deterministic=deterministic) # action_log_probs.mean().backward() # fsadf return value, action, action_log_probs, dist_entropy
class CNNPolicy(nn.Module): def __init__(self, num_inputs, action_size): super(CNNPolicy, self).__init__() self.conv1 = nn.Conv2d(num_inputs, 32, 8, stride=4) self.conv2 = nn.Conv2d(32, 64, 4, stride=2) self.conv3 = nn.Conv2d(64, 32, 3, stride=1) # self.conv1_bn = nn.BatchNorm2d(32) # self.conv2_bn = nn.BatchNorm2d(64) # self.conv3_bn = nn.BatchNorm2d(32) self.act_func = F.leaky_relu # F.tanh ## F.elu F.relu F.softplus # print (num_inputs) # fasd if num_inputs == 6: self.intermediate_size = 11264 else: self.intermediate_size = 32 * 7 * 7 # self.linear1 = nn.Linear(32 * 7 * 7, 512) self.linear1 = nn.Linear(self.intermediate_size, 512) self.critic_linear = nn.Linear(512, 1) num_outputs = action_size # action_space.n self.dist = Categorical(512, num_outputs) # if action_space.__class__.__name__ == "Discrete": # num_outputs = action_space.n # self.dist = Categorical(512, num_outputs) # elif action_space.__class__.__name__ == "Box": # num_outputs = action_space.shape[0] # self.dist = DiagGaussian(512, num_outputs) # else: # raise NotImplementedError self.train() self.reset_parameters() def reset_parameters(self): self.apply(weights_init) relu_gain = nn.init.calculate_gain('relu') self.conv1.weight.data.mul_(relu_gain) self.conv2.weight.data.mul_(relu_gain) self.conv3.weight.data.mul_(relu_gain) self.linear1.weight.data.mul_(relu_gain) if self.dist.__class__.__name__ == "DiagGaussian": self.dist.fc_mean.weight.data.mul_(0.01) def encode(self, inputs): x = self.conv1(inputs) # / 255.0) # x = self.conv1_bn(self.conv1(inputs / 255.0)) # x = F.relu(x) # x = F.elu(x) # x = F.softplus(x) # x = F.tanh(x) x = self.act_func(x) x = self.conv2(x) # x = self.conv2_bn(self.conv2(x)) # x = F.relu(x) # x = F.elu(x) # x = F.softplus(x) x = self.act_func(x) x = self.conv3(x) # x = self.conv3_bn(self.conv3(x)) # x = F.relu(x) # x = F.elu(x) # x = F.softplus(x) x = self.act_func(x) x = x.view(-1, self.intermediate_size) x = self.linear1(x) return x def predict_for_action(self, inputs): # for_action = F.relu(inputs) # for_action = F.elu(inputs) # for_action = F.softplus(inputs) for_action = self.act_func(inputs) return for_action def predict_for_value(self, inputs): # x = F.relu(inputs) # x = F.elu(inputs) # x = F.softplus(inputs) x = self.act_func(inputs) for_value = self.critic_linear(x) return for_value def forward(self, inputs): x = self.encode(inputs) for_action = self.predict_for_action(x) for_value = self.predict_for_value(x) return for_value, for_action def action_dist(self, inputs): x = self.encode(inputs) for_action = self.predict_for_action(x) return self.dist.action_probs(for_action) def action_logdist(self, inputs): x = self.encode(inputs) for_action = self.predict_for_action(x) dist = self.dist.action_logprobs(for_action) return dist def act(self, inputs, deterministic=False): # print ('sss') value, x_action = self(inputs) # action = self.dist.sample(x_action, deterministic=deterministic) # action_log_probs, dist_entropy = self.dist.evaluate_actions(x_action, actions) # x_action.mean().backward() # fsadf action, action_log_probs, dist_entropy = self.dist.sample2( x_action, deterministic=deterministic) # action_log_probs.mean().backward() # fsadf # print (value) # print (action) # fdsfa return value, action, action_log_probs, dist_entropy
class CNNPolicy(nn.Module): def __init__(self, num_inputs, action_space): super(CNNPolicy, self).__init__() self.conv1 = nn.Conv2d(num_inputs, 32, 8, stride=4) self.conv2 = nn.Conv2d(32, 64, 4, stride=2) self.conv3 = nn.Conv2d(64, 32, 3, stride=1) self.act_func = F.leaky_relu # F.tanh ## F.elu F.relu F.softplus self.linear1 = nn.Linear(32 * 7 * 7, 512) self.critic_linear = nn.Linear(512, 1) if action_space.__class__.__name__ == "Discrete": num_outputs = action_space.n self.dist = Categorical(512, num_outputs) elif action_space.__class__.__name__ == "Box": num_outputs = action_space.shape[0] self.dist = DiagGaussian(512, num_outputs) else: # raise NotImplementedError self.dist = Categorical(512, action_space) self.train() self.reset_parameters() def reset_parameters(self): self.apply(weights_init) relu_gain = nn.init.calculate_gain('relu') self.conv1.weight.data.mul_(relu_gain) self.conv2.weight.data.mul_(relu_gain) self.conv3.weight.data.mul_(relu_gain) self.linear1.weight.data.mul_(relu_gain) if self.dist.__class__.__name__ == "DiagGaussian": self.dist.fc_mean.weight.data.mul_(0.01) def encode(self, inputs): # x = self.conv1(inputs / 255.0) x = self.conv1(inputs ) x = self.act_func(x) x = self.conv2(x) x = self.act_func(x) x = self.conv3(x) x = self.act_func(x) x = x.view(-1, 32 * 7 * 7) x = self.linear1(x) return x def predict_for_action(self, inputs): for_action = self.act_func(inputs) return for_action def predict_for_value(self, inputs): x = self.act_func(inputs) for_value= self.critic_linear(x) return for_value def forward(self, inputs): x = self.encode(inputs) for_action = self.predict_for_action(x) for_value = self.predict_for_value(x) return for_value, for_action def action_dist(self, inputs): x = self.encode(inputs) for_action = self.predict_for_action(x) dist = self.dist.action_probs(for_action) # print (torch.sum(torch.autograd.grad(torch.sum(torch.log(dist)), self.linear1.weight)[0])) #nonzero # print (torch.sum(torch.autograd.grad(torch.sum(torch.log(dist)), self.conv3.weight)[0])) #nonzero # print (torch.sum(torch.autograd.grad(torch.sum(torch.log(dist)), self.conv2.weight)[0])) # ZERO # print (torch.sum(torch.autograd.grad(torch.sum(torch.log(dist)), self.conv1.weight)[0])) # ZERO # fdsa return dist def action_logdist(self, inputs): x = self.encode(inputs) for_action = self.predict_for_action(x) dist = self.dist.action_logprobs(for_action) # print (torch.sum(torch.autograd.grad(torch.sum(torch.log(dist)), self.linear1.weight)[0])) #nonzero # print (torch.sum(torch.autograd.grad(torch.sum(torch.log(dist)), self.conv3.weight)[0])) #nonzero # print (torch.sum(torch.autograd.grad(torch.sum(torch.log(dist)), self.conv2.weight)[0])) # ZERO # print (torch.sum(torch.autograd.grad(torch.sum(torch.log(dist)), self.conv1.weight)[0])) # ZERO # fdsa return dist def act(self, inputs, deterministic=False): value, x_action = self(inputs) # action = self.dist.sample(x_action, deterministic=deterministic) # action_log_probs, dist_entropy = self.dist.evaluate_actions(x_action, actions) # x_action.mean().backward() # fsadf action, action_log_probs, dist_entropy = self.dist.sample2(x_action, deterministic=deterministic) # action_log_probs.mean().backward() # fsadf return value, action, action_log_probs, dist_entropy
class CNNPolicy(nn.Module): def __init__(self, num_inputs, action_size): super(CNNPolicy, self).__init__() self.conv1 = nn.Conv2d(num_inputs, 32, 8, stride=4) self.conv2 = nn.Conv2d(32, 64, 4, stride=2) self.conv3 = nn.Conv2d(64, 32, 3, stride=1) # self.conv1_bn = nn.BatchNorm2d(32) # self.conv2_bn = nn.BatchNorm2d(64) # self.conv3_bn = nn.BatchNorm2d(32) self.act_func = F.leaky_relu # F.tanh ## F.elu F.relu F.softplus # print (num_inputs) # fasd if num_inputs == 6: self.intermediate_size = 11264 else: self.intermediate_size = 32*7*7 # self.linear1 = nn.Linear(32 * 7 * 7, 512) self.linear1 = nn.Linear(self.intermediate_size, 512) self.critic_linear = nn.Linear(512, 1) num_outputs = action_size # action_space.n self.dist = Categorical(512, num_outputs) # if action_space.__class__.__name__ == "Discrete": # num_outputs = action_space.n # self.dist = Categorical(512, num_outputs) # elif action_space.__class__.__name__ == "Box": # num_outputs = action_space.shape[0] # self.dist = DiagGaussian(512, num_outputs) # else: # raise NotImplementedError self.train() self.reset_parameters() def reset_parameters(self): self.apply(weights_init) relu_gain = nn.init.calculate_gain('relu') self.conv1.weight.data.mul_(relu_gain) self.conv2.weight.data.mul_(relu_gain) self.conv3.weight.data.mul_(relu_gain) self.linear1.weight.data.mul_(relu_gain) if self.dist.__class__.__name__ == "DiagGaussian": self.dist.fc_mean.weight.data.mul_(0.01) def encode(self, inputs): x = self.conv1(inputs)# / 255.0) # x = self.conv1_bn(self.conv1(inputs / 255.0)) # x = F.relu(x) # x = F.elu(x) # x = F.softplus(x) # x = F.tanh(x) x = self.act_func(x) x = self.conv2(x) # x = self.conv2_bn(self.conv2(x)) # x = F.relu(x) # x = F.elu(x) # x = F.softplus(x) x = self.act_func(x) x = self.conv3(x) # x = self.conv3_bn(self.conv3(x)) # x = F.relu(x) # x = F.elu(x) # x = F.softplus(x) x = self.act_func(x) x = x.view(-1, self.intermediate_size) x = self.linear1(x) return x def predict_for_action(self, inputs): # for_action = F.relu(inputs) # for_action = F.elu(inputs) # for_action = F.softplus(inputs) for_action = self.act_func(inputs) return for_action def predict_for_value(self, inputs): # x = F.relu(inputs) # x = F.elu(inputs) # x = F.softplus(inputs) x = self.act_func(inputs) for_value= self.critic_linear(x) return for_value def forward(self, inputs): x = self.encode(inputs) for_action = self.predict_for_action(x) for_value = self.predict_for_value(x) return for_value, for_action def action_dist(self, inputs): x = self.encode(inputs) for_action = self.predict_for_action(x) return self.dist.action_probs(for_action) def action_logdist(self, inputs): x = self.encode(inputs) for_action = self.predict_for_action(x) dist = self.dist.action_logprobs(for_action) return dist def act(self, inputs, deterministic=False): # print ('sss') value, x_action = self(inputs) # action = self.dist.sample(x_action, deterministic=deterministic) # action_log_probs, dist_entropy = self.dist.evaluate_actions(x_action, actions) # x_action.mean().backward() # fsadf action, action_log_probs, dist_entropy = self.dist.sample2(x_action, deterministic=deterministic) # action_log_probs.mean().backward() # fsadf # print (value) # print (action) # fdsfa return value, action, action_log_probs, dist_entropy