def get_loss(self): if self.n_episodes <= self.episodes_before_train: pass batch = self.memory.sample(self.batch_size) states_var = to_tensor_var( batch.states, self.use_cuda).view(-1, self.state_dim) one_hot_actions = index_to_one_hot(batch.actions, self.action_dim) actions_var = to_tensor_var( one_hot_actions, self.use_cuda).view(-1, self.action_dim) rewards_var = to_tensor_var(batch.rewards, self.use_cuda).view(-1, 1) # Get the actor network loss self.actor_optimizer.zero_grad() action_log_probs = self.actor(states_var) entropy_loss = th.mean(entropy(th.exp(action_log_probs))) action_log_probs = th.sum(action_log_probs * actions_var, 1) values = self.critic(states_var, actions_var) advantages = rewards_var - values.detach() pg_loss = -th.mean(action_log_probs * advantages) actor_loss = pg_loss - entropy_loss * self.entropy_reg # Get the critic network loss self.critic_optimizer.zero_grad() target_values = rewards_var if self.critic_loss == "huber": critic_loss = nn.functional.smooth_l1_loss(values, target_values) else: critic_loss = nn.MSELoss()(values, target_values) combined_loss = {'actor_loss': actor_loss, 'critic_loss': critic_loss} return combined_loss
def train(self): if self.n_episodes <= self.episodes_before_train: pass batch = self.memory.sample(self.batch_size) states_var = to_tensor_var(batch.states, self.use_cuda).view(-1, self.state_dim) one_hot_actions = index_to_one_hot(batch.actions, self.action_dim) actions_var = to_tensor_var(one_hot_actions, self.use_cuda).view(-1, self.action_dim) rewards_var = to_tensor_var(batch.rewards, self.use_cuda).view(-1, 1) # update actor network self.actor_optimizer.zero_grad() action_log_probs = self.actor(states_var) entropy_loss = th.mean(entropy(th.exp(action_log_probs))) action_log_probs = th.sum(action_log_probs * actions_var, 1) values = self.critic(states_var, actions_var) advantages = rewards_var - values.detach() pg_loss = -th.mean(action_log_probs * advantages) actor_loss = pg_loss - entropy_loss * self.entropy_reg actor_loss.backward() if self.max_grad_norm is not None: nn.utils.clip_grad_norm(self.actor.parameters(), self.max_grad_norm) self.actor_optimizer.step() # update critic network self.critic_optimizer.zero_grad() target_values = rewards_var if self.critic_loss == "huber": critic_loss = nn.functional.smooth_l1_loss(values, target_values) else: critic_loss = nn.MSELoss()(values, target_values) critic_loss.backward() if self.max_grad_norm is not None: nn.utils.clip_grad_norm(self.critic.parameters(), self.max_grad_norm) self.critic_optimizer.step()
def train(self): if self.n_episodes <= self.episodes_before_train: pass batch = self.memory.sample(self.batch_size) states_var = to_tensor_var(batch.states, self.use_cuda).view(-1, self.n_agents, self.state_dim) actions_var = to_tensor_var(batch.actions, self.use_cuda).view( -1, self.n_agents, self.action_dim) rewards_var = to_tensor_var(batch.rewards, self.use_cuda).view(-1, self.n_agents, 1) whole_states_var = states_var.view(-1, self.n_agents * self.state_dim) whole_actions_var = actions_var.view(-1, self.n_agents * self.action_dim) for agent_id in range(self.n_agents): # update actor network self.actor_optimizers[agent_id].zero_grad() action_log_probs = self.actors[agent_id](states_var[:, agent_id, :]) entropy_loss = th.mean(entropy(th.exp(action_log_probs))) action_log_probs = th.sum( action_log_probs * actions_var[:, agent_id, :], 1) if self.training_strategy == "cocurrent": values = self.critics[agent_id](states_var[:, agent_id, :], actions_var[:, agent_id, :]) elif self.training_strategy == "centralized": values = self.critics[agent_id](whole_states_var, whole_actions_var) advantages = rewards_var[:, agent_id, :] - values.detach() pg_loss = -th.mean(action_log_probs * advantages) actor_loss = pg_loss - entropy_loss * self.entropy_reg actor_loss.backward() if self.max_grad_norm is not None: nn.utils.clip_grad_norm(self.actors[agent_id].parameters(), self.max_grad_norm) self.actor_optimizers[agent_id].step() # update critic network self.critic_optimizers[agent_id].zero_grad() target_values = rewards_var[:, agent_id, :] if self.critic_loss == "huber": critic_loss = nn.functional.smooth_l1_loss( values, target_values) else: critic_loss = nn.MSELoss()(values, target_values) critic_loss.backward() if self.max_grad_norm is not None: nn.utils.clip_grad_norm(self.critics[agent_id].parameters(), self.max_grad_norm) self.critic_optimizers[agent_id].step()
def train(self): if self.n_episodes <= self.episodes_before_train: pass batch = self.memory.sample(self.batch_size) states_var = to_tensor_var(batch.states, self.use_cuda).view(-1, self.state_dim) one_hot_actions = index_to_one_hot(batch.actions, self.action_dim) actions_var = to_tensor_var(one_hot_actions, self.use_cuda).view(-1, self.action_dim) rewards_var = to_tensor_var(batch.rewards, self.use_cuda).view(-1, 1) # update actor network action_log_probs, values = self.actor_critic(states_var) entropy_loss = th.mean(entropy(th.exp(action_log_probs))) action_log_probs = th.sum(action_log_probs * actions_var, 1) # fisher loss if self.optimizer.steps % self.optimizer.Ts == 0: self.actor_critic.zero_grad() pg_fisher_loss = th.mean(action_log_probs) values_noise = to_tensor_var(np.random.randn(values.size()[0]), self.use_cuda) sample_values = (values + values_noise.view(-1, 1)).detach() if self.critic_loss == "huber": vf_fisher_loss = -nn.functional.smooth_l1_loss( values, sample_values) else: vf_fisher_loss = -nn.MSELoss()(values, sample_values) joint_fisher_loss = pg_fisher_loss + self.vf_fisher_coef * vf_fisher_loss self.optimizer.acc_stats = True joint_fisher_loss.backward(retain_graph=True) self.optimizer.acc_stats = False self.optimizer.zero_grad() # actor loss advantages = rewards_var - values.detach() pg_loss = -th.mean(action_log_probs * advantages) actor_loss = pg_loss - entropy_loss * self.entropy_reg # critic loss target_values = rewards_var if self.critic_loss == "huber": critic_loss = nn.functional.smooth_l1_loss(values, target_values) else: critic_loss = nn.MSELoss()(values, target_values) loss = actor_loss + critic_loss loss.backward() if self.max_grad_norm is not None: nn.utils.clip_grad_norm(self.actor_critic.parameters(), self.max_grad_norm) self.optimizer.step()