def _optimize_td_loss1(self): if self._step < self.batch_size or self._step < self.initial_memory_threshold: return # Sample a batch from replay memory states, actions, rewards, next_states, terminals = self.replay_memory.sample(self.batch_size, random_machine=self.np_random) states = torch.from_numpy(states).to(self.device) #将numpy转换为torch actions_combined = torch.from_numpy(actions).to(self.device) # make sure to separate actions and parameters d_actions = actions_combined[:, 0].long() actions = actions_combined[:, 1:4] action_parameters = actions_combined[:, 4:8] rewards = torch.from_numpy(rewards).to(self.device).squeeze() next_states = torch.from_numpy(next_states).to(self.device) terminals = torch.from_numpy(terminals).to(self.device).squeeze() #具体的 actor网络用于产生离散动作,actor_param网络用于产生连续动作参数,actor_param_critic作为actor_param的critic部分 # ---------------------- optimize Q-network high level (actor)---------------------- #actor 是上层的q网络 with torch.no_grad(): pred_Q_a = self.actor_target(next_states) Qprime = torch.max(pred_Q_a, 1, keepdim=True)[0].squeeze() # Compute the TD error target = rewards + (1 - terminals) * self.gamma * Qprime # Compute current Q-values using policy network q_values = self.actor(states) y_predicted = q_values.gather(1, d_actions.view(-1, 1)).squeeze() y_expected = target loss_Q = self.loss_func(y_predicted, y_expected) self.actor_optimiser.zero_grad() loss_Q.backward() if self.clip_grad > 0: torch.nn.utils.clip_grad_norm_(self.actor.parameters(), self.clip_grad) self.actor_optimiser.step() soft_update_target_network(self.actor, self.actor_target, self.tau_actor)
def _optimize_td_loss(self): if self._step < self.batch_size or self._step < self.initial_memory_threshold: return # Sample a batch from replay memory states, actions, rewards, next_states, terminals = self.replay_memory.sample( self.batch_size, random_machine=self.np_random) states = torch.from_numpy(states).to(self.device) actions_combined = torch.from_numpy(actions).to( self.device) # make sure to separate actions and parameters actions = actions_combined[:, 0].long() action_parameters = actions_combined[:, 1:] rewards = torch.from_numpy(rewards).to(self.device).squeeze() next_states = torch.from_numpy(next_states).to(self.device) terminals = torch.from_numpy(terminals).to(self.device).squeeze() # ---------------------- optimize Q-network ---------------------- with torch.no_grad(): pred_next_action_parameters = self.actor_param_target.forward( next_states) pred_Q_a = self.actor_target(next_states, pred_next_action_parameters) Qprime = torch.max(pred_Q_a, 1, keepdim=True)[0].squeeze() # Compute the TD error target = rewards + (1 - terminals) * self.gamma * Qprime # Compute current Q-values using policy network q_values = self.actor(states, action_parameters) y_predicted = q_values.gather(1, actions.view(-1, 1)).squeeze() y_expected = target loss_Q = self.loss_func(y_predicted, y_expected) self.actor_optimiser.zero_grad() loss_Q.backward() if self.clip_grad > 0: torch.nn.utils.clip_grad_norm_(self.actor.parameters(), self.clip_grad) self.actor_optimiser.step() # ---------------------- optimize actor ---------------------- with torch.no_grad(): action_params = self.actor_param(states) action_params.requires_grad = True assert (self.weighted ^ self.average ^ self.random_weighted) or \ not (self.weighted or self.average or self.random_weighted) Q = self.actor(states, action_params) Q_val = Q if self.weighted: # approximate categorical probability density (i.e. counting) counts = Counter(actions.cpu().numpy()) weights = torch.from_numpy( np.array([ counts[a] / actions.shape[0] for a in range(self.num_actions) ])).float().to(self.device) Q_val = weights * Q elif self.average: Q_val = Q / self.num_actions elif self.random_weighted: weights = np.random.uniform(0, 1., self.num_actions) weights /= np.linalg.norm(weights) weights = torch.from_numpy(weights).float().to(self.device) Q_val = weights * Q if self.indexed: Q_indexed = Q_val.gather(1, actions.unsqueeze(1)) Q_loss = torch.mean(Q_indexed) else: Q_loss = torch.mean(torch.sum(Q_val, 1)) self.actor.zero_grad() Q_loss.backward() from copy import deepcopy delta_a = deepcopy(action_params.grad.data) # step 2 action_params = self.actor_param(Variable(states)) delta_a[:] = self._invert_gradients(delta_a, action_params, grad_type="action_parameters", inplace=True) if self.zero_index_gradients: delta_a[:] = self._zero_index_gradients( delta_a, batch_action_indices=actions, inplace=True) out = -torch.mul(delta_a, action_params) self.actor_param.zero_grad() out.backward(torch.ones(out.shape).to(self.device)) if self.clip_grad > 0: torch.nn.utils.clip_grad_norm_(self.actor_param.parameters(), self.clip_grad) self.actor_param_optimiser.step() soft_update_target_network(self.actor, self.actor_target, self.tau_actor) soft_update_target_network(self.actor_param, self.actor_param_target, self.tau_actor_param)
def _optimize_td_loss(self): if self.replay_memory.nb_entries < self.batch_size or \ self.replay_memory.nb_entries < self.initial_memory_threshold: return # Sample a batch from replay memory if self.n_step_returns: states, actions, rewards, next_states, terminals, n_step_returns = self.replay_memory.sample( self.batch_size, random_machine=self.np_random) else: states, actions, rewards, next_states, terminals = self.replay_memory.sample( self.batch_size, random_machine=self.np_random) n_step_returns = None states = torch.from_numpy(states).to(device) actions_combined = torch.from_numpy(actions).to( device) # make sure to separate actions and action-parameters actions = actions_combined[:, 1:self.num_actions + 1] action_parameters = actions_combined[:, self.num_actions + 1:] rewards = torch.from_numpy(rewards).to(device) next_states = torch.from_numpy(next_states).to(device) terminals = torch.from_numpy(terminals).to(device) if self.n_step_returns: n_step_returns = torch.from_numpy(n_step_returns).to(device) # ---------------------- optimize critic ---------------------- with torch.no_grad(): pred_next_actions, pred_next_action_parameters = self.actor_target.forward( next_states) off_policy_next_val = self.critic_target.forward( next_states, pred_next_actions, pred_next_action_parameters) off_policy_target = rewards + ( 1 - terminals) * self.gamma * off_policy_next_val if self.n_step_returns: on_policy_target = n_step_returns target = self.beta * on_policy_target + ( 1. - self.beta) * off_policy_target else: target = off_policy_target y_expected = target y_predicted = self.critic.forward(states, actions, action_parameters) loss_critic = self.loss_func(y_predicted, y_expected) self.critic_optimiser.zero_grad() loss_critic.backward() if self.clip_grad > 0: torch.nn.utils.clip_grad_norm_(self.critic.parameters(), self.clip_grad) self.critic_optimiser.step() # ---------------------- optimise actor ---------------------- # 1 - calculate gradients from critic with torch.no_grad(): actions, action_params = self.actor(states) action_params = torch.cat((actions, action_params), dim=1) action_params.requires_grad = True Q_val = self.critic(states, action_params[:, :self.num_actions], action_params[:, self.num_actions:]).mean() self.critic.zero_grad() Q_val.backward() from copy import deepcopy delta_a = deepcopy(action_params.grad.data) # 2 - apply inverting gradients and combine with gradients from actor actions, action_params = self.actor(Variable(states)) action_params = torch.cat((actions, action_params), dim=1) delta_a[:, self.num_actions:] = self._invert_gradients( delta_a[:, self.num_actions:].cpu(), action_params[:, self.num_actions:].cpu(), grad_type="action_parameters", inplace=True) delta_a[:, :self.num_actions] = self._invert_gradients( delta_a[:, :self.num_actions].cpu(), action_params[:, :self.num_actions].cpu(), grad_type="actions", inplace=True) out = -torch.mul(delta_a, action_params) self.actor.zero_grad() out.backward(torch.ones(out.shape).to(device)) if self.clip_grad > 0: torch.nn.utils.clip_grad_norm_(self.actor.parameters(), self.clip_grad) self.actor_optimiser.step() soft_update_target_network(self.actor, self.actor_target, self.tau_actor) soft_update_target_network(self.critic, self.critic_target, self.tau_critic)
def _optimize_td_loss(self): if self._step < self.batch_size or self._step < self.initial_memory_threshold: return # Sample a batch from replay memory states, actions, rewards, next_states, terminals = self.replay_memory.sample( self.batch_size, random_machine=self.np_random) states = torch.from_numpy(states).to(self.device) #将numpy转换为torch actions_combined = torch.from_numpy(actions).to( self.device) # make sure to separate actions and parameters d_actions = actions_combined[:, 0].long() actions = actions_combined[:, 1:4] action_parameters = actions_combined[:, 4:8] rewards = torch.from_numpy(rewards).to(self.device).squeeze() next_states = torch.from_numpy(next_states).to(self.device) terminals = torch.from_numpy(terminals).to(self.device).squeeze() #具体的 actor网络用于产生离散动作,actor_param网络用于产生连续动作参数,actor_param_critic作为actor_param的critic部分 # ---------------------- optimize Q-network high level (actor)---------------------- #actor 是上层的q网络 with torch.no_grad(): pred_Q_a = self.actor_target(next_states) Qprime = torch.max(pred_Q_a, 1, keepdim=True)[0].squeeze() # Compute the TD error target = rewards + (1 - terminals) * self.gamma * Qprime # Compute current Q-values using policy network q_values = self.actor(states) y_predicted = q_values.gather(1, d_actions.view(-1, 1)).squeeze() y_expected = target loss_Q = self.loss_func(y_predicted, y_expected) self.actor_optimiser.zero_grad() loss_Q.backward() if self.clip_grad > 0: torch.nn.utils.clip_grad_norm_(self.actor.parameters(), self.clip_grad) self.actor_optimiser.step() # ---------------------- 下层可以理解为一个DDPG网络---------------------- # ---------------------- optimize actor parameter (low)连续动作参数网络---------------------- with torch.no_grad(): pred_action = self.actor.forward(states) action_params = self.actor_param(states, pred_action) action_params.requires_grad = True assert (self.weighted ^ self.average ^ self.random_weighted) or \ not (self.weighted or self.average or self.random_weighted) Q = self.actor_param_critic(states, pred_action, action_params) Q_val = Q #actor的误差函数在这里 Q_loss = torch.mean(torch.sum(Q_val, 1)) #求和再取均值 self.actor_param_critic.zero_grad() Q_loss.backward() #如果将p-dqn中下面这个部分去掉,则性能将会明显下降,甚至不如hhqn from copy import deepcopy delta_a = deepcopy(action_params.grad.data) # step 2 pred_action = self.actor.forward(Variable(states)) action_params = self.actor_param(Variable(states), Variable(pred_action)) delta_a[:] = self._invert_gradients(delta_a, action_params, grad_type="action_parameters", inplace=True) if self.zero_index_gradients: delta_a[:] = self._zero_index_gradients( delta_a, batch_action_indices=actions, inplace=True) out = -torch.mul(delta_a, action_params) # print(out) self.actor_param.zero_grad() out.backward(torch.ones(out.shape).to(self.device)) if self.clip_grad > 0: torch.nn.utils.clip_grad_norm_(self.actor_param.parameters(), self.clip_grad) self.actor_param_optimiser.step() # ---------------------- optimize Q-network low level (critic)---------------------- with torch.no_grad(): pred_next_action = self.actor_target.forward(next_states) next_action_params = self.actor_param_target( next_states, pred_next_action) pred_Q_a = self.actor_param_target_critic(next_states, pred_next_action, next_action_params) Qprime = torch.max(pred_Q_a, 1, keepdim=True)[0].squeeze() # Compute the TD error target = rewards + (1 - terminals) * self.gamma * Qprime # Compute current Q-values using policy network q_values = self.actor_param_critic(states, actions, action_parameters) y_expected = target.unsqueeze(1) y_predicted = q_values loss_Q = self.loss_func(y_predicted, y_expected) self.actor_param_critic_optimiser.zero_grad() loss_Q.backward() if self.clip_grad > 0: torch.nn.utils.clip_grad_norm_( self.actor_param_critic.parameters(), self.clip_grad) self.actor_param_critic_optimiser.step() soft_update_target_network(self.actor, self.actor_target, self.tau_actor) soft_update_target_network(self.actor_param, self.actor_param_target, self.tau_actor_param) soft_update_target_network(self.actor_param_critic, self.actor_param_target_critic, self.tau_actor_param_critic)