def init(self, net_dim, state_dim, action_dim, if_per): self.action_dim = action_dim self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.act = ActorMPO(net_dim, state_dim, action_dim).to(self.device) self.act_target = deepcopy(self.act) self.cri = Critic(net_dim, state_dim, action_dim).to(self.device) self.cri_target = deepcopy(self.cri) self.criterion = torch.nn.SmoothL1Loss() self.act_optimizer = torch.optim.Adam(self.act.parameters(), lr=self.learning_rate) self.cri_optimizer = torch.optim.Adam(self.cri.parameters(), lr=self.learning_rate) self.eta = np.random.rand() self.eta_kl_mu = 0. self.eta_kl_sigma = 0. self.eta_kl_mu = 0.
def init(self, net_dim, state_dim, action_dim, if_per=False): self.ou_noise = OrnsteinUhlenbeckNoise(size=action_dim, sigma=self.ou_explore_noise) # I don't recommend to use OU-Noise self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.cri = Critic(net_dim, state_dim, action_dim).to(self.device) self.cri_target = deepcopy(self.cri) self.cri_optimizer = torch.optim.Adam(self.cri.parameters(), lr=self.learning_rate) self.act = Actor(net_dim, state_dim, action_dim).to(self.device) self.act_target = deepcopy(self.act) self.act_optimizer = torch.optim.Adam(self.act.parameters(), lr=self.learning_rate) self.criterion = torch.nn.SmoothL1Loss(reduction='none' if if_per else 'mean') if if_per: self.get_obj_critic = self.get_obj_critic_per else: self.get_obj_critic = self.get_obj_critic_raw
class AgentMPO(AgentBase): def __init__(self): super().__init__() self.epsilon_dual = 0.1 self.epsilon_kl_mu = 0.01 self.epsilon_kl_sigma = 0.01 self.epsilon_kl = 0.01 self.alpha = 10 self.sample_a_num = 64 self.lagrange_iteration_num = 5 def init(self, net_dim, state_dim, action_dim, if_per): self.action_dim = action_dim self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.act = ActorMPO(net_dim, state_dim, action_dim).to(self.device) self.act_target = deepcopy(self.act) self.cri = Critic(net_dim, state_dim, action_dim).to(self.device) self.cri_target = deepcopy(self.cri) self.criterion = torch.nn.SmoothL1Loss() self.act_optimizer = torch.optim.Adam(self.act.parameters(), lr=self.learning_rate) self.cri_optimizer = torch.optim.Adam(self.cri.parameters(), lr=self.learning_rate) self.eta = np.random.rand() self.eta_kl_mu = 0. self.eta_kl_sigma = 0. self.eta_kl_mu = 0. def select_action(self, state) -> np.ndarray: states = torch.as_tensor((state, ), dtype=torch.float32, device=self.device).detach_() action = self.act.get_action(states)[0] return action.cpu().numpy() def update_net(self, buffer, target_step, batch_size, repeat_times) -> (float, float): buffer.update_now_len_before_sample() t_a = torch.empty([256, 64, 3], dtype=torch.float32, device=self.device) t_s = torch.empty([256, 64, 15], dtype=torch.float32, device=self.device) obj_critic = None for _ in range(int(target_step * repeat_times)): # Policy Evaluation with torch.no_grad(): reward, mask, a, state, next_s = buffer.sample_batch( batch_size) pi_next_s = self.act_target.get_distribution(next_s) sampled_next_a = pi_next_s.sample( (self.sample_a_num, )).transpose(0, 1) expanded_next_s = next_s[:, None, :].expand( -1, self.sample_a_num, -1) expected_next_q = self.cri_target( expanded_next_s.reshape(-1, state.shape[1]), sampled_next_a.reshape(-1, a.shape[1])) expected_next_q = expected_next_q.reshape( batch_size, self.sample_a_num) expected_next_q = expected_next_q.mean(dim=1) expected_next_q = expected_next_q.unsqueeze(dim=1) q_label = reward + mask * expected_next_q q = self.cri(state, a) obj_critic = self.criterion(q, q_label) self.cri_optimizer.zero_grad() obj_critic.backward() self.cri_optimizer.step() self.soft_update(self.cri_target, self.cri, self.soft_update_tau) # Policy Improvation # Sample M additional action for each state with torch.no_grad(): pi_b = self.act_target.get_distribution(state) sampled_a = pi_b.sample((self.sample_a_num, )) expanded_s = state[None, ...].expand(self.sample_a_num, -1, -1) target_q = self.cri_target.forward( expanded_s.reshape(-1, state.shape[1]), # (M * B, ds) sampled_a.reshape(-1, a.shape[1]) # (M * B, da) ).reshape(self.sample_a_num, batch_size) target_q_np = target_q.cpu().numpy() # E step def dual(eta): max_q = np.max(target_q_np, 0) return eta * self.epsilon_dual + np.mean(max_q) \ + eta * np.mean(np.log(np.mean(np.exp((target_q_np - max_q) / eta), axis=0))) bounds = [(1e-6, None)] res = minimize(dual, np.array([self.eta]), method='SLSQP', bounds=bounds) self.eta = res.x[0] qij = torch.softmax(target_q / self.eta, dim=0) # (M, B) or (da, B) # M step pi = self.act.get_distribution(state) loss_p = torch.mean(qij * ( pi.expand((self.sample_a_num, batch_size)).log_prob( sampled_a) # (M, B) + pi_b.expand((self.sample_a_num, batch_size)).log_prob( sampled_a) # (M, B) )) # mean_loss_p.append((-loss_p).item()) kl_mu, kl_sigma = gaussian_kl(mu_i=pi_b.loc, mu=pi.loc, A_i=pi_b.scale_tril, A=pi.scale_tril) if np.isnan(kl_mu.item()): print('kl_μ is nan') embed() if np.isnan(kl_sigma.item()): print('kl_Σ is nan') embed() self.eta_kl_mu -= self.alpha * (self.epsilon_kl_mu - kl_mu).detach().item() self.eta_kl_sigma -= self.alpha * (self.epsilon_kl_sigma - kl_sigma).detach().item() self.eta_kl_mu = 0.0 if self.eta_kl_mu < 0.0 else self.eta_kl_mu self.eta_kl_sigma = 0.0 if self.eta_kl_sigma < 0.0 else self.eta_kl_sigma self.act_optimizer.zero_grad() obj_actor = -(loss_p + self.eta_kl_mu * (self.epsilon_kl_mu - kl_mu) + self.eta_kl_sigma * (self.epsilon_kl_sigma - kl_sigma)) self.act_optimizer.zero_grad() obj_actor.backward() self.act_optimizer.step() self.soft_update(self.act_target, self.act, self.soft_update_tau) self.update_record(obj_a=obj_actor.item(), obj_c=obj_critic.item(), loss_pi=loss_p.item(), est_q=q_label.mean().item(), max_kl_mu=kl_mu.item(), max_kl_sigma=kl_sigma.item(), eta=self.eta) return self.train_record
class AgentDDPG(AgentBase): def __init__(self): super().__init__() self.ou_explore_noise = 0.3 # explore noise of action self.ou_noise = None def init(self, net_dim, state_dim, action_dim, if_per=False): self.ou_noise = OrnsteinUhlenbeckNoise(size=action_dim, sigma=self.ou_explore_noise) # I don't recommend to use OU-Noise self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.cri = Critic(net_dim, state_dim, action_dim).to(self.device) self.cri_target = deepcopy(self.cri) self.cri_optimizer = torch.optim.Adam(self.cri.parameters(), lr=self.learning_rate) self.act = Actor(net_dim, state_dim, action_dim).to(self.device) self.act_target = deepcopy(self.act) self.act_optimizer = torch.optim.Adam(self.act.parameters(), lr=self.learning_rate) self.criterion = torch.nn.SmoothL1Loss( reduction='none' if if_per else 'mean') if if_per: self.get_obj_critic = self.get_obj_critic_per else: self.get_obj_critic = self.get_obj_critic_raw def select_action(self, state) -> np.ndarray: states = torch.as_tensor((state, ), dtype=torch.float32, device=self.device).detach_() action = self.act(states)[0].cpu().numpy() return (action + self.ou_noise()).ratio_clip(-1, 1) def update_net(self, buffer, target_step, batch_size, repeat_times) -> (float, float): buffer.update_now_len_before_sample() obj_critic = obj_actor = None # just for print return for _ in range(int(target_step * repeat_times)): obj_critic, state = self.get_obj_critic(buffer, batch_size) self.cri_optimizer.zero_grad() obj_critic.backward() self.cri_optimizer.step() self.soft_update(self.cri_target, self.cri, self.soft_update_tau) q_value_pg = self.act(state) # policy gradient obj_actor = -self.cri_target(state, q_value_pg).mean() # obj_actor self.act_optimizer.zero_grad() obj_actor.backward() self.act_optimizer.step() self.soft_update(self.act_target, self.act, self.soft_update_tau) self.update_record(obj_a=obj_actor.item(), obj_c=obj_critic.item()) return self.train_record def get_obj_critic_raw(self, buffer, batch_size): with torch.no_grad(): reward, mask, action, state, next_s = buffer.sample_batch( batch_size) next_q = self.cri_target(next_s, self.act_target(next_s)) q_label = reward + mask * next_q q_value = self.cri(state, action) obj_critic = self.criterion(q_value, q_label) return obj_critic, state def get_obj_critic_per(self, buffer, batch_size): with torch.no_grad(): reward, mask, action, state, next_s, is_weights = buffer.sample_batch( batch_size) next_q = self.cri_target(next_s, self.act_target(next_s)) q_label = reward + mask * next_q q_value = self.cri(state, action) obj_critic = (self.criterion(q_value, q_label) * is_weights).mean() td_error = (q_label - q_value.detach()).abs() buffer.td_error_update(td_error) return obj_critic, state