def init(self, net_dim, state_dim, action_dim, if_per=False): self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.cri = CriticTwin(net_dim, state_dim, action_dim).to(self.device) self.cri_target = deepcopy(self.cri) self.cri_optimizer = torch.optim.Adam(self.cri.parameters(), lr=self.learning_rate) self.act = Actor(net_dim, state_dim, action_dim).to(self.device) self.act_target = deepcopy(self.act) self.act_optimizer = torch.optim.Adam(self.act.parameters(), lr=self.learning_rate) self.criterion = torch.nn.SmoothL1Loss(reduction='none' if if_per else 'mean') if if_per: self.get_obj_critic = self.get_obj_critic_per else: self.get_obj_critic = self.get_obj_critic_raw
def init(self, net_dim, state_dim, action_dim, if_per=False): self.ou_noise = OrnsteinUhlenbeckNoise(size=action_dim, sigma=self.ou_explore_noise) # I don't recommend to use OU-Noise self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.cri = Critic(net_dim, state_dim, action_dim).to(self.device) self.cri_target = deepcopy(self.cri) self.cri_optimizer = torch.optim.Adam(self.cri.parameters(), lr=self.learning_rate) self.act = Actor(net_dim, state_dim, action_dim).to(self.device) self.act_target = deepcopy(self.act) self.act_optimizer = torch.optim.Adam(self.act.parameters(), lr=self.learning_rate) self.criterion = torch.nn.SmoothL1Loss(reduction='none' if if_per else 'mean') if if_per: self.get_obj_critic = self.get_obj_critic_per else: self.get_obj_critic = self.get_obj_critic_raw
class AgentTD3(AgentDDPG): def __init__(self): super().__init__() self.explore_noise = 0.1 # standard deviation of explore noise self.policy_noise = 0.2 # standard deviation of policy noise self.update_freq = 2 # delay update frequency, for soft target update def init(self, net_dim, state_dim, action_dim, if_per=False): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.cri = CriticTwin(net_dim, state_dim, action_dim).to(self.device) self.cri_target = deepcopy(self.cri) self.cri_optimizer = torch.optim.Adam(self.cri.parameters(), lr=self.learning_rate) self.act = Actor(net_dim, state_dim, action_dim).to(self.device) self.act_target = deepcopy(self.act) self.act_optimizer = torch.optim.Adam(self.act.parameters(), lr=self.learning_rate) self.criterion = torch.nn.SmoothL1Loss( reduction='none' if if_per else 'mean') if if_per: self.get_obj_critic = self.get_obj_critic_per else: self.get_obj_critic = self.get_obj_critic_raw def select_action(self, state) -> np.ndarray: states = torch.as_tensor((state, ), dtype=torch.float32, device=self.device).detach_() action = self.act(states)[0] action = (action + torch.randn_like(action) * self.explore_noise).clamp(-1, 1) return action.cpu().numpy() def update_net(self, buffer, target_step, batch_size, repeat_times) -> (float, float): buffer.update_now_len_before_sample() obj_critic = obj_actor = None for i in range(int(target_step * repeat_times)): obj_critic, state = self.get_obj_critic(buffer, batch_size) self.cri_optimizer.zero_grad() obj_critic.backward() self.cri_optimizer.step() if i % self.update_freq == 0: # delay update self.soft_update(self.cri_target, self.cri, self.soft_update_tau) q_value_pg = self.act(state) # policy gradient obj_actor = -self.cri_target(state, q_value_pg).mean() # obj_actor self.act_optimizer.zero_grad() obj_actor.backward() self.act_optimizer.step() if i % self.update_freq == 0: # delay update self.soft_update(self.act_target, self.act, self.soft_update_tau) self.update_record(obj_a=obj_actor.item(), obj_c=obj_critic.item() / 2) return self.train_record def get_obj_critic_raw(self, buffer, batch_size): with torch.no_grad(): reward, mask, action, state, next_s = buffer.sample_batch( batch_size) next_a = self.act_target.get_action( next_s, self.policy_noise) # policy noise next_q = torch.min(*self.cri_target.get_q1_q2( next_s, next_a)) # twin critics q_label = reward + mask * next_q q1, q2 = self.cri.get_q1_q2(state, action) obj_critic = self.criterion(q1, q_label) + self.criterion( q2, q_label) # twin critics return obj_critic, state def get_obj_critic_per(self, buffer, batch_size): """Prioritized Experience Replay Contributor: Github GyChou """ with torch.no_grad(): reward, mask, action, state, next_s, is_weights = buffer.sample_batch( batch_size) next_a = self.act_target.get_action( next_s, self.policy_noise) # policy noise next_q = torch.min(*self.cri_target.get_q1_q2( next_s, next_a)) # twin critics q_label = reward + mask * next_q q1, q2 = self.cri.get_q1_q2(state, action) obj_critic = ( (self.criterion(q1, q_label) + self.criterion(q2, q_label)) * is_weights).mean() td_error = (q_label - torch.min(q1, q2).detach()).abs() buffer.td_error_update(td_error) return obj_critic, state
class AgentDDPG(AgentBase): def __init__(self): super().__init__() self.ou_explore_noise = 0.3 # explore noise of action self.ou_noise = None def init(self, net_dim, state_dim, action_dim, if_per=False): self.ou_noise = OrnsteinUhlenbeckNoise(size=action_dim, sigma=self.ou_explore_noise) # I don't recommend to use OU-Noise self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.cri = Critic(net_dim, state_dim, action_dim).to(self.device) self.cri_target = deepcopy(self.cri) self.cri_optimizer = torch.optim.Adam(self.cri.parameters(), lr=self.learning_rate) self.act = Actor(net_dim, state_dim, action_dim).to(self.device) self.act_target = deepcopy(self.act) self.act_optimizer = torch.optim.Adam(self.act.parameters(), lr=self.learning_rate) self.criterion = torch.nn.SmoothL1Loss( reduction='none' if if_per else 'mean') if if_per: self.get_obj_critic = self.get_obj_critic_per else: self.get_obj_critic = self.get_obj_critic_raw def select_action(self, state) -> np.ndarray: states = torch.as_tensor((state, ), dtype=torch.float32, device=self.device).detach_() action = self.act(states)[0].cpu().numpy() return (action + self.ou_noise()).ratio_clip(-1, 1) def update_net(self, buffer, target_step, batch_size, repeat_times) -> (float, float): buffer.update_now_len_before_sample() obj_critic = obj_actor = None # just for print return for _ in range(int(target_step * repeat_times)): obj_critic, state = self.get_obj_critic(buffer, batch_size) self.cri_optimizer.zero_grad() obj_critic.backward() self.cri_optimizer.step() self.soft_update(self.cri_target, self.cri, self.soft_update_tau) q_value_pg = self.act(state) # policy gradient obj_actor = -self.cri_target(state, q_value_pg).mean() # obj_actor self.act_optimizer.zero_grad() obj_actor.backward() self.act_optimizer.step() self.soft_update(self.act_target, self.act, self.soft_update_tau) self.update_record(obj_a=obj_actor.item(), obj_c=obj_critic.item()) return self.train_record def get_obj_critic_raw(self, buffer, batch_size): with torch.no_grad(): reward, mask, action, state, next_s = buffer.sample_batch( batch_size) next_q = self.cri_target(next_s, self.act_target(next_s)) q_label = reward + mask * next_q q_value = self.cri(state, action) obj_critic = self.criterion(q_value, q_label) return obj_critic, state def get_obj_critic_per(self, buffer, batch_size): with torch.no_grad(): reward, mask, action, state, next_s, is_weights = buffer.sample_batch( batch_size) next_q = self.cri_target(next_s, self.act_target(next_s)) q_label = reward + mask * next_q q_value = self.cri(state, action) obj_critic = (self.criterion(q_value, q_label) * is_weights).mean() td_error = (q_label - q_value.detach()).abs() buffer.td_error_update(td_error) return obj_critic, state