class AgentSAC(AgentBase): def __init__(self, net_dim, state_dim, action_dim, learning_rate=1e-4): super().__init__() self.target_entropy = np.log(action_dim) self.alpha_log = torch.tensor((-np.log(action_dim) * np.e,), dtype=torch.float32, requires_grad=True, device=self.device) # trainable parameter self.act = ActorSAC(net_dim, state_dim, action_dim).to(self.device) self.act_target = deepcopy(self.act) self.cri = CriticTwin(int(net_dim * 1.25), state_dim, action_dim, ).to(self.device) self.cri_target = deepcopy(self.cri) self.criterion = torch.nn.SmoothL1Loss() self.optimizer = torch.optim.Adam([{'params': self.act.parameters(), 'lr': learning_rate * 0.75}, {'params': self.cri.parameters(), 'lr': learning_rate * 1.25}, {'params': (self.alpha_log,), 'lr': learning_rate}]) def select_actions(self, states): # states = (state, ...) states = torch.as_tensor(states, dtype=torch.float32, device=self.device) actions = self.act.get_action(states) return actions.detach().cpu().numpy() def update_policy(self, buffer, max_step, batch_size, repeat_times): buffer.update__now_len__before_sample() alpha = self.alpha_log.exp().detach() obj_actor = obj_critic = None for _ in range(int(max_step * repeat_times)): with torch.no_grad(): reward, mask, action, state, next_s = buffer.random_sample(batch_size) next_a, next_log_prob = self.act_target.get__action__log_prob(next_s) next_q = torch.min(*self.cri_target.get__q1_q2(next_s, next_a)) q_label = reward + mask * (next_q + next_log_prob * alpha) q1, q2 = self.cri.get__q1_q2(state, action) obj_critic = self.criterion(q1, q_label) + self.criterion(q2, q_label) action_pg, log_prob = self.act.get__action__log_prob(state) # policy gradient obj_alpha = (self.alpha_log * (log_prob - self.target_entropy).detach()).mean() alpha = self.alpha_log.exp().detach() with torch.no_grad(): self.alpha_log[:] = self.alpha_log.clamp(-16, 2) obj_actor = -(torch.min(*self.cri_target.get__q1_q2(state, action_pg)) + log_prob * alpha).mean() obj_united = obj_critic + obj_alpha + obj_actor self.optimizer.zero_grad() obj_united.backward() self.optimizer.step() soft_target_update(self.cri_target, self.cri) soft_target_update(self.act_target, self.act) # return obj_actor.item(), obj_critic.item() return alpha.item(), obj_critic.item()
class AgentModSAC(AgentBase): def __init__(self, state_dim, action_dim, net_dim, learning_rate=1e-4): AgentBase.__init__(self) self.target_entropy = np.log(action_dim) self.alpha_log = torch.tensor((-np.log(action_dim) * np.e, ), requires_grad=True, dtype=torch.float32, device=self.device) self.act = ActorSAC(state_dim, action_dim, net_dim).to(self.device) self.act_target = ActorSAC(state_dim, action_dim, net_dim).to(self.device) self.act_target.load_state_dict(self.act.state_dict()) self.cri = CriticTwin(state_dim, action_dim, int(net_dim * 1.25)).to(self.device) self.cri_target = CriticTwin(state_dim, action_dim, int(net_dim * 1.25)).to(self.device) self.cri_target.load_state_dict(self.cri.state_dict()) self.criterion = nn.SmoothL1Loss() self.optimizer = torch.optim.Adam([ { 'params': self.act.parameters(), 'lr': learning_rate }, { 'params': self.cri.parameters(), 'lr': learning_rate }, { 'params': (self.alpha_log, ), 'lr': learning_rate }, ], lr=learning_rate) def select_actions(self, states): states = torch.as_tensor(states, dtype=torch.float32, device=self.device) actions = self.act.get__a_noisy(states) return actions.detach().cpu().numpy() def update_policy(self, buffer, max_step, batch_size, repeat_times): buffer.update__now_len__before_sample() k = 1.0 + buffer.now_len / buffer.max_len batch_size_ = int(batch_size * k) train_steps = int(max_step * k * repeat_times) alpha = self.alpha_log.exp().detach() update_a = 0 for update_c in range(1, train_steps): with torch.no_grad(): reward, mask, state, action, next_s = buffer.random_sample( batch_size_) next_a_noise, next_log_prob = self.act_target.get__a__log_prob( next_s) q_label = reward + mask * (torch.min(*self.cri_target( next_s, next_a_noise)) + next_log_prob * alpha) q1, q2 = self.cri(state, action) cri_obj = self.criterion(q1, q_label) + self.criterion(q2, q_label) self.obj_c = 0.995 * self.obj_c + 0.0025 * cri_obj.item() a_noise_pg, log_prob = self.act.get__a__log_prob( state) # policy gradient alpha_obj = (self.alpha_log * (log_prob - self.target_entropy).detach()).mean() with torch.no_grad(): self.alpha_log[:] = self.alpha_log.clamp(-16, 2) lamb = np.exp(-self.obj_c**2) if_update_a = update_a / update_c < 1 / (2 - lamb) if if_update_a: # auto TTUR update_a += 1 alpha = self.alpha_log.exp().detach() act_obj = -(torch.min(*self.cri_target(state, a_noise_pg)) + log_prob * alpha).mean() self.obj_a = 0.995 * self.obj_a + 0.005 * q_label.mean().item() united_obj = cri_obj + alpha_obj + act_obj else: united_obj = cri_obj + alpha_obj self.optimizer.zero_grad() united_obj.backward() self.optimizer.step() soft_target_update(self.cri_target, self.cri) soft_target_update(self.act_target, self.act) if if_update_a else None
class AgentModSAC(AgentSAC): # Modified SAC using reliable_lambda and TTUR (Two Time-scale Update Rule) def __init__(self, net_dim, state_dim, action_dim, learning_rate=1e-4): super().__init__(net_dim, state_dim, action_dim, learning_rate) self.act = ActorSAC(net_dim, state_dim, action_dim, if_use_dn=True).to(self.device) self.act_target = deepcopy(self.act) self.cri = CriticTwin(int(net_dim * 1.25), state_dim, action_dim, if_use_dn=True).to(self.device) self.cri_target = deepcopy(self.cri) self.criterion = torch.nn.SmoothL1Loss() self.optimizer = torch.optim.Adam([{'params': self.act.parameters(), 'lr': learning_rate}, {'params': self.cri.parameters(), 'lr': learning_rate}, {'params': (self.alpha_log,), 'lr': learning_rate}]) self.obj_c = (-np.log(0.5)) ** 0.5 # for reliable_lambda def update_policy(self, buffer, max_step, batch_size, repeat_times): """ModSAC (Modified SAC using Reliable lambda) 1. Reliable Lambda is calculated based on Critic's loss function value. 2. Increasing batch_size and update_times 3. Auto-TTUR updates parameter in non-integer times. 4. net_dim of critic is slightly larger than actor. """ buffer.update__now_len__before_sample() k = 1.0 + buffer.now_len / buffer.max_len batch_size_ = int(batch_size * k) train_steps = int(max_step * k * repeat_times) alpha = self.alpha_log.exp().detach() update_a = 0 for update_c in range(1, train_steps): '''objective of critic (loss function of critic)''' with torch.no_grad(): reward, mask, action, state, next_s = buffer.random_sample(batch_size_) next_a, next_log_prob = self.act_target.get__action__log_prob(next_s) next_q = torch.min(*self.cri_target.get__q1_q2(next_s, next_a)) q_label = reward + mask * (next_q + next_log_prob * alpha) q1, q2 = self.cri.get__q1_q2(state, action) obj_critic = self.criterion(q1, q_label) + self.criterion(q2, q_label) self.obj_c = 0.995 * self.obj_c + 0.0025 * obj_critic.item() # for reliable_lambda '''objective of alpha (temperature parameter automatic adjustment)''' a_noise_pg, log_prob = self.act.get__action__log_prob(state) # policy gradient obj_alpha = (self.alpha_log * (log_prob - self.target_entropy).detach()).mean() '''objective of actor using reliable_lambda and TTUR (Two Time-scales Update Rule)''' reliable_lambda = np.exp(-self.obj_c ** 2) # for reliable_lambda if_update_a = update_a / update_c < 1 / (2 - reliable_lambda) if if_update_a: # auto TTUR update_a += 1 with torch.no_grad(): self.alpha_log[:] = self.alpha_log.clamp(-20, 2) alpha = self.alpha_log.exp().detach() obj_actor = -(torch.min(*self.cri_target.get__q1_q2(state, a_noise_pg)) + log_prob * alpha).mean() obj_united = obj_critic + obj_alpha + obj_actor else: obj_united = obj_critic + obj_alpha '''united objective''' self.optimizer.zero_grad() obj_united.backward() self.optimizer.step() soft_target_update(self.cri_target, self.cri) soft_target_update(self.act_target, self.act) if if_update_a else None return alpha.item(), self.obj_c