def __init__(self, actor_net, critic_net, buffer_size=1000, actor_learn_freq=1, target_update_freq=0, target_update_tau=5e-3, learning_rate=0.0001, discount_factor=0.99, batch_size=100, verbose=False): super().__init__() self.lr = learning_rate self.eps = np.finfo(np.float32).eps.item() self.tau = target_update_tau self.ratio_clip = 0.2 self.lam_entropy = 0.01 self.adv_norm = True self.rew_norm = False self.schedule_clip = False self.schedule_adam = False self.actor_learn_freq = actor_learn_freq self.target_update_freq = target_update_freq self._gamma = discount_factor self._target = target_update_freq > 0 self._update_iteration = 10 self._sync_cnt = 0 # self._learn_cnt = 0 self._learn_critic_cnt = 0 self._learn_actor_cnt = 0 self._verbose = verbose self._batch_size = batch_size self.buffer = ReplayBuffer(buffer_size, replay=False) # assert not self.buffer.allow_replay, 'PPO buffer cannot be replay buffer' self._normalized = lambda x, e: (x - x.mean()) / (x.std() + e) self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.actor_eval = actor_net.to(self.device) self.critic_eval = critic_net.to(self.device) self.actor_eval_optim = optim.Adam(self.actor_eval.parameters(), lr=self.lr) self.critic_eval_optim = optim.Adam(self.critic_eval.parameters(), lr=self.lr) self.actor_eval.train() self.critic_eval.train() if self._target: self.actor_target = deepcopy(self.actor_eval) self.critic_target = deepcopy(self.critic_eval) self.actor_target.load_state_dict(self.actor_eval.state_dict()) self.critic_target.load_state_dict(self.critic_eval.state_dict()) self.actor_target.eval() self.critic_target.eval() self.criterion = nn.SmoothL1Loss()
def __init__( self, model, buffer_size=1000, learning_rate=1e-3, discount_factor=0.99, gae_lamda=1, # mc verbose=False, num_episodes=1000, ): super().__init__() self.lr = learning_rate self.end_lr = self.lr * 0.1 self.eps = np.finfo(np.float32).eps.item() self._gamma = discount_factor self._gae_lamda = gae_lamda # default: 1, MC self._learn_cnt = 0 self._verbose = verbose self.schedule_adam = True self.buffer = ReplayBuffer(buffer_size, replay=False) self.actor_eval = model.policy_net.to(device).train() self.critic_eval = model.value_net.to(device).train() self.actor_eval_optim = optim.Adam(self.actor_eval.parameters(), lr=self.lr) self.critic_eval_optim = optim.Adam(self.critic_eval.parameters(), lr=self.lr) self.criterion = nn.SmoothL1Loss() self.num_episodes = num_episodes
def __init__( self, model, buffer_size=1000, batch_size=100, actor_learn_freq=1, target_update_freq=5, target_update_tau=1e-2, learning_rate=1e-3, discount_factor=0.99, verbose=False, update_iteration=10, act_dim=None, alpha=None # default: auto_entropy_tuning ): super().__init__() self.lr = learning_rate self.eps = np.finfo(np.float32).eps.item() self.tau = target_update_tau self.actor_learn_freq = actor_learn_freq self.target_update_freq = target_update_freq self._gamma = discount_factor self._target = target_update_freq > 0 self._update_iteration = update_iteration self._sync_cnt = 0 self._learn_critic_cnt = 0 self._learn_actor_cnt = 0 self._verbose = verbose self._batch_size = batch_size self.buffer = ReplayBuffer(buffer_size) # off-policy self.actor_eval = model.policy_net.to(device).train() self.critic_eval = model.value_net.to(device).train() self.actor_target = self.copy_net(self.actor_eval) self.critic_target = self.copy_net(self.critic_eval) self.actor_eval_optim = optim.Adam(self.actor_eval.parameters(), lr=self.lr) self.critic_eval_optim = optim.Adam(self.critic_eval.parameters(), lr=self.lr) self.criterion = nn.SmoothL1Loss() self.act_dim = act_dim self.alpha = alpha self.auto_entropy_tuning = True if self.alpha: self.auto_entropy_tuning = False self.value_eval = model.v_net.to(device).train() self.value_target = self.copy_net(self.value_eval) self.value_eval_optim = optim.Adam(self.value_eval.parameters(), lr=self.lr) else: self.target_entropy = -torch.tensor(1).to(device) self.log_alpha = torch.zeros(1, requires_grad=True, device=device) self.alpha_optim = optim.Adam([self.log_alpha], lr=self.lr) self.alpha = self.log_alpha.exp()
def __init__( self, model, buffer_size=1000, actor_learn_freq=1, target_update_freq=0, target_update_tau=5e-3, learning_rate=0.0001, discount_factor=0.99, gae_lamda=0.95, # td batch_size=100, verbose=False): super().__init__() self.lr = learning_rate self.eps = np.finfo(np.float32).eps.item() self.tau = target_update_tau self.ratio_clip = 0.2 self.lam_entropy = 0.01 self.adv_norm = False # normalize advantage, defalut=False self.rew_norm = False # normalize reward, default=False self.schedule_clip = False self.schedule_adam = False self.actor_learn_freq = actor_learn_freq self.target_update_freq = target_update_freq self._gamma = discount_factor self._gae_lam = gae_lamda self._target = target_update_freq > 0 self._update_iteration = 10 self._sync_cnt = 0 # self._learn_cnt = 0 self._learn_critic_cnt = 0 self._learn_actor_cnt = 0 self._verbose = verbose self._batch_size = batch_size self._normalized = lambda x, e: (x - x.mean()) / (x.std() + e) self.buffer = ReplayBuffer(buffer_size, replay=False) self.actor_eval = model.policy_net.to(device).train() self.critic_eval = model.value_net.to(device).train() self.actor_eval_optim = optim.Adam(self.actor_eval.parameters(), lr=self.lr) self.critic_eval_optim = optim.Adam(self.critic_eval.parameters(), lr=self.lr) # self.actor_eval.train() # self.critic_eval.train() if self._target: self.actor_target = self.copy_net(self.actor_eval) self.critic_target = self.copy_net(self.critic_eval) self.criterion = nn.SmoothL1Loss()
def __init__(self, actor_net, critic_net, buffer_size=1000, actor_learn_freq=1, target_update_freq=0, target_update_tau=5e-3, learning_rate=0.01, discount_factor=0.99, batch_size=100, verbose=False): super().__init__() self.lr = learning_rate self.eps = np.finfo(np.float32).eps.item() self.tau = target_update_tau self.actor_learn_freq = actor_learn_freq self.target_update_freq = target_update_freq self._gamma = discount_factor self._target = target_update_freq > 0 self._update_iteration = 10 self._sync_cnt = 0 # self._learn_cnt = 0 self._learn_critic_cnt = 0 self._learn_actor_cnt = 0 self._verbose = verbose self._batch_size = batch_size self.replay_buffer = ReplayBuffer(buffer_size) # assert buffer.allow_replay, 'DDPG buffer must be replay buffer' self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.actor_eval = actor_net.to(self.device) # pi(s) self.critic_eval = critic_net.to(self.device) # Q(s, a) self.actor_eval_optim = optim.Adam(self.actor_eval.parameters(), lr=self.lr) self.critic_eval_optim = optim.Adam(self.critic_eval.parameters(), lr=self.lr) self.actor_eval.train() self.critic_eval.train() if self._target: self.actor_target = deepcopy(self.actor_eval) self.critic_target = deepcopy(self.critic_eval) self.actor_target.load_state_dict(self.actor_eval.state_dict()) self.critic_target.load_state_dict(self.critic_eval.state_dict()) self.actor_target.eval() self.critic_target.eval() self.criterion = nn.MSELoss() # why mse?
def __init__(self, actor_net, critic_net, buffer_size=1000, actor_learn_freq=1, target_update_freq=0, target_update_tau=5e-3, learning_rate=0.01, discount_factor=0.99, gae_lamda=1, verbose=False): super().__init__() self.lr = learning_rate self.eps = np.finfo(np.float32).eps.item() self.tau = target_update_tau self.gae_lamda = gae_lamda self.actor_learn_freq = actor_learn_freq self.target_update_freq = target_update_freq self._gamma = discount_factor self._target = target_update_freq > 0 self._sync_cnt = 0 # self._learn_cnt = 0 self._learn_critic_cnt = 0 self._learn_actor_cnt = 0 self._verbose = verbose self.buffer = ReplayBuffer(buffer_size, replay=False) # assert not self.buffer.allow_replay, 'PPO buffer cannot be replay buffer' self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.actor_eval = actor_net.to(self.device) self.critic_eval = critic_net.to(self.device) self.actor_eval_optim = optim.Adam(self.actor_eval.parameters(), lr=self.lr) self.critic_eval_optim = optim.Adam(self.critic_eval.parameters(), lr=self.lr) self.actor_eval.train() self.critic_eval.train() if self._target: self.actor_target = deepcopy(self.actor_eval) self.critic_target = deepcopy(self.critic_eval) self.actor_target.load_state_dict(self.actor_eval.state_dict()) self.critic_target.load_state_dict(self.critic_eval.state_dict()) self.actor_target.eval() self.critic_target.eval() self.criterion = nn.SmoothL1Loss()
def __init__(self, critic_net, action_shape=0, buffer_size=1000, batch_size=100, target_update_freq=1, target_update_tau=1, learning_rate=0.01, discount_factor=0.99, verbose=False): super().__init__() self.lr = learning_rate self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.eps = np.finfo(np.float32).eps.item() self.tau = target_update_tau self.epsilon = 0.5 # ticks self.double_q = True self.dueling_q = True self.distributional_q = True self.prioritized_replay = True self.noisy_q = True self.n_step_td = True self.target_update_freq = target_update_freq self.action_shape = action_shape self._gamma = discount_factor self._batch_size = batch_size self._verbose = verbose self._update_iteration = 10 self._learn_cnt = 0 self._normalized = lambda x, e: (x - x.mean()) / (x.std() + e) self.rew_norm = True self.buffer = ReplayBuffer(buffer_size) self.critic_eval = critic_net.to(self.device) self.critic_target = deepcopy(self.critic_eval) self.critic_target.load_state_dict(self.critic_eval.state_dict()) self.critic_eval.use_dueling = self.critic_target.use_dueling = self.dueling_q # Dueling DQN self.critic_eval_optim = optim.Adam(self.critic_eval.parameters(), lr=self.lr) self.critic_eval.train() self.criterion = nn.MSELoss() self.random_choose = 0 self.sum_choose = 0
def __init__( self, model, buffer_size=1000, actor_learn_freq=1, target_update_freq=1, target_update_tau=0.01, # learning_rate=3e-3, actor_lr=1e-4, critic_lr=1e-3, discount_factor=0.99, batch_size=100, update_iteration=10, verbose=False, ): super().__init__() # self.lr = learning_rate self.eps = np.finfo(np.float32).eps.item() self.tau = target_update_tau self.actor_learn_freq = actor_learn_freq self.target_update_freq = target_update_freq self._gamma = discount_factor self._update_iteration = update_iteration self._sync_cnt = 0 self._learn_critic_cnt = 0 self._learn_actor_cnt = 0 self._verbose = verbose self._batch_size = batch_size self.buffer = ReplayBuffer(buffer_size) self.actor_eval = model.policy_net.to(device).train() self.critic_eval = model.value_net.to(device).train() self.actor_eval_optim = optim.Adam(self.actor_eval.parameters(), lr=actor_lr) self.critic_eval_optim = optim.Adam(self.critic_eval.parameters(), lr=critic_lr) self.actor_target = self.copy_net(self.actor_eval) self.critic_target = self.copy_net(self.critic_eval) self.criterion = nn.MSELoss() # why mse? self.noise_clip = 0.5 self.noise_std = 0.2
def __init__( self, model, buffer_size=1000, batch_size=100, actor_learn_freq=1, target_update_freq=5, target_update_tau=1e-2, learning_rate=1e-3, discount_factor=0.99, verbose=False, update_iteration=10, act_dim=None, alpha=1.0, ): super().__init__() self.lr = learning_rate self.eps = np.finfo(np.float32).eps.item() self.tau = target_update_tau self.actor_learn_freq = actor_learn_freq self.target_update_freq = target_update_freq self._gamma = discount_factor self._target = target_update_freq > 0 self._update_iteration = update_iteration self._sync_cnt = 0 self._learn_critic_cnt = 0 self._learn_actor_cnt = 0 self._verbose = verbose self._batch_size = batch_size self.buffer = ReplayBuffer(buffer_size) # off-policy self.actor_eval = model.policy_net.to(device).train() self.critic_eval = model.value_net.to(device).train() self.value_eval = model.v_net.to(device).train() self.value_target = self.copy_net(self.value_eval) self.actor_eval_optim = optim.Adam(self.actor_eval.parameters(), lr=self.lr) self.critic_eval_optim = optim.Adam(self.critic_eval.parameters(), lr=self.lr) self.value_eval_optim = optim.Adam(self.value_eval.parameters(), lr=self.lr) self.criterion = nn.SmoothL1Loss() self.act_dim = act_dim self.alpha = alpha
def __init__( self, model, buffer_size=1e6, batch_size=256, policy_freq=2, tau=0.005, discount=0.99, policy_lr=3e-4, value_lr=3e-4, learn_iteration=1, verbose=False, act_dim=None, alpha=1.0, ): super().__init__() self.tau = tau self.gamma = discount self.policy_freq = policy_freq self.learn_iteration = learn_iteration self.verbose = verbose self.act_dim = act_dim self.batch_size = batch_size self.buffer = ReplayBuffer(buffer_size) # off-policy self.actor_eval = model.policy_net.to(device).train() self.critic_eval = model.value_net.to(device).train() self.value_eval = model.v_net.to(device).train() self.value_target = self.copy_net(self.value_eval) self.actor_eval_optim = torch.optim.Adam(self.actor_eval.parameters(), lr=policy_lr) self.critic_eval_optim = torch.optim.Adam( self.critic_eval.parameters(), lr=value_lr) self.value_eval_optim = torch.optim.Adam(self.value_eval.parameters(), lr=value_lr) self.criterion = nn.SmoothL1Loss() self.alpha = alpha self.eps = np.finfo(np.float32).eps.item() self._learn_critic_cnt = 0 self._learn_actor_cnt = 0
def __init__( self, model, buffer_size=1000, actor_learn_freq=1, target_update_freq=1, target_update_tau=0.005, learning_rate=1e-4, discount_factor=0.99, batch_size=100, update_iteration=10, verbose=False, act_dim=None, num_episodes=1000, ): super().__init__() self.lr = learning_rate self.end_lr = learning_rate * 0.1 self.eps = np.finfo(np.float32).eps.item() self.tau = target_update_tau self.actor_learn_freq = actor_learn_freq self.target_update_freq = target_update_freq self._gamma = discount_factor self._update_iteration = update_iteration self._sync_cnt = 0 self._learn_critic_cnt = 0 self._learn_actor_cnt = 0 self._verbose = False self._batch_size = batch_size self.schedule_adam = True self.buffer = ReplayBuffer(buffer_size) self.actor_eval = model.policy_net.to(device).train() # pi(s) self.critic_eval = model.value_net.to(device).train() # Q(s, a) self.actor_eval_optim = optim.Adam(self.actor_eval.parameters(), lr=self.lr) self.critic_eval_optim = optim.Adam(self.critic_eval.parameters(), lr=self.lr) self.actor_target = self.copy_net(self.actor_eval) self.critic_target = self.copy_net(self.critic_eval) self.criterion = nn.MSELoss() # why mse? self.act_dim = act_dim self.num_episodes = num_episodes
def __init__(self, model, action_shape=0, buffer_size=1000, batch_size=100, target_update_freq=1, target_update_tau=1, learning_rate=0.01, discount_factor=0.99, verbose=False): super().__init__() self.lr = learning_rate self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.eps = np.finfo(np.float32).eps.item() self.tau = target_update_tau self.target_update_freq = target_update_freq # self.action_shape = action_shape self._gamma = discount_factor self._batch_size = batch_size self._verbose = verbose self._update_iteration = 10 self._learn_cnt = 0 self._normalized = lambda x, e: (x - x.mean()) / (x.std() + e) self.rew_norm = True self.buffer = ReplayBuffer(buffer_size) # self.declare_networks() self.critic_eval = model.value_net.to(self.device).train() self.critic_target = self.copy_net(self.critic_eval) self.critic_eval_optim = optim.Adam(self.critic_eval.parameters(), lr=self.lr) # self.critic_eval.train() self.criterion = nn.MSELoss() self.random_choose = 0 self.sum_choose = 0
class SAC1(BasePolicy): # pg_net + q_net + v_net def __init__( self, model, buffer_size=1e6, batch_size=256, policy_freq=2, tau=0.005, discount=0.99, policy_lr=3e-4, value_lr=3e-4, learn_iteration=1, verbose=False, act_dim=None, alpha=1.0, ): super().__init__() self.tau = tau self.gamma = discount self.policy_freq = policy_freq self.learn_iteration = learn_iteration self.verbose = verbose self.act_dim = act_dim self.batch_size = batch_size self.buffer = ReplayBuffer(buffer_size) # off-policy self.actor_eval = model.policy_net.to(device).train() self.critic_eval = model.value_net.to(device).train() self.value_eval = model.v_net.to(device).train() self.value_target = self.copy_net(self.value_eval) self.actor_eval_optim = torch.optim.Adam(self.actor_eval.parameters(), lr=policy_lr) self.critic_eval_optim = torch.optim.Adam( self.critic_eval.parameters(), lr=value_lr) self.value_eval_optim = torch.optim.Adam(self.value_eval.parameters(), lr=value_lr) self.criterion = nn.SmoothL1Loss() self.alpha = alpha self.eps = np.finfo(np.float32).eps.item() self._learn_critic_cnt = 0 self._learn_actor_cnt = 0 def learn(self): pg_loss, q_loss, v_loss = 0, 0, 0 for _ in range(self.learn_iteration): batch = self.buffer.split_batch(self.batch_size) if self.act_dim is None: self.act_dim = np.array(batch['a']).shape[-1] S = torch.tensor(batch['s'], dtype=torch.float32, device=device) A = torch.tensor(batch['a'], dtype=torch.float32, device=device).view(-1, self.act_dim) M = torch.tensor(batch['m'], dtype=torch.float32).view(-1, 1) R = torch.tensor(batch['r'], dtype=torch.float32).view(-1, 1) S_ = torch.tensor(batch['s_'], dtype=torch.float32, device=device) new_A, log_prob = self.actor_eval.evaluate(S) # V_value loss with torch.no_grad(): new_q1_value, new_q2_value = self.critic_eval(S, new_A) next_value = torch.min(new_q1_value, new_q2_value) - self.alpha * log_prob value = self.value_eval(S) value_loss = self.criterion(value, next_value) # Soft q loss with torch.no_grad(): target_value = self.value_target(S_) target_q_value = R + M * self._gamma * target_value.cpu() target_q_value = target_q_value.to(device) q1_value, q2_value = self.critic_eval(S, A) loss1 = self.criterion(q1_value, target_q_value) loss2 = self.criterion(q2_value, target_q_value) critic_loss = 0.5 * (loss1 + loss2) # update V self.value_eval_optim.zero_grad() value_loss.backward() self.value_eval_optim.step() # update soft Q self.critic_eval_optim.zero_grad() critic_loss.backward() self.critic_eval_optim.step() self._learn_critic_cnt += 1 actor_loss = torch.tensor(0) # update policy if self._learn_critic_cnt % self.policy_freq == 0: # policy loss actor_loss = (self.alpha * log_prob - torch.min(new_q1_value, new_q2_value)).mean() # actor_loss = (log_prob - torch.min(new_q1_value, new_q2_value).detach()).mean() self.actor_eval_optim.zero_grad() actor_loss.backward() self.actor_eval_optim.step() self._learn_actor_cnt += 1 self.soft_sync_weight(self.value_target, self.value_eval, self.tau) pg_loss += actor_loss.item() q_loss += critic_loss.item() v_loss += value_loss.item() return pg_loss, q_loss, v_loss
class TD3(BasePolicy): def __init__( self, model, buffer_size=1000, actor_learn_freq=2, target_update_freq=1, target_update_tau=0.005, learning_rate=1e-4, discount_factor=0.99, batch_size=100, update_iteration=10, verbose=False, act_dim=None, ): super().__init__() self.lr = learning_rate self.eps = np.finfo(np.float32).eps.item() self.tau = target_update_tau self.actor_learn_freq = actor_learn_freq self.target_update_freq = target_update_freq self._gamma = discount_factor self._update_iteration = update_iteration self._sync_cnt = 0 self._learn_critic_cnt = 0 self._learn_actor_cnt = 0 self._verbose = verbose self._batch_size = batch_size self.buffer = ReplayBuffer(buffer_size) self.actor_eval = model.policy_net.to(device).train() self.critic_eval = model.value_net.to(device).train() self.actor_eval_optim = optim.Adam(self.actor_eval.parameters(), lr=self.lr) self.critic_eval_optim = optim.Adam(self.critic_eval.parameters(), lr=self.lr) self.actor_target = self.copy_net(self.actor_eval) self.critic_target = self.copy_net(self.critic_eval) self.criterion = nn.MSELoss() # why mse? self.noise_clip = 0.5 self.noise_std = 0.2 def learn(self): loss_actor_avg, loss_critic_avg = 0, 0 for _ in range(self._update_iteration): batch = self.buffer.split_batch(self._batch_size) if self.act_dim is None: self.act_dim = np.array(batch['a']).shape[-1] S = torch.tensor(batch['s'], dtype=torch.float32, device=device) # [batch_size, S.feature_size] A = torch.tensor(batch['a'], dtype=torch.float32, device=device).view(-1, self.act_dim) M = torch.tensor(batch['m'], dtype=torch.float32).view(-1, 1) R = torch.tensor(batch['r'], dtype=torch.float32).view(-1, 1) S_ = torch.tensor(batch['s_'], dtype=torch.float32, device=device) if self._verbose: print( f'Shape S:{S.shape}, A:{A.shape}, M:{M.shape}, R:{R.shape}, S_:{S_.shape}' ) A_noise = self.actor_target.action(S_, self.noise_std, self.noise_clip) with torch.no_grad(): q1_next, q2_next = self.critic_target.twinQ(S_, A_noise) q_next = torch.min(q1_next, q2_next) q_target = R + M * self._gamma * q_next.cpu() q_target = q_target.to(device) q1_eval, q2_eval = self.critic_eval.twinQ(S, A) loss1 = self.criterion(q1_eval, q_target) loss2 = self.criterion(q2_eval, q_target) critic_loss = 0.5 * (loss1 + loss2) self.critic_eval_optim.zero_grad() critic_loss.backward() self.critic_eval_optim.step() loss_critic_avg += critic_loss.item() self._learn_critic_cnt += 1 if self._verbose: print( f'=======Learn_Critic_Net, cnt{self._learn_critic_cnt}=======' ) if self._learn_critic_cnt % self.actor_learn_freq == 0: actor_loss = -self.critic_eval(S, self.actor_eval(S)).mean() self.actor_eval_optim.zero_grad() actor_loss.backward() self.actor_eval_optim.step() loss_actor_avg += actor_loss.item() self._learn_actor_cnt += 1 if self._verbose: print( f'=======Learn_Actort_Net, cnt{self._learn_actor_cnt}=======' ) if self._learn_critic_cnt % self.target_update_freq == 0: if self._verbose: print( f'=======Soft_sync_weight of TD3, tau{self.tau}=======' ) self.soft_sync_weight(self.actor_target, self.actor_eval, self.tau) self.soft_sync_weight(self.critic_target, self.critic_eval, self.tau) loss_actor_avg /= (self._update_iteration / self.actor_learn_freq) loss_critic_avg /= self._update_iteration return loss_actor_avg, loss_critic_avg
class SAC2(BasePolicy): # pg_net + q_net + alpha def __init__( self, model, buffer_size=1e6, batch_size=256, policy_freq=2, tau=0.005, discount=0.99, policy_lr=3e-4, value_lr=3e-4, learn_iteration=1, verbose=False, act_dim=None, ): super().__init__() self.tau = tau self.gamma = discount self.policy_freq = policy_freq self.learn_iteration = learn_iteration self.verbose = verbose self.act_dim = act_dim self.batch_size = batch_size self.buffer = ReplayBuffer(buffer_size) # off-policy self.actor_eval = model.policy_net.to(device).train() self.critic_eval = model.value_net.to(device).train() self.actor_target = self.copy_net(self.actor_eval) self.critic_target = self.copy_net(self.critic_eval) self.actor_eval_optim = torch.optim.Adam(self.actor_eval.parameters(), lr=policy_lr) self.critic_eval_optim = torch.optim.Adam( self.critic_eval.parameters(), lr=value_lr) self.criterion = nn.SmoothL1Loss() self.target_entropy = -torch.tensor(1).to(device) self.log_alpha = torch.zeros(1, requires_grad=True, device=device) self.alpha_optim = torch.optim.Adam([self.log_alpha], lr=policy_lr) self.alpha = self.log_alpha.exp() self.eps = np.finfo(np.float32).eps.item() self._learn_critic_cnt = 0 self._learn_actor_cnt = 0 def learn(self): pg_loss, q_loss, a_loss = 0, 0, 0 for _ in range(self.learn_iteration): batch = self.buffer.split_batch(self.batch_size) if self.act_dim is None: self.act_dim = np.array(batch['a']).shape[-1] self.target_entropy = -torch.tensor(self.act_dim).to(device) S = torch.tensor(batch['s'], dtype=torch.float32, device=device) A = torch.tensor(batch['a'], dtype=torch.float32, device=device).view(-1, self.act_dim) M = torch.tensor(batch['m'], dtype=torch.float32).view(-1, 1) R = torch.tensor(batch['r'], dtype=torch.float32).view(-1, 1) S_ = torch.tensor(batch['s_'], dtype=torch.float32, device=device) if self.verbose: print( f'shape S:{S.size()}, A:{A.size()}, M:{M.size()}, R:{R.size()}, S_:{S_.size()}, W:{W.size()}' ) with torch.no_grad(): next_A, next_log = self.actor_target.evaluate(S_) q1_next, q2_next = self.critic_target(S_, next_A) q_next = torch.min(q1_next, q2_next) - self.alpha * next_log q_target = R + M * self.gamma * q_next.cpu() q_target = q_target.to(device) # q_loss q1_eval, q2_eval = self.critic_eval(S, A) loss1 = self.criterion(q1_eval, q_target) loss2 = self.criterion(q2_eval, q_target) critic_loss = 0.5 * (loss1 + loss2) self.critic_eval_optim.zero_grad() critic_loss.backward() self.critic_eval_optim.step() self._learn_critic_cnt += 1 actor_loss = alpha_loss = torch.tensor(0) if self._learn_critic_cnt % self.policy_freq == 0: curr_A, curr_log = self.actor_eval.evaluate(S) q1_next, q2_next = self.critic_eval(S, curr_A) q_next = torch.min(q1_next, q2_next) # pg_loss actor_loss = (self.alpha * curr_log - q_next).mean() self.actor_eval_optim.zero_grad() actor_loss.backward() self.actor_eval_optim.step() self._learn_actor_cnt += 1 # alpha loss alpha_loss = -( self.log_alpha * (curr_log + self.target_entropy).detach()).mean() self.alpha_optim.zero_grad() alpha_loss.backward() self.alpha_optim.step() self.alpha = float(self.log_alpha.exp().detach().cpu().numpy()) self.soft_sync_weight(self.critic_target, self.critic_eval, self.tau) self.soft_sync_weight(self.actor_target, self.actor_eval, self.tau) q_loss += critic_loss.item() pg_loss += actor_loss.item() a_loss += alpha_loss.item() return pg_loss, q_loss, a_loss
def __init__( self, model, action_dim=1, buffer_size=1000, batch_size=100, actor_learn_freq=1, target_update_freq=5, target_update_tau=0.1, # learning_rate=1e-3, actor_lr=1e-4, critic_lr=1e-3, discount_factor=0.99, verbose=False, update_iteration=10, use_priority=False, use_m=False, n_step=1, ): super().__init__() # self.lr = learning_rate self.eps = np.finfo(np.float32).eps.item() self.tau = target_update_tau self.actor_learn_freq = actor_learn_freq self.target_update_freq = target_update_freq self._gamma = discount_factor self._target = target_update_freq > 0 self._update_iteration = update_iteration self._sync_cnt = 0 # self._learn_cnt = 0 self._learn_critic_cnt = 0 self._learn_actor_cnt = 0 self._verbose = verbose self._batch_size = batch_size self.use_priority = use_priority self.use_dist = model.value_net.use_dist self.use_munchausen = use_m self.n_step = n_step if self.use_priority: self.buffer = PriorityReplayBuffer(buffer_size, n_step=self.n_step) else: self.buffer = ReplayBuffer(buffer_size) # off-policy if self.use_dist: assert model.value_net.num_atoms > 1 # assert isinstance(model.value_net, CriticModelDist) self.v_min = model.value_net.v_min self.v_max = model.value_net.v_max self.num_atoms = model.value_net.num_atoms self.delta_z = (self.v_max - self.v_min) / (self.num_atoms - 1) self.support = torch.linspace(self.v_min, self.v_max, self.num_atoms) self.actor_eval = model.policy_net.to(device).train() self.critic_eval = model.value_net.to(device).train() self.actor_target = self.copy_net(self.actor_eval) self.critic_target = self.copy_net(self.critic_eval) self.actor_eval_optim = optim.Adam(self.actor_eval.parameters(), lr=actor_lr) self.critic_eval_optim = optim.Adam(self.critic_eval.parameters(), lr=critic_lr) self.criterion = nn.SmoothL1Loss(reduction='none') # keep batch dim self.target_entropy = -torch.tensor(action_dim).to(device) self.log_alpha = torch.zeros(1, requires_grad=True, device=device) self.alpha_optim = optim.Adam([self.log_alpha], lr=actor_lr) self.alpha = self.log_alpha.exp()
class DDPG(BasePolicy): def __init__( self, model, buffer_size=1000, actor_learn_freq=1, target_update_freq=1, target_update_tau=0.005, learning_rate=1e-4, discount_factor=0.99, batch_size=100, update_iteration=10, verbose=False, act_dim=None, num_episodes=1000, ): super().__init__() self.lr = learning_rate self.end_lr = learning_rate * 0.1 self.eps = np.finfo(np.float32).eps.item() self.tau = target_update_tau self.actor_learn_freq = actor_learn_freq self.target_update_freq = target_update_freq self._gamma = discount_factor self._update_iteration = update_iteration self._sync_cnt = 0 self._learn_critic_cnt = 0 self._learn_actor_cnt = 0 self._verbose = False self._batch_size = batch_size self.schedule_adam = True self.buffer = ReplayBuffer(buffer_size) self.actor_eval = model.policy_net.to(device).train() # pi(s) self.critic_eval = model.value_net.to(device).train() # Q(s, a) self.actor_eval_optim = optim.Adam(self.actor_eval.parameters(), lr=self.lr) self.critic_eval_optim = optim.Adam(self.critic_eval.parameters(), lr=self.lr) self.actor_target = self.copy_net(self.actor_eval) self.critic_target = self.copy_net(self.critic_eval) self.criterion = nn.MSELoss() # why mse? self.act_dim = act_dim self.num_episodes = num_episodes def learn(self): loss_actor_avg, loss_critic_avg = 0, 0 for _ in range(self._update_iteration): batch = self.buffer.split_batch(self._batch_size) if self.act_dim is None: self.act_dim = np.array(batch['a']).shape[-1] S = torch.tensor(batch['s'], dtype=torch.float32, device=device) # [batch_size, state_dim] # print(batch['a']) A = torch.tensor(batch['a'], dtype=torch.float32, device=device).view(-1, self.act_dim) # [batch_size, act_dim] M = torch.tensor(batch['m'], dtype=torch.float32).view(-1, 1) # [batch_size, 1] R = torch.tensor(batch['r'], dtype=torch.float32).view(-1, 1) # [batch_size, 1] S_ = torch.tensor(batch['s_'], dtype=torch.float32, device=device) # [batch_size, state_dim] if self._verbose: print(f'Shape S:{S.shape}, A:{A.shape}, M:{M.shape}, R:{R.shape}, S_:{S_.shape}') with torch.no_grad(): q_next = self.critic_target(S_, self.actor_target(S_)) q_target = R + M * self._gamma * q_next.cpu() q_target = q_target.to(device) q_eval = self.critic_eval(S, A) # [batch_size, q_value_size] critic_loss = self.criterion(q_eval, q_target) self.critic_eval_optim.zero_grad() critic_loss.backward() self.critic_eval_optim.step() loss_critic_avg += critic_loss.item() self._learn_critic_cnt += 1 if self._verbose: print(f'=======Learn_Critic_Net, cnt:{self._learn_critic_cnt}=======') if self._learn_critic_cnt % self.actor_learn_freq == 0: actor_loss = -self.critic_eval(S, self.actor_eval(S)).mean() self.actor_eval_optim.zero_grad() actor_loss.backward() self.actor_eval_optim.step() loss_actor_avg += actor_loss.item() self._learn_actor_cnt += 1 if self._verbose: print(f'=======Learn_Actort_Net, cnt:{self._learn_actor_cnt}=======') if self._learn_critic_cnt % self.target_update_freq == 0: if self._verbose: print(f'=======Soft_sync_weight of DDPG, tau:{self.tau}=======') self.soft_sync_weight(self.critic_target, self.critic_eval, self.tau) self.soft_sync_weight(self.actor_target, self.actor_eval, self.tau) if self.schedule_adam: new_lr = self.lr + (self.end_lr - self.lr) / self.num_episodes * self._learn_critic_cnt / self._update_iteration # set learning rate # ref: https://stackoverflow.com/questions/48324152/ for g in self.actor_eval_optim.param_groups: g['lr'] = new_lr for g in self.critic_eval_optim.param_groups: g['lr'] = new_lr loss_actor_avg /= (self._update_iteration/self.actor_learn_freq) loss_critic_avg /= self._update_iteration return loss_actor_avg, loss_critic_avg
class OAC(BasePolicy): # no value network def __init__(self, model, buffer_size=1000, batch_size=100, actor_learn_freq=1, target_update_freq=5, target_update_tau=0.01, learning_rate=1e-3, discount_factor=0.99, verbose=False, update_iteration=10, act_dim=None): super().__init__() self.lr = learning_rate self.eps = np.finfo(np.float32).eps.item() self.tau = target_update_tau self.actor_learn_freq = actor_learn_freq self.target_update_freq = target_update_freq self._gamma = discount_factor self._target = target_update_freq > 0 self._update_iteration = update_iteration self._sync_cnt = 0 self._learn_critic_cnt = 0 self._learn_actor_cnt = 0 self._verbose = verbose self._batch_size = batch_size self.buffer = ReplayBuffer(buffer_size) # off-policy self.actor_eval = model.policy_net.to(device).train() self.critic_eval = model.value_net.to(device).train() self.actor_target = self.copy_net(self.actor_eval) self.critic_target = self.copy_net(self.critic_eval) self.actor_eval_optim = optim.Adam(self.actor_eval.parameters(), lr=self.lr) self.critic_eval_optim = optim.Adam(self.critic_eval.parameters(), lr=self.lr) self.criterion = nn.SmoothL1Loss() self.act_dim = act_dim self.target_entropy = -torch.tensor(1).to(device) self.log_alpha = torch.zeros(1, requires_grad=True, device=device) self.alpha_optim = optim.Adam([self.log_alpha], lr=self.lr) self.alpha = self.log_alpha.exp() def choose_action(self, state, test=False, beta_UB=1.0, delta=1.0): # paper: Better Exploration with Optimistic Actor-Critic, NeurIPS 2019 # pdf: https://arxiv.org/pdf/1910.12807.pdf # ref: https://github.com/microsoft/oac-explore/blob/master/optimistic_exploration.py # paper param: beta_UB=4.66 delta=23.53, env_name=humanoid state = torch.tensor(state, dtype=torch.float32, device=device) if test: self.actor_eval.eval() mean, log_std = self.actor_eval(state) return mean.detach().cpu().numpy() assert len(list(state.shape)) == 1 # not batch mu_T, log_std = self.actor_eval(state) std = torch.exp(log_std) # assert len(list(mu_T.shape)) == 1, mu_T # assert len(list(std.shape)) == 1 mu_T.requires_grad_() curr_act = torch.tanh(mu_T).unsqueeze(0) # action state = state.unsqueeze(0) q1, q2 = self.critic_target(state, curr_act) mu_q = (q1 + q2) / 2.0 sigma_q = torch.abs(q1 - q2) / 2.0 Q_UB = mu_q + beta_UB * sigma_q grad = torch.autograd.grad(Q_UB, mu_T) grad = grad[0] assert grad is not None assert mu_T.shape == grad.shape sigma_T = torch.pow(std, 2) denom = torch.sqrt(torch.sum(torch.mul(torch.pow(grad, 2), sigma_T))) + 10e-6 mu_C = np.sqrt(2.0 * delta) * torch.mul(sigma_T, grad) / denom assert mu_C.shape == mu_T.shape mu_E = mu_T + mu_C assert mu_E.shape == std.shape normal = Normal(mu_E, std) z = normal.sample() action = torch.tanh(z).detach().cpu().numpy() return action def learn(self): pg_loss, q_loss, a_loss = 0, 0, 0 for _ in range(self._update_iteration): batch = self.buffer.split_batch(self._batch_size) if self.act_dim is None: self.act_dim = np.array(batch['a']).shape[-1] self.target_entropy = -torch.tensor(self.act_dim).to(device) S = torch.tensor(batch['s'], dtype=torch.float32, device=device) A = torch.tensor(batch['a'], dtype=torch.float32, device=device).view(-1, 1) M = torch.tensor(batch['m'], dtype=torch.float32).view(-1, 1) R = torch.tensor(batch['r'], dtype=torch.float32).view(-1, 1) S_ = torch.tensor(batch['s_'], dtype=torch.float32, device=device) # print (f'size S:{S.size()}, A:{A.size()}, M:{M.size()}, R:{R.size()}, S_:{S_.size()}, W:{W.size()}') with torch.no_grad(): next_A, next_log = self.actor_target.evaluate(S_) q1_next, q2_next = self.critic_target(S_, next_A) q_next = torch.min(q1_next, q2_next) - self.alpha * next_log q_target = R + M * self._gamma * q_next.cpu() q_target = q_target.to(device) # q_loss q1_eval, q2_eval = self.critic_eval(S, A) critic_loss = self.criterion(q1_eval, q_target) + self.criterion( q2_eval, q_target) self.critic_eval_optim.zero_grad() critic_loss.backward() self.critic_eval_optim.step() self._learn_critic_cnt += 1 actor_loss = torch.tensor(0) alpha_loss = torch.tensor(0) if self._learn_critic_cnt % self.actor_learn_freq == 0: curr_A, curr_log = self.actor_eval.evaluate(S) q1_next, q2_next = self.critic_eval(S, curr_A) q_next = torch.min(q1_next, q2_next) # pg_loss actor_loss = (self.alpha * curr_log - q_next).mean() self.actor_eval_optim.zero_grad() actor_loss.backward() self.actor_eval_optim.step() # alpha loss alpha_loss = -( self.log_alpha * (curr_log + self.target_entropy).detach()).mean() self.alpha_optim.zero_grad() alpha_loss.backward() self.alpha_optim.step() self.alpha = float(self.log_alpha.exp().detach().cpu().numpy()) q_loss += critic_loss.item() * 0.5 pg_loss += actor_loss.item() a_loss += alpha_loss.item() if self._learn_critic_cnt % self.target_update_freq: self.soft_sync_weight(self.critic_target, self.critic_eval, self.tau) self.soft_sync_weight(self.actor_target, self.actor_eval, self.tau) return pg_loss, q_loss, a_loss
class MSAC(BasePolicy): def __init__( self, model, action_dim=1, buffer_size=1000, batch_size=100, actor_learn_freq=1, target_update_freq=5, target_update_tau=0.1, # learning_rate=1e-3, actor_lr=1e-4, critic_lr=1e-3, discount_factor=0.99, verbose=False, update_iteration=10, use_priority=False, use_m=False, n_step=1, ): super().__init__() # self.lr = learning_rate self.eps = np.finfo(np.float32).eps.item() self.tau = target_update_tau self.actor_learn_freq = actor_learn_freq self.target_update_freq = target_update_freq self._gamma = discount_factor self._target = target_update_freq > 0 self._update_iteration = update_iteration self._sync_cnt = 0 # self._learn_cnt = 0 self._learn_critic_cnt = 0 self._learn_actor_cnt = 0 self._verbose = verbose self._batch_size = batch_size self.use_priority = use_priority self.use_dist = model.value_net.use_dist self.use_munchausen = use_m self.n_step = n_step if self.use_priority: self.buffer = PriorityReplayBuffer(buffer_size, n_step=self.n_step) else: self.buffer = ReplayBuffer(buffer_size) # off-policy if self.use_dist: assert model.value_net.num_atoms > 1 # assert isinstance(model.value_net, CriticModelDist) self.v_min = model.value_net.v_min self.v_max = model.value_net.v_max self.num_atoms = model.value_net.num_atoms self.delta_z = (self.v_max - self.v_min) / (self.num_atoms - 1) self.support = torch.linspace(self.v_min, self.v_max, self.num_atoms) self.actor_eval = model.policy_net.to(device).train() self.critic_eval = model.value_net.to(device).train() self.actor_target = self.copy_net(self.actor_eval) self.critic_target = self.copy_net(self.critic_eval) self.actor_eval_optim = optim.Adam(self.actor_eval.parameters(), lr=actor_lr) self.critic_eval_optim = optim.Adam(self.critic_eval.parameters(), lr=critic_lr) self.criterion = nn.SmoothL1Loss(reduction='none') # keep batch dim self.target_entropy = -torch.tensor(action_dim).to(device) self.log_alpha = torch.zeros(1, requires_grad=True, device=device) self.alpha_optim = optim.Adam([self.log_alpha], lr=actor_lr) self.alpha = self.log_alpha.exp() def _tensor(self, data, use_cuda=False): if np.array(data).ndim == 1: data = torch.tensor(data, dtype=torch.float32).view(-1, 1) else: data = torch.tensor(data, dtype=torch.float32) if use_cuda: data = data.to(device) return data def learn_critic_dist(self, obs, act, rew, next_obs, mask): with torch.no_grad(): next_act, next_log_pi = self.actor_target(next_obs) # q(s, a) change to z(s, a) to discribe a distributional p1_next, p2_next = self.critic_target.get_probs( next_obs, next_act) # [batch_size, num_atoms] p_next = torch.stack([ torch.where(z1.sum() < z2.sum(), z1, z2) for z1, z2 in zip(p1_next, p2_next) ]) p_next -= (self.alpha * next_log_pi) Tz = rew.unsqueeze(1) + mask * self.support.unsqueeze(0) Tz = Tz.clamp(min=self.v_min, max=self.v_max) b = (Tz - self.v_min) / self.delta_z l, u = b.floor().to(torch.int64), b.ceil().to(torch.int64) l[(u > 0) * (l == u)] -= 1 u[(l < (self.num_atoms - 1)) * (l == u)] += 1 m = obs.new_zeros(self._batch_size, self.num_atoms).cpu() p_next = p_next.cpu() # print (f'm device: {m.device}') # print (f'p_next device: {p_next.device}') offset = torch.linspace(0, ((self._batch_size - 1) * self.num_atoms), self._batch_size).unsqueeze(1).expand( self._batch_size, self.num_atoms).to(l) m.view(-1).index_add_( 0, (l + offset).view(-1), (p_next * (u.float() - b)).view(-1)) # m_l = m_l + p(s_t+n, a*)(u - b) m.view(-1).index_add_( 0, (u + offset).view(-1), (p_next * (b - l.float())).view(-1)) # m_u = m_u + p(s_t+n, a*)(b - l) m = m.to(device) log_z1, log_z2 = self.critic_eval.get_probs(obs, act, log=True) loss1 = -(m * log_z1).sum(dim=1) loss2 = -(m * log_z2).sum(dim=1) return 0.5 * (loss1 + loss2) def learn_critic(self, obs, act, rew, next_obs, mask): with torch.no_grad(): next_A, next_log = self.actor_target.evaluate(next_obs) # print (f'nextA shape is {next_A.shape}') q1_next, q2_next = self.critic_target.twinQ(next_obs, next_A) # print (f'shape q1 {q1_next.shape}, q2 {q2_next.shape}, next_obs {next_obs.shape}, next_A {next_A.shape}') # print (f'q1_next shape is {q1_next.shape}') # q_next = torch.stack([torch.where(q1.sum() < q2.sum(), q1, q2) for q1, q2 in zip(q1_next, q2_next)]) # print (f'shape stack q_next {q_next.shape} ') q_next = torch.min(q1_next, q2_next) - self.alpha * next_log # print (f'q_next shape is {q_next.shape}') # print(f'shpae rew {rew.shape}, mask {mask.shape}, q_next {q_next.shape}') q_target = rew + mask * self._gamma * q_next.cpu() if self.use_priority: q_target = rew + mask * (self._gamma** self.n_step) * q_next.cpu() # print (f'q_target shape is {q_target.shape}') q_target = q_target.to(device) # q_loss q1_eval, q2_eval = self.critic_eval.twinQ(obs, act) criterion = nn.SmoothL1Loss(reduction='none') # print (f'q1_eval shape is {q1_eval.shape}') loss1 = criterion(q1_eval, q_target) loss2 = criterion(q2_eval, q_target) return 0.5 * (loss1 + loss2) def learn_actor_dist(self, obs): curr_act, curr_log = self.actor_eval.evaluate(obs) p1_next, p2_next = self.critic_eval.get_probs(obs, curr_act) p_next = torch.stack([ torch.where(p1.sum() < p2.sum(), p1, p2) for p1, p2 in zip(p1_next, p2_next) ]) num_atoms = torch.tensor(self.num_atoms, dtype=torch.float32, device=device) # actor_loss = p_next * num_atoms # actor_loss = torch.sum(actor_loss, dim=1) # actor_loss = -(actor_loss + self.alpha * curr_log).mean() actor_loss = (self.alpha * curr_log - p_next) actor_loss = torch.sum(actor_loss, dim=1) actor_loss = actor_loss.mean() return actor_loss, curr_log def learn_actor(self, obs): curr_act, curr_log = self.actor_eval.evaluate(obs) q1_next, q2_next = self.critic_eval.twinQ(obs, curr_act) q_next = torch.min(q1_next, q2_next) actor_loss = (self.alpha * curr_log - q_next).mean() return actor_loss, curr_log def get_munchausen_rew(self, obs, act, rew): self.m_alpha = 0.9 self.m_tau = 0.03 self.lo = -1 mu, log_std = self.actor_eval(obs) std = log_std.exp() dist = Normal(mu, std) log_pi_a = self.m_tau * dist.log_prob(act).mean(1).unsqueeze(1).cpu() m_rew = rew + self.m_alpha * torch.clamp(log_pi_a, min=self.lo, max=0) return m_rew def learn(self): pg_loss, q_loss, a_loss = 0, 0, 0 for _ in range(self._update_iteration): if self.use_priority: S, A, R, S_, M, indices, weights = self.buffer.sample( self._batch_size) W = torch.tensor(weights, dtype=torch.float32, device=device).view(-1, 1) else: batch_split = self.buffer.split_batch(self._batch_size) S, A, M, R, S_ = batch_split['s'], batch_split[ 'a'], batch_split['m'], batch_split['r'], batch_split['s_'] # print ('after sampling from buffer!') R = torch.tensor(R, dtype=torch.float32).view(-1, 1) S = torch.tensor(S, dtype=torch.float32, device=device) # A = torch.tensor(A, dtype=torch.float32, device=device).view(-1, 1) A = torch.tensor(A, dtype=torch.float32, device=device).squeeze(1) # print (f'A shape {A.shape}') M = torch.tensor(M, dtype=torch.float32).view(-1, 1) S_ = torch.tensor(S_, dtype=torch.float32, device=device) # self.use_munchausen = True if self.use_munchausen: R = self.get_munchausen_rew(S, A, R) # print (f'shape S:{S.shape}, A:{A.shape}, M:{M.shape}, R:{R.shape}, S_:{S_.shape}') if self.use_dist: # D = torch.from_numpy(np.array([1^int(mask.item()) for mask in M])).view(-1, 1) # print (f'size S:{S.shape}, A:{A.shape}, M:{M.shape}, R:{R.shape}, S_:{S_.shape}, D:{D.shape}') # assert 0 batch_loss = self.learn_critic_dist(S, A, R, S_, M) else: batch_loss = self.learn_critic(S, A, R, S_, M) if self.use_priority: critic_loss = (W * batch_loss).mean() td_errors = batch_loss.detach().cpu().numpy().sum(1) # print(batch_loss) # print(td_errors) self.buffer.update_priorities(indices, np.abs(td_errors) + 1e-6) else: critic_loss = batch_loss.mean() self.critic_eval_optim.zero_grad() critic_loss.backward() self.critic_eval_optim.step() self._learn_critic_cnt += 1 actor_loss = torch.tensor(0) alpha_loss = torch.tensor(0) if self._learn_critic_cnt % self.actor_learn_freq == 0: if self.use_dist: actor_loss, curr_log = self.learn_actor_dist(S) else: actor_loss, curr_log = self.learn_actor(S) self.actor_eval_optim.zero_grad() actor_loss.backward() self.actor_eval_optim.step() # alpha loss alpha_loss = -( self.log_alpha * (curr_log + self.target_entropy).detach()).mean() self.alpha_optim.zero_grad() alpha_loss.backward() self.alpha_optim.step() self.alpha = float(self.log_alpha.exp().detach().cpu().numpy()) q_loss += critic_loss.item() pg_loss += actor_loss.item() a_loss += alpha_loss.item() if self._learn_critic_cnt % self.target_update_freq: self.soft_sync_weight(self.critic_target, self.critic_eval, self.tau) self.soft_sync_weight(self.actor_target, self.actor_eval, self.tau) return pg_loss, q_loss, a_loss
class Rainbow( BasePolicy ): #option: double(done), dueling(todo), noisy(todo), n-step(todo), def __init__(self, critic_net, action_shape=0, buffer_size=1000, batch_size=100, target_update_freq=1, target_update_tau=1, learning_rate=0.01, discount_factor=0.99, verbose=False): super().__init__() self.lr = learning_rate self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.eps = np.finfo(np.float32).eps.item() self.tau = target_update_tau self.epsilon = 0.5 # ticks self.double_q = True self.dueling_q = True self.distributional_q = True self.prioritized_replay = True self.noisy_q = True self.n_step_td = True self.target_update_freq = target_update_freq self.action_shape = action_shape self._gamma = discount_factor self._batch_size = batch_size self._verbose = verbose self._update_iteration = 10 self._learn_cnt = 0 self._normalized = lambda x, e: (x - x.mean()) / (x.std() + e) self.rew_norm = True self.buffer = ReplayBuffer(buffer_size) self.critic_eval = critic_net.to(self.device) self.critic_target = deepcopy(self.critic_eval) self.critic_target.load_state_dict(self.critic_eval.state_dict()) self.critic_eval.use_dueling = self.critic_target.use_dueling = self.dueling_q # Dueling DQN self.critic_eval_optim = optim.Adam(self.critic_eval.parameters(), lr=self.lr) self.critic_eval.train() self.criterion = nn.MSELoss() self.random_choose = 0 self.sum_choose = 0 def choose_action(self, state, test=False): state = torch.tensor(state, dtype=torch.float32, device=self.device).unsqueeze(0) q_values = self.critic_eval(state) action = q_values.argmax(dim=1).cpu().data.numpy() action = action[0] if self.action_shape == 0 else action.reshape( self.action_shape) # return the argmax index if test: self.epsilon = 1.0 if np.random.randn() >= self.epsilon: # epsilon-greedy self.random_choose += 1 action = np.random.randint(0, q_values.size()[-1]) action = action if self.action_shape == 0 else action.reshape( self.action_shape) self.sum_choose += 1 return action def learn(self): for _ in range(self._update_iteration): batch_split = self.buffer.split_batch( self._batch_size) # s, a, r, s_ S = torch.tensor(batch_split['s'], dtype=torch.float32, device=self.device) A = torch.tensor(batch_split['a'], dtype=torch.float32, device=self.device).view(-1, 1) M = torch.tensor(batch_split['m'], dtype=torch.float32).view(-1, 1) R = torch.tensor(batch_split['r'], dtype=torch.float32).view(-1, 1) S_ = torch.tensor(batch_split['s_'], dtype=torch.float32, device=self.device) # print (f'SIZE S {S.size()}, A {A.size()}, M {M.size()}, R {R.size()}, S_ {S_.size()}') if self.rew_norm: R = self._normalized(R, self.eps) with torch.no_grad(): get_action_net = self.critic_eval if self.double_q else self.critic_target # Double DQN argmax_action = get_action_net(S_).max(dim=1, keepdim=True)[1] q_next = self.critic_target(S_).gather( 1, argmax_action.type(torch.long)) q_target = R + M * self._gamma * q_next.cpu() q_target = q_target.to(self.device) q_eval = self.critic_eval(S).gather(1, A.type(torch.long)) critic_loss = self.criterion(q_eval, q_target) self.critic_eval_optim.zero_grad() critic_loss.backward() self.critic_eval_optim.step() self._learn_cnt += 1 if self._learn_cnt % self.target_update_freq == 0: if self._verbose: print(f'=======Soft_sync_weight of DQN=======') self.soft_sync_weight(self.critic_target, self.critic_eval, self.tau)
class PPO(BasePolicy): # option: double def __init__( self, model, buffer_size=1000, actor_learn_freq=1, target_update_freq=0, target_update_tau=5e-3, learning_rate=0.0001, discount_factor=0.99, gae_lamda=0.95, # td batch_size=100, verbose=False): super().__init__() self.lr = learning_rate self.eps = np.finfo(np.float32).eps.item() self.tau = target_update_tau self.ratio_clip = 0.2 self.lam_entropy = 0.01 self.adv_norm = False # normalize advantage, defalut=False self.rew_norm = False # normalize reward, default=False self.schedule_clip = False self.schedule_adam = False self.actor_learn_freq = actor_learn_freq self.target_update_freq = target_update_freq self._gamma = discount_factor self._gae_lam = gae_lamda self._target = target_update_freq > 0 self._update_iteration = 10 self._sync_cnt = 0 # self._learn_cnt = 0 self._learn_critic_cnt = 0 self._learn_actor_cnt = 0 self._verbose = verbose self._batch_size = batch_size self._normalized = lambda x, e: (x - x.mean()) / (x.std() + e) self.buffer = ReplayBuffer(buffer_size, replay=False) self.actor_eval = model.policy_net.to(device).train() self.critic_eval = model.value_net.to(device).train() self.actor_eval_optim = optim.Adam(self.actor_eval.parameters(), lr=self.lr) self.critic_eval_optim = optim.Adam(self.critic_eval.parameters(), lr=self.lr) # self.actor_eval.train() # self.critic_eval.train() if self._target: self.actor_target = self.copy_net(self.actor_eval) self.critic_target = self.copy_net(self.critic_eval) self.criterion = nn.SmoothL1Loss() def learn(self, i_episode=0, num_episode=100): if not self.buffer.is_full(): print( f'Waiting for a full buffer: {len(self.buffer)}\{self.buffer.capacity()} ', end='\r') return 0, 0 loss_actor_avg = 0 loss_critic_avg = 0 memory_split = self.buffer.split(self.buffer.all_memory()) S = torch.tensor(memory_split['s'], dtype=torch.float32, device=device) A = torch.tensor(memory_split['a'], dtype=torch.float32, device=device).view(-1, 1) S_ = torch.tensor(memory_split['s_'], dtype=torch.float32, device=device) R = torch.tensor(memory_split['r'], dtype=torch.float32).view(-1, 1) Log = torch.tensor(memory_split['l'], dtype=torch.float32, device=device).view(-1, 1) # print (f'Size S {S.size()}, A {A.size()}, S_ {S_.size()}, R {R.size()}, Log {Log.size()}') # print (f'S {S}, A {A}, S_ {S_}, R {R}, Log {Log}') with torch.no_grad(): v_evals = self.critic_eval(S).cpu().numpy() end_v_eval = self.critic_eval(S_[-1]).cpu().numpy() rewards = self._normalized( R, self.eps).numpy() if self.rew_norm else R.numpy() # rewards = rewards.cpu().numpy() adv_gae_td = self.GAE(rewards, v_evals, next_v_eval=end_v_eval, gamma=self._gamma, lam=self._gae_lam) # td_error adv advantage = torch.from_numpy(adv_gae_td).to(device).unsqueeze(-1) advantage = self._normalized(advantage, 1e-10) if self.adv_norm else advantage # indices = [i for i in range(len(self.buffer))] for _ in range(self._update_iteration): v_eval = self.critic_eval(S) v_target = advantage + v_eval.detach() critic_loss = self.criterion(v_eval, v_target) loss_critic_avg += critic_loss.item() self.critic_eval_optim.zero_grad() critic_loss.backward() self.critic_eval_optim.step() self._learn_critic_cnt += 1 if self._learn_critic_cnt % self.actor_learn_freq == 0: # actor_core mu, sigma = self.actor_eval(S) dist = Normal(mu, sigma) new_log_prob = dist.log_prob(A) pg_ratio = torch.exp(new_log_prob - Log) # size = [batch_size, 1] clipped_pg_ratio = torch.clamp(pg_ratio, 1.0 - self.ratio_clip, 1.0 + self.ratio_clip) surrogate_loss = -torch.min( pg_ratio * advantage, clipped_pg_ratio * advantage).mean() # policy entropy loss_entropy = -torch.mean( torch.exp(new_log_prob) * new_log_prob) actor_loss = surrogate_loss - self.lam_entropy * loss_entropy loss_actor_avg += actor_loss.item() self.actor_eval_optim.zero_grad() actor_loss.backward() self.actor_eval_optim.step() self._learn_actor_cnt += 1 if self._verbose: print(f'=======Learn_Actort_Net=======') if self._target: if self._learn_critic_cnt % self.target_update_freq == 0: if self._verbose: print(f'=======Soft_sync_weight of DDPG=======') self.soft_sync_weight(self.critic_target, self.critic_eval, self.tau) self.soft_sync_weight(self.actor_target, self.actor_eval, self.tau) self.buffer.clear() assert self.buffer.is_empty() # update param ep_ratio = 1 - (i_episode / num_episode) if self.schedule_clip: self.ratio_clip = 0.2 * ep_ratio if self.schedule_adam: new_lr = self.lr * ep_ratio # set learning rate # ref: https://stackoverflow.com/questions/48324152/ for g in self.actor_eval_optim.param_groups: g['lr'] = new_lr for g in self.critic_eval_optim.param_groups: g['lr'] = new_lr print( f'critic_cnt {self._learn_critic_cnt}, actor_cnt {self._learn_actor_cnt}' ) loss_actor_avg /= (self._update_iteration / self.actor_learn_freq) loss_critic_avg /= self._update_iteration return loss_actor_avg, loss_critic_avg
class A2CPolicy(BasePolicy): #option: double def __init__(self, actor_net, critic_net, buffer_size=1000, actor_learn_freq=1, target_update_freq=0, target_update_tau=5e-3, learning_rate=0.01, discount_factor=0.99, gae_lamda=1, verbose=False): super().__init__() self.lr = learning_rate self.eps = np.finfo(np.float32).eps.item() self.tau = target_update_tau self.gae_lamda = gae_lamda self.actor_learn_freq = actor_learn_freq self.target_update_freq = target_update_freq self._gamma = discount_factor self._target = target_update_freq > 0 self._sync_cnt = 0 # self._learn_cnt = 0 self._learn_critic_cnt = 0 self._learn_actor_cnt = 0 self._verbose = verbose self.buffer = ReplayBuffer(buffer_size, replay=False) # assert not self.buffer.allow_replay, 'PPO buffer cannot be replay buffer' self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.actor_eval = actor_net.to(self.device) self.critic_eval = critic_net.to(self.device) self.actor_eval_optim = optim.Adam(self.actor_eval.parameters(), lr=self.lr) self.critic_eval_optim = optim.Adam(self.critic_eval.parameters(), lr=self.lr) self.actor_eval.train() self.critic_eval.train() if self._target: self.actor_target = deepcopy(self.actor_eval) self.critic_target = deepcopy(self.critic_eval) self.actor_target.load_state_dict(self.actor_eval.state_dict()) self.critic_target.load_state_dict(self.critic_eval.state_dict()) self.actor_target.eval() self.critic_target.eval() self.criterion = nn.SmoothL1Loss() def choose_action(self, state, test=False): state = torch.tensor(state, dtype=torch.float32, device=self.device) if test: self.actor_eval.eval() return Categorical(self.actor_eval(state)).sample().item(), 0 dist = self.actor_eval(state) m = Categorical(dist) action = m.sample() log_prob = m.log_prob(action) state_value = self.critic_eval(state) return action.item(), log_prob def learn(self): memory_split = self.buffer.split( self.buffer.all_memory()) # s, r, l, m S = torch.tensor(memory_split['s'], dtype=torch.float32, device=self.device) R = torch.tensor(memory_split['r'], dtype=torch.float32).view(-1, 1) M = torch.tensor(memory_split['m'], dtype=torch.float32).view(-1, 1) Log = torch.stack(memory_split['l']).view(-1, 1) v_eval = self.critic_eval(S) v_evals = v_eval.detach().cpu().numpy() rewards = R.numpy() masks = M.numpy() adv_gae_mc = self.GAE(rewards, v_evals, next_v_eval=0, masks=masks, gamma=self._gamma, lam=self.gae_lamda) # MC adv advantage = torch.from_numpy(adv_gae_mc).to(self.device).reshape(-1, 1) v_target = advantage + v_eval.detach() # critic_core critic_loss = self.criterion(v_eval, v_target) self.critic_eval_optim.zero_grad() critic_loss.backward() self.critic_eval_optim.step() self._learn_critic_cnt += 1 if self._learn_critic_cnt % self.actor_learn_freq == 0: # actor_core actor_loss = (-Log * advantage).sum() self.actor_eval.train() self.actor_eval_optim.zero_grad() actor_loss.backward() self.actor_eval_optim.step() self._learn_actor_cnt += 1 if self._target: if self._learn_critic_cnt % self.target_update_freq == 0: if self._verbose: print(f'=======Soft_sync_weight of AC=======') self.soft_sync_weight(self.critic_target, self.critic_eval, self.tau) self.soft_sync_weight(self.actor_target, self.actor_eval, self.tau) self._sync_cnt += 1 self.buffer.clear() assert self.buffer.is_empty() def process(self, **kwargs): self.buffer.append(**kwargs)
def __init__( self, model, buffer_size=1e6, batch_size=256, policy_freq=2, tau=0.005, discount=0.99, policy_lr=3e-4, value_lr=3e-4, learn_iteration=1, verbose=False, act_dim=None, n_step=1, use_munchausen=False, use_priority=False, use_dist_q=False, use_PAL=False, ): super().__init__() self.tau = tau self.gamma = discount self.policy_freq = policy_freq self.learn_iteration = learn_iteration self.verbose = verbose self.act_dim = act_dim self.batch_size = batch_size self.use_dist_q = use_dist_q self.use_priority = use_priority self.use_munchausen = use_munchausen self.use_PAL = use_PAL assert not (self.use_priority and self.use_PAL) self.buffer = ReplayBuffer(buffer_size) if self.use_priority: self.buffer = PriorityReplayBuffer(buffer_size, gamma=discount, n_step=n_step) self.actor_eval = model.policy_net.to(device).train() self.critic_eval = model.value_net.to(device).train() self.actor_target = self.copy_net(self.actor_eval) self.critic_target = self.copy_net(self.critic_eval) self.actor_eval_optim = torch.optim.Adam(self.actor_eval.parameters(), lr=policy_lr) self.critic_eval_optim = torch.optim.Adam( self.critic_eval.parameters(), lr=value_lr) self.criterion = nn.SmoothL1Loss(reduction='none') # keep batch dim self.target_entropy = -torch.tensor(1).to(device) self.log_alpha = torch.zeros(1, requires_grad=True, device=device) self.alpha_optim = torch.optim.Adam([self.log_alpha], lr=policy_lr) self.alpha = self.log_alpha.exp() self.eps = np.finfo(np.float32).eps.item() self._learn_critic_cnt = 0 self._learn_actor_cnt = 0
class A2C(BasePolicy): #option: double def __init__( self, model, buffer_size=1000, learning_rate=1e-3, discount_factor=0.99, gae_lamda=1, # mc verbose=False, num_episodes=1000, ): super().__init__() self.lr = learning_rate self.end_lr = self.lr * 0.1 self.eps = np.finfo(np.float32).eps.item() self._gamma = discount_factor self._gae_lamda = gae_lamda # default: 1, MC self._learn_cnt = 0 self._verbose = verbose self.schedule_adam = True self.buffer = ReplayBuffer(buffer_size, replay=False) self.actor_eval = model.policy_net.to(device).train() self.critic_eval = model.value_net.to(device).train() self.actor_eval_optim = optim.Adam(self.actor_eval.parameters(), lr=self.lr) self.critic_eval_optim = optim.Adam(self.critic_eval.parameters(), lr=self.lr) self.criterion = nn.SmoothL1Loss() self.num_episodes = num_episodes def learn(self): pg_loss, v_loss = 0, 0 mem = self.buffer.split(self.buffer.all_memory()) # s, r, l, m S = torch.tensor(mem['s'], dtype=torch.float32, device=device) R = torch.tensor(mem['r'], dtype=torch.float32).view(-1, 1) M = torch.tensor(mem['m'], dtype=torch.float32).view(-1, 1) # Log = torch.stack(list(mem['l'])).view(-1, 1) Log = torch.stack(mem['l']).view(-1, 1) v_eval = self.critic_eval(S) v_evals = v_eval.detach().cpu().numpy() rewards = R.numpy() masks = M.numpy() adv_gae_mc = self.GAE(rewards, v_evals, next_v_eval=0, masks=masks, gamma=self._gamma, lam=self._gae_lamda) # MC adv advantage = torch.from_numpy(adv_gae_mc).to(device).reshape(-1, 1) # critic_core v_target = advantage + v_eval.detach() critic_loss = self.criterion(v_eval, v_target) # actor_core actor_loss = (-Log * advantage).sum() self.critic_eval_optim.zero_grad() critic_loss.backward() self.critic_eval_optim.step() self.actor_eval_optim.zero_grad() actor_loss.backward() self.actor_eval_optim.step() v_loss += critic_loss.item() pg_loss += actor_loss.item() self._learn_cnt += 1 self.buffer.clear() assert self.buffer.is_empty() if self.schedule_adam: new_lr = self.lr + (self.end_lr - self.lr) / self.num_episodes * self._learn_cnt # set learning rate # ref: https://stackoverflow.com/questions/48324152/ for g in self.actor_eval_optim.param_groups: g['lr'] = new_lr for g in self.critic_eval_optim.param_groups: g['lr'] = new_lr return pg_loss, v_loss
class DDPG(BasePolicy): def __init__( self, model, buffer_size=1000, actor_learn_freq=1, target_update_freq=0, target_update_tau=1, learning_rate=1e-3, discount_factor=0.99, batch_size=100, update_iteration=10, verbose=False, ): super().__init__() self.lr = learning_rate self.eps = np.finfo(np.float32).eps.item() self.tau = target_update_tau self.actor_learn_freq = actor_learn_freq self.target_update_freq = target_update_freq self._gamma = discount_factor self._target = target_update_freq > 0 self._update_iteration = update_iteration self._sync_cnt = 0 self._learn_critic_cnt = 0 self._learn_actor_cnt = 0 self._verbose = verbose self._batch_size = batch_size self.buffer = ReplayBuffer(buffer_size) self.actor_eval = model.policy_net.to(device).train() # pi(s) self.critic_eval = model.value_net.to(device).train() # Q(s, a) self.actor_eval_optim = optim.Adam(self.actor_eval.parameters(), lr=self.lr) self.critic_eval_optim = optim.Adam(self.critic_eval.parameters(), lr=self.lr) if self._target: self.actor_target = self.copy_net(self.actor_eval) self.critic_target = self.copy_net(self.critic_eval) self.criterion = nn.MSELoss() # why mse? def learn(self): loss_actor_avg, loss_critic_avg = 0, 0 for _ in range(self._update_iteration): batch_split = self.buffer.split_batch(self._batch_size) S = torch.tensor(batch_split['s'], dtype=torch.float32, device=device) # [batch_size, S.feature_size] A = torch.tensor(batch_split['a'], dtype=torch.float32, device=device).view(-1, 1) # [batch_size, 1] M = torch.tensor(batch_split['m'], dtype=torch.float32).view(-1, 1) R = torch.tensor(batch_split['r'], dtype=torch.float32).view(-1, 1) S_ = torch.tensor(batch_split['s_'], dtype=torch.float32, device=device) with torch.no_grad(): q_next = self.critic_eval(S_, self.actor_eval(S_)) if self._target: q_next = self.critic_target(S_, self.actor_target(S_)) q_target = R + M * self._gamma * q_next.cpu() q_target = q_target.to(device) # print (f'SIZE S {S.size()}, A {A.size()}') q_eval = self.critic_eval(S, A) # [batch_size, q_value_size] critic_loss = self.criterion(q_eval, q_target) loss_critic_avg += critic_loss.item() self.critic_eval_optim.zero_grad() critic_loss.backward() self.critic_eval_optim.step() self._learn_critic_cnt += 1 if self._learn_critic_cnt % self.actor_learn_freq == 0: actor_loss = -self.critic_eval(S, self.actor_eval(S)).mean() loss_actor_avg += actor_loss.item() self.actor_eval_optim.zero_grad() actor_loss.backward() self.actor_eval_optim.step() self._learn_actor_cnt += 1 if self._verbose: print(f'=======Learn_Actort_Net=======') if self._target: if self._learn_critic_cnt % self.target_update_freq == 0: if self._verbose: print(f'=======Soft_sync_weight of DDPG=======') self.soft_sync_weight(self.critic_target, self.critic_eval, self.tau) self.soft_sync_weight(self.actor_target, self.actor_eval, self.tau) loss_actor_avg /= (self._update_iteration / self.actor_learn_freq) loss_critic_avg /= self._update_iteration return loss_actor_avg, loss_critic_avg
class SACV(BasePolicy): def __init__( self, model, buffer_size=1000, batch_size=100, actor_learn_freq=1, target_update_freq=5, target_update_tau=1e-2, learning_rate=1e-3, discount_factor=0.99, update_iteration=10, verbose=False, use_priority=False, act_dim=None, ): super().__init__() self.lr = learning_rate self.eps = np.finfo(np.float32).eps.item() self.tau = target_update_tau self.actor_learn_freq = actor_learn_freq self.target_update_freq = target_update_freq self._gamma = discount_factor self._target = target_update_freq > 0 self._update_iteration = update_iteration self._sync_cnt = 0 # self._learn_cnt = 0 self._learn_critic_cnt = 0 self._learn_actor_cnt = 0 self._verbose = verbose self._batch_size = batch_size self.use_priority = use_priority self.use_dist = model.value_net.use_dist if self.use_priority: self.buffer = PriorityReplayBuffer(buffer_size) else: self.buffer = ReplayBuffer(buffer_size) # off-policy if self.use_dist: assert model.value_net.num_atoms > 1 # assert isinstance(model.value_net, CriticModelDist) self.v_min = model.value_net.v_min self.v_max = model.value_net.v_max self.num_atoms = model.value_net.num_atoms self.delta_z = (self.v_max - self.v_min) / (self.num_atoms - 1) self.support = torch.linspace(self.v_min, self.v_max, self.num_atoms) self.actor_eval = model.policy_net.to(device).train() self.critic_eval = model.value_net.to(device).train() self.actor_target = self.copy_net(self.actor_eval) self.critic_target = self.copy_net(self.critic_eval) self.actor_eval_optim = optim.Adam(self.actor_eval.parameters(), lr=self.lr) self.critic_eval_optim = optim.Adam(self.critic_eval.parameters(), lr=self.lr) self.criterion = nn.SmoothL1Loss(reduction='none') # keep batch dim self.act_dim = act_dim self.target_entropy = -torch.tensor(1).to(device) self.log_alpha = torch.zeros(1, requires_grad=True, device=device) self.alpha_optim = optim.Adam([self.log_alpha], lr=self.lr) self.alpha = self.log_alpha.exp() def _tensor(self, data, use_cuda=False): if np.array(data).ndim == 1: data = torch.tensor(data, dtype=torch.float32).view(-1, 1) else: data = torch.tensor(data, dtype=torch.float32) if use_cuda: data = data.to(device) return data def learn_dist(self, obs, act, rew, next_obs, mask): with torch.no_grad(): next_act, next_log_pi = self.actor_target(next_obs) # q(s, a) change to z(s, a) to discribe a distributional z1_next, z2_next = self.critic_target.get_probs( next_obs, next_act) # [batch_size, num_atoms] p_next = torch.stack([ torch.where(z1.sum() < z2.sum(), z1, z2) for z1, z2 in zip(z1_next, z2_next) ]) p_next -= (self.alpha * next_log_pi) Tz = rew.unsqueeze(1) + mask * self.support.unsqueeze(0) Tz = Tz.clamp(min=self.v_min, max=self.v_max) b = (Tz - self.v_min) / self.delta_z l, u = b.floor().to(torch.int64), b.ceil().to(torch.int64) l[(u > 0) * (l == u)] -= 1 u[(l < (self.num_atoms - 1)) * (l == u)] += 1 m = obs.new_zeros(self._batch_size, self.num_atoms).cpu() p_next = p_next.cpu() # print (f'm device: {m.device}') # print (f'p_next device: {p_next.device}') offset = torch.linspace(0, ((self._batch_size - 1) * self.num_atoms), self._batch_size).unsqueeze(1).expand( self._batch_size, self.num_atoms).to(l) m.view(-1).index_add_( 0, (l + offset).view(-1), (p_next * (u.float() - b)).view(-1)) # m_l = m_l + p(s_t+n, a*)(u - b) m.view(-1).index_add_( 0, (u + offset).view(-1), (p_next * (b - l.float())).view(-1)) # m_u = m_u + p(s_t+n, a*)(b - l) m = m.to(device) log_z1, log_z2 = self.critic_eval.get_probs(obs, act, log=True) loss1 = -(m * log_z1).sum(dim=1) loss2 = -(m * log_z2).sum(dim=1) batch_loss = 0.5 * (loss1 + loss2) return batch_loss def learn(self): pg_loss, q_loss, a_loss = 0, 0, 0 for _ in range(self._update_iteration): if self.use_priority: # s_{t}, n-step_rewards, s_{t+n} tree_idxs, S, A, R, S_, M, weights = self.buffer.sample( self._batch_size) W = torch.tensor(weights, dtype=torch.float32, device=device).view(-1, 1) else: batch_split = self.buffer.split_batch(self._batch_size) S, A, M, R, S_ = batch_split['s'], batch_split[ 'a'], batch_split['m'], batch_split['r'], batch_split['s_'] # print ('after sampling from buffer!') if self.act_dim is None: self.act_dim = A.shape[-1] self.target_entropy = -torch.tensor(self.act_dim).to(device) print(self.target_entropy) assert 0 R = torch.tensor(R, dtype=torch.float32).view(-1, 1) S = torch.tensor(S, dtype=torch.float32, device=device) # A = torch.tensor(A, dtype=torch.float32, device=device).view(-1, 1) A = torch.tensor(A, dtype=torch.float32, device=device).view(-1, self.act_dim) # print (f'A shape {A.shape}') M = torch.tensor(M, dtype=torch.float32).view(-1, 1) S_ = torch.tensor(S_, dtype=torch.float32, device=device) # print (f'size S:{S.size()}, A:{A.size()}, M:{M.size()}, R:{R.size()}, S_:{S_.size()}') if self.use_dist: # print (M[0].size()) # print (M[0]) # print (M[0].item()) # assert 0 # D = torch.from_numpy(np.array([1^int(mask.item()) for mask in M])).view(-1, 1) # print (f'size S:{S.shape}, A:{A.size()}, M:{M.size()}, R:{R.size()}, S_:{S_.size()}, D:{D.size()}') # assert 0 batch_loss = self.learn_dist(S, A, R, S_, M) else: with torch.no_grad(): next_A, next_log = self.actor_target.evaluate(S_) q1_next, q2_next = self.critic_target(S_, next_A) q_next = torch.min(q1_next, q2_next) - self.alpha * next_log q_target = R + M * self._gamma * q_next.cpu() q_target = q_target.to(device) # q_loss q1_eval, q2_eval = self.critic_eval(S, A) loss1 = self.criterion(q1_eval, q_target) loss2 = self.criterion(q2_eval, q_target) # print(f'q_eval {q1_eval.shape}, q_target {q_target.shape}') batch_loss = 0.5 * (loss1 + loss2) if self.use_priority: critic_loss = (W * batch_loss).mean() self.buffer.update_priorities( tree_idxs, np.abs(batch_loss.detach().cpu().numpy()) + 1e-6) else: critic_loss = batch_loss.mean() self.critic_eval_optim.zero_grad() critic_loss.backward() self.critic_eval_optim.step() self._learn_critic_cnt += 1 actor_loss = torch.tensor(0) alpha_loss = torch.tensor(0) if self._learn_critic_cnt % self.actor_learn_freq == 0: curr_A, curr_log = self.actor_eval.evaluate(S) if self.use_dist: z1_next, z2_next = self.critic_eval.get_probs(S, curr_A) p_next = torch.stack([ torch.where(z1.sum() < z2.sum(), z1, z2) for z1, z2 in zip(z1_next, z2_next) ]) num_atoms = torch.tensor(self.num_atoms, dtype=torch.float32, device=device) # actor_loss = p_next * num_atoms # actor_loss = torch.sum(actor_loss, dim=1) # actor_loss = -(actor_loss + self.alpha * curr_log).mean() actor_loss = (self.alpha * curr_log - p_next) actor_loss = torch.sum(actor_loss, dim=1) actor_loss = actor_loss.mean() else: q1_next, q2_next = self.critic_eval(S, curr_A) q_next = torch.min(q1_next, q2_next) # pg_loss actor_loss = (self.alpha * curr_log - q_next).mean() self.actor_eval_optim.zero_grad() actor_loss.backward() self.actor_eval_optim.step() # alpha loss alpha_loss = -( self.log_alpha * (curr_log + self.target_entropy).detach()).mean() self.alpha_optim.zero_grad() alpha_loss.backward() self.alpha_optim.step() self.alpha = float(self.log_alpha.exp().detach().cpu().numpy()) q_loss += critic_loss.item() pg_loss += actor_loss.item() a_loss += alpha_loss.item() if self._learn_critic_cnt % self.target_update_freq: self.soft_sync_weight(self.critic_target, self.critic_eval, self.tau) self.soft_sync_weight(self.actor_target, self.actor_eval, self.tau) return pg_loss, q_loss, a_loss
class DDPGPolicy(BasePolicy): def __init__(self, actor_net, critic_net, buffer_size=1000, actor_learn_freq=1, target_update_freq=0, target_update_tau=5e-3, learning_rate=0.01, discount_factor=0.99, batch_size=100, verbose=False): super().__init__() self.lr = learning_rate self.eps = np.finfo(np.float32).eps.item() self.tau = target_update_tau self.actor_learn_freq = actor_learn_freq self.target_update_freq = target_update_freq self._gamma = discount_factor self._target = target_update_freq > 0 self._update_iteration = 10 self._sync_cnt = 0 # self._learn_cnt = 0 self._learn_critic_cnt = 0 self._learn_actor_cnt = 0 self._verbose = verbose self._batch_size = batch_size self.replay_buffer = ReplayBuffer(buffer_size) # assert buffer.allow_replay, 'DDPG buffer must be replay buffer' self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.actor_eval = actor_net.to(self.device) # pi(s) self.critic_eval = critic_net.to(self.device) # Q(s, a) self.actor_eval_optim = optim.Adam(self.actor_eval.parameters(), lr=self.lr) self.critic_eval_optim = optim.Adam(self.critic_eval.parameters(), lr=self.lr) self.actor_eval.train() self.critic_eval.train() if self._target: self.actor_target = deepcopy(self.actor_eval) self.critic_target = deepcopy(self.critic_eval) self.actor_target.load_state_dict(self.actor_eval.state_dict()) self.critic_target.load_state_dict(self.critic_eval.state_dict()) self.actor_target.eval() self.critic_target.eval() self.criterion = nn.MSELoss() # why mse? def choose_action(self, state, test=False): state = torch.tensor(state, dtype=torch.float32, device=self.device) if test: self.actor_eval.eval() action = self.actor_eval(state) # out = tanh(x) action = action.clamp(-1, 1) return action.item() def learn(self): loss_actor_avg = 0 loss_critic_avg = 0 for _ in range(self._update_iteration): memory_batch = self.replay_buffer.random_sample(self._batch_size) batch_split = self.replay_buffer.split(memory_batch) S = torch.tensor( batch_split['s'], dtype=torch.float32, device=self.device) # [batch_size, S.feature_size] A = torch.tensor(batch_split['a'], dtype=torch.float32, device=self.device).unsqueeze( -1) # [batch_size, 1] S_ = torch.tensor(batch_split['s_'], dtype=torch.float32, device=self.device) R = torch.tensor(batch_split['r'], dtype=torch.float32, device=self.device).unsqueeze(-1) with torch.no_grad(): q_target = self.critic_eval(S_, self.actor_eval(S_)) if self._target: q_target = self.critic_target(S_, self.actor_target(S_)) q_target = R + self._gamma * q_target print( f'SIZE S {S.size()}, A {A.size()}, S_ {S_.size()}, R {R.size()}' ) q_eval = self.critic_eval(S, A) # [batch_size, q_value_size] critic_loss = self.criterion(q_eval, q_target) loss_critic_avg += critic_loss.item() self.critic_eval_optim.zero_grad() critic_loss.backward() self.critic_eval_optim.step() self._learn_critic_cnt += 1 if self._learn_critic_cnt % self.actor_learn_freq == 0: actor_loss = -self.critic_eval(S, self.actor_eval(S)).mean() loss_actor_avg += actor_loss.item() self.actor_eval_optim.zero_grad() actor_loss.backward() self.actor_eval_optim.step() self._learn_actor_cnt += 1 if self._verbose: print(f'=======Learn_Actort_Net=======') if self._target: if self._learn_critic_cnt % self.target_update_freq == 0: if self._verbose: print(f'=======Soft_sync_weight of DDPG=======') self.soft_sync_weight(self.critic_target, self.critic_eval, self.tau) self.soft_sync_weight(self.actor_target, self.actor_eval, self.tau) loss_actor_avg /= (self._update_iteration / self.actor_learn_freq) loss_critic_avg /= self._update_iteration return loss_actor_avg, loss_critic_avg def process(self, **kwargs): self.replay_buffer.append(**kwargs)
class SAC(BasePolicy): # combiane SAC1 and SAC2 def __init__( self, model, buffer_size=1000, batch_size=100, actor_learn_freq=1, target_update_freq=5, target_update_tau=1e-2, learning_rate=1e-3, discount_factor=0.99, verbose=False, update_iteration=10, act_dim=None, alpha=None # default: auto_entropy_tuning ): super().__init__() self.lr = learning_rate self.eps = np.finfo(np.float32).eps.item() self.tau = target_update_tau self.actor_learn_freq = actor_learn_freq self.target_update_freq = target_update_freq self._gamma = discount_factor self._target = target_update_freq > 0 self._update_iteration = update_iteration self._sync_cnt = 0 self._learn_critic_cnt = 0 self._learn_actor_cnt = 0 self._verbose = verbose self._batch_size = batch_size self.buffer = ReplayBuffer(buffer_size) # off-policy self.actor_eval = model.policy_net.to(device).train() self.critic_eval = model.value_net.to(device).train() self.actor_target = self.copy_net(self.actor_eval) self.critic_target = self.copy_net(self.critic_eval) self.actor_eval_optim = optim.Adam(self.actor_eval.parameters(), lr=self.lr) self.critic_eval_optim = optim.Adam(self.critic_eval.parameters(), lr=self.lr) self.criterion = nn.SmoothL1Loss() self.act_dim = act_dim self.alpha = alpha self.auto_entropy_tuning = True if self.alpha: self.auto_entropy_tuning = False self.value_eval = model.v_net.to(device).train() self.value_target = self.copy_net(self.value_eval) self.value_eval_optim = optim.Adam(self.value_eval.parameters(), lr=self.lr) else: self.target_entropy = -torch.tensor(1).to(device) self.log_alpha = torch.zeros(1, requires_grad=True, device=device) self.alpha_optim = optim.Adam([self.log_alpha], lr=self.lr) self.alpha = self.log_alpha.exp() def learn(self): pg_loss, q_loss, a_loss = 0, 0, 0 for _ in range(self._update_iteration): batch = self.buffer.split_batch(self._batch_size) if self.act_dim is None: self.act_dim = np.array(batch['a']).shape[-1] if self.auto_entropy_tuning: self.target_entropy = -torch.tensor(self.act_dim).to(device) S = torch.tensor(batch['s'], dtype=torch.float32, device=device) A = torch.tensor(batch['a'], dtype=torch.float32, device=device).view(-1, self.act_dim) M = torch.tensor(batch['m'], dtype=torch.float32).view(-1, 1) R = torch.tensor(batch['r'], dtype=torch.float32).view(-1, 1) S_ = torch.tensor(batch['s_'], dtype=torch.float32, device=device) if self._verbose: print(f'shape S:{S.size()}, A:{A.size()}, M:{M.size()}, R:{R.size()}, S_:{S_.size()}, W:{W.size()}') if self.auto_entropy_tuning: with torch.no_grad(): next_A, next_log = self.actor_target.evaluate(S_) q1_next, q2_next = self.critic_target(S_, next_A) q_next = torch.min(q1_next, q2_next) - self.alpha * next_log else: curr_A, curr_log = self.actor_eval.evaluate(S) # v_loss with torch.no_grad(): q1, q2 = self.critic_eval(S, curr_A) v_target = torch.min(q1, q2) - self.alpha * curr_log q_next = self.value_target(S_) v_eval = self.value_eval(S) value_loss = self.criterion(v_eval, v_target) # update V self.value_eval_optim.zero_grad() value_loss.backward() self.value_eval_optim.step() q_target = R + M * self._gamma * q_next.cpu() q_target = q_target.to(device) q1_eval, q2_eval = self.critic_eval(S, A) loss1 = self.criterion(q1_eval, q_target) loss2 = self.criterion(q2_eval, q_target) critic_loss = 0.5 * (loss1 + loss2) # update soft Q self.critic_eval_optim.zero_grad() critic_loss.backward() self.critic_eval_optim.step() self._learn_critic_cnt += 1 actor_loss = torch.tensor(0) alpha_loss = torch.tensor(0) if self._learn_critic_cnt % self.actor_learn_freq == 0: if self.auto_entropy_tuning: curr_A, curr_log = self.actor_eval.evaluate(S) q1_next, q2_next = self.critic_eval(S, curr_A) q_eval_next = torch.min(q1_next, q2_next) # alpha loss alpha_loss = -(self.log_alpha * (curr_log + self.target_entropy).detach()).mean() self.alpha_optim.zero_grad() alpha_loss.backward() self.alpha_optim.step() self.alpha = float(self.log_alpha.exp().detach().cpu().numpy()) else: q_eval_next = torch.min(q1, q2) # pg_loss actor_loss = (self.alpha * curr_log - q_eval_next).mean() self.actor_eval_optim.zero_grad() actor_loss.backward() self.actor_eval_optim.step() if self._learn_critic_cnt % self.target_update_freq: if self.auto_entropy_tuning: self.soft_sync_weight(self.critic_target, self.critic_eval, self.tau) self.soft_sync_weight(self.actor_target, self.actor_eval, self.tau) else: self.soft_sync_weight(self.value_target, self.value_eval, self.tau) q_loss += critic_loss.item() pg_loss += actor_loss.item() if self.auto_entropy_tuning: loss += alpha_loss.item() else: loss += value_loss.item() return pg_loss, q_loss, loss
class SAC2(BasePolicy): # no value network def __init__( self, model, buffer_size=1000, batch_size=100, actor_learn_freq=1, target_update_freq=5, target_update_tau=0.01, learning_rate=1e-3, discount_factor=0.99, verbose=False, update_iteration=10, ): super().__init__() self.lr = learning_rate self.eps = np.finfo(np.float32).eps.item() self.tau = target_update_tau self.actor_learn_freq = actor_learn_freq self.target_update_freq = target_update_freq self._gamma = discount_factor self._target = target_update_freq > 0 self._update_iteration = update_iteration self._sync_cnt = 0 self._learn_critic_cnt = 0 self._learn_actor_cnt = 0 self._verbose = verbose self._batch_size = batch_size self.buffer = ReplayBuffer(buffer_size) # off-policy self.actor_eval = model.policy_net.to(device).train() self.critic_eval = model.value_net.to(device).train() self.actor_target = self.copy_net(self.actor_eval) self.critic_target = self.copy_net(self.critic_eval) self.actor_eval_optim = optim.Adam(self.actor_eval.parameters(), lr=self.lr) self.critic_eval_optim = optim.Adam(self.critic_eval.parameters(), lr=self.lr) self.criterion = nn.SmoothL1Loss() self.target_entropy = -torch.tensor(1).to(device) self.log_alpha = torch.zeros(1, requires_grad=True, device=device) self.alpha_optim = optim.Adam([self.log_alpha], lr=self.lr) self.alpha = self.log_alpha.exp() def learn(self): pg_loss, q_loss, a_loss = 0, 0, 0 for _ in range(self._update_iteration): batch_split = self.buffer.split_batch(self._batch_size) S = torch.tensor(batch_split['s'], dtype=torch.float32, device=device) A = torch.tensor(batch_split['a'], dtype=torch.float32, device=device).view(-1, 1) M = torch.tensor(batch_split['m'], dtype=torch.float32).view(-1, 1) R = torch.tensor(batch_split['r'], dtype=torch.float32).view(-1, 1) S_ = torch.tensor(batch_split['s_'], dtype=torch.float32, device=device) # print (f'size S:{S.size()}, A:{A.size()}, M:{M.size()}, R:{R.size()}, S_:{S_.size()}, W:{W.size()}') with torch.no_grad(): next_A, next_log = self.actor_target.evaluate(S_) q1_next, q2_next = self.critic_target(S_, next_A) q_next = torch.min(q1_next, q2_next) - self.alpha * next_log q_target = R + M * self._gamma * q_next.cpu() q_target = q_target.to(device) # q_loss q1_eval, q2_eval = self.critic_eval(S, A) critic_loss = self.criterion(q1_eval, q_target) + self.criterion( q2_eval, q_target) self.critic_eval_optim.zero_grad() critic_loss.backward() self.critic_eval_optim.step() self._learn_critic_cnt += 1 actor_loss = torch.tensor(0) alpha_loss = torch.tensor(0) if self._learn_critic_cnt % self.actor_learn_freq == 0: curr_A, curr_log = self.actor_eval.evaluate(S) q1_next, q2_next = self.critic_eval(S, curr_A) q_next = torch.min(q1_next, q2_next) # pg_loss actor_loss = (self.alpha * curr_log - q_next).mean() self.actor_eval_optim.zero_grad() actor_loss.backward() self.actor_eval_optim.step() # alpha loss alpha_loss = -( self.log_alpha * (curr_log + self.target_entropy).detach()).mean() self.alpha_optim.zero_grad() alpha_loss.backward() self.alpha_optim.step() self.alpha = float(self.log_alpha.exp().detach().cpu().numpy()) q_loss += critic_loss.item() * 0.5 pg_loss += actor_loss.item() a_loss += alpha_loss.item() if self._learn_critic_cnt % self.target_update_freq: self.soft_sync_weight(self.critic_target, self.critic_eval, self.tau) self.soft_sync_weight(self.actor_target, self.actor_eval, self.tau) return pg_loss, q_loss, a_loss
class PPOPolicy(BasePolicy): #option: double def __init__(self, actor_net, critic_net, buffer_size=1000, actor_learn_freq=1, target_update_freq=0, target_update_tau=5e-3, learning_rate=0.0001, discount_factor=0.99, batch_size=100, verbose=False): super().__init__() self.lr = learning_rate self.eps = np.finfo(np.float32).eps.item() self.tau = target_update_tau self.ratio_clip = 0.2 self.lam_entropy = 0.01 self.adv_norm = True self.rew_norm = False self.schedule_clip = False self.schedule_adam = False self.actor_learn_freq = actor_learn_freq self.target_update_freq = target_update_freq self._gamma = discount_factor self._target = target_update_freq > 0 self._update_iteration = 10 self._sync_cnt = 0 # self._learn_cnt = 0 self._learn_critic_cnt = 0 self._learn_actor_cnt = 0 self._verbose = verbose self._batch_size = batch_size self.buffer = ReplayBuffer(buffer_size, replay=False) # assert not self.buffer.allow_replay, 'PPO buffer cannot be replay buffer' self._normalized = lambda x, e: (x - x.mean()) / (x.std() + e) self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.actor_eval = actor_net.to(self.device) self.critic_eval = critic_net.to(self.device) self.actor_eval_optim = optim.Adam(self.actor_eval.parameters(), lr=self.lr) self.critic_eval_optim = optim.Adam(self.critic_eval.parameters(), lr=self.lr) self.actor_eval.train() self.critic_eval.train() if self._target: self.actor_target = deepcopy(self.actor_eval) self.critic_target = deepcopy(self.critic_eval) self.actor_target.load_state_dict(self.actor_eval.state_dict()) self.critic_target.load_state_dict(self.critic_eval.state_dict()) self.actor_target.eval() self.critic_target.eval() self.criterion = nn.SmoothL1Loss() def choose_action(self, state, test=False): state = torch.tensor(state, dtype=torch.float32, device=self.device) if test: self.actor_eval.eval() with torch.no_grad(): mu, sigma = self.actor_eval(state) dist = Normal(mu, sigma) action = dist.sample() # print (f'mu:{mu}, sigma:{sigma}, dist: {dist}, action sample before clamp: {action}') action = action.clamp(-2, 2) # print (f'action after clamp {action}') log_prob = dist.log_prob(action) assert abs(action.item()) <= 2, f'ERROR: action out of {action}' return action.item(), log_prob.item() def get_batchs_indices(self, buffer_size, batch_size, replace=True, batch_num=None): indices = [i for i in range(buffer_size)] if replace: # 有放回的采样 if not batch_num: batch_num = round(buffer_size / batch_size + 0.5) * 2 return [ np.random.choice(indices, batch_size, replace=False) for _ in range(batch_num) ] else: # 无放回的采样 np.random.shuffle(indices) return [ indices[i:i + batch_size] for i in range(0, buffer_size, batch_size) ] def learn(self, i_episode=0, num_episode=100): if not self.buffer.is_full(): print( f'Waiting for a full buffer: {len(self.buffer)}\{self.buffer.capacity()} ', end='\r') return 0, 0 loss_actor_avg = 0 loss_critic_avg = 0 memory_split = self.buffer.split(self.buffer.all_memory()) S = torch.tensor(memory_split['s'], dtype=torch.float32, device=self.device) A = torch.tensor(memory_split['a'], dtype=torch.float32, device=self.device).view(-1, 1) S_ = torch.tensor(memory_split['s_'], dtype=torch.float32, device=self.device) R = torch.tensor(memory_split['r'], dtype=torch.float32).view(-1, 1) Log = torch.tensor(memory_split['l'], dtype=torch.float32, device=self.device).view(-1, 1) # print (f'Size S {S.size()}, A {A.size()}, S_ {S_.size()}, R {R.size()}, Log {Log.size()}') # print (f'S {S}, A {A}, S_ {S_}, R {R}, Log {Log}') with torch.no_grad(): v_evals = self.critic_eval(S).cpu().numpy() end_v_eval = self.critic_eval(S_[-1]).cpu().numpy() rewards = self._normalized( R, self.eps).numpy() if self.rew_norm else R.numpy() # rewards = rewards.cpu().numpy() adv_gae_td = self.GAE(rewards, v_evals, next_v_eval=end_v_eval, gamma=self._gamma, lam=0) # td_error adv advantage = torch.from_numpy(adv_gae_td).to(self.device).unsqueeze(-1) advantage = self._normalized(advantage, 1e-10) if self.adv_norm else advantage # indices = [i for i in range(len(self.buffer))] for _ in range(self._update_iteration): v_eval = self.critic_eval(S) v_target = advantage + v_eval.detach() critic_loss = self.criterion(v_eval, v_target) loss_critic_avg += critic_loss.item() self.critic_eval_optim.zero_grad() critic_loss.backward() self.critic_eval_optim.step() self._learn_critic_cnt += 1 if self._learn_critic_cnt % self.actor_learn_freq == 0: # actor_core mu, sigma = self.actor_eval(S) dist = Normal(mu, sigma) new_log_prob = dist.log_prob(A) pg_ratio = torch.exp(new_log_prob - Log) # size = [batch_size, 1] clipped_pg_ratio = torch.clamp(pg_ratio, 1.0 - self.ratio_clip, 1.0 + self.ratio_clip) surrogate_loss = -torch.min( pg_ratio * advantage, clipped_pg_ratio * advantage).mean() # policy entropy loss_entropy = -torch.mean( torch.exp(new_log_prob) * new_log_prob) actor_loss = surrogate_loss - self.lam_entropy * loss_entropy loss_actor_avg += actor_loss.item() self.actor_eval_optim.zero_grad() actor_loss.backward() self.actor_eval_optim.step() self._learn_actor_cnt += 1 if self._verbose: print(f'=======Learn_Actort_Net=======') if self._target: if self._learn_critic_cnt % self.target_update_freq == 0: if self._verbose: print(f'=======Soft_sync_weight of DDPG=======') self.soft_sync_weight(self.critic_target, self.critic_eval, self.tau) self.soft_sync_weight(self.actor_target, self.actor_eval, self.tau) self.buffer.clear() assert self.buffer.is_empty() # update param ep_ratio = 1 - (i_episode / num_episode) if self.schedule_clip: self.ratio_clip = 0.2 * ep_ratio if self.schedule_adam: new_lr = self.lr * ep_ratio # set learning rate # ref: https://stackoverflow.com/questions/48324152/ for g in self.actor_eval_optim.param_groups: g['lr'] = new_lr for g in self.critic_eval_optim.param_groups: g['lr'] = new_lr print( f'critic_cnt {self._learn_critic_cnt}, actor_cnt {self._learn_actor_cnt}' ) loss_actor_avg /= (self._update_iteration / self.actor_learn_freq) loss_critic_avg /= self._update_iteration return loss_actor_avg, loss_critic_avg def process(self, **kwargs): self.buffer.append(**kwargs)