class AgentPPO(AgentBase): def __init__(self): super().__init__() self.ratio_clip = 0.3 # ratio.clamp(1 - clip, 1 + clip) self.lambda_entropy = 0.01 # could be 0.02 self.lambda_gae_adv = 0.98 # could be 0.95~0.99, GAE (Generalized Advantage Estimation. ICLR.2016.) self.if_use_gae = False # if use Generalized Advantage Estimation self.if_on_policy = True # AgentPPO is an on policy DRL algorithm self.noise = None self.optimizer = None self.compute_reward = None # attribution def init(self, net_dim, state_dim, action_dim): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.compute_reward = self.compute_reward_gae if self.if_use_gae else self.compute_reward_adv self.act = ActorPPO(net_dim, state_dim, action_dim).to(self.device) self.cri = CriticAdv(state_dim, net_dim).to(self.device) self.optimizer = torch.optim.Adam([{ 'params': self.act.parameters(), 'lr': self.learning_rate }, { 'params': self.cri.parameters(), 'lr': self.learning_rate }]) def select_action(self, state) -> tuple: states = torch.as_tensor((state, ), dtype=torch.float32, device=self.device).detach() actions, noises = self.act.get_action_noise(states) return actions[0].cpu().numpy(), noises[0].cpu().numpy() def explore_env(self, env, buffer, target_step, reward_scale, gamma) -> int: buffer.empty_buffer_before_explore() # NOTICE! necessary for on-policy actual_step = 0 while actual_step < target_step: state = env.reset() for _ in range(env.max_step): action, noise = self.select_action(state) next_state, reward, done, _ = env.step(np.tanh(action)) actual_step += 1 other = (reward * reward_scale, 0.0 if done else gamma, *action, *noise) buffer.append_buffer(state, other) if done: break state = next_state return actual_step def update_net(self, buffer, _target_step, batch_size, repeat_times=4) -> (float, float): buffer.update_now_len_before_sample() buf_len = buffer.now_len # assert buf_len >= _target_step with torch.no_grad(): # Trajectory using reverse reward buf_reward, buf_mask, buf_action, buf_noise, buf_state = buffer.sample_all( ) bs = 2**10 # set a smaller 'bs: batch size' when out of GPU memory. buf_value = torch.cat([ self.cri(buf_state[i:i + bs]) for i in range(0, buf_state.size(0), bs) ], dim=0) buf_logprob = -(buf_noise.pow(2).__mul__(0.5) + self.act.a_std_log + self.act.sqrt_2pi_log).sum(1) buf_r_sum, buf_advantage = self.compute_reward( buf_len, buf_reward, buf_mask, buf_value) del buf_reward, buf_mask, buf_noise obj_critic = None for _ in range( int(repeat_times * buf_len / batch_size)): # PPO: Surrogate objective of Trust Region indices = torch.randint(buf_len, size=(batch_size, ), requires_grad=False, device=self.device) state = buf_state[indices] action = buf_action[indices] r_sum = buf_r_sum[indices] logprob = buf_logprob[indices] advantage = buf_advantage[indices] new_logprob = self.act.compute_logprob(state, action) # it is obj_actor ratio = (new_logprob - logprob).exp() obj_surrogate1 = advantage * ratio obj_surrogate2 = advantage * ratio.clamp(1 - self.ratio_clip, 1 + self.ratio_clip) obj_surrogate = -torch.min(obj_surrogate1, obj_surrogate2).mean() obj_entropy = (new_logprob.exp() * new_logprob).mean() # policy entropy obj_actor = obj_surrogate + obj_entropy * self.lambda_entropy value = self.cri(state).squeeze( 1) # critic network predicts the reward_sum (Q value) of state obj_critic = self.criterion(value, r_sum) obj_united = obj_actor + obj_critic / (r_sum.std() + 1e-5) self.optimizer.zero_grad() obj_united.backward() self.optimizer.step() return obj_actor.item(), obj_critic.item() def compute_reward_adv(self, buf_len, buf_reward, buf_mask, buf_value) -> (torch.Tensor, torch.Tensor): buf_r_sum = torch.empty(buf_len, dtype=torch.float32, device=self.device) # reward sum pre_r_sum = 0 # reward sum of previous step for i in range(buf_len - 1, -1, -1): buf_r_sum[i] = buf_reward[i] + buf_mask[i] * pre_r_sum pre_r_sum = buf_r_sum[i] buf_advantage = buf_r_sum - (buf_mask * buf_value.squeeze(1)) buf_advantage = (buf_advantage - buf_advantage.mean()) / (buf_advantage.std() + 1e-5) return buf_r_sum, buf_advantage def compute_reward_gae(self, buf_len, buf_reward, buf_mask, buf_value) -> (torch.Tensor, torch.Tensor): buf_r_sum = torch.empty(buf_len, dtype=torch.float32, device=self.device) # old policy value buf_advantage = torch.empty(buf_len, dtype=torch.float32, device=self.device) # advantage value pre_r_sum = 0 # reward sum of previous step pre_advantage = 0 # advantage value of previous step for i in range(buf_len - 1, -1, -1): buf_r_sum[i] = buf_reward[i] + buf_mask[i] * pre_r_sum pre_r_sum = buf_r_sum[i] buf_advantage[i] = buf_reward[i] + buf_mask[i] * (pre_advantage - buf_value[i]) pre_advantage = buf_value[ i] + buf_advantage[i] * self.lambda_gae_adv buf_advantage = (buf_advantage - buf_advantage.mean()) / (buf_advantage.std() + 1e-5) return buf_r_sum, buf_advantage
class AgentPPO(AgentBase): def __init__(self): super().__init__() self.clip = 0.3 # ratio.clamp(1 - clip, 1 + clip) self.lambda_entropy = 0.01 # larger lambda_entropy means more exploration self.noise = None self.optimizer = None self.if_on_policy = True def init(self, net_dim, state_dim, action_dim): self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.act = ActorPPO(net_dim, state_dim, action_dim).to(self.device) self.cri = CriticAdv(state_dim, net_dim).to(self.device) self.criterion = torch.nn.SmoothL1Loss() self.optimizer = torch.optim.Adam([{'params': self.act.parameters(), 'lr': self.learning_rate}, {'params': self.cri.parameters(), 'lr': self.learning_rate}]) def select_actions(self, states): # states = (state, ...) states = torch.as_tensor(states, dtype=torch.float32, device=self.device) a_noise, noise = self.act.get_action_noise(states) return a_noise.detach().cpu().numpy(), noise.detach().cpu().numpy() def store_transition(self, env, buffer, target_step, reward_scale, gamma): buffer.empty_memories__before_explore() # NOTICE! necessary for on-policy # assert target_step == buffer.max_len - max_step actual_step = 0 while actual_step < target_step: state = env.reset() for _ in range(env.max_step): action, noise = self.select_actions((state,)) action = action[0] noise = noise[0] next_state, reward, done, _ = env.step(np.tanh(action)) actual_step += 1 other = (reward * reward_scale, 0.0 if done else gamma, *action, *noise) buffer.append_buffer(state, other) if done: break state = next_state return actual_step def update_net(self, buffer, _max_step, batch_size, repeat_times=8): buffer.update__now_len__before_sample() max_memo = buffer.now_len with torch.no_grad(): # Trajectory using reverse reward buf_reward, buf_mask, buf_action, buf_noise, buf_state = buffer.sample_for_ppo() bs = 2 ** 10 # set a smaller 'bs: batch size' when out of GPU memory. buf_value = torch.cat([self.cri(buf_state[i:i + bs]) for i in range(0, buf_state.size(0), bs)], dim=0) buf_logprob = -(buf_noise.pow(2).__mul__(0.5) + self.act.a_std_log + self.act.sqrt_2pi_log).sum(1) buf_r_sum, buf_advantage = self.compute_reward(buffer, max_memo, buf_reward, buf_mask, buf_value) del buf_reward, buf_mask, buf_noise obj_critic = None for _ in range(int(repeat_times * max_memo / batch_size)): # PPO: Surrogate objective of Trust Region indices = torch.randint(max_memo, size=(batch_size,), requires_grad=False, device=self.device) state = buf_state[indices] action = buf_action[indices] r_sum = buf_r_sum[indices] logprob = buf_logprob[indices] advantage = buf_advantage[indices] new_logprob = self.act.compute_logprob(state, action) # it is obj_actor ratio = (new_logprob - logprob).exp() obj_surrogate1 = advantage * ratio obj_surrogate2 = advantage * ratio.clamp(1 - self.clip, 1 + self.clip) obj_surrogate = -torch.min(obj_surrogate1, obj_surrogate2).mean() obj_entropy = (new_logprob.exp() * new_logprob).mean() # policy entropy obj_actor = obj_surrogate + obj_entropy * self.lambda_entropy value = self.cri(state).squeeze(1) # critic network predicts the reward_sum (Q value) of state obj_critic = self.criterion(value, r_sum) obj_united = obj_actor + obj_critic / (r_sum.std() + 1e-5) self.optimizer.zero_grad() obj_united.backward() self.optimizer.step() return self.act.a_std_log.mean().item(), obj_critic.item() def compute_reward(self, buffer, max_memo, buf_reward, buf_mask, buf_value): buf_r_sum = torch.empty(max_memo, dtype=torch.float32, device=self.device) # reward sum pre_r_sum = 0 # reward sum of previous step for i in range(max_memo - 1, -1, -1): buf_r_sum[i] = buf_reward[i] + buf_mask[i] * pre_r_sum pre_r_sum = buf_r_sum[i] buf_advantage = buf_r_sum - (buf_mask * buf_value.squeeze(1)) buf_advantage = (buf_advantage - buf_advantage.mean()) / (buf_advantage.std() + 1e-5) return buf_r_sum, buf_advantage