def __init__(self, net_dim, state_dim, action_dim, learning_rate=1e-4): super().__init__() self.clip = 0.25 # ratio.clamp(1 - clip, 1 + clip) self.lambda_entropy = 0.01 # larger lambda_entropy means more exploration self.act = ActorPPO(net_dim, state_dim, action_dim).to(self.device) self.cri = CriticAdv(state_dim, net_dim).to(self.device) self.criterion = torch.nn.SmoothL1Loss() self.optimizer = torch.optim.Adam([{'params': self.act.parameters(), 'lr': learning_rate}, {'params': self.cri.parameters(), 'lr': learning_rate}])
def __init__(self, state_dim, action_dim, net_dim, learning_rate=1e-4): AgentBase.__init__(self) self.act = ActorPPO(state_dim, action_dim, net_dim).to(self.device) self.cri = CriticAdv(state_dim, net_dim).to(self.device) self.criterion = nn.SmoothL1Loss() self.optimizer = torch.optim.Adam([ { 'params': self.act.parameters(), 'lr': learning_rate }, { 'params': self.cri.parameters(), 'lr': learning_rate }, ], lr=learning_rate)
class AgentPPO(AgentBase): def __init__(self, net_dim, state_dim, action_dim, learning_rate=1e-4): super().__init__() self.clip = 0.25 # ratio.clamp(1 - clip, 1 + clip) self.lambda_entropy = 0.01 # larger lambda_entropy means more exploration self.act = ActorPPO(net_dim, state_dim, action_dim).to(self.device) self.cri = CriticAdv(state_dim, net_dim).to(self.device) self.criterion = torch.nn.SmoothL1Loss() self.optimizer = torch.optim.Adam([{'params': self.act.parameters(), 'lr': learning_rate}, {'params': self.cri.parameters(), 'lr': learning_rate}]) def select_actions(self, states): # states = (state, ...) states = torch.as_tensor(states, dtype=torch.float32, device=self.device) a_noise, noise = self.act.get__action_noise(states) return a_noise.detach().cpu().numpy(), noise.detach().cpu().numpy() def update_buffer(self, env, buffer, max_step, reward_scale, gamma): buffer.empty_memories__before_explore() step_counter = 0 target_step = buffer.max_len - max_step while step_counter < target_step: state = env.reset() for _ in range(max_step): action, noise = self.select_actions((state,)) action = action[0] noise = noise[0] next_state, reward, done, _ = env.step(np.tanh(action)) step_counter += 1 other = (reward * reward_scale, 0.0 if done else gamma, *action, *noise) buffer.append_memo(state, other) if done: break state = next_state return step_counter def update_policy(self, buffer, _max_step, batch_size, repeat_times=8): buffer.update__now_len__before_sample() max_memo = buffer.now_len with torch.no_grad(): # Trajectory using reverse reward buf_reward, buf_mask, buf_action, buf_noise, buf_state = buffer.sample_for_ppo() bs = 2 ** 10 # set a smaller 'bs: batch size' when out of GPU memory. buf_value = torch.cat([self.cri(buf_state[i:i + bs]) for i in range(0, buf_state.size(0), bs)], dim=0) buf_log_prob = -(buf_noise.pow(2).__mul__(0.5) + self.act.a_std_log + self.act.sqrt_2pi_log).sum(1) buf_r_sum = torch.empty(max_memo, dtype=torch.float32, device=self.device) # reward sum pre_r_sum = 0 # reward sum of previous step for i in range(max_memo - 1, -1, -1): buf_r_sum[i] = buf_reward[i] + buf_mask[i] * pre_r_sum pre_r_sum = buf_r_sum[i] buf_advantage = buf_r_sum - (buf_mask * buf_value).squeeze(1) buf_advantage = buf_advantage / (buf_advantage.std() + 1e-5) del buf_reward, buf_mask, buf_noise obj_actor = obj_critic = None for _ in range(int(repeat_times * max_memo / batch_size)): # PPO: Surrogate objective of Trust Region indices = torch.randint(max_memo, size=(batch_size,), requires_grad=False, device=self.device) state = buf_state[indices] action = buf_action[indices] r_sum = buf_r_sum[indices] log_prob = buf_log_prob[indices] advantage = buf_advantage[indices] new_log_prob = self.act.compute__log_prob(state, action) # it is obj_actor ratio = (new_log_prob - log_prob).exp() obj_surrogate1 = advantage * ratio obj_surrogate2 = advantage * ratio.clamp(1 - self.clip, 1 + self.clip) obj_surrogate = -torch.min(obj_surrogate1, obj_surrogate2).mean() obj_entropy = (new_log_prob.exp() * new_log_prob).mean() # policy entropy obj_actor = obj_surrogate + obj_entropy * self.lambda_entropy value = self.cri(state).squeeze(1) # critic network predicts the reward_sum (Q value) of state obj_critic = self.criterion(value, r_sum) obj_united = obj_actor + obj_critic / (r_sum.std() + 1e-5) self.optimizer.zero_grad() obj_united.backward() self.optimizer.step() return obj_actor.item(), obj_critic.item()
class AgentGaePPO(AgentBase): def __init__(self, state_dim, action_dim, net_dim, learning_rate=1e-4): AgentBase.__init__(self) self.act = ActorPPO(state_dim, action_dim, net_dim).to(self.device) self.cri = CriticAdv(state_dim, net_dim).to(self.device) self.criterion = nn.SmoothL1Loss() self.optimizer = torch.optim.Adam([ { 'params': self.act.parameters(), 'lr': learning_rate }, { 'params': self.cri.parameters(), 'lr': learning_rate }, ], lr=learning_rate) def select_actions(self, states): states = torch.as_tensor(states, dtype=torch.float32, device=self.device) # plan to detach() here a_noise, noise = self.act.get__a_noisy__noise(states) return a_noise.detach().cpu().numpy(), noise.detach().cpu().numpy() def update_buffer(self, env, buffer, max_step, reward_scale, gamma): buffer.empty_memories__before_explore() step_counter = 0 target_step = buffer.max_len - max_step while step_counter < target_step: state = env.reset() for _ in range(max_step): action, noise = self.select_actions((state, )) action = action[0] noise = noise[0] next_state, reward, done, _ = env.step(np.tanh(action)) step_counter += 1 buffer.append_memo( (reward * reward_scale, 0.0 if done else gamma, *state, *action, *noise)) if done: break state = next_state return step_counter def update_policy(self, buffer, _max_step, batch_size, repeat_times=8): buffer.update__now_len__before_sample() clip = 0.25 # ratio.clamp(1 - clip, 1 + clip) lambda_adv = 0.98 # why 0.98? cannot use 0.99 lambda_entropy = 0.01 # could be 0.02 max_memo = buffer.now_len all_reward, all_mask, all_state, all_action, all_noise = buffer.all_sample( ) all__new_v = list() all_log_prob = list() with torch.no_grad(): b_size = 2**10 a_std_log__sqrt_2pi_log = self.act.a_std_log + self.act.sqrt_2pi_log for i in range(0, all_state.size()[0], b_size): new_v = self.cri(all_state[i:i + b_size]) all__new_v.append(new_v) log_prob = -(all_noise[i:i + b_size].pow(2) / 2 + a_std_log__sqrt_2pi_log).sum(1) all_log_prob.append(log_prob) all__new_v = torch.cat(all__new_v, dim=0) all_log_prob = torch.cat(all_log_prob, dim=0) all__delta = torch.empty(max_memo, dtype=torch.float32, device=self.device) all__old_v = torch.empty(max_memo, dtype=torch.float32, device=self.device) # old policy value all__adv_v = torch.empty(max_memo, dtype=torch.float32, device=self.device) # advantage value prev_old_v = 0 # old q value prev_new_v = 0 # new q value prev_adv_v = 0 # advantage q value for i in range(max_memo - 1, -1, -1): # could be more elegant all__delta[ i] = all_reward[i] + all_mask[i] * prev_new_v - all__new_v[i] all__old_v[i] = all_reward[i] + all_mask[i] * prev_old_v all__adv_v[ i] = all__delta[i] + all_mask[i] * prev_adv_v * lambda_adv prev_old_v = all__old_v[i] prev_new_v = all__new_v[i] prev_adv_v = all__adv_v[i] all__adv_v = (all__adv_v - all__adv_v.mean()) / ( all__adv_v.std() + 1e-5) # advantage_norm: actor_obj = critic_obj = None for _ in range(int(repeat_times * max_memo / batch_size)): indices = torch.randint(max_memo, size=(batch_size, ), device=self.device) state = all_state[indices] action = all_action[indices] advantage = all__adv_v[indices] old_value = all__old_v[indices].unsqueeze(1) old_log_prob = all_log_prob[indices] new_log_prob = self.act.compute__log_prob( state, action) # it is actor_obj new_value = self.cri(state) critic_obj = (self.criterion(new_value, old_value)) / (old_value.std() + 1e-5) ratio = torch.exp(new_log_prob - old_log_prob) surrogate_obj0 = advantage * ratio # surrogate objective of TRPO surrogate_obj1 = advantage * ratio.clamp(1 - clip, 1 + clip) surrogate_obj = -torch.min(surrogate_obj0, surrogate_obj1).mean() loss_entropy = (torch.exp(new_log_prob) * new_log_prob).mean() # policy entropy actor_obj = surrogate_obj + loss_entropy * lambda_entropy united_obj = actor_obj + critic_obj self.optimizer.zero_grad() united_obj.backward() self.optimizer.step() self.obj_a = actor_obj.item() self.obj_c = critic_obj.item()