Beispiel #1
0
    def __init__(self, net_dim, state_dim, action_dim, learning_rate=1e-4):
        super().__init__()
        self.clip = 0.25  # ratio.clamp(1 - clip, 1 + clip)
        self.lambda_entropy = 0.01  # larger lambda_entropy means more exploration

        self.act = ActorPPO(net_dim, state_dim, action_dim).to(self.device)
        self.cri = CriticAdv(state_dim, net_dim).to(self.device)

        self.criterion = torch.nn.SmoothL1Loss()
        self.optimizer = torch.optim.Adam([{'params': self.act.parameters(), 'lr': learning_rate},
                                           {'params': self.cri.parameters(), 'lr': learning_rate}])
Beispiel #2
0
    def __init__(self, state_dim, action_dim, net_dim, learning_rate=1e-4):
        AgentBase.__init__(self)

        self.act = ActorPPO(state_dim, action_dim, net_dim).to(self.device)
        self.cri = CriticAdv(state_dim, net_dim).to(self.device)

        self.criterion = nn.SmoothL1Loss()
        self.optimizer = torch.optim.Adam([
            {
                'params': self.act.parameters(),
                'lr': learning_rate
            },
            {
                'params': self.cri.parameters(),
                'lr': learning_rate
            },
        ],
                                          lr=learning_rate)
Beispiel #3
0
class AgentPPO(AgentBase):
    def __init__(self, net_dim, state_dim, action_dim, learning_rate=1e-4):
        super().__init__()
        self.clip = 0.25  # ratio.clamp(1 - clip, 1 + clip)
        self.lambda_entropy = 0.01  # larger lambda_entropy means more exploration

        self.act = ActorPPO(net_dim, state_dim, action_dim).to(self.device)
        self.cri = CriticAdv(state_dim, net_dim).to(self.device)

        self.criterion = torch.nn.SmoothL1Loss()
        self.optimizer = torch.optim.Adam([{'params': self.act.parameters(), 'lr': learning_rate},
                                           {'params': self.cri.parameters(), 'lr': learning_rate}])

    def select_actions(self, states):  # states = (state, ...)
        states = torch.as_tensor(states, dtype=torch.float32, device=self.device)
        a_noise, noise = self.act.get__action_noise(states)
        return a_noise.detach().cpu().numpy(), noise.detach().cpu().numpy()

    def update_buffer(self, env, buffer, max_step, reward_scale, gamma):
        buffer.empty_memories__before_explore()

        step_counter = 0
        target_step = buffer.max_len - max_step
        while step_counter < target_step:
            state = env.reset()
            for _ in range(max_step):
                action, noise = self.select_actions((state,))
                action = action[0]
                noise = noise[0]

                next_state, reward, done, _ = env.step(np.tanh(action))
                step_counter += 1

                other = (reward * reward_scale, 0.0 if done else gamma, *action, *noise)
                buffer.append_memo(state, other)
                if done:
                    break
                state = next_state
        return step_counter

    def update_policy(self, buffer, _max_step, batch_size, repeat_times=8):
        buffer.update__now_len__before_sample()
        max_memo = buffer.now_len

        with torch.no_grad():  # Trajectory using reverse reward
            buf_reward, buf_mask, buf_action, buf_noise, buf_state = buffer.sample_for_ppo()

            bs = 2 ** 10  # set a smaller 'bs: batch size' when out of GPU memory.
            buf_value = torch.cat([self.cri(buf_state[i:i + bs]) for i in range(0, buf_state.size(0), bs)], dim=0)
            buf_log_prob = -(buf_noise.pow(2).__mul__(0.5) + self.act.a_std_log + self.act.sqrt_2pi_log).sum(1)

            buf_r_sum = torch.empty(max_memo, dtype=torch.float32, device=self.device)  # reward sum
            pre_r_sum = 0  # reward sum of previous step
            for i in range(max_memo - 1, -1, -1):
                buf_r_sum[i] = buf_reward[i] + buf_mask[i] * pre_r_sum
                pre_r_sum = buf_r_sum[i]
            buf_advantage = buf_r_sum - (buf_mask * buf_value).squeeze(1)
            buf_advantage = buf_advantage / (buf_advantage.std() + 1e-5)

            del buf_reward, buf_mask, buf_noise

        obj_actor = obj_critic = None
        for _ in range(int(repeat_times * max_memo / batch_size)):  # PPO: Surrogate objective of Trust Region
            indices = torch.randint(max_memo, size=(batch_size,), requires_grad=False, device=self.device)
            state = buf_state[indices]
            action = buf_action[indices]
            r_sum = buf_r_sum[indices]
            log_prob = buf_log_prob[indices]
            advantage = buf_advantage[indices]

            new_log_prob = self.act.compute__log_prob(state, action)  # it is obj_actor
            ratio = (new_log_prob - log_prob).exp()
            obj_surrogate1 = advantage * ratio
            obj_surrogate2 = advantage * ratio.clamp(1 - self.clip, 1 + self.clip)
            obj_surrogate = -torch.min(obj_surrogate1, obj_surrogate2).mean()
            obj_entropy = (new_log_prob.exp() * new_log_prob).mean()  # policy entropy
            obj_actor = obj_surrogate + obj_entropy * self.lambda_entropy

            value = self.cri(state).squeeze(1)  # critic network predicts the reward_sum (Q value) of state
            obj_critic = self.criterion(value, r_sum)

            obj_united = obj_actor + obj_critic / (r_sum.std() + 1e-5)
            self.optimizer.zero_grad()
            obj_united.backward()
            self.optimizer.step()

        return obj_actor.item(), obj_critic.item()
Beispiel #4
0
class AgentGaePPO(AgentBase):
    def __init__(self, state_dim, action_dim, net_dim, learning_rate=1e-4):
        AgentBase.__init__(self)

        self.act = ActorPPO(state_dim, action_dim, net_dim).to(self.device)
        self.cri = CriticAdv(state_dim, net_dim).to(self.device)

        self.criterion = nn.SmoothL1Loss()
        self.optimizer = torch.optim.Adam([
            {
                'params': self.act.parameters(),
                'lr': learning_rate
            },
            {
                'params': self.cri.parameters(),
                'lr': learning_rate
            },
        ],
                                          lr=learning_rate)

    def select_actions(self, states):
        states = torch.as_tensor(states,
                                 dtype=torch.float32,
                                 device=self.device)  # plan to detach() here
        a_noise, noise = self.act.get__a_noisy__noise(states)
        return a_noise.detach().cpu().numpy(), noise.detach().cpu().numpy()

    def update_buffer(self, env, buffer, max_step, reward_scale, gamma):
        buffer.empty_memories__before_explore()

        step_counter = 0
        target_step = buffer.max_len - max_step
        while step_counter < target_step:
            state = env.reset()
            for _ in range(max_step):
                action, noise = self.select_actions((state, ))
                action = action[0]
                noise = noise[0]

                next_state, reward, done, _ = env.step(np.tanh(action))
                step_counter += 1

                buffer.append_memo(
                    (reward * reward_scale, 0.0 if done else gamma, *state,
                     *action, *noise))
                if done:
                    break
                state = next_state
        return step_counter

    def update_policy(self, buffer, _max_step, batch_size, repeat_times=8):
        buffer.update__now_len__before_sample()

        clip = 0.25  # ratio.clamp(1 - clip, 1 + clip)
        lambda_adv = 0.98  # why 0.98? cannot use 0.99
        lambda_entropy = 0.01  # could be 0.02

        max_memo = buffer.now_len
        all_reward, all_mask, all_state, all_action, all_noise = buffer.all_sample(
        )

        all__new_v = list()
        all_log_prob = list()
        with torch.no_grad():
            b_size = 2**10
            a_std_log__sqrt_2pi_log = self.act.a_std_log + self.act.sqrt_2pi_log
            for i in range(0, all_state.size()[0], b_size):
                new_v = self.cri(all_state[i:i + b_size])
                all__new_v.append(new_v)

                log_prob = -(all_noise[i:i + b_size].pow(2) / 2 +
                             a_std_log__sqrt_2pi_log).sum(1)
                all_log_prob.append(log_prob)

            all__new_v = torch.cat(all__new_v, dim=0)
            all_log_prob = torch.cat(all_log_prob, dim=0)

        all__delta = torch.empty(max_memo,
                                 dtype=torch.float32,
                                 device=self.device)
        all__old_v = torch.empty(max_memo,
                                 dtype=torch.float32,
                                 device=self.device)  # old policy value
        all__adv_v = torch.empty(max_memo,
                                 dtype=torch.float32,
                                 device=self.device)  # advantage value

        prev_old_v = 0  # old q value
        prev_new_v = 0  # new q value
        prev_adv_v = 0  # advantage q value
        for i in range(max_memo - 1, -1, -1):  # could be more elegant
            all__delta[
                i] = all_reward[i] + all_mask[i] * prev_new_v - all__new_v[i]
            all__old_v[i] = all_reward[i] + all_mask[i] * prev_old_v
            all__adv_v[
                i] = all__delta[i] + all_mask[i] * prev_adv_v * lambda_adv

            prev_old_v = all__old_v[i]
            prev_new_v = all__new_v[i]
            prev_adv_v = all__adv_v[i]
        all__adv_v = (all__adv_v - all__adv_v.mean()) / (
            all__adv_v.std() + 1e-5)  # advantage_norm:

        actor_obj = critic_obj = None
        for _ in range(int(repeat_times * max_memo / batch_size)):
            indices = torch.randint(max_memo,
                                    size=(batch_size, ),
                                    device=self.device)

            state = all_state[indices]
            action = all_action[indices]
            advantage = all__adv_v[indices]
            old_value = all__old_v[indices].unsqueeze(1)
            old_log_prob = all_log_prob[indices]

            new_log_prob = self.act.compute__log_prob(
                state, action)  # it is actor_obj
            new_value = self.cri(state)
            critic_obj = (self.criterion(new_value,
                                         old_value)) / (old_value.std() + 1e-5)

            ratio = torch.exp(new_log_prob - old_log_prob)
            surrogate_obj0 = advantage * ratio  # surrogate objective of TRPO
            surrogate_obj1 = advantage * ratio.clamp(1 - clip, 1 + clip)
            surrogate_obj = -torch.min(surrogate_obj0, surrogate_obj1).mean()
            loss_entropy = (torch.exp(new_log_prob) *
                            new_log_prob).mean()  # policy entropy
            actor_obj = surrogate_obj + loss_entropy * lambda_entropy

            united_obj = actor_obj + critic_obj
            self.optimizer.zero_grad()
            united_obj.backward()
            self.optimizer.step()

        self.obj_a = actor_obj.item()
        self.obj_c = critic_obj.item()