コード例 #1
0
ファイル: ppo.py プロジェクト: wuyou33/quadrotor_control_ppo
class PPO:
    def __init__(self, device, state_dim, action_dim, action_std, lr, betas,
                 gamma, K_epochs, eps_clip):
        self.lr = lr
        self.device = device
        self.betas = betas
        self.gamma = gamma
        self.eps_clip = eps_clip
        self.K_epochs = K_epochs

        self.policy = ActorCritic(state_dim, action_dim, action_std).to(device)
        #self.optimizer = RAdam(self.policy.parameters(), lr=lr, betas=betas)
        self.optimizer = optim.Adam(self.policy.parameters(), lr=lr)

        self.policy_old = ActorCritic(state_dim, action_dim,
                                      action_std).to(device)
        self.policy_old.load_state_dict(self.policy.state_dict())

        self.MseLoss = nn.MSELoss()

    def select_action(self, state, memory):
        if np.any(np.isnan(state)):
            print('in select action: state is nan', state)
        state = torch.FloatTensor(state.reshape(1, -1)).to(self.device)
        return self.policy_old.act(state, memory).cpu().data.numpy().flatten()

    def update(self, memory):
        # Monte Carlo estimate of rewards:
        rewards = []
        discounted_reward = 0
        for reward, is_terminal in zip(reversed(memory.rewards),
                                       reversed(memory.is_terminals)):
            if is_terminal:
                discounted_reward = 0
            discounted_reward = reward + (self.gamma * discounted_reward)
            rewards.insert(0, discounted_reward)

        # Normalizing the rewards:
        rewards = torch.tensor(rewards).to(self.device)
        rewards = (rewards - rewards.mean()) / (rewards.std() + 1e-5)

        # convert list to tensor
        old_states_ = torch.squeeze(
            torch.stack(memory.states).to(self.device)).detach()
        old_actions_ = torch.squeeze(
            torch.stack(memory.actions).to(self.device)).detach()
        old_logprobs_ = torch.squeeze(torch.stack(memory.logprobs)).to(
            self.device).detach()

        batch_size = old_states_.shape[0]
        mini_batch_size = batch_size // 8  # 64

        # Optimize policy for K epochs:
        for _ in range(self.K_epochs):
            # Evaluating old actions and values :
            for i in range(batch_size // mini_batch_size):
                rand_ids = np.random.randint(0, batch_size, mini_batch_size)
                old_states = old_states_[rand_ids, :]
                old_actions = old_actions_[rand_ids, :]
                old_logprobs = old_logprobs_[rand_ids, :]
                rewards_batch = rewards[rand_ids]

                logprobs, state_values, dist_entropy = self.policy.evaluate(
                    old_states, old_actions)

                # Finding the ratio (pi_theta / pi_theta__old):
                ratios = torch.exp(logprobs - old_logprobs.detach())

                # Finding Surrogate Loss:
                advantages = rewards_batch - state_values.detach()
                ## torch.clamp(ratios, 1 - self.eps_clip, 1 + self.eps_clip)
                #surr = -torch.min(ratios, 1) * advantages  # as per the paper

                len_adv = advantages.shape[0]
                advantages = advantages.reshape((len_adv, 1))
                surr1 = ratios * advantages
                surr2 = 1 * advantages  ## as per the paper

                surr = -torch.min(surr1, surr2).mean()
                w_crit_loss = 1
                loss = surr + w_crit_loss * (rewards_batch - state_values).pow(
                    2).mean()  #- 0.01 * dist_entropy

                # take gradient step
                self.optimizer.zero_grad()
                loss.mean().backward()
                self.optimizer.step()

        # Copy new weights into old policy:
        self.policy_old.load_state_dict(self.policy.state_dict())
コード例 #2
0
class PPO(nn.Module):
    def __init__(self,
                 state_dim,
                 action_dim,
                 eps=0.2,
                 gamma=0.99,
                 lambda_=0.95,
                 K_epoch=80,
                 batch_size=64):
        super(PPO, self).__init__()
        self.eps = eps
        self.gamma = gamma
        self.lambda_ = lambda_
        self.K_epoch = K_epoch
        self.batch_size = batch_size

        self.model = ActorCritic(state_dim, action_dim)
        self.model_old = ActorCritic(state_dim, action_dim)
        for param in self.model_old.parameters():
            param.requires_grad = False
        self.copy_weights()

    def forward(self, x):
        self.pi, self.v = self.model_old(x)

        return self.pi, self.v

    def copy_weights(self):
        self.model_old.load_state_dict(self.model.state_dict())

    def update(self, buffer, optimizer):
        self.model.train()
        self.model_old.eval()
        self.advantage_fcn(buffer.data)

        batch_loss, batch_clip_loss, batch_vf_loss = [], [], []
        for epoch in range(self.K_epoch):
            for state, action, next_s, reward, log_prob_old, entropy, advantage in buffer.get_data(
                    self.batch_size):
                pi, v = self.model(state)
                log_prob_pi = pi.log_prob(action)

                prob_ratio = torch.exp(log_prob_pi - log_prob_old)

                first_term = prob_ratio * advantage
                second_term = self.clip_by_value(prob_ratio) * advantage
                loss_clip = (torch.min(first_term, second_term)).mean()

                _, v_next = self.model_old(next_s)
                v_target = reward + self.gamma * v_next
                loss_vf = ((v - v_target)**2).mean(
                )  # squared error loss: (v(s_t) - v_target)**2

                loss = -(loss_clip - loss_vf
                         )  #-(loss_clip - 0.5*loss_vf + 0.01*entropy.mean())

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                batch_loss.append(loss.detach().numpy())
                batch_clip_loss.append(loss_clip.detach().numpy())
                batch_vf_loss.append(loss_vf.detach().numpy())

        self.copy_weights()
        buffer.reset()

    def advantage_fcn(self, buffer, normalize=True):
        _, v_st1 = self.model(torch.stack(buffer['next_s']))
        _, v_s = self.model(torch.stack(buffer['s']))
        deltas = torch.stack(buffer['r']) + self.gamma * v_st1 - v_s

        advantage, temp = [], 0
        idxs = torch.tensor(range(len(deltas) - 1, -1, -1))  #reverse
        reverse_deltas = deltas.index_select(0, idxs)
        for delta_t in reverse_deltas:
            temp = delta_t + self.lambda_ * self.gamma * temp
            advantage.append(temp)

        advantage = torch.as_tensor(advantage[::-1])  #re-reverse
        if normalize:
            advantage = (advantage - advantage.mean()) / advantage.std()

        buffer['advantage'] = advantage.unsqueeze(1)

    def clip_by_value(self, x):
        return x.clamp(1 - self.eps, 1 + self.eps)  # clamp(min, max)