コード例 #1
0
class Agent(object):
    def __init__(self,
                 action_size,
                 state_size,
                 fc_sizes=None,
                 actor_fc_sizes=[256, 128, 64],
                 critic_fc_sizes=[256, 128, 64],
                 gamma=0.99,
                 gae_tau=0.2,
                 tau=0.001,
                 lr=5e-4,
                 num_agents=1):

        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")

        self.ac_target = ActorCritic(state_size,
                                     action_size,
                                     fc_sizes=fc_sizes,
                                     actor_fc_sizes=actor_fc_sizes,
                                     critic_fc_sizes=critic_fc_sizes).to(
                                         self.device)
        self.ac_local = ActorCritic(state_size,
                                    action_size,
                                    fc_sizes=fc_sizes,
                                    actor_fc_sizes=actor_fc_sizes,
                                    critic_fc_sizes=critic_fc_sizes).to(
                                        self.device)
        self.optimizer_actor = torch.optim.Adam(
            self.ac_local.actor.parameters(), lr=1e-4, weight_decay=0)
        self.optimizer_critic = torch.optim.Adam(
            self.ac_local.critic.parameters(), lr=1e-3, weight_decay=0.01)

        # Noise process
        self.num_agents = num_agents
        self.noise = [OUNoise(action_size, np.random.randint(0, 5000))
                      ] * num_agents

        #some configuration constants
        self.gamma = gamma
        self.gae_tau = gae_tau
        self.tau = tau

        self.hardUpdate()

    def act(self, state, add_noise=True):
        self.ac_local.actor.eval()
        with torch.no_grad():
            actions = self.ac_local.actor(state)
        self.ac_local.actor.train()
        actions_np = []
        for agent_index in range(0, self.num_agents):
            action = actions[agent_index].cpu().data.numpy()
            action = np.clip(action, -1, 1)
            actions_np.append(action)
        if (add_noise):
            for agent_index in range(0, self.num_agents):
                actions_np[agent_index] += self.noise[agent_index].sample()
        return actions_np

    def learn(self, exp_replay, sample_train=10, batch_size=256):
        for exp in range(0, sample_train):
            state, action, reward, next_state, done = exp_replay.sample(
                batch_size)
            action_next = self.ac_target.actor(next_state)
            value = self.ac_target.critic(state, action)
            next_value = self.ac_target.critic(next_state, action_next)

            q_target = reward + self.gamma * next_value * (1 - done)
            q_expected = self.ac_local.critic(state, action)
            loss_critic = F.mse_loss(q_expected, q_target)

            self.optimizer_critic.zero_grad()
            loss_critic.backward()
            torch.nn.utils.clip_grad_norm_(self.ac_local.critic.parameters(),
                                           1)
            self.optimizer_critic.step()

            action_pred = self.ac_local.actor(state)
            loss_actor = -self.ac_local.critic(state, action_pred).mean()

            self.optimizer_actor.zero_grad()
            loss_actor.backward()
            torch.nn.utils.clip_grad_norm_(self.ac_local.actor.parameters(), 1)
            self.optimizer_actor.step()

        self.softUpdate()

    def softUpdate(self):
        if (self.ac_target.fc_common != None):
            for target_param, local_param in zip(
                    self.ac_target.fc_common.parameters(),
                    self.ac_local.fc_common.parameters()):
                target_param.data.copy_(self.tau * local_param.data +
                                        (1.0 - self.tau) * target_param.data)
        for target_param, local_param in zip(self.ac_target.actor.parameters(),
                                             self.ac_local.actor.parameters()):
            target_param.data.copy_(self.tau * local_param.data +
                                    (1.0 - self.tau) * target_param.data)
        for target_param, local_param in zip(
                self.ac_target.critic.parameters(),
                self.ac_local.critic.parameters()):
            target_param.data.copy_(self.tau * local_param.data +
                                    (1.0 - self.tau) * target_param.data)

    def hardUpdate(self):
        if (self.ac_target.fc_common != None):
            for target_param, local_param in zip(
                    self.ac_target.fc_common.parameters(),
                    self.ac_local.fc_common.parameters()):
                target_param.data.copy_(local_param.data)
        for target_param, local_param in zip(self.ac_target.actor.parameters(),
                                             self.ac_local.actor.parameters()):
            target_param.data.copy_(local_param.data)
        for target_param, local_param in zip(
                self.ac_target.critic.parameters(),
                self.ac_local.critic.parameters()):
            target_param.data.copy_(local_param.data)
コード例 #2
0
class PPO():
    def __init__(self, ActorCritic, state_dim, action_dim, action_std, lr,
                 betas, gamma, K_epochs, batch_size, eps_clip, device, env):
        self.lr = lr  # lr ADAM
        self.betas = betas  # momentum in ADAM
        self.gamma = gamma  # formonte carlo reward
        self.eps_clip = eps_clip  # clipping
        self.K_epochs = K_epochs  # k epochs to update the SGD
        self.batch_size = batch_size
        self.device = device
        self.action_std = action_std
        self.env = env

        self.old_policy = ActorCritic(state_dim, action_dim).to(device)
        self.new_policy = ActorCritic(state_dim, action_dim).to(device)

        self.optimizer = optim.Adam(self.new_policy.parameters(),
                                    lr=lr,
                                    betas=betas)
        #### make sure to initialize them the same
        self.old_policy.load_state_dict(self.new_policy.state_dict())

        #self.MseLoss = nn.SELoss()

    def sample_action_old_policy(self, state, sample_batch):
        state = torch.Tensor(state.reshape(1, -1)).squeeze()
        action_mu = self.old_policy.actor(state)
        action_var = self.action_std
        dist = Normal(action_mu, action_var**2)
        action = dist.sample()
        action = torch.clamp(action, self.env.action_space.low[0],
                             self.env.action_space.high[0])
        logprob = dist.log_prob(action)

        #### Update the sample batch with thee result of the observation
        sample_batch.states.append(state)
        sample_batch.actions.append(action)
        sample_batch.logprobs.append(logprob)
        return action

    def update(self, sample_batch):

        ## Monte Carlo Estimate of rewards
        rewards = discounted_rewards(sample_batch.rewards,
                                     sample_batch.is_terminals,
                                     self.device).unsqueeze(1).detach()
        ### get old_states, old_actions, old_probs from sample_batch
        old_states = torch.stack(sample_batch.states).to(self.device).detach()
        old_actions = torch.stack(sample_batch.actions).to(
            self.device).detach()
        old_logprobs = torch.stack(sample_batch.logprobs).to(
            self.device).detach()

        dataset = TensorDataset(rewards, old_states, old_actions, old_logprobs)
        dataloader = DataLoader(dataset=dataset,
                                batch_size=self.batch_size,
                                shuffle=True)

        for epoch in range(self.K_epochs):
            # add mini batch
            for idx, batch_data in enumerate(dataloader):
                #print('epoch: ', epoch,  ' batch_idx ', idx)
                (batch_rewards, batch_old_states, batch_old_actions,
                 batch_old_logprobs) = batch_data
                #### evaluate old actions and values in the new policy
                new_action_mu = self.new_policy.actor(batch_old_states)
                new_action_var = self.action_std
                new_dist = Normal(new_action_mu, new_action_var**2)

                new_logprobs = new_dist.log_prob(batch_old_actions)
                new_dist_entropy = new_dist.entropy()  #.reshape(-1)
                new_state_values = self.new_policy.critic(
                    batch_old_states)  #.squeeze()

                # ratio of new_policy/ old_policy
                ratios = torch.exp(new_logprobs -
                                   batch_old_logprobs)  #.reshape(-1)
                # Losses

                advantages = batch_rewards - new_state_values

                surr1 = ratios * advantages

                surr2 = torch.clamp(ratios, 1 - self.eps_clip,
                                    1 + self.eps_clip) * advantages
                loss = -torch.min(surr1, surr2) + 0.5 * (
                    new_state_values -
                    batch_rewards)**2 - 0.01 * new_dist_entropy

                # gradient step
                self.optimizer.zero_grad()
                #loss.register_hook(lambda grad: print(grad))
                #loss.mean().backward(retain_graph=True )
                loss.mean().backward()
                # check if you want to clip the weights
                #torch.nn.utils.clip_grad_norm_(self.new_policy.parameters(), 10)
                self.optimizer.step()
        #import pdb; pdb.set_trace()
        # update the old policy with the wights of the new policy
        self.old_policy.load_state_dict(self.new_policy.state_dict())