예제 #1
0
class DDPGAgent():
    def __init__(self, state_dim, action_dim, random_seed):
        self.state_dim = state_dim
        self.action_dim = action_dim
        self.seed = random.seed(random_seed)

        # Actor network with its target network
        self.actor_local = Actor(state_dim, action_dim, random_seed).to(device)
        self.actor_target = Actor(state_dim, action_dim,
                                  random_seed).to(device)
        self.actor_optimizer = optim.Adam(self.actor_local.parameters(),
                                          lr=LR_ACTOR)

        # Critic network with its target network
        self.critic_local = Critic(state_dim, action_dim,
                                   random_seed).to(device)
        self.critic_target = Critic(state_dim, action_dim,
                                    random_seed).to(device)
        self.critic_optimizer = optim.Adam(self.critic_local.parameters(),
                                           lr=LR_CRITIC,
                                           weight_decay=WEIGHT_DECAY)

        # Noise
        self.noise = OUNoise(action_dim, random_seed)
        self.epsilon = EPSILON

        # Replay memory
        self.memory = ReplayBuffer(action_dim, BUFFER_SIZE, BATCH_SIZE,
                                   random_seed)

    def step(self, state, action, reward, next_state, done, timestamp):
        """Save experience in replay memory, and use random sample from memory to learn."""
        # Save experience
        self.memory.add(state, action, reward, next_state, done)
        # Learn (if there are enough samples in memory)
        if len(self.memory) > BATCH_SIZE and timestamp % LEARN_EVERY == 0:
            for _ in range(LEARN_NUMBER):
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, add_noise=True):
        """Return actions for given state from current policy."""
        state = torch.from_numpy(state).float().to(device)
        self.actor_local.eval()
        with torch.no_grad():
            action = self.actor_local(state).cpu().data.numpy()
        self.actor_local.train()
        if add_noise:
            action += self.noise.sample() * self.epsilon
        return np.clip(action, -1, 1)

    def reset(self):
        self.noise.reset()

    def learn(self, experiences, gamma):
        """Update policy and value parameters using given batch of experience tuples."""
        states, actions, rewards, next_states, dones = experiences

        #   UPDATE CRITIC   #
        actions_next = self.actor_target(next_states.to(device))
        Q_targets_next = self.critic_target(next_states.to(device),
                                            actions_next.to(device))
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))
        Q_expected = self.critic_local(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        clip_grad_norm_(self.critic_local.parameters(),
                        1)  # Clip the gradient when update critic network
        self.critic_optimizer.step()

        #   UPDATE ACTOR   #
        actions_pred = self.actor_local(states)
        actor_loss = -self.critic_local(states, actions_pred).mean()

        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()

        #   UPDATE TARGET NETWORKS   #
        self.soft_update(self.critic_local, self.critic_target, RHO)
        self.soft_update(self.actor_local, self.actor_target, RHO)

        #   UPDATE EPSILON AND NOISE   #
        self.epsilon *= EPSILON_DECAY
        self.noise.reset()

    def soft_update(self, local_model, target_model, rho):
        """Soft update model parameters."""
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(rho * target_param.data +
                                    (1.0 - rho) * local_param.data)
예제 #2
0
class DDPGAgent:
    def __init__(self,
                 output_dim,
                 input_dim,
                 name,
                 hidden=256,
                 lr_actor=1.0e-3,
                 lr_critic=1.0e-3,
                 tau=1.0e-2,
                 seed=10):
        super(DDPGAgent, self).__init__()

        self.seed = seed
        self.actor = Actor(input_dim, hidden, output_dim, seed).to(device)
        self.critic = Critic(input_dim=input_dim,
                             action_dim=output_dim,
                             hidden=hidden,
                             seed=seed,
                             output_dim=1).to(device)
        self.target_actor = Actor(input_dim, hidden, output_dim,
                                  seed).to(device)
        self.target_critic = Critic(input_dim=input_dim,
                                    action_dim=output_dim,
                                    hidden=hidden,
                                    seed=seed,
                                    output_dim=1).to(device)
        self.name = name
        self.noise = OUNoise(output_dim, seed)
        self.tau = tau
        self.epsilon = EPSILON
        self.gamma = GAMMA
        self.clipgrad = CLIPGRAD
        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=lr_actor)
        self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                           lr=lr_critic,
                                           weight_decay=0)

    def act(self, state, add_noise=True):
        """Return actions for given state from current policy."""
        state = torch.from_numpy(state).float().unsqueeze(0).to(
            device)  #.unsqueeze(0)
        self.actor.eval()
        with torch.no_grad():
            action = self.actor(state).cpu().squeeze(0).data.numpy()
        self.actor.train()
        if add_noise:
            action += self.noise.sample() * self.epsilon
        return np.clip(action, -1, 1)

    def learn(self, experiences):
        """Update policy and value parameters using given batch of experience tuples."""
        states, actions, rewards, next_states, dones = experiences

        #   UPDATE CRITIC   #
        actions_next = self.target_actor(next_states.to(device))
        Q_targets_next = self.target_critic(next_states.to(device),
                                            actions_next.to(device))
        Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones))
        Q_expected = self.critic(states, actions)
        critic_loss = F.mse_loss(Q_expected, Q_targets)

        self.critic_optimizer.zero_grad()
        critic_loss.backward()
        clip_grad_norm_(self.critic.parameters(), self.clipgrad)
        self.critic_optimizer.step()

        #   UPDATE ACTOR   #
        actions_pred = self.actor(states)
        actor_loss = -self.critic(states, actions_pred).mean()

        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        #clip_grad_norm_(self.actor.parameters(), self.clipgrad)
        self.actor_optimizer.step()

        #   UPDATE TARGET NETWORKS   #
        self.soft_update(self.critic, self.target_critic)
        self.soft_update(self.actor, self.target_actor)

        #   UPDATE EPSILON AND NOISE   #
        self.epsilon *= EPSILON_DECAY
        self.noise.reset()

    def reset(self):
        self.noise.reset()

    def soft_update(self, local_model, target_model):
        """Soft update model parameters."""
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(self.tau * local_param.data +
                                    (1.0 - self.tau) * target_param.data)