예제 #1
0
class CnnDDQNAgent:
    def __init__(self, config: Config):
        self.config = config
        self.is_training = True
        if self.config.prioritized_replay:
            self.buffer = PrioritizedReplayBuffer(
                self.config.max_buff,
                alpha=self.config.prioritized_replay_alpha)
            if self.config.prioritized_replay_beta_iters is None:
                prioritized_replay_beta_iters = self.config.frames
            self.beta_schedule = LinearSchedule(
                prioritized_replay_beta_iters,
                initial_p=self.config.prioritized_replay_beta0,
                final_p=1.0)
        else:
            self.buffer = ReplayBuffer(self.config.max_buff)
            self.beta_schedule = None

        self.model = CnnDQN(self.config.state_shape, self.config.action_dim)
        self.target_model = CnnDQN(self.config.state_shape,
                                   self.config.action_dim)
        self.target_model.load_state_dict(self.model.state_dict())
        self.model_optim = Adam(self.model.parameters(),
                                lr=self.config.learning_rate)

        if self.config.use_cuda:
            self.cuda()

    def act(self, state, epsilon=None):
        if epsilon is None: epsilon = self.config.epsilon_min
        if random.random() > epsilon or not self.is_training:
            state = torch.tensor(state, dtype=torch.float).unsqueeze(0)
            if self.config.use_cuda:
                state = state.cuda()
            q_value = self.model.forward(state)
            action = q_value.max(1)[1].item()
        else:
            action = random.randrange(self.config.action_dim)
        return action

    def learning(self, fr):

        if self.config.prioritized_replay:
            experience = self.buffer.sample(self.config.batch_size,
                                            beta=self.beta_schedule.value(fr))
            (s0, a, r, s1, done, weights, batch_idxes) = experience
        else:
            s0, a, r, s1, done = self.buffer.sample(self.config.batch_size)
            weights, batch_idxes = np.ones_like(r), None

        s0 = torch.tensor(s0, dtype=torch.float)
        s1 = torch.tensor(s1, dtype=torch.float)
        a = torch.tensor(a, dtype=torch.long)
        r = torch.tensor(r, dtype=torch.float)
        done = torch.tensor(done, dtype=torch.float)
        weights = torch.tensor(weights, dtype=torch.float)

        if self.config.use_cuda:
            s0 = s0.cuda()
            s1 = s1.cuda()
            a = a.cuda()
            r = r.cuda()
            done = done.cuda()
            weights = weights.cuda()

        q_values = self.model(s0).cuda()
        next_q_values = self.model(s1).cuda()
        next_q_state_values = self.target_model(s1).cuda()

        q_value = q_values.gather(1, a.unsqueeze(1)).squeeze(1)
        next_q_value = next_q_state_values.gather(
            1,
            next_q_values.max(1)[1].unsqueeze(1)).squeeze(1)
        expected_q_value = r + self.config.gamma * next_q_value * (1 - done)
        td_errors = next_q_value - expected_q_value
        # Notice that detach the expected_q_value
        loss = F.smooth_l1_loss(q_value,
                                expected_q_value.detach(),
                                reduction='none')
        loss = (loss * weights).mean()

        self.model_optim.zero_grad()
        loss.backward()
        self.model_optim.step()

        if self.config.prioritized_replay:
            new_priorities = np.abs(td_errors.detach().cpu().numpy()
                                    ) + self.config.prioritized_replay_eps
            self.buffer.update_priorities(batch_idxes, new_priorities)

        if fr % self.config.update_tar_interval == 0:
            self.target_model.load_state_dict(self.model.state_dict())

        return loss.item()

    def cuda(self):
        self.model.cuda()
        self.target_model.cuda()

    def load_weights(self, model_path):
        model = torch.load(model_path)
        if 'model' in model:
            self.model.load_state_dict(model['model'])
        else:
            self.model.load_state_dict(model)

    def save_model(self, output, name=''):
        torch.save(self.model.state_dict(), '%s/model_%s.pkl' % (output, name))

    def save_config(self, output):
        with open(output + '/config.txt', 'w') as f:
            attr_val = get_class_attr_val(self.config)
            for k, v in attr_val.items():
                f.write(str(k) + " = " + str(v) + "\n")

    def save_checkpoint(self, fr, output):
        checkpath = output + '/checkpoint_model'
        os.makedirs(checkpath, exist_ok=True)
        torch.save({
            'frames': fr,
            'model': self.model.state_dict()
        }, '%s/checkpoint_fr_%d.tar' % (checkpath, fr))

    def load_checkpoint(self, model_path):
        checkpoint = torch.load(model_path)
        fr = checkpoint['frames']
        self.model.load_state_dict(checkpoint['model'])
        self.target_model.load_state_dict(checkpoint['model'])
        return fr
예제 #2
0
class Agent():
    """ Class implementation of a so-called "intelligent" agent.
        This agent interacts with and learns from the environment.
    """

    double_dqn = False
    """ True for the Double-DQN method.
    """

    dueling_network = False
    """ True for the Dueling Network (DN) method.
    """

    prioritized_replay = False
    """ True for the Prioritized Replay memory buffer.
    """
    def __init__(self,
                 state_size,
                 action_size,
                 seed,
                 lr_decay=9999e-4,
                 double_dqn=False,
                 dueling_network=False,
                 prioritized_replay=False):
        """ Initialize an Agent instance.
        
        Params
        ======
            state_size (int): Dimension of each state
            action_size (int): Dimension of each action
            seed (int): Random seed
            lr_decay (float): Multiplicative factor of learning rate decay
            double_dqn (bool): Toogle for using the Double-DQN method
            dueling_network (bool): Toogle for using the Dueling Network (DN) method
            prioritized_replay (bool): Toogle for using the Prioritized Replay method
        """

        # Set the parameters.
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.double_dqn = double_dqn
        self.dueling_network = dueling_network
        self.prioritized_replay = prioritized_replay

        # Q-Network hidden layers.
        hidden_layers = [128, 32]

        # Use the Dueling Network (DN) method.
        if self.dueling_network:

            # DN requires a hidden state value.
            hidden_state_value = [64, 32]

            self.qnetwork_local = DuelingQNetwork(
                state_size, action_size, seed, hidden_layers,
                hidden_state_value).to(device)
            self.qnetwork_target = DuelingQNetwork(
                state_size, action_size, seed, hidden_layers,
                hidden_state_value).to(device)
            self.qnetwork_target.eval()

        else:  # Use the Deep Q-Network (DQN) method.

            self.qnetwork_local = QNetwork(state_size, action_size, seed,
                                           hidden_layers).to(device)
            self.qnetwork_target = QNetwork(state_size, action_size, seed,
                                            hidden_layers).to(device)
            self.qnetwork_target.eval()

        # Optimize using Adam.
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=LEARNING_RATE)
        self.lr_scheduler = optim.lr_scheduler.ExponentialLR(
            self.optimizer, lr_decay)

        # Use the Prioritized Replay memory buffer if enabled.
        if self.prioritized_replay:

            self.memory = PrioritizedReplayBuffer(action_size,
                                                  BUFFER_SIZE,
                                                  BATCH_SIZE,
                                                  seed,
                                                  device,
                                                  alpha=0.6,
                                                  beta=0.4,
                                                  beta_scheduler=1.0)

        else:  # Use the Replay memory buffer instead.
            self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE,
                                       seed, device)

        # Initialize the time step (until the THRESHOLD is reached).
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        """ Update the network on each step.

        Params
        ======
            state (array_like): Current state
        """

        # Save experience in replay memory.
        self.memory.add(state, action, reward, next_state, done)

        # Learn every time step till THRESHOLD.
        self.t_step = (self.t_step + 1) % THRESHOLD

        if self.t_step == 0:  # Initial time step.

            # If enough samples are available in memory, get random subset and learn.
            if len(self.memory) > BATCH_SIZE:
                experiences = self.memory.sample()
                self.learn(experiences, GAMMA)

    def act(self, state, eps=0.):
        """ Return the actions for a given state as per current policy.
        
        Params
        ======
            state (array_like): Current state
            eps (float): Epsilon (ε), for epsilon-greedy action selection
        """

        # Epsilon-greedy action selection.
        if random.random() > eps:

            state = torch.from_numpy(state).float().unsqueeze(0).to(device)

            self.qnetwork_local.eval()

            with torch.no_grad():
                action_values = self.qnetwork_local(state)

            # Train the network.
            self.qnetwork_local.train()

            # Return the action.
            return np.argmax(action_values.cpu().data.numpy())

        else:  # Return a random action.
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, gamma):
        """ Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): Tuple of (s, a, r, s', done, w) tuples 
            gamma (float): Discount factor
        """

        # Set the parameters.
        states, actions, rewards, next_states, dones, w = experiences

        # Compute and minimize the loss.
        with torch.no_grad():

            if self.double_dqn:  # Use of Double-DQN method.

                # Select the greedy actions using the QNetwork Local.
                # Calculate the pair action/reward for each of the next_states.
                next_action_rewards_local = self.qnetwork_local(next_states)

                # Select the action with the maximum reward for each of the next actions.
                greedy_actions_local = next_action_rewards_local.max(
                    dim=1, keepdim=True)[1]

                ## Get the rewards for the greedy actions using the QNetwork Target.
                # Calculate the pair action/reward for each of the next_states.
                next_action_rewards_target = self.qnetwork_target(next_states)

                # Get the target reward for each of the greedy actions selected,
                # following the local network.
                target_rewards = next_action_rewards_target.gather(
                    1, greedy_actions_local)

            else:  # Use of the fixed Q-target method.

                # Calculate the pair action/reward for each of the next_states.
                next_action_rewards = self.qnetwork_target(next_states)

                # Select the maximum reward for each of the next actions.
                target_rewards = next_action_rewards.max(dim=1,
                                                         keepdim=True)[0]

            # Calculate the discounted target rewards.
            target_rewards = rewards + (gamma * target_rewards * (1 - dones))

        # Calculate the pair action/rewards for each of the states.
        # Here, shape: [batch_size, action_size].
        expected_action_rewards = self.qnetwork_local(states)

        # Get the reward for each of the actions.
        # Here, shape: [batch_size, 1].
        expected_rewards = expected_action_rewards.gather(1, actions)

        # If the Prioritized Replay memory buffer if enabled.
        if self.prioritized_replay:
            target_rewards.sub_(expected_rewards)
            target_rewards.squeeze_()
            target_rewards.pow_(2)

            with torch.no_grad():
                td_error = target_rewards.detach()
                td_error.pow_(0.5)
                self.memory.update_priorities(td_error)

            target_rewards.mul_(w)
            loss = target_rewards.mean()

        else:  # Calculate the loss.
            loss = F.mse_loss(expected_rewards, target_rewards)

        # Perform the back-propagation.
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        self.lr_scheduler.step()

        # Update the target network.
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """ Soft update model parameters:
            θ_target = τ * θ_local + (1 - τ) * θ_target.

        Params
        ======
            local_model (PyTorch model): Weights will be copied from
            target_model (PyTorch model): Weights will be copied to
            tau (float): Interpolation parameter 
        """

        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1. - tau) * target_param.data)