Beispiel #1
0
class Double_DQN_Agent(DQN_Agent):
    '''
    Double DQN Agent. Sourses:
    https://towardsdatascience.com/double-deep-q-networks-905dd8325412
    https://www.freecodecamp.org/news/improvements-in-deep-q-learning-dueling-double-dqn-prioritized-experience-replay-and-fixed-58b130cc5682/

    Denote 'Policy net' (forward + backward) and 'Target net' (forward only).
    During optimization we do 3 forward passes.
    1) Policy net: Q(state, action) to get Q values of state, action pairs.
    2) Policy net: Q(next_state, ) to predict the next_action
    3) Target net: Q(next_state, next_action) to get Q values of next_state, next_action pairs.
    ***NOTICE: in some sources, in step 2) the use target net, and in step 3) use policy net.
    Then compute loss and do backward step on Policy net.
    Copy with polyak averaging weights from policy net into target net.
    '''
    def __init__(
        self,
        state_size,
        n_actions,
        args,
        device=torch.device("cuda" if torch.cuda.is_available() else "cpu")):
        super().__init__(state_size, n_actions, args, device=device)
        self.target_net = DQN(state_size, n_actions,
                              layers=self.layers).to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

    def _compute_loss(self, state_batch, action_batch, next_states_batch,
                      reward_batch):
        # Q{policy net}(s, a)
        state_action_q_values = self.policy_net(state_batch).gather(
            1, action_batch)

        # argmax{a} Q{policy net}(s', a')
        next_state_actions = torch.argmax(self.policy_net(next_states_batch),
                                          dim=1).unsqueeze(1)

        # Q{ploicy net}(s', argmax{a} Q{target net}(s', a') )
        next_state_q_values = self.target_net(next_states_batch).gather(
            1, next_state_actions)

        # Q* = Disount * Q(s', argmax(..)) + R
        expected_state_action_values = (next_state_q_values *
                                        self.discount) + reward_batch

        # Compute Huber loss
        loss = F.smooth_l1_loss(state_action_q_values,
                                expected_state_action_values)
        return loss
Beispiel #2
0
class Fixed_Q_Targets_Agent(DQN_Agent):
    '''
    This agent implements Fixed Q-Targets algorithm. There are two deep networks.
    Policy network - to predict Q of a given action, value a state. i.e. Q(s,a)
    Target network - to predict Q values of action of the next state. i.e. max Q(s', a') for loss calculation.
    '''
    def __init__(
        self,
        state_size,
        n_actions,
        args,
        device=torch.device("cuda" if torch.cuda.is_available() else "cpu")):
        super().__init__(state_size, n_actions, args, device=device)
        self.target_net = DQN(state_size, n_actions,
                              layers=self.layers).to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

    def _compute_loss(self, state_batch, action_batch, next_states_batch,
                      reward_batch):
        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken. These are the actions which would've been taken
        # for each batch state according to policy_net
        state_action_values = self.policy_net(state_batch).gather(
            1, action_batch)

        # Compute V(s_{t+1}) for all next states.
        # Expected values of actions for non_final_next_states are computed based
        # on the "older" target_net; selecting their best reward with max(1)[0].
        # This is merged based on the mask, such that we'll have either the expected
        # state value or 0 in case the state was final.
        next_state_values = torch.zeros(self.batch_size, device=self.device)
        next_state_values = self.target_net(next_states_batch).max(
            1)[0].detach()
        # Compute the expected Q values
        expected_state_action_values = (next_state_values.unsqueeze(1) *
                                        self.discount) + reward_batch

        # Compute Huber loss
        loss = F.smooth_l1_loss(state_action_values,
                                expected_state_action_values)
        return loss