Ejemplo n.º 1
0
class DQN_Agent:
    """
    PyTorch Implementation of DQN/DDQN.
    """
    def __init__(self, state_size, action_size, args):
        """
        Initialize a D4PG Agent.
        """

        self.device = torch.device(
            "cuda:0" if torch.cuda.is_available() else "cpu")
        self.action_size = action_size
        self.state_size = state_size
        self.framework = args.framework
        self.eval = args.eval
        self.agent_count = 1
        self.learn_rate = args.learn_rate
        self.batch_size = args.batch_size
        self.buffer_size = args.buffer_size
        self.C = args.C
        self._epsilon = args.epsilon
        self.epsilon_decay = args.epsilon_decay
        self.epsilon_min = args.epsilon_min

        self.gamma = 0.99
        self.rollout = args.rollout
        self.tau = args.tau
        self.momentum = 1
        self.l2_decay = 0.0001
        self.update_type = "hard"

        self.t_step = 0
        self.episode = 0
        self.seed = 0

        # Set up memory buffers
        if args.prioritized_experience_replay:
            self.memory = PERBuffer(args.buffersize, self.batchsize,
                                    self.framestack, self.device, args.alpha,
                                    args.beta)
            self.criterion = WeightedLoss()
        else:
            self.memory = ReplayBuffer(self.device, self.buffer_size,
                                       self.gamma, self.rollout)

        #                    Initialize Q networks                         #
        self.q = self._make_model(state_size, action_size, args.pixels)
        self.q_target = self._make_model(state_size, action_size, args.pixels)
        self._hard_update(self.q, self.q_target)
        self.q_optimizer = self._set_optimizer(self.q.parameters(),
                                               lr=self.learn_rate,
                                               decay=self.l2_decay,
                                               momentum=self.momentum)

        self.new_episode()

    @property
    def epsilon(self):
        """
        This property ensures that the annealing process is run every time that
        E is called.
        Anneals the epsilon rate down to a specified minimum to ensure there is
        always some noisiness to the policy actions. Returns as a property.
        """

        self._epsilon = max(self.epsilon_min, self.epsilon_decay**self.t_step)
        return self._epsilon

    def act(self, state, eval=False, pretrain=False):
        """
        Select an action using epsilon-greedy π.
        Always use greedy if not training.
        """

        if np.random.random() > self.epsilon or not eval and not pretrain:
            state = state.to(self.device)
            with torch.no_grad():
                action_values = self.q(state).detach().cpu()
            action = action_values.argmax(dim=1).unsqueeze(0).numpy()
        else:
            action = np.random.randint(self.action_size, size=(1, 1))
        return action.astype(np.long)

    def step(self, state, action, reward, next_state, pretrain=False):
        """
        Add the current SARS' tuple into the short term memory, then learn
        """

        # Current SARS' stored in short term memory, then stacked for NStep
        experience = (state, action, reward, next_state)
        if self.rollout == 1:
            self.memory.store_trajectory(state, torch.from_numpy(action),
                                         torch.tensor([reward]), next_state)
        else:
            self.memory.store_experience(experience)
        self.t_step += 1

        # Learn after done pretraining
        if not pretrain:
            self.learn()

    def learn(self):
        """
        Trains the Deep QNetwork and returns action values.
        Can use multiple frameworks.
        """

        # Sample from replay buffer, REWARDS are sum of (ROLLOUT - 1) timesteps
        # Already calculated before storing in the replay buffer.
        # NEXT_STATES are ROLLOUT steps ahead of STATES
        batch, is_weights, tree_idx = self.memory.sample(self.batch_size)
        states, actions, rewards, next_states, terminal_mask = batch

        q_values = torch.zeros(self.batch_size).to(self.device)
        if self.framework == 'DQN':
            # Max predicted Q values for the next states from the target model
            q_values[terminal_mask] = self.q_target(next_states).detach().max(
                dim=1)[0]

        if self.framework == 'DDQN':
            # Get maximizing ACTION under Q, evaluate actionvalue
            # under q_target

            # Max valued action under active network
            max_actions = self.q(next_states).detach().argmax(1).unsqueeze(1)
            # Use the active network action to get the value of the stable
            # target network
            q_values[terminal_mask] = self.q_target(
                next_states).detach().gather(1, max_actions).squeeze(1)

        targets = rewards + (self.gamma**self.rollout * q_values)

        targets = targets.unsqueeze(1)
        values = self.q(states).gather(1, actions)

        #Huber Loss provides better results than MSE
        if is_weights is None:
            loss = F.smooth_l1_loss(values, targets)

        #Compute Huber Loss manually to utilize is_weights with Prioritization
        else:
            loss, td_errors = self.criterion.huber(values, targets, is_weights)
            self.memory.batch_update(tree_idx, td_errors)

        # Perform gradient descent
        self.q_optimizer.zero_grad()
        loss.backward()
        self.q_optimizer.step()

        self._update_networks()
        self.loss = loss.item()

    def initialize_memory(self, pretrain_length, env):
        """
        Fills up the ReplayBuffer memory with PRETRAIN_LENGTH number of experiences
        before training begins.
        """

        if len(self.memory) >= pretrain_length:
            print("Memory already filled, length: {}".format(len(self.memory)))
            return

        print("Initializing memory buffer.")

        while True:
            done = False
            env.reset()
            state = env.state
            while not done:
                action = self.act(state, pretrain=True)
                next_state, reward, done = env.step(action)
                if done:
                    next_state = None

                self.step(state, action, reward, next_state, pretrain=True)
                states = next_state

                if self.t_step % 50 == 0 or len(
                        self.memory) >= pretrain_length:
                    print("Taking pretrain step {}... memory filled: {}/{}\
                        ".format(self.t_step, len(self.memory),
                                 pretrain_length))
                if len(self.memory) >= pretrain_length:
                    print("Done!")
                    self.t_step = 0
                    self._epsilon = 1
                    return

    def new_episode(self):
        """
        Handle any cleanup or steps to begin a new episode of training.
        """

        self.memory.init_n_step()
        self.episode += 1

    def _update_networks(self):
        """
        Updates the network using either DDPG-style soft updates (w/ param \
        TAU), or using a DQN/D4PG style hard update every C timesteps.
        """

        if self.update_type == "soft":
            self._soft_update(self.q, self.q_target)
        elif self.t_step % self.C == 0:
            self._hard_update(self.q, self.q_target)

    def _soft_update(self, active, target):
        """
        Slowly updated the network using every-step partial network copies
        modulated by parameter TAU.
        """

        for t_param, param in zip(target.parameters(), active.parameters()):
            t_param.data.copy_(self.tau * param.data +
                               (1 - self.tau) * t_param.data)

    def _hard_update(self, active, target):
        """
        Fully copy parameters from active network to target network. To be used
        in conjunction with a parameter "C" that modulated how many timesteps
        between these hard updates.
        """

        target.load_state_dict(active.state_dict())

    def _set_optimizer(self, params, lr, decay, momentum, optimizer="Adam"):
        """
        Sets the optimizer based on command line choice. Defaults to Adam.
        """

        if optimizer == "RMSprop":
            return optim.RMSprop(params, lr=lr, momentum=momentum)
        elif optimizer == "SGD":
            return optim.SGD(params, lr=lr, momentum=momentum)
        else:
            return optim.Adam(params, lr=lr, weight_decay=decay)

    def _make_model(self, state_size, action_size, use_cnn):
        """
        Sets up the network model based on whether state data or pixel data is
        provided.
        """

        if use_cnn:
            return QCNNetwork(state_size, action_size,
                              self.seed).to(self.device)
        else:
            return QNetwork(state_size, action_size, self.seed).to(self.device)
Ejemplo n.º 2
0
class Agent():
    """Interacts with and learns from the environment."""
    def __init__(self, state_size, action_size, seed, framework, buffer_type):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.framework = framework
        self.buffer_type = buffer_type

        # Q-Network
        self.qnetwork_local = QNetwork(state_size, action_size,
                                       seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size,
                                        seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR)

        # Replay memory
        # def __init__(self, device, buffer_size, batch_size, alpha, beta):
        if self.buffer_type == 'PER_ReplayBuffer':
            self.memory = PER_ReplayBuffer(device, BUFFER_SIZE, BATCH_SIZE,
                                           ALPHA, BETA)
        if self.buffer_type == 'ReplayBuffer':
            self.memory = ReplayBuffer(device, action_size, BUFFER_SIZE,
                                       BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0

    def step(self, state, action, reward, next_state, done):
        # Save experience in replay memory
        self.memory.add(state, action, reward, next_state, done)

        # Learn every UPDATE_EVERY time steps.
        self.t_step = (self.t_step + 1) % UPDATE_EVERY
        if self.t_step == 0:
            # If enough samples are available in memory, get random subset and learn
            if len(self.memory) > BATCH_SIZE:
                if self.buffer_type == 'ReplayBuffer':
                    experiences = self.memory.sample()
                    is_weights = None
                    idxs = None
                if self.buffer_type == 'PER_ReplayBuffer':
                    experiences, is_weights, idxs = self.memory.sample()
                    self.criterion = WeightedLoss()
                    #print('debugging:', experiences )
                self.learn(experiences, is_weights, idxs, GAMMA)

    def act(self, state, eps=0.):
        """Returns actions for given state as per current policy.
        
        Params
        ======
            state (array_like): current state
            eps (float): epsilon, for epsilon-greedy action selection
        """
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()  # use local network

        # Epsilon-greedy action selection
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))

    def learn(self, experiences, is_weights, idxs, gamma):
        """Update value parameters using given batch of experience tuples.

        Params
        ======
            experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples 
            gamma (float): discount factor
        """
        states, actions, rewards, next_states, dones = experiences
        if self.framework == 'DQN':
            # Get max predicted Q values (for next states) from target model
            Q_targets_next = self.qnetwork_target(next_states).detach(
            ).max(1)[0].unsqueeze(
                1
            )  #target network: which is uploaded slower than the local network
            #print('Q_targets_next is ', Q_targets_next)
        if self.framework == "DDQN":
            #print("DDQN")
            max_actions = self.qnetwork_local(next_states).detach().argmax(
                1).unsqueeze(1)
            #print('max_actions is ', max_actions)
            #print('self.qnetwork_target(next_states).detach() is ',self.qnetwork_target(next_states).detach())
            #Q_targets_next = self.qnetwork_target(next_states).detach().gather(1, max_actions).squeeze(1)
            Q_targets_next = self.qnetwork_target(next_states).detach().gather(
                1, max_actions)
            #print('DDQN, Q', Q_targets_next)
            #print('rewards is ', rewards)
        # Compute Q targets for current states
        Q_targets = rewards + (gamma * Q_targets_next * (1 - dones))

        # Get expected Q values from local model
        Q_expected = self.qnetwork_local(states).gather(1, actions)

        #------------------------------------------------------------------------------
        #Huber Loss provides better results than MSE
        if is_weights is None:
            loss = F.smooth_l1_loss(Q_expected, Q_targets)

        #Compute Huber Loss manually to utilize is_weights with Prioritization
        else:
            loss, td_errors = self.criterion.huber(Q_expected, Q_targets,
                                                   is_weights)
            self.memory.batch_update(idxs, td_errors)

        # Perform gradient descent
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # ------------------- update target network ------------------- #
        self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU)

    def soft_update(self, local_model, target_model, tau):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model (PyTorch model): weights will be copied from
            target_model (PyTorch model): weights will be copied to
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(),
                                             local_model.parameters()):
            target_param.data.copy_(tau * local_param.data +
                                    (1.0 - tau) * target_param.data)