Esempio n. 1
0
class DDQNAgent():
    """
        A Double DQN agent has two networks. One local network and one target network.
        The local network is trained every iteration and is used for predictive action.
        The target network is updated to a soft copy of the local network every so often.

        The reason is because the Bellman equation would be valuing the network that is predicting
        as well as that same network being used to calculate loss. We have this separation of training
        and predicting to help the agent learn.
    """
    def __init__(self, gamma, epsilon, lr, n_actions, input_dims,
                    mem_size, batch_size, eps_min = 0.01, eps_dec = 5e-7,
                    replace = 10_000):
        pass
        self.gamma = gamma #used to discount future rewards
        self.epsilon = epsilon #used for epsilon-greedy action choosing algo.
        self.lr = lr #learning rate, essentially, how big of a step does the optimizer take
        self.n_actions = n_actions #number of actions available to our agent in its environment
        self.action_space = [i for i in range(n_actions)]#list comprehension to create array of indices of possible actions to choose from
        self.input_dims = input_dims #the dimensions of our input as defined by the agent's environment
        self.mem_size = mem_size #maximum amount of memories to store
        self.batch_size = batch_size #mini-batch size to sample from memory.
        self.eps_min = eps_min #smallest possible epsilon value for our agent
        self.eps_dec = eps_dec #how much to decrease epsilon each iteration
        self.replace_after = replace #how many iterations until we replace our target network with a sofy copy of our local network
        self.steps = 0 #iteration counter for use with replace_after

        #create a ReplayBuffer to store our memories, also used to sample a mini-batch
        self.memory = ReplayBuffer(mem_size, input_dims, n_actions)

        self.Q_local = DeepQNetwork(self.lr, self.n_actions,
                                    input_dims = self.input_dims)
        self.Q_target = DeepQNetwork(self.lr, self.n_actions,
                                    input_dims = self.input_dims)

    def store_memory(self, state, action, reward, state_, done):
        """
            Save a new memory to our ReplayBuffer
        """
        self.memory.store_memory(state, action, reward, state_, done)

    def sample_batch(self):
        """
            Pull a stochastic mini-batch from our ReplayBuffer
        """
        state, action, reward, state_, done = \
                            self.memory.sample_batch(self.batch_size)

        states = T.tensor(state).to(self.Q_local.device)
        actions = T.tensor(action).to(self.Q_local.device)
        rewards = T.tensor(reward).to(self.Q_local.device)
        states_ = T.tensor(state_).to(self.Q_local.device)
        dones = T.tensor(done).to(self.Q_local.device)

        return states, actions, rewards, states_, dones


    def choose_action(self, observation):
        """
            Choose an action from our action space using an epsilon-greedy algorithm.
            We can either EXPLOIT, or EXPLORE based on a random probability.

            Exploiting will choose the best known action. (confidence)

            Exploring will explore a random action. This will possibly present new information to our agent
            to learn from.
        """
        if np.random.random() > self.epsilon:#epsilon-greedy (EXPLOIT)
            state = T.tensor([observation], dtype = T.float).to(self.Q_local.device)
            actions = self.Q_local.forward(state)
            action = T.argmax(actions).item()#.item() gets index from tensor
        else:#(EXPLORE)
            action = np.random.choice(self.action_space)#choose random action from our action space

        return action

    def replace_target_network(self):
        """
            after replace_after iterations we update our target network
            to be a soft copy of our local network
        """
        if self.replace_after is not None and \
                    self.steps % self.replace_after == 0:
            self.Q_target.load_state_dict(self.Q_local.state_dict())

    def decrement_epsilon(self):
        """
            decrease epsilon, but not below eps_min
        """
        self.epsilon = max(self.epsilon - self.eps_dec, self.eps_min)

    def learn(self):
        """
            Main part of our agent.

            First we zero the gradient of our optimzier to stop exploding gradients.
            Then we sample a stochastic mini-batch from our ReplayBuffer.

            Then we make predictions and evaluations of this random mini-batch, step our optimzer
            and calculate loss.

            Finally, we decrement our epsilon and begin the cycle of (SEE->DO->LEARN) once again.
        """
        if self.memory.mem_cntr < self.batch_size:#if we dont have a full batch of memories, dont learn quite yet
            return

        self.Q_local.optimizer.zero_grad()#zero out our gradient for optimzer. Stop exploding gradients

        self.replace_target_network()

        states, actions, rewards, states_, dones = self.sample_batch()

        indices = np.arange(self.batch_size)

        q_pred = self.Q_local.forward(states)[indices, actions]#local pred
        q_next = self.Q_target.forward(states_)#target pred
        q_eval = self.Q_local.forward(states_)

        max_actions = T.argmax(q_eval, dim = 1)
        q_next[dones] = 0.0#set to not done

        q_target = rewards + self.gamma*q_next[indices, max_actions]#bellman equation
        loss = self.Q_local.loss(q_target, q_pred).to(self.Q_local.device)
        loss.backward()#back-propagation

        self.Q_local.optimizer.step()
        self.steps += 1

        self.decrement_epsilon()

    def save_agent(self):
        self.Q_local.save_model('local')
        self.Q_target.save_model('target')

    def load_agent(self):
        self.Q_local.load_model('local')
        self.Q_target.load_model('target')
Esempio n. 2
0
class Agent():
    def __init__(self,
                 input_dims,
                 n_actions,
                 lr,
                 mem_size,
                 batch_size,
                 epsilon,
                 gamma=0.99,
                 eps_dec=5e-7,
                 eps_min=0.01,
                 replace=1000,
                 algo=None,
                 env_name=None,
                 checkpoint_dir='tmp/dqn'):
        self.lr = lr
        self.batch_size = batch_size
        self.input_dims = input_dims
        self.n_actions = n_actions
        self.gamma = gamma
        self.epsilon = epsilon
        self.eps_dec = eps_dec
        self.eps_min = eps_min
        self.replace = replace
        self.algo = algo
        self.env_name = env_name
        self.checkpoint_dir = checkpoint_dir
        self.action_space = [i for i in range(self.n_actions)]
        self.learn_step_counter = 0

        self.memory = ReplayBuffer(mem_size, input_dims, n_actions)

        self.q_eval = DeepQNetwork(self.lr,
                                   self.n_actions,
                                   input_dims=self.input_dims,
                                   name=self.env_name + " " + self.algo +
                                   "_q_eval",
                                   checkpoint_dir=self.checkpoint_dir)
        self.q_next = DeepQNetwork(self.lr,
                                   self.n_actions,
                                   input_dims=self.input_dims,
                                   name=self.env_name + " " + self.algo +
                                   "_q_next",
                                   checkpoint_dir=self.checkpoint_dir)

    def choose_action(self, observation):
        if np.random.random() > self.epsilon:
            state = T.tensor([observation], dtype=T.float).to(
                self.q_eval.device)  # converting observation to tensor,
            # and observation is in the list because our convolution expects an input tensor of shape batch size
            # by input dims.
            q_values = self.q_eval.forward(state)
            action = T.argmax(q_values).item()
        else:
            action = np.random.choice(self.action_space)

        return action

    def store_transition(self, state, action, reward, resulted_state, done):
        self.memory.store_transition(state, action, reward, resulted_state,
                                     done)

    def sample_memory(self):
        state, action, reward, resulted_state, done = self.memory.sample_buffer(
            self.batch_size)
        state = T.tensor(state).to(self.q_eval.device)
        reward = T.tensor(reward).to(self.q_eval.device)
        done = T.tensor(done).to(self.q_eval.device)
        action = T.tensor(action).to(self.q_eval.device)
        resulted_state = T.tensor(resulted_state).to(self.q_eval.device)

        return state, reward, done, action, resulted_state

    def replace_target_network(self):
        if self.learn_step_counter % self.replace == 0:
            self.q_next.load_state_dict(self.q_eval.state_dict())

    def decrement_epsilon(self):
        if self.epsilon > self.eps_min:
            self.epsilon -= self.eps_dec
        else:
            self.epsilon = self.eps_min

    def save_models(self):
        self.q_eval.save_checkpoint()
        self.q_next.save_checkpoint()

    def load_models(self):
        self.q_eval.load_checkpoint()
        self.q_next.load_checkpoint()

    def learn(self):
        if self.memory.mem_counter < self.batch_size:
            return

        self.q_eval.optimizer.zero_grad()

        self.replace_target_network()

        state, reward, done, action, resulted_state = self.sample_memory()

        indexes = np.arange(self.batch_size, dtype=np.longlong)
        action = action.long()
        done = done.bool()

        prediction = self.q_eval.forward(state)[
            indexes, action]  # dims: batch_size * n_actions

        next_result = self.q_next.forward(resulted_state).max(dim=1)[0]
        next_result[done] = 0.0  # for terminal states, target should be reward
        target = reward + self.gamma * next_result

        loss = self.q_eval.loss(target, prediction).to(self.q_eval.device)
        loss.backward()
        self.q_eval.optimizer.step()

        self.learn_step_counter += 1
        self.decrement_epsilon()