Example #1
0
class DQNAgent():
    def __init__(self, gamma, epsilon, lr, n_actions, input_dims, mem_size, batch_size, eps_min=0.01, eps_dec = 5e-7,
                replace =1000, algo=None, env_name = None, chkpt_dir = 'tmp/dqn'):
        self.gamma = gamma
        self.epsilon = epsilon
        self.lr = lr
        self.n_actions = n_actions
        self.input_dims = input_dims
        self.batch_size = batch_size
        self.eps_min = eps_min
        self.eps_dec = eps_dec
        self.replace_target_cnt = replace
        self.algo = algo
        self.env_name = env_name
        self.chkpt_dir = chkpt_dir
        self.action_space = [i for i in range(self.n_actions)]
        self.learn_step_counter= 0

        self.memory = ReplayBuffer(mem_size, input_dims, n_actions)
        self.q_eval = DeepQNetwork(self.lr, self.n_actions, input_dims = self.input_dims,name = self.env_name + self.algo + '_q_eval', chkpt_dir= self.chkpt_dir)
        self.q_next = DeepQNetwork(self.lr, self.n_actions, input_dims = self.input_dims,name = self.env_name + self.algo + '_q_next', chkpt_dir= self.chkpt_dir)

    def choose_action(self, observation):
        if(np.random.random()> self.epsilon):
            state = T.tensor([observation], dtype=T.float).to(self.q_eval.device)
            actions = self.q_eval.forward(state)
            action = T.argmax(actions).item()
        else:
            action = np.random.choice(self.action_space)
        return action

    def store_transition(self, state, action, reward, state_, done):
        self.memory.store_transition(state, action, reward, state_, done)
    def sample_memory(self):
        state, action, reward, new_state, done = self.memory.sample_buffer(self.batch_size)
        states = T.tensor(state).to(self.q_eval.device)
        rewards= T.tensor(reward).to(self.q_eval.device)
        dones = T.tensor(done).to(self.q_eval.device)
        actions = T.tensor(action).to(self.q_eval.device)
        states_ = T.tensor(new_state).to(self.q_eval.device)
        return states, actions, rewards, states_, done
    def replace_target_network(self):
        if self.learn_step_counter % self.replace_target_cnt == 0:
            self.q_next.load_state_dict(self.q_eval.state_dict())
    def decrement_epsilon(self):
        self.epsilon = self.epsilon -self.eps_dec if self.epsilon > self.eps_min else self.eps_min

    def save_models(self):
        self.q_eval.save_checkpoint()
        self.q_next.save_checkpoint()
    def load_models(self):
        self.q_eval.load_checkpoint()
        self.q_next.load_checkpoint()
    def learn(self):
        if(self.memory.mem_cntr < self.batch_size):
            return
        self.q_eval.optimizer.zero_grad()
        self.replace_target_network()
        states, actions, rewards, states_, dones = self.sample_memory()
        indices = np.arange(self.batch_size)
        q_pred = self.q_eval.forward(states)[indices, actions]
        q_next = self.q_next.forward(states_).max(dim=1)[0]
        q_next[dones] = 0
        q_target= rewards + self.gamma * q_next
        loss =self.q_eval.loss(q_target, q_pred).to(self.q_eval.device)
        loss.backward()
        self.q_eval.optimizer.step()
        self.learn_step_counter+=1
        self.decrement_epsilon()
class DQNAgent(object):
    def __init__(self,
                 gamma,
                 epsilon,
                 lr,
                 n_actions,
                 input_dims,
                 mem_size,
                 batch_size,
                 eps_min=0.01,
                 eps_dec=5e-7,
                 replace=1000,
                 algo=None,
                 env_name=None,
                 chkpt_dir='tmp/dqn'):
        self.gamma = gamma
        self.epsilon = epsilon
        self.lr = lr
        self.n_actions = n_actions
        self.input_dims = input_dims
        self.batch_size = batch_size
        self.eps_min = eps_min
        self.eps_dec = eps_dec
        self.replace_target_cnt = replace
        self.algo = algo
        self.env_name = env_name
        self.chkpt_dir = chkpt_dir
        self.action_space = [i for i in range(n_actions)]
        self.learn_step_counter = 0

        self.memory = ReplayBuffer(mem_size, input_dims, n_actions)

        self.q_eval = DeepQNetwork(self.lr,
                                   self.n_actions,
                                   input_dims=self.input_dims,
                                   name=self.env_name + '_' + self.algo +
                                   '_q_eval',
                                   chkpt_dir=self.chkpt_dir)

        self.q_next = DeepQNetwork(self.lr,
                                   self.n_actions,
                                   input_dims=self.input_dims,
                                   name=self.env_name + '_' + self.algo +
                                   '_q_next',
                                   chkpt_dir=self.chkpt_dir)

        #self.dim_bechir = self.q_eval.calculate_output_bechir(self.input_dims)

    def choose_action(self, observation):
        """
        Choose an action through an epsilon-greedy approach.
        :param observation: state features as provided by gym environment.
        :return: action
        """
        if np.random.random() > self.epsilon:
            # Convert state to Pytorch tensor and send to q_eval.device
            state = T.tensor([observation],
                             dtype=T.float).to(self.q_eval.device)
            # Get actions values from q_eval network
            actions = self.q_eval.forward(state)
            # Get action with highest value
            action = T.argmax(actions).item()
        else:
            # Select random action from action space
            action = np.random.choice(self.action_space)

        return action

    def store_transition(self, state, action, reward, state_, done):
        self.memory.store_transition(state, action, reward, state_, done)

    def sample_memory(self):
        state, action, reward, new_state, done = \
                                self.memory.sample_buffer(self.batch_size)

        states = T.tensor(state).to(self.q_eval.device)
        rewards = T.tensor(reward).to(self.q_eval.device)
        dones = T.tensor(done).to(self.q_eval.device)
        actions = T.tensor(action).to(self.q_eval.device)
        states_ = T.tensor(new_state).to(self.q_eval.device)

        return states, actions, rewards, states_, dones

    def replace_target_network(self):
        if self.learn_step_counter % self.replace_target_cnt == 0:
            self.q_next.load_state_dict(self.q_eval.state_dict())

    def decrement_epsilon(self):
        self.epsilon = self.epsilon - self.eps_dec \
                           if self.epsilon > self.eps_min else self.eps_min

    def save_models(self):
        self.q_eval.save_checkpoint()
        self.q_next.save_checkpoint()

    def load_models(self):
        self.q_eval.load_checkpoint()
        self.q_next.load_checkpoint()

    def learn(self):
        # If memory counter has not reached batch size simply return
        if self.memory.mem_cntr < self.batch_size:
            return

        # reset gradients for the main network's optimizer
        self.q_eval.optimizer.zero_grad()

        # Call function to update target network weights every n steps
        self.replace_target_network()

        # Sample environment transitions from the replay buffer
        states, actions, rewards, states_, dones = self.sample_memory()

        # Get Q(s,a) for the actions performed by the agent.
        # Because we processed a batch of states, we need to index the result of the forward function by the indices of
        # the states (from 0 to batch_size) followed by the index of the action performed by the agent.
        indices = np.arange(self.batch_size)
        q_pred = self.q_eval.forward(states)[indices, actions]

        # Get max Q(s', a') from target network
        q_next = self.q_next.forward(states_).max(dim=1)[0]

        # Set Q(s', a') to zero for terminal states
        q_next[dones] = 0.0
        # Compute the q_target as r + gamma * Q(s',a')
        q_target = rewards + self.gamma * q_next

        # Compute the loss tensor and move it to q_eval.device
        loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device)

        # Backpropagate loss and optimize network parameters
        loss.backward()
        self.q_eval.optimizer.step()

        # Increment training counter
        self.learn_step_counter += 1

        # Decrement epsilon for epsilon-greedy action selection
        self.decrement_epsilon()
class DQNAgent():
    def __init__(self,
                 gamma,
                 epsilon,
                 lr,
                 n_actions,
                 input_dims,
                 mem_size,
                 batch_size,
                 chkpt_dir,
                 eps_min=0.01,
                 eps_dec=5e-7,
                 replace=1000,
                 algo=None,
                 env_name=None):
        self.gamma = gamma  # 0.99
        self.epsilon = epsilon  # 1.0
        self.lr = lr  # 0.0001
        self.n_actions = n_actions  # 6
        self.input_dims = input_dims  # (4, 84, 84)
        self.batch_size = batch_size  # 32
        self.eps_min = eps_min  # 0.1
        self.eps_dec = eps_dec  # 1e-05
        self.replace_target_cnt = replace  # 1000
        self.algo = algo  # 'DQNAgent'
        self.env_name = env_name  #  'PongNoFrameskip-v4'
        self.chkpt_dir = chkpt_dir  #  .\\models\\
        self.action_space = [i for i in range(self.n_actions)
                             ]  # [0, 1, 2, 3, 4, 5]
        self.learn_step_counter = 0

        self.memory = ReplayBuffer(mem_size, input_dims, n_actions)
        self.q_eval = DeepQNetwork(self.lr,
                                   self.n_actions,
                                   input_dims=self.input_dims,
                                   name=self.env_name + '_' + self.algo +
                                   '_q_eval',
                                   chkpt_dir=self.chkpt_dir)
        self.q_next = DeepQNetwork(self.lr,
                                   self.n_actions,
                                   input_dims=self.input_dims,
                                   name=self.env_name + '_' + self.algo +
                                   '_q_next',
                                   chkpt_dir=self.chkpt_dir)

    def choose_action(self, observation):
        if np.random.random() > self.epsilon:
            state = T.tensor([observation],
                             dtype=T.float).to(self.q_eval.device)
            actions = self.q_eval.forward(state)
            action = T.argmax(actions).item()
        else:
            action = np.random.choice(self.action_space)
        return action

    def store_transition(self, state, action, reward, state_, done):
        self.memory.store_transition(state, action, reward, state_, done)

    def sample_memory(self):
        state, action, reward, new_state, done = self.memory.sample_buffer(
            self.batch_size)
        states = T.tensor(state).to(self.q_eval.device)
        rewards = T.tensor(reward).to(self.q_eval.device)
        dones = T.tensor(done).to(self.q_eval.device)
        actions = T.tensor(action).to(self.q_eval.device)
        states_ = T.tensor(new_state).to(self.q_eval.device)
        return states, actions, rewards, states_, dones

    def replace_target_network(self):
        if self.learn_step_counter % self.replace_target_cnt == 0:
            self.q_next.load_state_dict(self.q_eval.state_dict(
            ))  # load_state_dict and state_dict are inbuilt functions of torch

    def decrement_epsilon(self):
        self.epsilon = self.epsilon - self.eps_dec if self.epsilon > self.eps_min else self.eps_min

    def save_models(self):
        self.q_eval.save_checkpoint()
        self.q_next.save_checkpoint()

    def load_models(self):
        self.q_eval.load_checkpoint()
        self.q_next.load_checkpoint()

    def learn(self):
        if self.memory.mem_cntr < self.batch_size:
            return
        self.q_eval.optimizer.zero_grad()

        self.replace_target_network()
        states, actions, rewards, states_, dones = self.sample_memory()

        indices = np.arange(self.batch_size)
        q_pred = self.q_eval.forward(
            states
        )[indices,
          actions]  # self.q_eval.forward(states).shape = (32, 6), q_pred.shape = 32
        q_next = self.q_next.forward(states_).max(
            dim=1
        )[0]  # self.q_next.forward(states_).shape = (32, 6), q_next.shape = 32

        temp_dones = dones.bool()
        q_next[temp_dones] = 0.0  # as reward for terminal state is 0
        q_target = rewards + self.gamma * q_next

        loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device)
        loss.backward()
        self.q_eval.optimizer.step()
        self.learn_step_counter += 1
        self.decrement_epsilon()
class DoubleDQNAgent():
    def __init__(self, gamma, epsilon, lr, n_actions, input_dims,
                 memory_size, batch_size, algo, env_name, checkpoint_dir,
                 epsilon_min=0.01, epsilon_decay=5e-7, replace_target_count=1000):
        self.gamma = gamma
        self.epsilon = epsilon
        self.lr = lr
        self.n_actions = n_actions
        self.input_dims = input_dims
        self.memory_size = memory_size
        self.batch_size = batch_size
        self.algo = algo
        self.env_name = env_name
        self.epsilon_min = 0.01
        self.epsilon_decay = epsilon_decay
        self.replace_target_count = replace_target_count
        self.checkpoint_dir = checkpoint_dir
        self.action_space = [i for i in range(self.n_actions)]
        self.learn_step_counter = 0

        self.memory = ReplayMemory(memory_size, input_dims, n_actions)

        self.q_net = DeepQNetwork(self.lr, self.n_actions, 
                                  name=self.env_name+'_'+self.algo+'_q_net',
                                  input_dims=self.input_dims,
                                  checkpoint_dir=self.checkpoint_dir)
        self.target_net = DeepQNetwork(self.lr, self.n_actions, 
                                  name=self.env_name+'_'+self.algo+'_target_net',
                                  input_dims=self.input_dims,
                                  checkpoint_dir=self.checkpoint_dir)

    def choose_action(self, observation):
        if np.random.random() > self.epsilon:
            observation = T.tensor([observation], dtype=T.float).to(self.q_net.device)
            actions = self.q_net(observation)
            action = T.argmax(actions).item()
        else:
            action = np.random.choice(self.action_space)

        return action

    def remember(self, state, action, reward, next_state, done):
        self.memory.remember(state, action, reward, next_state, done)

    def sample_memory(self):
        states, actions, rewards, next_states, dones = \
                                self.memory.sample_buffer(self.batch_size)
        
        states = T.tensor(states).to(self.q_net.device)
        actions = T.tensor(actions).to(self.q_net.device)
        rewards = T.tensor(rewards).to(self.q_net.device)
        next_states = T.tensor(next_states).to(self.q_net.device)
        dones = T.tensor(dones).to(self.q_net.device)

        return states, actions, rewards, next_states, dones

    def replace_target_network(self):
        if self.learn_step_counter % self.replace_target_count == 0:
            self.target_net.load_state_dict(self.q_net.state_dict())

    def decrement_epsilon(self):
        self.epsilon = self.epsilon - self.epsilon_decay \
                            if self.epsilon > self.epsilon_min else self.epsilon_min
            
    def learn(self):
        if self.memory.memory_counter < self.batch_size:
            return
        
        self.q_net.optimizer.zero_grad()

        self.replace_target_network()

        states, actions, rewards, next_states, dones = self.sample_memory()

        q_prediction = self.q_net(states) # (batch_size, *n_actions)
        target_predictions = self.target_net(next_states) # (batch_size, *n_actions)
        target_predictions[dones] = 0.0
        
        indices = np.arange(self.batch_size)
        q_value = q_prediction[indices, actions]

        t_actions = T.argmax(self.q_net(next_states), dim=1)
        target_value = rewards + self.gamma * target_predictions[indices, t_actions]

        loss = self.q_net.loss(q_value, target_value).to(self.q_net.device)
        loss.backward()
        self.q_net.optimizer.step()
        self.learn_step_counter += 1

        self.decrement_epsilon()
            
    def save_models(self):
        self.q_net.save_checkpoint()
        self.target_net.save_checkpoint()

    def load_models(self):
        self.q_net.load_checkpoint()
        self.target_net.load_checkpoint()
Example #5
0
class Agent(object):
    def __init__(self,
                 gamma,
                 epsilon,
                 lr,
                 n_actions,
                 input_dims,
                 mem_size,
                 batch_size,
                 eps_min=0.01,
                 eps_dec=5e-7,
                 replace=1000,
                 algo=None,
                 env_name=None,
                 checkpoint_dir='tmp/dqn'):
        self.gamma = gamma
        self.epsilon = epsilon
        self.lr = lr
        self.n_actions = n_actions
        self.input_dims = input_dims
        self.batch_size = batch_size
        self.eps_min = eps_min
        self.eps_dec = eps_dec
        self.replace_target_counter = replace
        self.algo = algo
        self.env_name = env_name
        self.checkpoint_dir = checkpoint_dir
        self.action_space = [i for i in range(self.n_actions)]
        self.learn_step_counter = 0

        self.memory = ReplayBuffer(mem_size, input_dims, n_actions)

        self.q_eval = DeepQNetwork(self.lr,
                                   self.n_actions,
                                   input_dims=self.input_dims,
                                   name=self.env_name + '_' + self.algo +
                                   "_q_eval",
                                   checkpoint_dir=self.checkpoint_dir)
        self.q_next = DeepQNetwork(self.lr,
                                   self.n_actions,
                                   input_dims=self.input_dims,
                                   name=self.env_name + '_' + self.algo +
                                   "_q_next",
                                   checkpoint_dir=self.checkpoint_dir)

    def store_transition(self, state, action, reward, resulted_state, done):
        self.memory.store_transition(state, action, reward, resulted_state,
                                     done)

    def sample_memory(self):
        state, action, reward, resulted_state, done = self.memory.sample_buffer(
            self.batch_size)
        states = T.tensor(state).to(self.q_eval.device)
        rewards = T.tensor(reward).to(self.q_eval.device)
        dones = T.tensor(done).to(self.q_eval.device)
        actions = T.tensor(action).to(self.q_eval.device)
        resulted_states = T.tensor(resulted_state).to(self.q_eval.device)

        return states, actions, rewards, resulted_states, dones

    def choose_action(self, observation):
        if np.random.random() > self.epsilon:
            state = T.tensor([observation], dtype=T.float).to(
                self.q_eval.device)  # converting observation to tensor,
            # and observation is in the list because our convolution expects an input tensor of shape batch size
            # by input dims.
            _, advantages = self.q_eval.forward(state)
            action = T.argmax(advantages).item()
        else:
            action = np.random.choice(self.action_space)

        return action

    def replace_target_network(self):
        if self.replace_target_counter is not None and \
            self.learn_step_counter % self.replace_target_counter == 0:
            self.q_next.load_state_dict(self.q_eval.state_dict())

    def decrement_epsilon(self):
        if self.epsilon > self.eps_min:
            self.epsilon = self.epsilon - self.eps_dec
        else:
            self.epsilon = self.eps_min

    def learn(self):
        if self.memory.mem_counter < self.batch_size:
            return

        self.q_eval.optimizer.zero_grad()

        self.replace_target_network()

        states, actions, rewards, resulted_states, dones = self.sample_memory()

        indexes = np.arange(self.batch_size)

        V_states, A_states = self.q_eval.forward(states)
        q_pred = T.add(
            V_states, (A_states - A_states.mean(dim=1, keepdim=True)))[indexes,
                                                                       actions]

        V_resulted_states, A_resulted_states = self.q_next.forward(
            resulted_states)
        q_next = T.add(
            V_resulted_states,
            (A_resulted_states -
             A_resulted_states.mean(dim=1, keepdim=True))).max(dim=1)[0]
        q_next[dones] = 0.0

        target = rewards + self.gamma * q_next

        loss = self.q_eval.loss(target, q_pred).to(self.q_eval.device)
        loss.backward()
        self.q_eval.optimizer.step()

        self.learn_step_counter += 1
        self.decrement_epsilon()

    def save_models(self):
        self.q_eval.save_checkpoint()
        self.q_next.save_checkpoint()

    def load_models(self):
        self.q_eval.load_checkpoint()
        self.q_next.load_checkpoint()