class DQNAgent(): def __init__(self, gamma, epsilon, lr, n_actions, input_dims, mem_size, batch_size, eps_min=0.01, eps_dec = 5e-7, replace =1000, algo=None, env_name = None, chkpt_dir = 'tmp/dqn'): self.gamma = gamma self.epsilon = epsilon self.lr = lr self.n_actions = n_actions self.input_dims = input_dims self.batch_size = batch_size self.eps_min = eps_min self.eps_dec = eps_dec self.replace_target_cnt = replace self.algo = algo self.env_name = env_name self.chkpt_dir = chkpt_dir self.action_space = [i for i in range(self.n_actions)] self.learn_step_counter= 0 self.memory = ReplayBuffer(mem_size, input_dims, n_actions) self.q_eval = DeepQNetwork(self.lr, self.n_actions, input_dims = self.input_dims,name = self.env_name + self.algo + '_q_eval', chkpt_dir= self.chkpt_dir) self.q_next = DeepQNetwork(self.lr, self.n_actions, input_dims = self.input_dims,name = self.env_name + self.algo + '_q_next', chkpt_dir= self.chkpt_dir) def choose_action(self, observation): if(np.random.random()> self.epsilon): state = T.tensor([observation], dtype=T.float).to(self.q_eval.device) actions = self.q_eval.forward(state) action = T.argmax(actions).item() else: action = np.random.choice(self.action_space) return action def store_transition(self, state, action, reward, state_, done): self.memory.store_transition(state, action, reward, state_, done) def sample_memory(self): state, action, reward, new_state, done = self.memory.sample_buffer(self.batch_size) states = T.tensor(state).to(self.q_eval.device) rewards= T.tensor(reward).to(self.q_eval.device) dones = T.tensor(done).to(self.q_eval.device) actions = T.tensor(action).to(self.q_eval.device) states_ = T.tensor(new_state).to(self.q_eval.device) return states, actions, rewards, states_, done def replace_target_network(self): if self.learn_step_counter % self.replace_target_cnt == 0: self.q_next.load_state_dict(self.q_eval.state_dict()) def decrement_epsilon(self): self.epsilon = self.epsilon -self.eps_dec if self.epsilon > self.eps_min else self.eps_min def save_models(self): self.q_eval.save_checkpoint() self.q_next.save_checkpoint() def load_models(self): self.q_eval.load_checkpoint() self.q_next.load_checkpoint() def learn(self): if(self.memory.mem_cntr < self.batch_size): return self.q_eval.optimizer.zero_grad() self.replace_target_network() states, actions, rewards, states_, dones = self.sample_memory() indices = np.arange(self.batch_size) q_pred = self.q_eval.forward(states)[indices, actions] q_next = self.q_next.forward(states_).max(dim=1)[0] q_next[dones] = 0 q_target= rewards + self.gamma * q_next loss =self.q_eval.loss(q_target, q_pred).to(self.q_eval.device) loss.backward() self.q_eval.optimizer.step() self.learn_step_counter+=1 self.decrement_epsilon()
class DQNAgent(object): def __init__(self, gamma, epsilon, lr, n_actions, input_dims, mem_size, batch_size, eps_min=0.01, eps_dec=5e-7, replace=1000, algo=None, env_name=None, chkpt_dir='tmp/dqn'): self.gamma = gamma self.epsilon = epsilon self.lr = lr self.n_actions = n_actions self.input_dims = input_dims self.batch_size = batch_size self.eps_min = eps_min self.eps_dec = eps_dec self.replace_target_cnt = replace self.algo = algo self.env_name = env_name self.chkpt_dir = chkpt_dir self.action_space = [i for i in range(n_actions)] self.learn_step_counter = 0 self.memory = ReplayBuffer(mem_size, input_dims, n_actions) self.q_eval = DeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name + '_' + self.algo + '_q_eval', chkpt_dir=self.chkpt_dir) self.q_next = DeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name + '_' + self.algo + '_q_next', chkpt_dir=self.chkpt_dir) #self.dim_bechir = self.q_eval.calculate_output_bechir(self.input_dims) def choose_action(self, observation): """ Choose an action through an epsilon-greedy approach. :param observation: state features as provided by gym environment. :return: action """ if np.random.random() > self.epsilon: # Convert state to Pytorch tensor and send to q_eval.device state = T.tensor([observation], dtype=T.float).to(self.q_eval.device) # Get actions values from q_eval network actions = self.q_eval.forward(state) # Get action with highest value action = T.argmax(actions).item() else: # Select random action from action space action = np.random.choice(self.action_space) return action def store_transition(self, state, action, reward, state_, done): self.memory.store_transition(state, action, reward, state_, done) def sample_memory(self): state, action, reward, new_state, done = \ self.memory.sample_buffer(self.batch_size) states = T.tensor(state).to(self.q_eval.device) rewards = T.tensor(reward).to(self.q_eval.device) dones = T.tensor(done).to(self.q_eval.device) actions = T.tensor(action).to(self.q_eval.device) states_ = T.tensor(new_state).to(self.q_eval.device) return states, actions, rewards, states_, dones def replace_target_network(self): if self.learn_step_counter % self.replace_target_cnt == 0: self.q_next.load_state_dict(self.q_eval.state_dict()) def decrement_epsilon(self): self.epsilon = self.epsilon - self.eps_dec \ if self.epsilon > self.eps_min else self.eps_min def save_models(self): self.q_eval.save_checkpoint() self.q_next.save_checkpoint() def load_models(self): self.q_eval.load_checkpoint() self.q_next.load_checkpoint() def learn(self): # If memory counter has not reached batch size simply return if self.memory.mem_cntr < self.batch_size: return # reset gradients for the main network's optimizer self.q_eval.optimizer.zero_grad() # Call function to update target network weights every n steps self.replace_target_network() # Sample environment transitions from the replay buffer states, actions, rewards, states_, dones = self.sample_memory() # Get Q(s,a) for the actions performed by the agent. # Because we processed a batch of states, we need to index the result of the forward function by the indices of # the states (from 0 to batch_size) followed by the index of the action performed by the agent. indices = np.arange(self.batch_size) q_pred = self.q_eval.forward(states)[indices, actions] # Get max Q(s', a') from target network q_next = self.q_next.forward(states_).max(dim=1)[0] # Set Q(s', a') to zero for terminal states q_next[dones] = 0.0 # Compute the q_target as r + gamma * Q(s',a') q_target = rewards + self.gamma * q_next # Compute the loss tensor and move it to q_eval.device loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device) # Backpropagate loss and optimize network parameters loss.backward() self.q_eval.optimizer.step() # Increment training counter self.learn_step_counter += 1 # Decrement epsilon for epsilon-greedy action selection self.decrement_epsilon()
class DQNAgent(): def __init__(self, gamma, epsilon, lr, n_actions, input_dims, mem_size, batch_size, chkpt_dir, eps_min=0.01, eps_dec=5e-7, replace=1000, algo=None, env_name=None): self.gamma = gamma # 0.99 self.epsilon = epsilon # 1.0 self.lr = lr # 0.0001 self.n_actions = n_actions # 6 self.input_dims = input_dims # (4, 84, 84) self.batch_size = batch_size # 32 self.eps_min = eps_min # 0.1 self.eps_dec = eps_dec # 1e-05 self.replace_target_cnt = replace # 1000 self.algo = algo # 'DQNAgent' self.env_name = env_name # 'PongNoFrameskip-v4' self.chkpt_dir = chkpt_dir # .\\models\\ self.action_space = [i for i in range(self.n_actions) ] # [0, 1, 2, 3, 4, 5] self.learn_step_counter = 0 self.memory = ReplayBuffer(mem_size, input_dims, n_actions) self.q_eval = DeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name + '_' + self.algo + '_q_eval', chkpt_dir=self.chkpt_dir) self.q_next = DeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name + '_' + self.algo + '_q_next', chkpt_dir=self.chkpt_dir) def choose_action(self, observation): if np.random.random() > self.epsilon: state = T.tensor([observation], dtype=T.float).to(self.q_eval.device) actions = self.q_eval.forward(state) action = T.argmax(actions).item() else: action = np.random.choice(self.action_space) return action def store_transition(self, state, action, reward, state_, done): self.memory.store_transition(state, action, reward, state_, done) def sample_memory(self): state, action, reward, new_state, done = self.memory.sample_buffer( self.batch_size) states = T.tensor(state).to(self.q_eval.device) rewards = T.tensor(reward).to(self.q_eval.device) dones = T.tensor(done).to(self.q_eval.device) actions = T.tensor(action).to(self.q_eval.device) states_ = T.tensor(new_state).to(self.q_eval.device) return states, actions, rewards, states_, dones def replace_target_network(self): if self.learn_step_counter % self.replace_target_cnt == 0: self.q_next.load_state_dict(self.q_eval.state_dict( )) # load_state_dict and state_dict are inbuilt functions of torch def decrement_epsilon(self): self.epsilon = self.epsilon - self.eps_dec if self.epsilon > self.eps_min else self.eps_min def save_models(self): self.q_eval.save_checkpoint() self.q_next.save_checkpoint() def load_models(self): self.q_eval.load_checkpoint() self.q_next.load_checkpoint() def learn(self): if self.memory.mem_cntr < self.batch_size: return self.q_eval.optimizer.zero_grad() self.replace_target_network() states, actions, rewards, states_, dones = self.sample_memory() indices = np.arange(self.batch_size) q_pred = self.q_eval.forward( states )[indices, actions] # self.q_eval.forward(states).shape = (32, 6), q_pred.shape = 32 q_next = self.q_next.forward(states_).max( dim=1 )[0] # self.q_next.forward(states_).shape = (32, 6), q_next.shape = 32 temp_dones = dones.bool() q_next[temp_dones] = 0.0 # as reward for terminal state is 0 q_target = rewards + self.gamma * q_next loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device) loss.backward() self.q_eval.optimizer.step() self.learn_step_counter += 1 self.decrement_epsilon()
class DoubleDQNAgent(): def __init__(self, gamma, epsilon, lr, n_actions, input_dims, memory_size, batch_size, algo, env_name, checkpoint_dir, epsilon_min=0.01, epsilon_decay=5e-7, replace_target_count=1000): self.gamma = gamma self.epsilon = epsilon self.lr = lr self.n_actions = n_actions self.input_dims = input_dims self.memory_size = memory_size self.batch_size = batch_size self.algo = algo self.env_name = env_name self.epsilon_min = 0.01 self.epsilon_decay = epsilon_decay self.replace_target_count = replace_target_count self.checkpoint_dir = checkpoint_dir self.action_space = [i for i in range(self.n_actions)] self.learn_step_counter = 0 self.memory = ReplayMemory(memory_size, input_dims, n_actions) self.q_net = DeepQNetwork(self.lr, self.n_actions, name=self.env_name+'_'+self.algo+'_q_net', input_dims=self.input_dims, checkpoint_dir=self.checkpoint_dir) self.target_net = DeepQNetwork(self.lr, self.n_actions, name=self.env_name+'_'+self.algo+'_target_net', input_dims=self.input_dims, checkpoint_dir=self.checkpoint_dir) def choose_action(self, observation): if np.random.random() > self.epsilon: observation = T.tensor([observation], dtype=T.float).to(self.q_net.device) actions = self.q_net(observation) action = T.argmax(actions).item() else: action = np.random.choice(self.action_space) return action def remember(self, state, action, reward, next_state, done): self.memory.remember(state, action, reward, next_state, done) def sample_memory(self): states, actions, rewards, next_states, dones = \ self.memory.sample_buffer(self.batch_size) states = T.tensor(states).to(self.q_net.device) actions = T.tensor(actions).to(self.q_net.device) rewards = T.tensor(rewards).to(self.q_net.device) next_states = T.tensor(next_states).to(self.q_net.device) dones = T.tensor(dones).to(self.q_net.device) return states, actions, rewards, next_states, dones def replace_target_network(self): if self.learn_step_counter % self.replace_target_count == 0: self.target_net.load_state_dict(self.q_net.state_dict()) def decrement_epsilon(self): self.epsilon = self.epsilon - self.epsilon_decay \ if self.epsilon > self.epsilon_min else self.epsilon_min def learn(self): if self.memory.memory_counter < self.batch_size: return self.q_net.optimizer.zero_grad() self.replace_target_network() states, actions, rewards, next_states, dones = self.sample_memory() q_prediction = self.q_net(states) # (batch_size, *n_actions) target_predictions = self.target_net(next_states) # (batch_size, *n_actions) target_predictions[dones] = 0.0 indices = np.arange(self.batch_size) q_value = q_prediction[indices, actions] t_actions = T.argmax(self.q_net(next_states), dim=1) target_value = rewards + self.gamma * target_predictions[indices, t_actions] loss = self.q_net.loss(q_value, target_value).to(self.q_net.device) loss.backward() self.q_net.optimizer.step() self.learn_step_counter += 1 self.decrement_epsilon() def save_models(self): self.q_net.save_checkpoint() self.target_net.save_checkpoint() def load_models(self): self.q_net.load_checkpoint() self.target_net.load_checkpoint()
class Agent(object): def __init__(self, gamma, epsilon, lr, n_actions, input_dims, mem_size, batch_size, eps_min=0.01, eps_dec=5e-7, replace=1000, algo=None, env_name=None, checkpoint_dir='tmp/dqn'): self.gamma = gamma self.epsilon = epsilon self.lr = lr self.n_actions = n_actions self.input_dims = input_dims self.batch_size = batch_size self.eps_min = eps_min self.eps_dec = eps_dec self.replace_target_counter = replace self.algo = algo self.env_name = env_name self.checkpoint_dir = checkpoint_dir self.action_space = [i for i in range(self.n_actions)] self.learn_step_counter = 0 self.memory = ReplayBuffer(mem_size, input_dims, n_actions) self.q_eval = DeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name + '_' + self.algo + "_q_eval", checkpoint_dir=self.checkpoint_dir) self.q_next = DeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name + '_' + self.algo + "_q_next", checkpoint_dir=self.checkpoint_dir) def store_transition(self, state, action, reward, resulted_state, done): self.memory.store_transition(state, action, reward, resulted_state, done) def sample_memory(self): state, action, reward, resulted_state, done = self.memory.sample_buffer( self.batch_size) states = T.tensor(state).to(self.q_eval.device) rewards = T.tensor(reward).to(self.q_eval.device) dones = T.tensor(done).to(self.q_eval.device) actions = T.tensor(action).to(self.q_eval.device) resulted_states = T.tensor(resulted_state).to(self.q_eval.device) return states, actions, rewards, resulted_states, dones def choose_action(self, observation): if np.random.random() > self.epsilon: state = T.tensor([observation], dtype=T.float).to( self.q_eval.device) # converting observation to tensor, # and observation is in the list because our convolution expects an input tensor of shape batch size # by input dims. _, advantages = self.q_eval.forward(state) action = T.argmax(advantages).item() else: action = np.random.choice(self.action_space) return action def replace_target_network(self): if self.replace_target_counter is not None and \ self.learn_step_counter % self.replace_target_counter == 0: self.q_next.load_state_dict(self.q_eval.state_dict()) def decrement_epsilon(self): if self.epsilon > self.eps_min: self.epsilon = self.epsilon - self.eps_dec else: self.epsilon = self.eps_min def learn(self): if self.memory.mem_counter < self.batch_size: return self.q_eval.optimizer.zero_grad() self.replace_target_network() states, actions, rewards, resulted_states, dones = self.sample_memory() indexes = np.arange(self.batch_size) V_states, A_states = self.q_eval.forward(states) q_pred = T.add( V_states, (A_states - A_states.mean(dim=1, keepdim=True)))[indexes, actions] V_resulted_states, A_resulted_states = self.q_next.forward( resulted_states) q_next = T.add( V_resulted_states, (A_resulted_states - A_resulted_states.mean(dim=1, keepdim=True))).max(dim=1)[0] q_next[dones] = 0.0 target = rewards + self.gamma * q_next loss = self.q_eval.loss(target, q_pred).to(self.q_eval.device) loss.backward() self.q_eval.optimizer.step() self.learn_step_counter += 1 self.decrement_epsilon() def save_models(self): self.q_eval.save_checkpoint() self.q_next.save_checkpoint() def load_models(self): self.q_eval.load_checkpoint() self.q_next.load_checkpoint()