class DeepQAgent(): def __init__(self, lr: float, gamma: float, obs_dims, num_actions: int, mem_size, mini_batchsize, epsilon_dec, env_name, algo_name, epsilon_min=0.1, checkpoint_dir='temp/dqn'): self.lr = lr self.gamma = gamma self.obs_dims = obs_dims self.num_actions = num_actions self.mini_batchsize = mini_batchsize self.epsilon_min = epsilon_min self.epsilon_dec = epsilon_dec self.epsilon = 1.0 self.mem_counter = 0 self.copy_counter = 0 self.checkpoint_dir = checkpoint_dir self.memories = ReplayBuffer(mem_size=mem_size, state_shape=self.obs_dims, num_actions=self.num_actions) self.action_space = [i for i in range(self.num_actions)] self.learning_network = DeepQNetwork( lr=self.lr, num_actions=self.num_actions, input_dims=self.obs_dims, name=env_name + '_' + algo_name + '_learning', checkpoint_dir=self.checkpoint_dir) self.target_network = DeepQNetwork(lr=self.lr, num_actions=self.num_actions, input_dims=self.obs_dims, name=env_name + '_' + algo_name + '_target', checkpoint_dir=self.checkpoint_dir) def decrement_epsilon(self): if self.epsilon > self.epsilon_min: self.epsilon = self.epsilon - self.epsilon_dec else: self.epsilon = self.epsilon_min def store_memory(self, obs, action, reward, new_obs, done): self.memories.store(obs, action, reward, new_obs, done) self.mem_counter += 1 def sample_memory(self): states, actions, rewards, new_states, dones = self.memories.sample( self.mini_batchsize) states = T.tensor(states).to(self.target_network.device) actions = T.tensor(actions).to(self.target_network.device) rewards = T.tensor(rewards).to(self.target_network.device) new_states = T.tensor(new_states).to(self.target_network.device) dones = T.tensor(dones).to(self.target_network.device) # print(f'---States shape: {states.size()}') return states, actions, rewards, new_states, dones def get_action(self, obs): if np.random.random() < self.epsilon: action = np.random.choice(len(self.action_space), 1)[0] else: # obs = np.array([obs]) state = T.tensor([obs], dtype=T.float).to(self.learning_network.device) returns_for_actions = self.target_network.forward(state) action = T.argmax(returns_for_actions).cpu().detach().numpy() return action def learn(self): if self.mem_counter < self.mini_batchsize: return self.learning_network.optimizer.zero_grad() self.copy_target_network() states, actions, rewards, new_states, dones = self.sample_memory() # print(f'---Actions shape: {actions.size()}') # print(f'---Actions: {actions}') indices = np.arange(self.mini_batchsize) # q_pred = self.learning_network.forward(states)[:, actions] q_pred = self.learning_network.forward(states)[indices, actions] q_next = self.target_network.forward(new_states).max(dim=1)[0] # dim=1 specifies take max along actions and [0] specifies taking the values instead of indices # print(f'---q_pred shape: {q_pred.size()}---') # print(f'---q_next shape: {q_next.size()}---') q_next[dones] = 0.0 targets = rewards + self.gamma * q_next cost = self.learning_network.loss(targets, q_pred) cost.backward() self.learning_network.optimizer.step() self.decrement_epsilon() if self.copy_counter % 4 == 0: self.copy_target_network() self.copy_counter += 1 def copy_target_network(self): self.target_network.load_state_dict(self.learning_network.state_dict()) def save_models(self): self.learning_network.save() self.target_network.save() def load_models(self): self.learning_network.load() self.target_network.load()
class DuelingDQAgent(): def __init__(self, lr: float, gamma: float, obs_dims, num_actions: int, mem_size, mini_batchsize, epsilon_dec, env_name, algo_name, epsilon=1.0, replace=1000, epsilon_min=0.1, checkpoint_dir='temp/dqn/duelingdqn'): self.lr = lr self.gamma = gamma self.obs_dims = obs_dims self.num_actions = num_actions self.mini_batchsize = mini_batchsize self.epsilon_min = epsilon_min self.epsilon_dec = epsilon_dec self.epsilon = epsilon self.mem_counter = 0 self.copy_counter = 0 self.replace_target_cnt = replace self.checkpoint_dir = checkpoint_dir self.memories = ReplayBuffer(mem_size=mem_size, state_shape=self.obs_dims, num_actions=self.num_actions) self.action_space = [i for i in range(self.num_actions)] self.learning_network = DuelingQNetwork( lr=self.lr, num_actions=self.num_actions, input_dims=self.obs_dims, name=env_name + '_' + algo_name + '_learning', checkpoint_dir=self.checkpoint_dir) self.target_network = DuelingQNetwork( lr=self.lr, num_actions=self.num_actions, input_dims=self.obs_dims, name=env_name + '_' + algo_name + '_target', checkpoint_dir=self.checkpoint_dir) def decrement_epsilon(self): if self.epsilon > self.epsilon_min: self.epsilon = self.epsilon - self.epsilon_dec else: self.epsilon = self.epsilon_min def store_memory(self, obs, action, reward, new_obs, done): self.memories.store(obs, action, reward, new_obs, done) self.mem_counter += 1 def sample_memory(self): states, actions, rewards, new_states, dones = self.memories.sample( self.mini_batchsize) states = T.tensor(states).to(self.target_network.device) actions = T.tensor(actions).to(self.target_network.device) rewards = T.tensor(rewards).to(self.target_network.device) new_states = T.tensor(new_states).to(self.target_network.device) dones = T.tensor(dones).to(self.target_network.device) # print(f'---States shape: {states.size()}') return states, actions, rewards, new_states, dones def get_action(self, obs): if np.random.random() < self.epsilon: action = np.random.choice(len(self.action_space), 1)[0] else: # obs = np.array([obs]) state = T.tensor([obs], dtype=T.float).to(self.learning_network.device) returns_for_actions = self.target_network.forward(state) action = T.argmax(returns_for_actions).cpu().detach().numpy() return action def learn(self): if self.mem_counter < self.mini_batchsize: return self.learning_network.optimizer.zero_grad() states, actions, rewards, new_states, dones = self.sample_memory() # print(f'---Actions shape: {actions.size()}') # print(f'---Actions: {actions}') indices = np.arange(self.mini_batchsize) q_pred = self.learning_network.forward(states)[indices, actions] q_next = self.learning_network.forward(new_states) actions_selected = T.argmax( q_next, dim=1) # Action selection based on online weights q_eval = self.target_network.forward(new_states) q_eval[dones] = 0.0 #Actions' return value are evaluated q_target = rewards + self.gamma * q_eval[indices, actions_selected] cost = self.learning_network.loss(q_target, q_pred) cost.backward() self.learning_network.optimizer.step() self.decrement_epsilon() if self.copy_counter % self.replace_target_cnt == 0: self.copy_target_network() self.copy_counter += 1 def copy_target_network(self): self.target_network.load_state_dict(self.learning_network.state_dict()) def save_models(self): self.learning_network.save() self.target_network.save() def load_models(self): self.learning_network.load() self.target_network.load()
class DoubleDQAgent(): def __init__(self, lr: float, gamma: float, obs_dims, num_actions: int, mem_size, mini_batchsize, epsilon_dec, env_name, algo_name, epsilon=1.0, replace=1000, epsilon_min=0.1, checkpoint_dir='results\\doubledqn'): self.lr = lr self.gamma = gamma self.obs_dims = obs_dims self.num_actions = num_actions self.mini_batchsize = mini_batchsize self.epsilon_min = epsilon_min self.epsilon_dec = epsilon_dec self.epsilon = epsilon self.replace_target_cnt = replace self.mem_counter = 0 self.copy_counter = 0 self.checkpoint_dir = checkpoint_dir self.memories = ReplayBuffer(mem_size=mem_size, state_shape=self.obs_dims, num_actions=self.num_actions) self.action_space = [i for i in range(self.num_actions)] self.learning_network = DeepQNetwork( lr=self.lr, num_actions=self.num_actions, input_dims=self.obs_dims, name=algo_name + '_' + env_name + '_' + 'learning', checkpoint_dir=self.checkpoint_dir) self.target_network = DeepQNetwork(lr=self.lr, num_actions=self.num_actions, input_dims=self.obs_dims, name=env_name + '_' + algo_name + '_target', checkpoint_dir=self.checkpoint_dir) self.loss_value = 0 self.writer = SummaryWriter(os.path.join(self.checkpoint_dir, 'logs')) def decrement_epsilon(self): if self.epsilon > self.epsilon_min: self.epsilon = self.epsilon - self.epsilon_dec else: self.epsilon = self.epsilon_min def store_memory(self, obs, action, reward, new_obs, done): self.memories.store(obs, action, reward, new_obs, done) self.mem_counter += 1 def sample_memory(self): states, actions, rewards, new_states, dones = self.memories.sample( self.mini_batchsize) states = T.tensor(states).to(self.target_network.device) actions = T.tensor(actions).to(self.target_network.device) rewards = T.tensor(rewards).to(self.target_network.device) new_states = T.tensor(new_states).to(self.target_network.device) dones = T.tensor(dones).to(self.target_network.device) # print(f'---States shape: {states.size()}') return states, actions, rewards, new_states, dones def get_action(self, obs): if np.random.random() < self.epsilon: action = np.random.choice(len(self.action_space), 1)[0] else: state = T.tensor([obs], dtype=T.float).to(self.learning_network.device) returns_for_actions = self.target_network.forward(state) action = T.argmax(returns_for_actions).cpu().detach().numpy() return action def learn(self): if self.mem_counter < self.mini_batchsize: return self.learning_network.optimizer.zero_grad() states, actions, rewards, new_states, dones = self.sample_memory() indices = np.arange(self.mini_batchsize) q_pred = self.learning_network.forward(states)[indices, actions] q_next = self.learning_network.forward(new_states) actions_selected = T.argmax( q_next, dim=1) # Action selection based on online weights q_eval = self.target_network.forward(new_states) q_eval[dones] = 0.0 #Actions' return value are evaluated q_target = rewards + self.gamma * q_eval[indices, actions_selected] cost = self.learning_network.loss(q_target, q_pred) cost.backward() self.learning_network.optimizer.step() self.decrement_epsilon() if self.copy_counter % self.replace_target_cnt == 0: self.copy_target_network() self.copy_counter += 1 self.loss_value = cost def log(self, num_episode): diff = 0 for p_learning, p_target in zip(self.learning_network.parameters(), self.target_network.parameters()): p_learning = p_learning.data.cpu() p_target = p_target.data.cpu() diff += T.sum(p_learning - p_target) self.writer.add_scalar("td_error", self.loss_value, num_episode) self.writer.add_scalar("learning_target_diff", diff, num_episode) return diff def copy_target_network(self): self.target_network.load_state_dict(self.learning_network.state_dict()) def save_models(self): self.learning_network.save() self.target_network.save() def load_models(self): self.learning_network.load() self.target_network.load()