class DuelingDQNAgent(Agent): def __init__(self, *args, **kwargs): super(DuelingDQNAgent, self).__init__(*args, **kwargs) self.q_eval = DuelingDeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name + '_' + self.algo + '_q_eval', checkpoint_dir=self.checkpoint_dir) self.q_next = DuelingDeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name + '_' + self.algo + '_q_next', checkpoint_dir=self.checkpoint_dir) def choose_action(self, observation): if np.random.random() > self.epsilon: state = T.tensor([observation], dtype=T.float).to(self.q_eval.device) _, advantage = self.q_eval.forward(state) action = T.argmax(advantage).item() else: action = np.random.choice(self.action_space) return action def learn(self): if self.memory.mem_cntr < self.batch_size: return self.q_eval.optimizer.zero_grad() self.replace_target_network() states, actions, rewards, new_states, dones = self.sample_memory() indices = np.arange(self.batch_size) V_s, A_s = self.q_eval.forward(states) V_s_next, A_s_next = self.q_next.forward(new_states) q_pred = T.add(V_s, (A_s - A_s.mean(dim=1, keepdim=True)))[indices, actions] q_next = T.add( V_s_next, (A_s_next - A_s_next.mean(dim=1, keepdim=True))).max(dim=1)[0] q_next[dones] = 0.0 q_target = rewards + self.gamma * q_next loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device) loss.backward() self.q_eval.optimizer.step() self.learn_step_cntr += 1 self.decrement_epsilon()
class DuelingDDQNAgent(): def __init__(self, gamma, epsilon, lr, n_actions, input_dims, mem_size, batch_size, eps_min=0.01, eps_dec=5e-7, replace=1000, algo=None, env_name=None, chkpt_dir='tmp/dqn'): self.gamma = gamma self.epsilon = epsilon self.lr = lr self.n_actions = n_actions self.input_dims = input_dims self.batch_size = batch_size self.eps_min = eps_min self.eps_dec = eps_dec self.replace_target_cnt = replace self.algo = algo self.env_name = env_name self.chkpt_dir = chkpt_dir self.action_space = [i for i in range(self.n_actions)] self.learn_step_counter = 0 self.memory = ReplayBuffer(mem_size, input_dims, n_actions) self.q_eval = DuelingDeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name+'_'+self.algo+'_q_eval', chkpt_dir=self.chkpt_dir) self.q_next = DuelingDeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name+'_'+self.algo+'_q_next', chkpt_dir=self.chkpt_dir) def choose_action(self, observation): if np.random.random() > self.epsilon: state = T.tensor([observation], dtype=T.float).to(self.q_eval.device) _, advantage = self.q_eval.forward(state) action = T.argmax(advantage).item() else: action = np.random.choice(self.action_space) return action def store_transition(self, state, action, reward, state_, done): self.memory.store_transition(state, action, reward, state_, done) def sample_memory(self): state, action, reward, new_state, done = self.memory.sample_buffer(self.batch_size) states = T.tensor(state).to(self.q_eval.device) actions = T.tensor(action).to(self.q_eval.device) rewards = T.tensor(reward).to(self.q_eval.device) states_ = T.tensor(new_state).to(self.q_eval.device) dones = T.tensor(done).to(self.q_eval.device) return states, actions, rewards, states_, dones def replace_target_network(self): if self.learn_step_counter % self.replace_target_cnt == 0: self.q_next.load_state_dict(self.q_eval.state_dict()) def decrement_epsilon(self): self.epsilon = self.epsilon - self.eps_dec if self.epsilon > self.eps_min else self.eps_min def save_models(self): self.q_eval.save_checkpoint() self.q_next.save_checkpoint() def load_models(self): self.q_eval.load_checkpoint() self.q_next.load_checkpoint() def learn(self): if self.memory.mem_cntr < self.batch_size: return self.q_eval.optimizer.zero_grad() self.replace_target_network() states, actions, rewards, states_, dones = self.sample_memory() indices = np.arange(self.batch_size) V_s, A_s = self.q_eval.forward(states) V_s_, A_s_ = self.q_next.forward(states_) V_s_eval, A_s_eval = self.q_eval.forward(states_) q_pred = T.add(V_s, (A_s - A_s.mean(dim=1, keepdim=True)))[indices, actions] q_next = T.add(V_s_, (A_s_ - A_s_.mean(dim=1, keepdim=True))) q_eval = T.add(V_s_eval, (A_s_eval - A_s_eval.mean(dim=1, keepdim=True))) max_actions = T.argmax(q_eval, dim=1) q_next[dones] = 0.0 q_target = rewards + self.gamma * q_next[indices, max_actions] loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device) loss.backward() self.q_eval.optimizer.step() self.learn_step_counter += 1 self.decrement_epsilon()
class DuelingDQNAgent(): def __init__(self, gamma, epsilon, lr, n_actions, input_dims, memory_size, batch_size, algo, env_name, checkpoint_dir, epsilon_min=0.01, epsilon_decay=5e-7, replace_target_count=1000): self.gamma = gamma self.epsilon = epsilon self.lr = lr self.n_actions = n_actions self.input_dims = input_dims self.memory_size = memory_size self.batch_size = batch_size self.algo = algo self.env_name = env_name self.epsilon_min = 0.01 self.epsilon_decay = epsilon_decay self.replace_target_count = replace_target_count self.checkpoint_dir = checkpoint_dir self.action_space = [i for i in range(self.n_actions)] self.learn_step_counter = 0 self.memory = ReplayMemory(memory_size, input_dims, n_actions) self.q_net = DuelingDeepQNetwork(self.lr, self.n_actions, name=self.env_name + '_' + self.algo + '_q_net', input_dims=self.input_dims, checkpoint_dir=self.checkpoint_dir) self.target_net = DuelingDeepQNetwork( self.lr, self.n_actions, name=self.env_name + '_' + self.algo + '_target_net', input_dims=self.input_dims, checkpoint_dir=self.checkpoint_dir) def choose_action(self, observation): if np.random.random() > self.epsilon: observation = T.tensor([observation], dtype=T.float).to(self.q_net.device) _, A = self.q_net(observation) action = T.argmax(A, dim=1).item() else: action = np.random.choice(self.action_space) return action def remember(self, state, action, reward, next_state, done): self.memory.remember(state, action, reward, next_state, done) def sample_memory(self): states, actions, rewards, next_states, dones = \ self.memory.sample_buffer(self.batch_size) states = T.tensor(states).to(self.q_net.device) actions = T.tensor(actions).to(self.q_net.device) rewards = T.tensor(rewards).to(self.q_net.device) next_states = T.tensor(next_states).to(self.q_net.device) dones = T.tensor(dones).to(self.q_net.device) return states, actions, rewards, next_states, dones def replace_target_network(self): if self.learn_step_counter % self.replace_target_count == 0: self.target_net.load_state_dict(self.q_net.state_dict()) def decrement_epsilon(self): self.epsilon = self.epsilon - self.epsilon_decay \ if self.epsilon > self.epsilon_min else self.epsilon_min def learn(self): if self.memory.memory_counter < self.batch_size: return self.q_net.optimizer.zero_grad() self.replace_target_network() states, actions, rewards, next_states, dones = self.sample_memory() indices = np.arange(self.batch_size) V_pred, A_pred = self.q_net(states) V_target, A_target = self.target_net(next_states) q_value = T.add( V_pred, (A_pred[indices, actions] - A_pred.mean(dim=1, keepdim=True))) target_predictions = T.add( V_target, (A_target.max(dim=1)[0] - A_target.mean(dim=1, keepdim=True))) target_predictions[dones] = 0.0 target_value = rewards + self.gamma * target_predictions loss = self.q_net.loss(q_value, target_value).to(self.q_net.device) loss.backward() self.q_net.optimizer.step() self.learn_step_counter += 1 self.decrement_epsilon() def save_models(self): self.q_net.save_checkpoint() self.target_net.save_checkpoint() def load_models(self): self.q_net.load_checkpoint() self.target_net.load_checkpoint()