class DDQNAgent(object): def __init__(self, gamma, epsilon, lr, n_actions, input_dims, action_joint_dim, mem_size, batch_size, eps_min, eps_dec, replace, prioritized=False, prob_alpha=0.6, beta=0.4, beta_increment=1e-4, temperature = 0.1, tau = 1e-5): """ Double Deep Q-Learning Agent class. ----- Args: gamma: Discount factor for reward. 0 indicates a myopic behaviour. 1 indicates a far-sighted behaviour. epsilon: Exploration/exploitation rate. 0 indicates full exploitation. lr: Learning Rate. The bigger 'lr' the bigger step in the gradient of the loss. n_actions: Number of possible actions. input_dims: Dimension of the state (allegedly an image). The channel goes first (CHANN, HEIGHT, WIDTH) action_joint_dim: Number of joints for the Multi-agent case. Normally the number of agents. mem_size: Number of the Replay Buffer memory. batch_size: Number of past experiences used for trainin Q-Network. eps_min: Min. value for the exploration. eps_dec: Epsilon decay in every epoch. replace: Number of epochs for replacing the target network with the behavioral network. ------ """ # Hiperparámetros de entrenamiento # self.gamma = gamma self.epsilon = epsilon self.beta = beta self.beta_increment = beta_increment self.prob_alpha = prob_alpha self.lr = lr self.n_actions = n_actions self.input_dims = input_dims self.batch_size = batch_size self.eps_min = eps_min self.eps_dec = eps_dec self.update_target_count = replace self.action_space = [i for i in range(n_actions)] self.action_joint_dim = action_joint_dim self.prioritized = prioritized self.temperature = temperature self.tau = tau self.mem_size = mem_size if not self.prioritized: self.memory = ReplayBuffer(mem_size, input_dims, action_joint_dim) else: self.memory = PrioritizedReplayBuffer(mem_size, input_dims, action_joint_dim, self.prob_alpha) # Funciones del modelo y del target # self.q_eval = DeepQNetwork(self.lr, num_agents=action_joint_dim, action_size=n_actions, input_size=input_dims) self.q_eval.cuda() self.q_next = DeepQNetwork(self.lr, num_agents=action_joint_dim, action_size=n_actions, input_size=input_dims) self.q_next.cuda() def reset_memory(self): self.memory = ReplayBuffer(self.mem_size, self.input_dims, self.action_joint_dim) def store_transition(self, state, action, reward, next_state, done): """ Store a '(s,a,r,s')' transition into the buffer replay. ------ Args: state: State of the experience. action: Action joint performed in the given 'state'. reward: 1D Reward array obtained due to (s,a). One component for every agent. next_state: The next state produced, given (s,a). done: If the state is terminal. Normally 0 because non-episodic. ------ """ # Guardamos en memoria la tupla (s,a,r,s') + done # # Se guardan como arrays de numpy y al samplear se pasan a tensores # self.memory.store_transition(state, action, reward, next_state, done) def sample_memory(self): """ Extract 'self.batch_size' experiences (s,a,r,s') from the memory replay. ------ Returns: The *BATCH_SIZE* (s,a,r,s') experiences. ------ """ # Muestreamos un conjunto BATCH de experiencias # state, action, reward, new_state, done = self.memory.sample_buffer(self.batch_size) with T.no_grad(): # Los convertimos a tensores de Pytorch # states = T.tensor(state, device = self.q_eval.device) rewards = T.tensor(reward, device = self.q_eval.device) dones = T.tensor(done, device = self.q_eval.device) actions = T.tensor(action, device = self.q_eval.device) next_state = T.tensor(new_state, device = self.q_eval.device) return states, actions, rewards, next_state, dones def prioritized_sample_memory(self): """ Extract 'self.batch_size' experiences (s,a,r,s') from the memory replay. ------ Returns: The *BATCH_SIZE* (s,a,r,s') experiences. ------ """ # Muestreamos un conjunto BATCH de experiencias # state, action, reward, new_state, done, indices, weight = self.memory.sample_buffer(self.batch_size, self.beta) with T.no_grad(): # Los convertimos a tensores de Pytorch # states = T.tensor(state, device = self.q_eval.device) rewards = T.tensor(reward, device = self.q_eval.device) dones = T.tensor(done, device = self.q_eval.device) actions = T.tensor(action, device = self.q_eval.device) next_state = T.tensor(new_state, device = self.q_eval.device) weights = T.tensor(weight, device = self.q_eval.device) return states, actions, rewards, next_state, dones, indices, weights # Política e-greedy # def choose_action(self, observation, mode = 'egreedy'): """ Epsilon-greedy policy. Plays explorate/explotate with a probability epsilon/(1-apsilon). ---- Args: observation: The state (allegedly an image). Must be a matrix with (N_CHANN, HEIGHT, WIDTH) Returns: An action joint (1D array) with the selected actions. ------ """ if mode == 'egreedy': if np.random.random() > self.epsilon: with T.no_grad(): state = T.tensor([observation], dtype=T.float, device = self.q_eval.device) Q = self.q_eval.forward(state) action_joint = [] # En la e-greedy, si caemos en explotación, a = max_a(Q) for i in range(self.action_joint_dim): action_joint.append(T.argmax(Q.narrow(1,i*self.n_actions,self.n_actions)).item()) return action_joint else: action_joint = np.random.choice(self.action_space, size = self.action_joint_dim) return action_joint elif mode == 'softmax': with T.no_grad(): state = T.tensor([observation], dtype=T.float, device = self.q_eval.device) Q = self.q_eval.forward(state) action_joint = [] # Softmax policy # for i in range(self.action_joint_dim): probs = T.softmax(Q.narrow(1,i*self.n_actions,self.n_actions)/self.temperature,dim=1) categ = T.distributions.Categorical(probs) action_joint.append(categ.sample().item()) return action_joint else: assert('ERROR. ESCOJA UN VALOR DE POLÍTICA ADECUADO: (egreedy/softmax)') # Este método se usará out-class para poder alternar entre DDQN y DQN # def replace_target_network(self, epoch): """ Function to dump the behavioral network into the target network. ----- Args: epoch: The actual epoch. ------ """ """ if epoch % self.update_target_count == 0: for target_param, param in zip(self.q_next.parameters(), self.q_eval.parameters()): target_param.data.copy_(target_param.data * (1.0 - self.tau) + param.data * self.tau) """ if epoch % self.update_target_count == 0: self.q_next.load_state_dict(self.q_eval.state_dict()) def learn(self, mask = None): """ Learning function. It predicts the return (target_network) and takes a descent gradient step. The q-values are calculated with Time Difference and we will accumulate the other_results of the gradients along the agents. ------ """ # Si intentamos entrenar con menos experiencias que tamaño de batch # nos salimos y no entrenamos. if self.memory.mem_cntr < self.batch_size: return if mask is None: mask = np.zeros(shape=self.action_joint_dim) self.q_eval.optimizer.zero_grad() self.q_next.optimizer.zero_grad() if not self.prioritized: states, actions, rewards, next_states, dones = self.sample_memory() else: states, actions, rewards, next_states, dones, batches, weights = self.prioritized_sample_memory() prior = T.tensor(np.zeros(shape=batches.shape), device = self.q_eval.device) indices = np.arange(self.batch_size) Q_pred = self.q_eval(states) Q_next = self.q_next(next_states) Q_eval = self.q_eval(next_states) for i in range(self.action_joint_dim): if mask[i] == 1: # If the mask is 1, the agent does not learn at all # continue q_pred = Q_pred.narrow(1,i*self.n_actions,self.n_actions)[indices, actions[:, i]] max_actions = T.argmax(Q_eval.narrow(1,i*self.n_actions,self.n_actions), dim=1).detach() q_target = rewards[indices, i] + self.gamma*Q_next.narrow(1,i*self.n_actions,self.n_actions)[indices, max_actions[i]].detach() # TARGET IS DETACHED, ITS PARAMETERS ARE NOT SUBJECT OF TRAINING # if not self.prioritized: loss = self.q_eval.loss(q_target, q_pred).to(self.q_eval.device) loss.backward(retain_graph=True) else: loss = self.q_eval.loss2(q_target, q_pred).to(self.q_eval.device)*weights prior += loss.data.detach() loss = loss.mean() loss.backward(retain_graph=True) self.q_eval.optimizer.step() if self.prioritized: self.memory.update_priorities(batches, prior.cpu().numpy()) def decrement_epsilon(self): """ Decrement 'self.epsilon' with a 'self.eps_dec', clipping its minimum to 'self.eps_min'. ------ """ if self.epsilon >= self.eps_min: self.epsilon = self.epsilon - self.eps_dec else: self.epsilon = self.eps_min def increment_beta(self): if self.beta >= 1: self.beta = 1 else: self.beta += self.beta_increment def decrement_temperature(self): if self.temperature < 0.005: self.temperature = 0.005 else: self.temperature = self.temperature - 2e-4
class DQN(object): def __init__(self): if USE_CNN: if USE_GPU: self.eval_net, self.target_net = ConvNet().cuda(), ConvNet( ).cuda() else: self.eval_net, self.target_net = ConvNet(), ConvNet() else: if USE_GPU: self.eval_net, self.target_net = Net().cuda(), Net().cuda() else: self.eval_net, self.target_net = Net(), Net() self.learn_step_counter = 0 # for target updating self.memory_counter = 0 # Create the replay buffer if MEMORY_MODE == 'PER': self.replay_buffer = PrioritizedReplayBuffer(MEMORY_CAPACITY, alpha=PER_ALPHA) else: self.replay_buffer = ReplayBuffer(MEMORY_CAPACITY) self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=LR) def choose_action(self, x, EPSILON): if USE_GPU: x = Variable(torch.FloatTensor(x)).cuda() else: x = Variable(torch.FloatTensor(x)) # input only one sample if np.random.uniform() < EPSILON: # greedy actions_value = self.eval_net.forward(x.unsqueeze(0)) if USE_GPU: action = torch.argmax( actions_value).data.cpu().numpy() # return the argmax else: action = torch.argmax( actions_value).data.numpy() # return the argmax; else: # random action = np.random.randint(0, N_ACTIONS) return action def store_transition(self, s, a, r, s_, done): self.memory_counter += 1 self.replay_buffer.add(s, a, r, s_, float(done)) def learn(self, beta): # target parameter update if self.learn_step_counter % TARGET_REPLACE_ITER == 0: self.target_net.load_state_dict(self.eval_net.state_dict()) self.learn_step_counter += 1 # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if MEMORY_MODE == 'PER': experience = self.replay_buffer.sample(BATCH_SIZE, beta=beta) (b_state_memory, b_action_memory, b_reward_memory, b_next_state_memory, b_done, b_weights, b_idxes) = experience else: b_state_memory, b_action_memory, b_reward_memory, b_next_state_memory, b_done = self.replay_buffer.sample( BATCH_SIZE) b_weights, b_idxes = np.ones_like(b_reward_memory), None if USE_GPU: b_s = Variable(torch.FloatTensor(b_state_memory)).cuda() b_a = Variable(torch.LongTensor(b_action_memory)).cuda() b_r = Variable(torch.FloatTensor(b_reward_memory)).cuda() b_s_ = Variable(torch.FloatTensor(b_next_state_memory)).cuda() b_d = Variable(torch.FloatTensor(b_done)).cuda() else: b_s = Variable(torch.FloatTensor(b_state_memory)) b_a = Variable(torch.LongTensor(b_action_memory)) b_r = Variable(torch.FloatTensor(b_reward_memory)) b_s_ = Variable(torch.FloatTensor(b_next_state_memory)) b_d = Variable(torch.FloatTensor(b_done)) # q_eval w.r.t the action in experience q_eval = self.eval_net(b_s).gather(1, b_a.unsqueeze(1)).view( -1) # shape (batch, 1) if DOUBLE: _, best_actions = self.eval_net.forward(b_s_).detach().max(1) q_next = self.target_net( b_s_).detach() # detach from graph, don't backpropagate q_target = b_r + GAMMA * (1. - b_d) * q_next.gather( 1, best_actions.unsqueeze(1)).squeeze(1) # shape (batch, 1) else: q_next = self.target_net( b_s_).detach() # detach from graph, don't backpropagate q_target = b_r + GAMMA * ( 1. - b_d) * q_next.max(1)[0] # shape (batch, 1) loss = F.smooth_l1_loss(q_eval, q_target, reduce=False) loss = torch.mean(torch.Tensor(b_weights).cuda() * loss) td_error = (q_target - q_eval).data.cpu().numpy() self.optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.eval_net.parameters(), 10.) self.optimizer.step() if MEMORY_MODE == 'PER': new_priorities = np.abs(td_error) + PER_EPSILON self.replay_buffer.update_priorities(b_idxes, new_priorities) def save_model(self): # save evaluation network and target network simultaneously self.eval_net.save(EVAL_PATH) self.target_net.save(TARGET_PATH) def load_model(self): # load evaluation network and target network simultaneously self.eval_net.load(EVAL_PATH) self.target_net.load(TARGET_PATH)