class DQNAgent(): def __init__(self, state_size, action_size, double=False, duel=False): self.state_size = state_size self.action_size = action_size self.discounted_factor = 0.99 self.learning_rate = 0.001 self.double = double # Define Model if duel: self.local_model = Duel_Qnetwork(state_size, action_size).to(device) self.target_model = Duel_Qnetwork(state_size, action_size).to(device) else: self.local_model = Qnetwork(state_size, action_size).to(device) self.target_model = Qnetwork(state_size, action_size).to(device) # Define optimizer self.optimizer = optim.Adam(self.local_model.parameters(), lr=self.learning_rate) # Define Buffer self.buffer = Replay_buffer(action_size, buffer_size=BUFFER_SIZE, batch_size=BATCH_SIZE) # time_step, local_model update, target_model update self.t_step = 0 self.target_update_t = 0 def get_action(self, state, eps=0.0): """state (numpy.ndarray)""" state = torch.from_numpy(state.reshape( 1, self.state_size)).float().to(device) self.local_model.eval() with torch.no_grad(): action_values = self.local_model(state) # .detach().cpu() self.local_model.train() # epsilon greedy policy if random.random() < eps: action = np.random.randint(4) return action else: action = np.argmax(action_values.cpu().data.numpy()) return int(action) def append_sample(self, state, action, reward, next_state, done): self.buffer.add(state, action, reward, next_state, done) self.t_step += 1 if self.t_step % LOCAL_UPDATE == 0: """If there are enough experiences""" if self.buffer.__len__() > BATCH_SIZE: experiences = self.buffer.sample() self.learn(experiences) # self.target_update_t += 1 # if self.target_update_t % TARGET_UPDATE == 0: self.soft_target_model_update(TAU) def learn(self, experiences): """experiences ;tensor """ states, actions, rewards, next_states, dones = experiences pred_q = self.local_model(states).gather(1, actions) if self.double: _, argmax_actions = torch.max( self.local_model.forward(next_states).detach(), 1, keepdim=True) pred_next_q = self.target_model.forward(next_states).gather( 1, argmax_actions) else: pred_next_q, _ = torch.max( self.target_model.forward(next_states).detach(), 1, keepdim=True) target_q = rewards + ( (1 - dones) * self.discounted_factor * pred_next_q) loss = F.mse_loss(target_q, pred_q) self.optimizer.zero_grad() loss.backward() self.optimizer.step() def soft_target_model_update(self, tau): for target_param, local_param in zip(self.target_model.parameters(), self.local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent: def __init__(self, state_size, action_size, gamma=0.99, lr=5e-4, buffer_size=int(1e5), batch_size=64, tau=1e-3): # defining local and target networks self.qnet_local = Qnetwork(state_size, action_size).to(device) self.qnet_target = Qnetwork(state_size, action_size).to(device) # set local and target parameters equal to each other self.soft_update(tau=1.0) # experience replay buffer self.memory = ReplayBuffer(buffer_size, batch_size) # defining variables self.state_size = state_size self.action_size = action_size self.buffer_size = buffer_size self.batch_size = batch_size self.gamma = gamma self.lr = lr self.tau = tau self.t_step = 0 # optimizer self.optimizer = optim.Adam(self.qnet_local.parameters(), lr=self.lr) def step(self, state, action, reward, next_state, done): """ saves the step info in the memory buffer and perform a learning iteration Input : state,action,reward,state,done : non-batched numpy arrays Output : none """ # add sample to the memory buffer self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY # use replay buffer to learn if it has enough samples if self.t_step == 0: if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) def learn(self, experiences): """ perform a learning iteration by using sampled experience batch Input : experience : tuple from the memory buffer states, actions, rewards, next_states, dones = experiences eg : states.shape = [N,state_size] Output : none """ states, actions, rewards, next_states, dones, wj, choose = experiences #states, actions, rewards, next_states, dones = experiences # set optimizer grdient to zero self.optimizer.zero_grad() # predicted action value q_pred = self.qnet_local.forward(states).gather(1, actions) # target action value ## use double DQNs, refer https://arxiv.org/abs/1509.06461 next_action_local = self.qnet_local.forward(next_states).max(1)[1] q_target = rewards + self.gamma * ( 1 - dones) * self.qnet_target.forward(next_states)[ range(self.batch_size), next_action_local].unsqueeze(1) # compute td error td_error = q_target - q_pred # update td error in Replay buffer self.memory.update_td_error(choose, td_error.detach().cpu().numpy().squeeze()) # defining loss loss = ((wj * td_error)**2).mean() # running backprop and optimizer step loss.backward() self.optimizer.step() # run soft update self.soft_update(self.tau) def act(self, state, eps=0.): """ return the local model's predicted action for the given state Input : state : [state_size] Output : action : scalar action as action space is discrete with dim = 1 """ state = torch.from_numpy(state).float().unsqueeze(dim=0).to( device) # converts numpy array to torch tensor self.qnet_local.eval() # put net in test mode with torch.no_grad(): max_action = np.argmax( self.qnet_local(state)[0].cpu().data.numpy()) self.qnet_local.train() # put net back in train mode rand_num = np.random.rand( ) # sample a random number uniformly between 0 and 1 # implementing epsilon greedy policy if rand_num < eps: return np.random.randint(self.action_size) else: return max_action def soft_update(self, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target """ for target_param, local_param in zip(self.qnet_target.parameters(), self.qnet_local.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)