class Agent_DQN: def __init__(self, args, env): self.args = args self.env = env self.input_channels = 3 if 'SpaceInvaders' in args.env_id else 4 self.num_actions = self.env.action_space.n # if testing, simply load the model we have trained if args.test_dqn: self.load(args.model) self.online_net.eval() self.target_net.eval() return # DQN variants setting self.prioritized = args.prioritized self.double = args.double self.n_steps = args.n_steps self.noise_linear = args.noise_linear if self.prioritized: self.memory = PrioritizedReplayBuffer(10000, alpha=0.6) self.beta_schedule = LinearSchedule(args.num_timesteps, initial_p=0.4, final_p=1.0) self.criterion = MSELoss else: self.memory = ReplayBuffer(10000) self.criterion = nn.MSELoss() if args.atari: DQN = DQN_Atari input_feature = self.input_channels else: DQN = DQN_Simple input_feature = env.observation_space.shape[0] # build target, online network self.target_net = DQN(input_feature, self.num_actions, dueling=args.dueling, noise_linear=args.noise_linear) self.target_net = self.target_net.cuda( ) if use_cuda else self.target_net self.online_net = DQN(input_feature, self.num_actions, dueling=args.dueling, noise_linear=args.noise_linear) self.online_net = self.online_net.cuda( ) if use_cuda else self.online_net # discounted reward self.GAMMA = 0.99 # exploration setting self.exploration = LinearSchedule(schedule_timesteps=int( 0.1 * args.num_timesteps), initial_p=1.0, final_p=0.05) # training settings self.train_freq = 4 self.learning_start = 10000 self.batch_size = args.batch_size self.num_timesteps = args.num_timesteps self.display_freq = args.display_freq self.save_freq = args.save_freq self.target_update_freq = args.target_update_freq self.optimizer = optim.RMSprop(self.online_net.parameters(), lr=1e-4) # global status self.episodes_done = 0 self.steps = 0 def make_action(self, observation, test=True): return self.act(observation, test) def save(self, save_path): print('save model to', save_path) torch.save(self.online_net, save_path + '_online') torch.save(self.target_net, save_path + '_target') def load(self, load_path): if use_cuda: self.online_net = torch.load(load_path + '_online') self.target_net = torch.load(load_path + '_target') else: self.online_net = torch.load( load_path + '_online', map_location=lambda storage, loc: storage) self.target_net = torch.load( load_path + '_target', map_location=lambda storage, loc: storage) def act(self, state, test=False): sample = random.random() if test: eps_threshold = 0.01 state = torch.from_numpy(state).permute(2, 0, 1).unsqueeze(0) state = state.cuda() if use_cuda else state else: eps_threshold = self.exploration.value(self.steps) if sample > eps_threshold: action = self.online_net( Variable(state, volatile=True).type(FloatTensor)).data.max(1)[1].view( 1, 1) else: action = LongTensor([[random.randrange(self.num_actions)]]) return action if not test else action[0, 0] def reset_noise(self): assert self.noise_linear == True self.online_net.reset_noise() self.target_net.reset_noise() def update(self): if self.prioritized: batch, weight, batch_idxes = self.memory.sample( self.batch_size, beta=self.beta_schedule.value(self.steps)) weight_batch = Variable(Tensor(weight)).squeeze() else: batch = self.memory.sample(self.batch_size) # Compute a mask of non-final states and concatenate the batch elements non_final_mask = ByteTensor( tuple(map(lambda s: s is not None, batch.next_state))) # We don't want to backprop through the expected action values and volatile # will save us on temporarily changing the model parameters' # requires_grad to False! non_final_next_states = Variable(torch.cat( [s for s in batch.next_state if s is not None]), volatile=True) state_batch = Variable(torch.cat(batch.state)) action_batch = Variable(torch.cat(batch.action)) reward_batch = Variable(torch.cat(batch.reward)) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken state_action_values = self.online_net(state_batch).gather( 1, action_batch) # Compute V(s_{t+1}) for all next states. next_state_values = Variable(torch.zeros(self.batch_size).type(Tensor)) q_next = self.target_net(non_final_next_states) if self.double: _, best_actions = self.online_net(non_final_next_states).max(1) next_state_values[non_final_mask] = q_next.gather( 1, best_actions.unsqueeze(1)).squeeze(1) else: next_state_values[non_final_mask] = q_next.max(1)[0] # Now, we don't want to mess up the loss with a volatile flag, so let's # clear it. After this, we'll just end up with a Variable that has # requires_grad=False next_state_values.volatile = False # Compute the expected Q values expected_state_action_values = ( next_state_values * (self.GAMMA**(self.n_steps))) + reward_batch # Compute loss if self.prioritized: loss = self.criterion(state_action_values, expected_state_action_values) loss = torch.mul(loss, weight_batch) new_priorities = np.abs(loss.cpu().data.numpy()) + 1e-6 self.memory.update_priorities(batch_idxes, new_priorities) loss = loss.mean() else: loss = self.criterion(state_action_values, expected_state_action_values) # Optimize the model self.optimizer.zero_grad() loss.backward() self.optimizer.step() return loss.data[0] def process_state(self, state): state = np.array(state) if self.args.atari: # map shape: (84,84,4) --> (1,4,84,84) state = torch.from_numpy(state).permute(2, 0, 1).unsqueeze(0) else: state = torch.Tensor(state).unsqueeze(0) return state.cuda() if use_cuda else state def train(self): total_reward = 0 loss = 0 # set training mode self.online_net.train() while (True): if self.noise_linear: self.reset_noise() state = self.process_state(self.env.reset()) done = False episode_duration = 0 while (not done): # select and perform action action = self.act(state) next_state, reward, done, _ = self.env.step(action[0, 0]) total_reward += reward reward = Tensor([reward]) # process new state next_state = self.process_state(next_state) if done: next_state = None # store the transition in memory self.memory.push(state, action, next_state, reward) # move to the next state state = next_state # Perform one step of the optimization (on the target network) if self.steps > self.learning_start and self.steps % self.train_freq == 0: loss = self.update() if self.noise_linear: self.reset_noise() # update target network if self.steps > self.learning_start and self.steps % self.target_update_freq == 0: self.target_net.load_state_dict( self.online_net.state_dict()) if self.steps % self.save_freq == 0: self.save('dqn.cpt') self.steps += 1 episode_duration += 1 if self.episodes_done % self.display_freq == 0: print( 'Episode: %d | Steps: %d/%d | Exploration: %f | Avg reward: %f | loss: %f | Episode Duration: %d' % (self.episodes_done, self.steps, self.num_timesteps, self.exploration.value(self.steps), total_reward / self.display_freq, loss, episode_duration)) writer.add_scalar('reward', total_reward / self.display_freq, self.steps) total_reward = 0 self.episodes_done += 1 if self.steps > self.num_timesteps: break self.save('dqn_final.model') def nsteps_train(self): ''' Training procedure for multi-steps learning ''' total_reward = 0 loss = 0 # set training mode self.online_net.train() while (True): if self.noise_linear: self.reset_noise() state_buffer = deque() # store states for future use action_buffer = deque() # store actions for future use reward_buffer = deque() # store rewards for future use nstep_reward = 0 # calculate n-step discounted reward state = self.process_state(self.env.reset()) state_buffer.append(state) done = False episode_duration = 0 # run n-1 steps for _ in range(1, self.n_steps): action = self.act(state) next_state, reward, done, _ = self.env.step(action[0, 0]) next_state = self.process_state(next_state) if done: next_state = None state_buffer.append(next_state) action_buffer.append(action) nstep_reward = nstep_reward * self.GAMMA + reward reward_buffer.append(reward) state = next_state episode_duration += 1 while (not done): # select and perform action action = self.act(state) next_state, reward, done, _ = self.env.step(action[0, 0]) total_reward += reward # process new state next_state = self.process_state(next_state) if done: next_state = None # save new state, action, reward state_buffer.append(next_state) action_buffer.append(action) reward_buffer.append(reward) nstep_reward = nstep_reward * self.GAMMA + reward # store the transition in memory self.memory.push(state_buffer.popleft(), action_buffer.popleft(), next_state, Tensor([nstep_reward])) # update n-step reward nstep_reward -= (self.GAMMA**(self.n_steps - 1)) * reward_buffer.popleft() # move to the next state state = next_state # Perform one step of the optimization (on the target network) if self.steps > self.learning_start and self.steps % self.train_freq == 0: loss = self.update() if self.noise_linear: self.reset_noise() # update target network if self.steps > self.learning_start and self.steps % self.target_update_freq == 0: self.target_net.load_state_dict( self.online_net.state_dict()) if self.steps % self.save_freq == 0: self.save('dqn.cpt') self.steps += 1 episode_duration += 1 if self.episodes_done % self.display_freq == 0: print( 'Episode: %d | Steps: %d/%d | Exploration: %f | Avg reward: %f | loss: %f | Episode Duration: %d' % (self.episodes_done, self.steps, self.num_timesteps, self.exploration.value(self.steps), total_reward / self.display_freq, loss, episode_duration)) writer.add_scalar('reward', total_reward / self.display_freq, self.steps) total_reward = 0 self.episodes_done += 1 if self.steps > self.num_timesteps: break self.save('dqn_final.model')
class DynaQAgent(mp.Process): def __init__(self, id, env, state_size, action_size, n_episodes, lr, gamma, global_network, target_network, q, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995): super(DynaQAgent, self).__init__() self.id = id self.env = env self.state_size = state_size self.action_size = action_size self.n_episodes = n_episodes self.gamma = gamma self.q = q self.local_memory = ReplayBuffer(self.action_size, BUFFER_SIZE, BATCH_SIZE) self.t_step = 0 self.max_t = max_t self.eps_start = eps_start self.eps_end = eps_end self.eps_decay = eps_decay self.experience = namedtuple( "Experience", field_names=["state", "action", "reward", "next_state", "done"]) self.global_network = global_network self.target_network = target_network self.optimizer = optim.SGD(self.global_network.parameters(), lr=lr, momentum=.5) self.scores_window = deque(maxlen=100) # last 100 scores def act(self, state, eps=0.): if random.random() > eps: # Turn the state into a tensor state = torch.from_numpy(state).float().unsqueeze(0).to(device) with torch.no_grad(): action_values = self.global_network( state) # Make choice based on local network return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.local_memory.add(state, action, reward, next_state, done) # Increment local timer self.t_step += 1 if self.t_step > BATCH_SIZE: experiences = self.local_memory.sample(BATCH_SIZE) self.learn(experiences) # TODO: Better way to do this?? if self.q[0].empty() and np.mean(self.scores_window) < 180: experiences = self.local_memory.sample(BATCH_SIZE) self.q[0].put(experiences[0].detach().share_memory_()) self.q[1].put(experiences[1].detach().share_memory_()) self.q[2].put(experiences[2].detach().share_memory_()) self.q[3].put(experiences[3].detach().share_memory_()) self.q[4].put(experiences[4].detach().share_memory_()) def learn(self, experiences): states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model Q_targets_next = self.target_network(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.global_network(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.soft_update(self.global_network, self.target_network, TAU) def soft_update(self, local_model, target_model, tau): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def get_experience_as_tensor(self, e): states = torch.from_numpy(np.vstack([e.state])).float().to(device) actions = torch.from_numpy(np.vstack([e.action])).long().to(device) rewards = torch.from_numpy(np.vstack([e.reward])).float().to(device) next_states = torch.from_numpy(np.vstack([e.next_state ])).float().to(device) dones = torch.from_numpy(np.vstack([e.done]).astype( np.uint8)).float().to(device) return (states, actions, rewards, next_states, dones) def get_action_values(self, state): state = torch.from_numpy(state).float().unsqueeze(0).to(device) with torch.no_grad(): action_values = self.target_network(state) return action_values.cpu().data.numpy()[0] def get_delta(self, state, action, next_state, reward): priority = reward + self.gamma * np.max( self.get_action_values(next_state)) - self.get_action_values( state)[action] return priority def run(self): scores = [] eps = self.eps_start # initialize epsilon start_time = time.time() for i_episode in range(1, self.n_episodes + 1): state = self.env.reset() score = 0 for t in range(self.max_t): action = self.act(state, eps) # if do_render: # self.env.render() next_state, reward, done, _ = self.env.step(action) self.step(state, action, reward, next_state, done) state = next_state score += reward if done: break self.scores_window.append(score) # save most recent score scores.append(score) # save most recent score eps = max(self.eps_end, self.eps_decay * eps) # decrease epsilon elapsed_time = time.time() - start_time if self.id == 0: print( '\rThread: {}, Episode {}\tAverage Score: {:.2f}, Runtime: ' .format(self.id, i_episode, np.mean(self.scores_window)) + time.strftime("%H:%M:%S", time.gmtime(elapsed_time))) if i_episode % 100 == 0: print( '\rThread: {}, Episode {}\tAverage Score: {:.2f}, Runtime: ' .format(self.id, i_episode, np.mean(self.scores_window)) + time.strftime("%H:%M:%S", time.gmtime(elapsed_time))) if np.mean(self.scores_window) >= 200.0: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode - 100, np.mean(self.scores_window))) break
class DQNAgent(mp.Process): def __init__(self, id, env, do_render, state_size, action_size, n_episodes, lr, gamma, update_every, global_network, target_network, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995): super(DQNAgent, self).__init__() self.id = id self.env = env self.do_render = do_render self.state_size = state_size self.action_size = action_size self.n_episodes = n_episodes self.gamma = gamma self.update_every = update_every self.local_memory = ReplayBuffer(env.action_space.n, BUFFER_SIZE, BATCH_SIZE) self.global_network = global_network self.qnetwork_target = target_network self.optimizer = optim.SGD(self.global_network.parameters(), lr=lr, momentum=.5) self.t_step = 0 self.max_t = max_t self.eps_start = eps_start self.eps_end = eps_end self.eps_decay = eps_decay def act(self, state, eps=0.): if random.random() > eps: state = torch.from_numpy(state).float().unsqueeze(0).to(device) with torch.no_grad(): action_values = self.global_network(state) return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.local_memory.add(state, action, reward, next_state, done) # Increment local timer self.t_step += 1 # If enough samples are available in memory, get random subset and learn # Learn every UPDATE_EVERY time steps. if self.t_step % self.update_every == 0: if self.t_step > BATCH_SIZE: experiences = self.local_memory.sample(BATCH_SIZE) self.learn(experiences) def compute_loss(self, experiences): states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target.forward( next_states).detach().max(1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model # Q_expected = self.qnetwork_local(states).gather(1, actions) Q_expected = self.global_network.forward(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) return loss def learn(self, experiences): loss = self.compute_loss(experiences) # Update gradients per HogWild! algorithm self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.soft_update(self.global_network, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def run(self): scores = [] scores_window = deque(maxlen=100) # last 100 scores eps = self.eps_start # initialize epsilon start_time = time.time() for i_episode in range(1, self.n_episodes + 1): state = self.env.reset() score = 0 for t in range(self.max_t): action = self.act(state, eps) if self.do_render: self.env.render() next_state, reward, done, _ = self.env.step(action) self.step(state, action, reward, next_state, done) state = next_state score += reward if done: break scores_window.append(score) # save most recent score scores.append(score) # save most recent score eps = max(self.eps_end, self.eps_decay * eps) # decrease epsilon elapsed_time = time.time() - start_time if self.id == 0: print( '\rThread: {}, Episode {}\tAverage Score: {:.2f}, Runtime: ' .format(self.id, i_episode, np.mean(scores_window)) + time.strftime("%H:%M:%S", time.gmtime(elapsed_time))) if i_episode % 100 == 0: print( '\rThread: {}, Episode {}\tAverage Score: {:.2f}, Runtime: ' .format(self.id, i_episode, np.mean(scores_window)) + time.strftime("%H:%M:%S", time.gmtime(elapsed_time))) if np.mean(scores_window) >= 200.0: print( '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}' .format(i_episode - 100, np.mean(scores_window))) torch.save(self.global_network.state_dict(), 'checkpoint.pth') break