class MoveToBeacon(base_agent.BaseAgent): """An agent specifically for solving the MoveToBeacon map.""" def __init__(self): super(MoveToBeacon, self).__init__() self.num_actions = len(available_actions) self.input_flat = 84 * 84 # Size of the screen self.wh = 84 # Minimap sizes self.mm_input_flat = 64 * 64 self.mm_wh = 64 self.batch_size = 32 self.max_memory_size = 2000 self.gamma = .99 self.learning_rate = 1e-4 self.epsilon = 1. self.final_epsilon = .05 self.epsilon_decay = 0.999 self.total_rewards = deque(maxlen=100) self.current_reward = 0 self.actions_taken = np.zeros(self.num_actions) self.rewards = [] self.total_actions = [] self.memory = ReplayMemory(self.num_actions, self.batch_size, self.max_memory_size, self.gamma) self.model = Model(self.wh, self.input_flat, self.mm_wh, self.mm_input_flat, 1, self.num_actions, self.learning_rate, self.memory) if self.model.loaded_model: self.epsilon = 0.05 def step(self, obs): # Current observable state screen_player_relative = obs.observation["screen"][_PLAYER_RELATIVE] current_state = screen_player_relative.flatten() mm_player_relative = obs.observation['minimap'][_MM_PLAYER_RELATIVE] minimap_state = mm_player_relative.flatten() army_state = obs.observation['screen'][_SELECT].flatten() # army_selected = np.array([1]) if 1 in obs.observation['screen'][_SELECT] else np.array([0]) if len(self.memory.memory) > 0: self.memory.update([current_state, minimap_state, army_state]) self.model.train() super(MoveToBeacon, self).step(obs) legal_actions = obs.observation['available_actions'] if random.random() < self.epsilon: action = legal_actions[random.randint(0, len(legal_actions)) - 1] action = available_actions.index(action) else: # feed_dict = {self.model.screen_input: [current_state], self.model.minimap_input: [minimap_state], # self.model.army_input: [army_selected]} feed_dict = {self.model.army_input: [army_state]} output = self.model.session.run(self.model.output, feed_dict)[0] output = [ value if action in legal_actions else -9e10 for action, value in zip(available_actions, output) ] action = np.argmax(output) self.actions_taken[int(action)] += 1 self.total_actions.append(action) # print('Action taken: {}'.format(action)) reward = obs.reward self.current_reward += reward if obs.last(): self.total_rewards.append(self.current_reward) self.rewards.append(self.current_reward) self.current_reward = 0 if self.episodes % 100 == 0 and self.episodes > 0: self.model.save() print('Highest: {} | Lowest: {} | Average: {}'.format( max(self.total_rewards), min(self.total_rewards), np.mean(self.total_rewards))) print(self.actions_taken) if self.episodes % 1000 == 0 and self.episodes > 0: pickle.dump( self.total_actions, open('/home/rob/Documents/uni/fyp/sc2/actions8.pkl', 'wb')) pickle.dump( self.rewards, open('/home/rob/Documents/uni/fyp/sc2/rewards8.pkl', 'wb')) exit(0) if self.epsilon > self.final_epsilon: self.epsilon = self.epsilon * self.epsilon_decay self.memory.add([current_state, minimap_state, army_state], action, reward, obs.last()) # self.model.train() if available_actions[action] == _NO_OP: return actions.FunctionCall(_NO_OP, []) elif available_actions[action] == _SELECT_ARMY: return actions.FunctionCall(_SELECT_ARMY, [_SELECT_ALL]) elif available_actions[action] == _ATTACK_SCREEN \ or available_actions[action] == _MOVE_SCREEN \ or available_actions[action] == _PATROL_SCREEN \ or available_actions[action] == _SMART_SCREEN: # This is the scripted one neutral_y, neutral_x = ( screen_player_relative == _PLAYER_NEUTRAL).nonzero() target = [int(neutral_x.mean()), int(neutral_y.mean())] return actions.FunctionCall(available_actions[action], [_NOT_QUEUED, target]) elif available_actions[action] == _STOP_QUICK: return actions.FunctionCall(available_actions[action], [_NOT_QUEUED]) elif available_actions[action] == _HOLD_POSITION_QUICK: return actions.FunctionCall(available_actions[action], [_NOT_QUEUED]) elif available_actions[action] == _ATTACK_MINIMAP \ or available_actions[action] == _MOVE_MINIMAP \ or available_actions[action] == _PATROL_MINIMAP \ or available_actions[action] == _SMART_MINIMAP: neutral_y, neutral_x = ( mm_player_relative == _PLAYER_NEUTRAL).nonzero() target = [int(neutral_x.mean()), int(neutral_y.mean())] return actions.FunctionCall(available_actions[action], [_NOT_QUEUED, target]) else: return actions.FunctionCall(_NO_OP, [])
class Agent(object): ''' Implements training and testing methods ''' def __init__(self, skip=True, episodic=True): self.env = wrap_dqn(gym.make('BreakoutDeterministic-v4'), skip, episodic) self.num_actions = self.env.action_space.n self.dqn = DQN(self.num_actions).cuda() self.target_dqn = DQN(self.num_actions).cuda() self.buffer = ReplayMemory(200000) self.gamma = 0.99 self.optimizer = optim.RMSprop(self.dqn.parameters(), lr=0.00025, eps=0.001, alpha=0.95) self.out_dir = '/scratch/ab8084/atari/saved/' if not os.path.exists(self.out_dir): os.makedirs(self.out_dir) self.reward_episodes = [] self.lengths_episodes = [] self.benchmark = -10000 def to_var(self, x): ''' Converts torch tensor x to torch variable ''' return Variable(x).cuda() def predict_q_values(self, states): ''' Computes q values of states by passing them through the behavior network states: numpy array, shape is (batch_size,frames,width,height) returns actions: shape is (batch_size, num_actions) ''' states = self.to_var(torch.from_numpy(states).float()) actions = self.dqn(states) return actions def predict_q_target_values(self, states): ''' Computes q values of next states by passing them through the target network states: numpy array, shape is (batch_size,frames,width,height) returns actions: shape is (batch_size, num_actions) ''' states = self.to_var(torch.from_numpy(states).float()) actions = self.target_dqn(states) return actions def select_action(self, state, epsilon): choice = np.random.choice([0, 1], p=(epsilon, (1 - epsilon))) if choice == 0: return np.random.choice(range(self.num_actions)) else: state = np.expand_dims(state, 0) actions = self.predict_q_values(state) return np.argmax(actions.data.cpu().numpy()) def update(self, states, targets, actions): ''' Calculates loss and updates the weights of the behavior network using backprop states: numpy array, shape is (batch_size,frames,width,height) actions: numpy array, shape is(batch_size,num_actions) targets: numpy array, shape is (batch_size) ''' targets = self.to_var( torch.unsqueeze(torch.from_numpy(targets).float(), -1)) actions = self.to_var( torch.unsqueeze(torch.from_numpy(actions).long(), -1)) predicted_values = self.predict_q_values(states) affected_values = torch.gather(predicted_values, 1, actions) loss = F.smooth_l1_loss(affected_values, targets) self.optimizer.zero_grad() loss.backward() self.optimizer.step() def calculate_q_targets(self, next_states, rewards, dones): ''' Calculate targets from target network next_states: numpy array, shape is (batch_size, frames, width, height) rewards: numpy array, shape is (batch_size,) dones: numpy array, shape is (batch_size,) ''' dones_mask = (dones == 1) predicted_q_target_values = self.predict_q_target_values(next_states) next_max_q_values = np.max( predicted_q_target_values.data.cpu().numpy(), axis=1) next_max_q_values[dones_mask] = 0 q_targets = rewards + self.gamma * next_max_q_values return q_targets def sync_target_network(self): ''' Copies weights from estimation to target network ''' primary_params = list(self.dqn.parameters()) target_params = list(self.target_dqn.parameters()) for i in range(0, len(primary_params)): target_params[i].data[:] = primary_params[i].data[:] def play(self, episodes): ''' plays for epsiodes number of episodes ''' for i in range(1, episodes + 1): done = False state = self.env.reset() plt.imshow(state) plt.axis('off') plt.show() while not done: action = self.select_action(state, 0) state, reward, done, _ = self.env.step(action) display.clear_output(wait=True) plt.imshow(self.env.render()) plt.axis('off') plt.show() time.pause(0.03) def close_env(self): ''' Clean up ''' self.env.close() def get_epsilon(self, total_steps, max_epsilon_steps, epsilon_start, epsilon_final): return max(epsilon_final, epsilon_start - total_steps / max_epsilon_steps) def save_final_model(self): ''' Saves final model to the disk ''' filename = '{}/final_model_breakout_skipTrue.pth'.format(self.out_dir) torch.save( { 'model_state_dict': self.dqn.state_dict(), 'benchmark': self.benchmark, 'lenghts_rewards': self.lengths_episodes, 'rewards_episodes': self.reward_episodes }, filename) def load_model(self, filename): ''' Loads model from the disk filename: model filename ''' try: checkpoint = torch.load( '/scratch/ab8084/atari/saved/final_model_breakout_skipTrue.pth' ) self.dqn.load_state_dict(checkpoint['model_state_dict']) self.benchmark = checkpoint['benchmark'] except: self.dqn.load_state_dict(torch.load(filename)) self.sync_target_network() def train(self, replay_buffer_fill_len, batch_size, episodes, stop_reward, max_epsilon_steps, epsilon_start, epsilon_final, sync_target_net_freq): ''' replay_buffer_fill_len: how many elements should replay buffer contain before training starts batch_size: batch size episodes: how many episodes (max. value) to iterate stop_reward: running reward value to be reached. upon reaching that value the training is stoped max_epsilon_steps: maximum number of epsilon steps epsilon_start: start epsilon value epsilon_final: final epsilon value, effectively a limit sync_target_net_freq: how often to sync estimation and target networks ''' start_time = time.time() print('Start training at: ' + time.asctime(time.localtime(start_time))) total_steps = 0 running_episode_reward = 0 print('Populating Replay Buffer') print('\n') state = self.env.reset() for i in range(replay_buffer_fill_len): done = False action = self.select_action(state, 0.05) next_state, reward, done, _ = self.env.step(action) self.buffer.add(state, action, reward, done, next_state) state = next_state if done: state = self.env.reset() print( 'Replay Buffer populated with {} transitions, starting training...' .format(self.buffer.count())) print('\n') for i in range(1, episodes + 1): done = False state = self.env.reset() episode_reward = 0 episode_length = 0 while not done: if (total_steps % sync_target_net_freq) == 0: print('synchronizing target network...') #print('\n') self.sync_target_network() epsilon = self.get_epsilon(total_steps, max_epsilon_steps, epsilon_start, epsilon_final) action = self.select_action(state, epsilon) next_state, reward, done, _ = self.env.step(action) self.buffer.add(state, action, reward, done, next_state) s_batch, a_batch, r_batch, d_batch, next_s_batch = self.buffer.sample( batch_size) q_targets = self.calculate_q_targets(next_s_batch, r_batch, d_batch) self.update(s_batch, q_targets, a_batch) state = next_state total_steps += 1 episode_length += 1 episode_reward += np.sign(reward) self.reward_episodes.append(episode_reward) self.lengths_episodes.append(episode_length) running_episode_reward = running_episode_reward * 0.9 + 0.1 * episode_reward if (i % 1000) == 0 or (running_episode_reward > stop_reward): print( 'global step: {}'.format(total_steps), ' | episode: {}'.format(i), ' | mean episode_length: {}'.format( np.mean(self.lengths_episodes[-1000:])), ' | mean episode reward: {}'.format( np.mean(self.reward_episodes[-1000:]))) #self.lengths_episodes=[] #self.reward_episodes=[] #print('episode: {}'.format(i)) #print('current epsilon: {}'.format(round(epsilon, 2))) #print('mean episode_length: {}'.format(np.mean(lengths_episodes[-50:]))) #print('mean episode reward: {}'.format(np.mean(reward_episodes[-50:]))) #print('\n') if episode_reward > self.benchmark: print('global step: {}'.format(total_steps), ' | episode: {}'.format(i), ' | episode_length: {}'.format(episode_length), ' | episode reward: {}'.format(episode_reward)) self.benchmark = episode_reward self.save_final_model() if running_episode_reward > stop_reward: print('stop reward reached!') print('saving final model...') print('\n') #self.save_final_model() break print('Finish training at: ' + time.asctime(time.localtime(start_time)))