def eval_greedy(self, ale, agent, epoch): if self.action_repeat_prob > 0: return self.eval_greedy_stochastic_env(ale, agent, epoch) sequences = self.sequences ale.reset_action_seed() episode_rewards = [] episode_num = 0 for sequence in sequences: ale.reset_game() preprocessor = Preprocessor() state = State(self.hist_len) episode_frames = 0 episode_reward = 0 for i in range(len(sequence)): ale.act(sequence[i]) preprocessor.add(ale.getScreenRGB()) if (i + 1) % self.action_repeat == 0: state.add_frame(preprocessor.preprocess()) lives = ale.lives() while not (ale.game_over() or (self.cap_eval_episodes and episode_frames >= self.eval_max_frames)): # the random state doesn't matter since eps=0.0 action = agent.eGreedy_action(state.get_state(), 0.0, np.random.RandomState(4)) reward = 0 for i in range(self.action_repeat): reward += ale.act(action) preprocessor.add(ale.getScreenRGB()) episode_frames += 1 img = preprocessor.preprocess() state.add_frame(img) episode_reward += reward if ale.lives() < lives: perform_action_sweep(ale, preprocessor, state) lives = ale.lives() self.eval_output_file.write("Episode " + str(episode_num) + " reward is " + str(episode_reward) + "\n") episode_rewards.append(episode_reward) episode_num += 1 avg_reward = float(sum(episode_rewards)) / float(len(sequences)) self.log_eval(avg_reward, epoch) return avg_reward
def eval_greedy_stochastic_env(self, ale, agent, epoch): self.eval_output_file.write("Performing Stochastic Evaluation...\n") episode_rewards = [] for episode_num in range(len(self.sequences)): ale.reset_game() ale.set_action_seed(episode_num) preprocessor = Preprocessor() state = State(self.hist_len) for _ in range(2): preprocessor.add(ale.getScreenRGB()) state.add_frame(preprocessor.preprocess()) episode_frames = 0 episode_reward = 0 lives = ale.lives() while not (ale.game_over() or (self.cap_eval_episodes and episode_frames > self.eval_max_frames)): action = agent.eGreedy_action(state.get_state(), 0.0) reward = 0 for i in range(self.action_repeat): reward += ale.act(action) preprocessor.add(ale.getScreenRGB()) episode_frames += 1 img = preprocessor.preprocess() state.add_frame(img) episode_reward += reward if ale.lives() < lives: perform_action_sweep(ale, preprocessor, state) lives = ale.lives() self.eval_output_file.write("Episode " + str(episode_num) + " reward is " + str(episode_reward) + "\n") episode_rewards.append(episode_reward) episode_num += 1 avg_reward = float(sum(episode_rewards)) / float(number_episodes) self.log_eval(avg_reward, epoch) return avg_reward
def eval(self, ale, agent): action_set = ale.getMinimalActionSet() rewards = [] # 100 episodes for i in range(100): ale.reset_game() preprocessor = Preprocessor() state = State(self.hist_len) steps = 0 utils.perform_no_ops(ale, 30, preprocessor, state) episode_reward = 0 while not ale.game_over() and steps < self.time_limit: if np.random.uniform() < 0.01: action = np.random.choice(action_set) else: action = agent.get_action(state.get_state()) for _ in range(self.action_repeat): episode_reward += ale.act(action) preprocessor.add(ale.getScreenRGB()) state.add_frame(preprocessor.preprocess()) steps += 1 print "Episode " + str(i) + " reward is " + str(episode_reward) rewards.append(episode_reward) print "Mean reward is: " + str(np.mean(rewards))
def train(training_frames, learning_rate, alpha, min_squared_gradient, minibatch_size, replay_capacity, hist_len, tgt_update_freq, discount, act_rpt, upd_freq, init_epsilon, fin_epsilon, fin_exp, replay_start_size, no_op_max, death_ends_episode, ale_seed, eval_freq, nature, checkpoint_frequency, checkpoint_dir, repeat_action_probability, rnd_no_op, rnd_exp, rnd_act_repeat, rnd_buffer_sample, rom, evaluator): #Create ALE object ale = ALEInterfaceWrapper(repeat_action_probability, rnd_act_repeat) #Set the random seed for the ALE ale.setInt('random_seed', ale_seed) # Load the ROM file ale.loadROM(rom) #initialize epsilon epsilon = init_epsilon # How much epsilon decreases at each time step. The annealing amount. epsilon_delta = (init_epsilon - fin_epsilon) / fin_exp print "Minimal Action set is:" print ale.getMinimalActionSet() # create DQN agent agent = DQN(ale.getMinimalActionSet().tolist(), learning_rate, alpha, min_squared_gradient, nature, checkpoint_frequency, checkpoint_dir, epsilon, hist_len, discount, rnd_exp, rnd_buffer_sample) # Initial evaluation evaluator.evaluate(agent, 0) # Initialize replay memory to capacity replay_capacity replay_memory = ReplayMemory(replay_capacity, hist_len) timestep = 0 episode_num = 1 # Main training loop while timestep < training_frames: # create a state variable of size hist_len state = State(hist_len) preprocessor = Preprocessor() # perform a random number of no ops to start the episode utils.perform_no_ops(ale, no_op_max, preprocessor, state, rnd_no_op) # total episode reward is 0 total_reward = 0 lives = ale.lives() episode_done = False time_since_term = 0 # episode loop while not episode_done: if timestep % checkpoint_frequency == 0: epoch = timestep / checkpoint_frequency agent.checkpoint_network(epoch) action = agent.get_action(state.get_state()) reward = 0 #skip frames by repeating action for i in range(act_rpt): reward = reward + ale.act(action) #add the images on stack preprocessor.add(ale.getScreenRGB()) #increment episode reward before clipping the reward for training total_reward += reward reward = np.clip(reward, -1, 1) # get the preprocessed new frame img = preprocessor.preprocess() state.add_frame(img) episode_done = ale.game_over() or (ale.lives() < lives and death_ends_episode) #store transition replay_memory.add_item(img, action, reward, episode_done, time_since_term) ''' Training. We only train once buffer has filled to size=replay_start_size ''' if (timestep > replay_start_size): # anneal epsilon. epsilon = max(epsilon - epsilon_delta, fin_epsilon) agent.set_epsilon(epsilon) if timestep % eval_freq == 0: evaluator.evaluate(agent, timestep / eval_freq) ale.reset_game() # Break loop and start new episode after eval # Can help prevent getting stuck in episodes episode_done = True if timestep % upd_freq == 0: agent.train(replay_memory, minibatch_size) timestep = timestep + 1 time_since_term += 1 ''' Inconsistency in Deepmind code versus Paper. In code they update target network every tgt_update_freq actions. In the the paper they say to do it every tgt_update_freq parameter updates. ''' if timestep % tgt_update_freq == 1: print "Copying Network..." agent.copy_network() print "Done Copying." log(episode_num, total_reward, timestep) # if game is not over, then continue with new life if ale.game_over(): ale.reset_game() episode_num = episode_num + 1 if timestep == training_frames: evaluator.evaluate(agent, training_frames / eval_freq) agent.checkpoint_network(training / checkpoint_frequency) print "Number " + str(timestep)