def evaluate(self, env=None, num_episodes=None): """ Evaluation with same procedure as the training """ # log our activity only if default call save_paths = False if num_episodes is None: self.logger.info("Evaluating...") else: save_paths = True # arguments defaults if num_episodes is None: num_episodes = self.config.num_episodes_test if env is None: env = self.env bfs_len = self.bfs_len else: bfs_len = env.get_bfs_length() # replay memory to play if self.config.use_memory: replay_buffer = ReplayBuffer( self.config.buffer_size, self.config.state_history, memory_size=self.config.memory_unit_size) else: replay_buffer = ReplayBuffer(self.config.buffer_size, self.config.state_history) rewards = [] steps = [] for i in range(num_episodes): total_reward = 0 state = env.reset() count = 0 while True: if self.config.render_test: env.render() # store last state in buffer idx = replay_buffer.store_frame(state) q_input = replay_buffer.encode_recent_observation() if self.config.use_memory: prev_memory = replay_buffer.encode_recent_memory() action, bottom_q, top_q, next_memory = self.get_action_with_memory( q_input, prev_memory) next_memory = np.squeeze(next_memory) else: action = self.get_action(q_input) if i == 0 and self.config.use_memory: with open(self.config.output_path + 'eval_example_log.txt', 'a') as of: of.write('State = {}\n'.format(env.cur_state)) of.write('Taking action = {}\n'.format(action)) of.write('prev_memory = {}\n'.format( prev_memory[0, :6])) of.write('next_memory = {}\n'.format(next_memory[:6])) of.write('bottom_q_values = {}\n'.format(bottom_q)) of.write('top_q_values = {}\n'.format(top_q)) of.write('\n') if save_paths: with open(self.config.output_path + 'path_log.txt', 'a') as of: of.write("(s, a) = ({}, {})\n".format( env.cur_state, action)) of.write('\n') # perform action in env new_state, reward, done, info = env.step(action) # store in replay memory replay_buffer.store_effect(idx, action, reward, done) if self.config.use_memory: replay_buffer.store_memory(idx, next_memory) state = new_state count += 1 # count reward total_reward += reward if done: if save_paths: with open(self.config.output_path + 'path_log.txt', 'a') as of: of.write('\n') break # updates to perform at the end of an episode rewards.append(total_reward) if total_reward <= 0: steps.append(np.nan) else: steps.append(count) steps = np.array(steps) - bfs_len # adjust for shortest possible path avg_reward = np.mean(rewards) avg_length = np.nanmean(steps) sigma_length = np.sqrt(np.nanvar(steps) / len(steps)) percent_completed = np.count_nonzero(~np.isnan(steps)) / float( len(steps)) sigma_reward = np.sqrt(np.var(rewards) / len(rewards)) if num_episodes > 1: msg = "Average reward: {:04.2f} +/- {:04.2f}, Percent completed: {:04.2f}, Average length: {:04.2f} +/- {:04.2f}, n = {}".format( avg_reward, sigma_reward, percent_completed, avg_length, sigma_length, len(rewards)) self.logger.info(msg) return avg_reward, percent_completed, avg_length
def train(self, exp_schedule, lr_schedule): """ Performs training of Q Args: exp_schedule: Exploration instance s.t. exp_schedule.get_action(best_action) returns an action lr_schedule: Schedule for learning rate """ # initialize replay buffer and variables if self.config.use_memory: replay_buffer = ReplayBuffer( self.config.buffer_size, self.config.state_history, memory_size=self.config.memory_unit_size) else: replay_buffer = ReplayBuffer(self.config.buffer_size, self.config.state_history) rewards = deque(maxlen=self.config.num_episodes_test) max_q_values = deque(maxlen=1000) q_values = deque(maxlen=1000) self.init_averages() t = last_eval = last_record = 0 # time control of nb of steps scores_eval = [] # list of scores computed at iteration time scores_eval += [self.evaluate()[0]] prog = Progbar(target=self.config.nsteps_train) evaluation_result_list = [] oos_evalution_result_list = [] # interact with environment prev_time = time.time() while t < self.config.nsteps_train: total_reward = 0 state = self.env.reset() while True: t += 1 last_eval += 1 last_record += 1 if self.config.render_train: self.env.render() # replay memory stuff idx = replay_buffer.store_frame(state) q_input = replay_buffer.encode_recent_observation() if self.config.use_memory: prev_memory = replay_buffer.encode_recent_memory() best_action, q_values, _, next_memory = self.get_best_action_with_memory( q_input, prev_memory) next_memory = np.squeeze(next_memory) else: best_action, q_values = self.get_best_action(q_input) # chose action according to current Q and exploration action = exp_schedule.get_action(best_action) # store q values max_q_values.append(max(q_values)) q_values += list(q_values) # perform action in env new_state, reward, done, info = self.env.step(action) # store the transition replay_buffer.store_effect(idx, action, reward, done) if self.config.use_memory: replay_buffer.store_memory(idx, next_memory) state = new_state # perform a training step loss_eval, grad_eval = self.train_step(t, replay_buffer, lr_schedule.epsilon) # logging stuff time_log_freq = 1000 if t % time_log_freq == 0: with open(self.config.output_path + 'time_log.txt', 'a') as of: of.write('{}\n'.format(time.time() - prev_time)) of.write('\n') prev_time = time.time() if ((t > self.config.learning_start) and (t % self.config.log_freq == 0) and (t % self.config.learning_freq == 0)): self.update_averages(rewards, max_q_values, q_values, scores_eval) exp_schedule.update(t) lr_schedule.update(t) if len(rewards) > 0: prog.update(t + 1, exact=[("Loss", loss_eval), ("Avg_R", self.avg_reward), ("Max_R", np.max(rewards)), ("eps", exp_schedule.epsilon), ("Grads", grad_eval), ("Max_Q", self.max_q), ("lr", lr_schedule.epsilon)]) elif (t < self.config.learning_start) and ( t % self.config.log_freq == 0): sys.stdout.write("\rPopulating the memory {}/{}...".format( t, self.config.learning_start)) sys.stdout.flush() # count reward total_reward += reward if done or t >= self.config.nsteps_train: break # updates to perform at the end of an episode rewards.append(total_reward) if (t > self.config.learning_start) and (last_eval > self.config.eval_freq): # evaluate our policy last_eval = 0 print("") score, complete, length = self.evaluate() if complete > 0: evaluation_result_list += [(score, complete, length)] if score > self.config.extended_eval_threshold: self.logger.info('Extended in-sample evaluation...') self.evaluate(num_episodes=1000) for _ in range(10): self.logger.info( 'Extended out-of-sample evaluation...') oos_result = self.evaluate( EnvMaze(n=self.config.maze_size), num_episodes=100) oos_evalution_result_list += [oos_result] scores_eval += [score] if (t > self.config.learning_start) and self.config.record and ( last_record > self.config.record_freq): self.logger.info("Recording...") last_record = 0 self.record() # last words self.logger.info("- Training done.") self.save() scores_eval += [self.evaluate()[0]] export_plot(scores_eval, "Scores", self.config.plot_output) return evaluation_result_list, oos_evalution_result_list