def training_loop(self, is_testing, reward_from_agent=True): start_time = time.time() #current_episodes = 0 if is_testing: rp.report("\n\n> Playing") max_episodes = self.max_test_episodes max_steps = self.max_steps_testing current_episodes = self.curr_playing_episodes else: rp.report("> Training") max_episodes = self.max_training_episodes max_steps = self.max_steps_training current_episodes = self.curr_training_episodes if self.logger.ep_count == 0 or is_testing: self.logger = Logger( max_episodes, self.agent.__class__.__name__, self.agent.model.__class__.__name__, self.agent.model, self.agent.action_wrapper.__class__.__name__, self.agent.action_wrapper.get_action_space_dim(), self.agent.action_wrapper.get_named_actions(), self.agent.state_builder.__class__.__name__, self.agent.reward_builder.__class__.__name__, self.env.__class__.__name__, log_actions=self.log_actions, episode_batch_avg_calculation=self. episode_batch_avg_calculation, rolling_avg_window_size=self.rolling_avg_window_size) while current_episodes < max_episodes: current_episodes += 1 self.env.start() if is_testing: self.curr_playing_episodes = current_episodes else: self.curr_training_episodes = current_episodes # Reset the environment obs = self.env.reset() step_reward = 0 done = False # Passing the episode to the agent reset, so that it can be passed to model reset # Allowing the model to track the episode number, and decide if it should diminish the # Learning Rate, depending on the currently selected strategy. self.agent.reset(current_episodes) ep_reward = 0 victory = False ep_actions = np.zeros( self.agent.action_wrapper.get_action_space_dim()) self.logger.record_episode_start() for step in range(max_steps): # Choosing an action and passing it to our env.step() in order to act on our environment action = self.agent.step(obs, done, is_testing) # Take the action (a) and observe the outcome state (s') and reward (r) obs, default_reward, done = self.env.step(action) # Logic to test wheter this is the last step of this episode is_last_step = step == max_steps - 1 done = done or is_last_step # Checking whether or not to use the reward from the reward builder so we can pass that to the agent if reward_from_agent: step_reward = self.agent.get_reward( obs, default_reward, done) else: step_reward = default_reward # Making the agent learn if not is_testing: self.agent.learn(obs, step_reward, done) # Adding our step reward to the total count of the episode's reward ep_reward += step_reward ep_actions[self.agent.previous_action] += 1 if done: victory = default_reward == 1 agent_info = { "Learning rate": self.agent.model.learning_rate, "Gamma": self.agent.model.gamma, "Epsilon": self.agent.model.epsilon_greedy, } self.logger.record_episode(ep_reward, victory, step + 1, agent_info, ep_actions) break self.logger.log_ep_stats() # check if user wants to pause training and test agent # if self.do_reward_test and current_episodes % self.episode_batch_avg_calculation == 0 and current_episodes > 1: if ( not is_testing ) and self.do_reward_test and current_episodes % self.episode_batch_avg_calculation == 0: self.test_agent() # if this is not a test (evaluation), saving is enabled and we are in a multiple # of our save_every variable then we save the model and generate graphs if ( not is_testing ) and self.enable_save and current_episodes > 0 and current_episodes % self.save_every == 0: self.save(self.full_save_path) # if we have done tests along the training save all loggers for further detailed analysis if self.do_reward_test and len( self.inside_training_test_loggers) > 0: for idx in range( len(self.logger.ep_avg_batch_rewards_episodes)): logger_dict = self.inside_training_test_loggers[idx] if not logger_dict["saved"]: episode = self.logger.ep_avg_batch_rewards_episodes[ idx] backup_full_save_path = self.full_save_path self.full_save_path = self.full_save_path + os.path.sep + "inside_training_play_files" + os.path.sep + "test_at_training_episode_{}".format( episode) self.make_persistance_dirs(self.log_actions) logger_dict["logger"].save(self.full_save_path) logger_dict["saved"] = True self.full_save_path = backup_full_save_path end_time = time.time() if is_testing: rp.report("\n> Test duration: {} seconds".format(end_time - start_time)) self.logger.log_train_stats() else: rp.report("\n> Training duration: {} seconds".format(end_time - start_time)) self.logger.log_train_stats() # Saving the model at the end of the training loop if self.enable_save: if is_testing: self.logger.save(self.full_save_play_path) rp.save(self.full_save_play_path) else: self.save(self.full_save_path) # if we have done tests along the training save all loggers for further detailed analysis if self.do_reward_test and len( self.inside_training_test_loggers) > 0: for idx in range( len(self.logger.ep_avg_batch_rewards_episodes)): logger_dict = self.inside_training_test_loggers[idx] if not logger_dict["saved"]: episode = self.logger.ep_avg_batch_rewards_episodes[ idx] backup_full_save_path = self.full_save_path self.full_save_path = self.full_save_path + os.path.sep + "inside_training_play_files" + os.path.sep + "test_at_training_episode_{}".format( episode) self.make_persistance_dirs(self.log_actions) logger_dict["logger"].save(self.full_save_path) logger_dict["saved"] = True self.full_save_path = backup_full_save_path
def save_extra(self, save_path): self.env.save(save_path) self.agent.save(save_path) self.logger.save(save_path) self.versioner.save(save_path) rp.save(save_path)
def old_play(self, test_params=None, reward_from_agent=True): rp.report("\n\n> Playing") self.logger = Logger( self.max_test_episodes, self.agent.__class__.__name__, self.agent.model.__class__.__name__, self.agent.model, self.agent.action_wrapper.__class__.__name__, self.agent.action_wrapper.get_action_space_dim(), self.agent.action_wrapper.get_named_actions(), self.agent.state_builder.__class__.__name__, self.agent.reward_builder.__class__.__name__, self.env.__class__.__name__, log_actions=self.log_actions, episode_batch_avg_calculation=self.episode_batch_avg_calculation, rolling_avg_window_size=self.rolling_avg_window_size) while self.curr_playing_episodes < self.max_test_episodes: self.curr_playing_episodes += 1 self.env.start() # Reset the environment obs = self.env.reset() step_reward = 0 done = False # Passing the episode to the agent reset, so that it can be passed to model reset # Allowing the model to track the episode number, and decide if it should diminish the # Learning Rate, depending on the currently selected strategy. self.agent.reset(self.curr_playing_episodes) ep_reward = 0 victory = False ep_actions = np.zeros( self.agent.action_wrapper.get_action_space_dim()) self.logger.record_episode_start() for step in range(self.max_steps_testing): action = self.agent.step(obs, done, is_testing=True) # Take the action (a) and observe the outcome state(s') and reward (r) obs, default_reward, done = self.env.step(action) is_last_step = step == self.max_steps_testing - 1 done = done or is_last_step if reward_from_agent: step_reward = self.agent.get_reward( obs, default_reward, done) else: step_reward = default_reward ep_reward += step_reward ep_actions[self.agent.previous_action] += 1 # If done: finish episode if done: victory = default_reward == 1 agent_info = { "Learning rate": self.agent.model.learning_rate, "Gamma": self.agent.model.gamma, "Epsilon": self.agent.model.epsilon_greedy, } self.logger.record_episode(ep_reward, victory, step + 1, agent_info, ep_actions) break self.logger.log_ep_stats() if test_params != None: test_params.logger.record_play_test(test_params.current_ep_count, self.logger.ep_rewards, self.logger.victories, self.max_test_episodes) else: # Only logs train stats if this is not a test, to avoid cluttering the interface with info self.logger.log_train_stats() # We need to save playing status as well if self.enable_save: self.logger.save(self.full_save_play_path) rp.save(self.full_save_play_path)