Exemple #1
0
    def training_loop(self, is_testing, reward_from_agent=True):
        start_time = time.time()
        #current_episodes = 0

        if is_testing:
            rp.report("\n\n> Playing")
            max_episodes = self.max_test_episodes
            max_steps = self.max_steps_testing
            current_episodes = self.curr_playing_episodes
        else:
            rp.report("> Training")
            max_episodes = self.max_training_episodes
            max_steps = self.max_steps_training
            current_episodes = self.curr_training_episodes

        if self.logger.ep_count == 0 or is_testing:
            self.logger = Logger(
                max_episodes,
                self.agent.__class__.__name__,
                self.agent.model.__class__.__name__,
                self.agent.model,
                self.agent.action_wrapper.__class__.__name__,
                self.agent.action_wrapper.get_action_space_dim(),
                self.agent.action_wrapper.get_named_actions(),
                self.agent.state_builder.__class__.__name__,
                self.agent.reward_builder.__class__.__name__,
                self.env.__class__.__name__,
                log_actions=self.log_actions,
                episode_batch_avg_calculation=self.
                episode_batch_avg_calculation,
                rolling_avg_window_size=self.rolling_avg_window_size)

        while current_episodes < max_episodes:
            current_episodes += 1
            self.env.start()

            if is_testing:
                self.curr_playing_episodes = current_episodes
            else:
                self.curr_training_episodes = current_episodes

            # Reset the environment
            obs = self.env.reset()
            step_reward = 0
            done = False
            # Passing the episode to the agent reset, so that it can be passed to model reset
            # Allowing the model to track the episode number, and decide if it should diminish the
            # Learning Rate, depending on the currently selected strategy.
            self.agent.reset(current_episodes)

            ep_reward = 0
            victory = False

            ep_actions = np.zeros(
                self.agent.action_wrapper.get_action_space_dim())
            self.logger.record_episode_start()

            for step in range(max_steps):
                # Choosing an action and passing it to our env.step() in order to act on our environment
                action = self.agent.step(obs, done, is_testing)
                # Take the action (a) and observe the outcome state (s') and reward (r)
                obs, default_reward, done = self.env.step(action)

                # Logic to test wheter this is the last step of this episode
                is_last_step = step == max_steps - 1
                done = done or is_last_step

                # Checking whether or not to use the reward from the reward builder so we can pass that to the agent
                if reward_from_agent:
                    step_reward = self.agent.get_reward(
                        obs, default_reward, done)
                else:
                    step_reward = default_reward

                # Making the agent learn
                if not is_testing:
                    self.agent.learn(obs, step_reward, done)

                # Adding our step reward to the total count of the episode's reward
                ep_reward += step_reward
                ep_actions[self.agent.previous_action] += 1

                if done:
                    victory = default_reward == 1
                    agent_info = {
                        "Learning rate": self.agent.model.learning_rate,
                        "Gamma": self.agent.model.gamma,
                        "Epsilon": self.agent.model.epsilon_greedy,
                    }
                    self.logger.record_episode(ep_reward, victory, step + 1,
                                               agent_info, ep_actions)
                    break

            self.logger.log_ep_stats()

            # check if user wants to pause training and test agent
            # if self.do_reward_test and current_episodes % self.episode_batch_avg_calculation == 0 and current_episodes > 1:
            if (
                    not is_testing
            ) and self.do_reward_test and current_episodes % self.episode_batch_avg_calculation == 0:
                self.test_agent()

            # if this is not a test (evaluation), saving is enabled and we are in a multiple
            # of our save_every variable then we save the model and generate graphs
            if (
                    not is_testing
            ) and self.enable_save and current_episodes > 0 and current_episodes % self.save_every == 0:
                self.save(self.full_save_path)

                # if we have done tests along the training save all loggers for further detailed analysis
                if self.do_reward_test and len(
                        self.inside_training_test_loggers) > 0:
                    for idx in range(
                            len(self.logger.ep_avg_batch_rewards_episodes)):
                        logger_dict = self.inside_training_test_loggers[idx]
                        if not logger_dict["saved"]:
                            episode = self.logger.ep_avg_batch_rewards_episodes[
                                idx]
                            backup_full_save_path = self.full_save_path
                            self.full_save_path = self.full_save_path + os.path.sep + "inside_training_play_files" + os.path.sep + "test_at_training_episode_{}".format(
                                episode)
                            self.make_persistance_dirs(self.log_actions)
                            logger_dict["logger"].save(self.full_save_path)
                            logger_dict["saved"] = True
                            self.full_save_path = backup_full_save_path

        end_time = time.time()
        if is_testing:
            rp.report("\n> Test duration: {} seconds".format(end_time -
                                                             start_time))
            self.logger.log_train_stats()
        else:
            rp.report("\n> Training duration: {} seconds".format(end_time -
                                                                 start_time))
            self.logger.log_train_stats()

        # Saving the model at the end of the training loop
        if self.enable_save:
            if is_testing:
                self.logger.save(self.full_save_play_path)
                rp.save(self.full_save_play_path)
            else:
                self.save(self.full_save_path)

                # if we have done tests along the training save all loggers for further detailed analysis
                if self.do_reward_test and len(
                        self.inside_training_test_loggers) > 0:
                    for idx in range(
                            len(self.logger.ep_avg_batch_rewards_episodes)):
                        logger_dict = self.inside_training_test_loggers[idx]
                        if not logger_dict["saved"]:
                            episode = self.logger.ep_avg_batch_rewards_episodes[
                                idx]
                            backup_full_save_path = self.full_save_path
                            self.full_save_path = self.full_save_path + os.path.sep + "inside_training_play_files" + os.path.sep + "test_at_training_episode_{}".format(
                                episode)
                            self.make_persistance_dirs(self.log_actions)
                            logger_dict["logger"].save(self.full_save_path)
                            logger_dict["saved"] = True
                            self.full_save_path = backup_full_save_path
Exemple #2
0
 def save_extra(self, save_path):
     self.env.save(save_path)
     self.agent.save(save_path)
     self.logger.save(save_path)
     self.versioner.save(save_path)
     rp.save(save_path)
Exemple #3
0
    def old_play(self, test_params=None, reward_from_agent=True):
        rp.report("\n\n> Playing")

        self.logger = Logger(
            self.max_test_episodes,
            self.agent.__class__.__name__,
            self.agent.model.__class__.__name__,
            self.agent.model,
            self.agent.action_wrapper.__class__.__name__,
            self.agent.action_wrapper.get_action_space_dim(),
            self.agent.action_wrapper.get_named_actions(),
            self.agent.state_builder.__class__.__name__,
            self.agent.reward_builder.__class__.__name__,
            self.env.__class__.__name__,
            log_actions=self.log_actions,
            episode_batch_avg_calculation=self.episode_batch_avg_calculation,
            rolling_avg_window_size=self.rolling_avg_window_size)

        while self.curr_playing_episodes < self.max_test_episodes:
            self.curr_playing_episodes += 1
            self.env.start()

            # Reset the environment
            obs = self.env.reset()
            step_reward = 0
            done = False
            # Passing the episode to the agent reset, so that it can be passed to model reset
            # Allowing the model to track the episode number, and decide if it should diminish the
            # Learning Rate, depending on the currently selected strategy.
            self.agent.reset(self.curr_playing_episodes)

            ep_reward = 0
            victory = False

            ep_actions = np.zeros(
                self.agent.action_wrapper.get_action_space_dim())
            self.logger.record_episode_start()

            for step in range(self.max_steps_testing):
                action = self.agent.step(obs, done, is_testing=True)
                # Take the action (a) and observe the outcome state(s') and reward (r)
                obs, default_reward, done = self.env.step(action)

                is_last_step = step == self.max_steps_testing - 1
                done = done or is_last_step

                if reward_from_agent:
                    step_reward = self.agent.get_reward(
                        obs, default_reward, done)
                else:
                    step_reward = default_reward

                ep_reward += step_reward

                ep_actions[self.agent.previous_action] += 1

                # If done: finish episode
                if done:
                    victory = default_reward == 1
                    agent_info = {
                        "Learning rate": self.agent.model.learning_rate,
                        "Gamma": self.agent.model.gamma,
                        "Epsilon": self.agent.model.epsilon_greedy,
                    }
                    self.logger.record_episode(ep_reward, victory, step + 1,
                                               agent_info, ep_actions)
                    break

            self.logger.log_ep_stats()

        if test_params != None:
            test_params.logger.record_play_test(test_params.current_ep_count,
                                                self.logger.ep_rewards,
                                                self.logger.victories,
                                                self.max_test_episodes)
        else:
            # Only logs train stats if this is not a test, to avoid cluttering the interface with info
            self.logger.log_train_stats()

        # We need to save playing status as well
        if self.enable_save:
            self.logger.save(self.full_save_play_path)
            rp.save(self.full_save_play_path)