Esempio n. 1
0
 def setUp(self):
     self.tc = core.EpisodesTrainContext()
     self.tc.num_episodes_per_iteration = 1
     self.tc.num_iterations = 1
     self.tc.num_episodes_per_eval = 2
     self.tc.max_steps_per_episode = 5
     self.pc = core.PlayContext(self.tc)
Esempio n. 2
0
    def play(self,
             callbacks: Union[List[core.AgentCallback], core.AgentCallback,
                              None] = None,
             num_episodes: int = 1,
             max_steps_per_episode: int = 1000,
             play_context: core.PlayContext = None,
             default_plots: bool = None):
        """Plays num_episodes with the current policy.

        Args:
            callbacks: list of callbacks called during each episode play
            num_episodes: number of episodes to play
            max_steps_per_episode: max steps per episode
            play_context: play configuration to be used. If set override all other play context arguments
            default_plots: if set addes a set of default callbacks (plot.State, plot.Rewards, ...)

        Returns:
            play_context containg the actions taken and the rewards received during training
        """
        if play_context is None:
            play_context = core.PlayContext()
            play_context.max_steps_per_episode = max_steps_per_episode
            play_context.num_episodes = num_episodes
        self._play(play_context=play_context,
                   callbacks=callbacks,
                   default_plots=default_plots)
        return play_context
Esempio n. 3
0
    def play(self,
             callbacks: Union[List[core.AgentCallback], core.AgentCallback, None] = None,
             num_episodes: int = 1,
             max_steps_per_episode: int = 1000,
             play_context: core.PlayContext = None,
             default_plots: bool = None):
        """Plays num_episodes with the current policy.

        Args:
            callbacks: list of callbacks called during each episode play
            num_episodes: number of episodes to play
            max_steps_per_episode: max steps per episode
            play_context: play configuration to be used. If set override all other play context arguments
            default_plots: if set addes a set of default callbacks (plot.State, plot.Rewards, ...)

        Returns:
            play_context containg the actions taken and the rewards received during training
        """
        assert self._backend_agent._agent_context._is_policy_trained, "No trained policy available. Call train() first."
        if play_context is None:
            play_context = core.PlayContext()
            play_context.max_steps_per_episode = max_steps_per_episode
            play_context.num_episodes = num_episodes
        callbacks = self._to_callback_list(callbacks=callbacks)
        callbacks = self._add_plot_callbacks(callbacks, default_plots, [plot.Steps(), plot.Rewards()])
        self._backend_agent.play(play_context=play_context, callbacks=callbacks)
        return play_context
Esempio n. 4
0
    def evaluate(self,
                 num_episodes: int = 50,
                 max_steps_per_episode: int = 50):
        """Plays num_episodes with the current policy and computes metrics on rewards.

        Args:
            num_episodes: number of episodes to play
            max_steps_per_episode: max steps per episode

        Returns:
            extensible score metrics
        """
        play_context = core.PlayContext()
        play_context.max_steps_per_episode = max_steps_per_episode
        play_context.num_episodes = num_episodes
        self.play(play_context=play_context, default_plots=False)
        Metrics = namedtuple('Metrics', 'steps rewards')

        Rewards = namedtuple('Rewards', 'mean std min max all')
        all_rewards = list(play_context.sum_of_rewards.values())
        mean_reward, std_reward, min_reward, max_reward = statistics.mean(all_rewards), statistics.stdev(
            all_rewards), min(all_rewards), max(all_rewards)
        rewards = Rewards(mean=mean_reward, std=std_reward, min=min_reward, max=max_reward, all=all_rewards)

        Steps = namedtuple('Steps', 'mean std min max all')
        all_num_steps = []
        for i in play_context.rewards.keys():
            all_num_steps.append(len(play_context.rewards[i]))

        mean_steps, std_steps, min_steps, max_steps = statistics.mean(all_num_steps), statistics.stdev(
            all_num_steps), min(all_num_steps), max(all_num_steps)
        steps = Steps(mean=mean_steps, std=std_steps, min=min_steps, max=max_steps, all=all_num_steps)

        metrics = Metrics(rewards=rewards, steps=steps)
        return metrics
Esempio n. 5
0
 def test_play(self):
     model_config = core.ModelConfig("CartPole-v0")
     randomAgent = tfagents.TfRandomAgent(model_config=model_config)
     pc=core.PlayContext()
     pc.max_steps_per_episode=10
     pc.num_episodes=1
     randomAgent.play(play_context=pc,callbacks=[])
     assert pc.num_episodes == 1
Esempio n. 6
0
    def _eval_current_policy(self):
        """Evaluates the current policy using play and updates the train_context

            If num_episodes_per_eval or num_iterations_per_eval is 0 no evaluation is performed.
        """
        tc = self._agent_context.train
        assert tc, "train_context not set"

        if tc.num_episodes_per_eval and tc.num_iterations_between_eval:
            callbacks = [_BackendEvalCallback(self._agent_context.train)
                         ] + self._callbacks
            self.play(play_context=core.PlayContext(self._agent_context.train),
                      callbacks=callbacks)
Esempio n. 7
0
    def score(self, num_episodes: int = 50, max_steps_per_episode: int = 50):
        """Plays num_episodes with the current policy and computes metrics on rewards.

        Args:
            num_episodes: number of episodes to play
            max_steps_per_episode: max steps per episode

        Returns:
            score metrics - mean, std, min, max, all
        """
        play_context = core.PlayContext()
        play_context.max_steps_per_episode = max_steps_per_episode
        play_context.num_episodes = num_episodes
        self.play(play_context=play_context, default_plots=False)
        all = list(play_context.sum_of_rewards.values())

        return statistics.mean(all), statistics.stdev(all), min(all), max(
            all), all
Esempio n. 8
0
 def test_save_load(self):
     model_config = core.ModelConfig(_lineworld_name)
     tc = core.PpoTrainContext()
     ppo_agent = tfagents.TfPpoAgent(model_config=model_config)
     ppo_agent.train(
         train_context=tc,
         callbacks=[duration._SingleIteration(),
                    log.Iteration()])
     tempdir = bcore._get_temp_path()
     bcore._mkdir(tempdir)
     ppo_agent.save(tempdir, [])
     ppo_agent = tfagents.TfPpoAgent(model_config=model_config)
     ppo_agent.load(tempdir, [])
     pc = core.PlayContext()
     pc.max_steps_per_episode = 10
     pc.num_episodes = 1
     ppo_agent.play(play_context=pc, callbacks=[])
     bcore._rmpath(tempdir)