def setUp(self): self.tc = core.EpisodesTrainContext() self.tc.num_episodes_per_iteration = 1 self.tc.num_iterations = 1 self.tc.num_episodes_per_eval = 2 self.tc.max_steps_per_episode = 5 self.pc = core.PlayContext(self.tc)
def play(self, callbacks: Union[List[core.AgentCallback], core.AgentCallback, None] = None, num_episodes: int = 1, max_steps_per_episode: int = 1000, play_context: core.PlayContext = None, default_plots: bool = None): """Plays num_episodes with the current policy. Args: callbacks: list of callbacks called during each episode play num_episodes: number of episodes to play max_steps_per_episode: max steps per episode play_context: play configuration to be used. If set override all other play context arguments default_plots: if set addes a set of default callbacks (plot.State, plot.Rewards, ...) Returns: play_context containg the actions taken and the rewards received during training """ if play_context is None: play_context = core.PlayContext() play_context.max_steps_per_episode = max_steps_per_episode play_context.num_episodes = num_episodes self._play(play_context=play_context, callbacks=callbacks, default_plots=default_plots) return play_context
def play(self, callbacks: Union[List[core.AgentCallback], core.AgentCallback, None] = None, num_episodes: int = 1, max_steps_per_episode: int = 1000, play_context: core.PlayContext = None, default_plots: bool = None): """Plays num_episodes with the current policy. Args: callbacks: list of callbacks called during each episode play num_episodes: number of episodes to play max_steps_per_episode: max steps per episode play_context: play configuration to be used. If set override all other play context arguments default_plots: if set addes a set of default callbacks (plot.State, plot.Rewards, ...) Returns: play_context containg the actions taken and the rewards received during training """ assert self._backend_agent._agent_context._is_policy_trained, "No trained policy available. Call train() first." if play_context is None: play_context = core.PlayContext() play_context.max_steps_per_episode = max_steps_per_episode play_context.num_episodes = num_episodes callbacks = self._to_callback_list(callbacks=callbacks) callbacks = self._add_plot_callbacks(callbacks, default_plots, [plot.Steps(), plot.Rewards()]) self._backend_agent.play(play_context=play_context, callbacks=callbacks) return play_context
def evaluate(self, num_episodes: int = 50, max_steps_per_episode: int = 50): """Plays num_episodes with the current policy and computes metrics on rewards. Args: num_episodes: number of episodes to play max_steps_per_episode: max steps per episode Returns: extensible score metrics """ play_context = core.PlayContext() play_context.max_steps_per_episode = max_steps_per_episode play_context.num_episodes = num_episodes self.play(play_context=play_context, default_plots=False) Metrics = namedtuple('Metrics', 'steps rewards') Rewards = namedtuple('Rewards', 'mean std min max all') all_rewards = list(play_context.sum_of_rewards.values()) mean_reward, std_reward, min_reward, max_reward = statistics.mean(all_rewards), statistics.stdev( all_rewards), min(all_rewards), max(all_rewards) rewards = Rewards(mean=mean_reward, std=std_reward, min=min_reward, max=max_reward, all=all_rewards) Steps = namedtuple('Steps', 'mean std min max all') all_num_steps = [] for i in play_context.rewards.keys(): all_num_steps.append(len(play_context.rewards[i])) mean_steps, std_steps, min_steps, max_steps = statistics.mean(all_num_steps), statistics.stdev( all_num_steps), min(all_num_steps), max(all_num_steps) steps = Steps(mean=mean_steps, std=std_steps, min=min_steps, max=max_steps, all=all_num_steps) metrics = Metrics(rewards=rewards, steps=steps) return metrics
def test_play(self): model_config = core.ModelConfig("CartPole-v0") randomAgent = tfagents.TfRandomAgent(model_config=model_config) pc=core.PlayContext() pc.max_steps_per_episode=10 pc.num_episodes=1 randomAgent.play(play_context=pc,callbacks=[]) assert pc.num_episodes == 1
def _eval_current_policy(self): """Evaluates the current policy using play and updates the train_context If num_episodes_per_eval or num_iterations_per_eval is 0 no evaluation is performed. """ tc = self._agent_context.train assert tc, "train_context not set" if tc.num_episodes_per_eval and tc.num_iterations_between_eval: callbacks = [_BackendEvalCallback(self._agent_context.train) ] + self._callbacks self.play(play_context=core.PlayContext(self._agent_context.train), callbacks=callbacks)
def score(self, num_episodes: int = 50, max_steps_per_episode: int = 50): """Plays num_episodes with the current policy and computes metrics on rewards. Args: num_episodes: number of episodes to play max_steps_per_episode: max steps per episode Returns: score metrics - mean, std, min, max, all """ play_context = core.PlayContext() play_context.max_steps_per_episode = max_steps_per_episode play_context.num_episodes = num_episodes self.play(play_context=play_context, default_plots=False) all = list(play_context.sum_of_rewards.values()) return statistics.mean(all), statistics.stdev(all), min(all), max( all), all
def test_save_load(self): model_config = core.ModelConfig(_lineworld_name) tc = core.PpoTrainContext() ppo_agent = tfagents.TfPpoAgent(model_config=model_config) ppo_agent.train( train_context=tc, callbacks=[duration._SingleIteration(), log.Iteration()]) tempdir = bcore._get_temp_path() bcore._mkdir(tempdir) ppo_agent.save(tempdir, []) ppo_agent = tfagents.TfPpoAgent(model_config=model_config) ppo_agent.load(tempdir, []) pc = core.PlayContext() pc.max_steps_per_episode = 10 pc.num_episodes = 1 ppo_agent.play(play_context=pc, callbacks=[]) bcore._rmpath(tempdir)