def train(self, distributed_rollouts: ESDistributedRollouts, n_epochs: Optional[int] = None, model_selection: Optional[ModelSelectionBase] = None) -> None: """ Run the ES training loop. :param distributed_rollouts: The distribution interface for experience collection. :param n_epochs: Number of epochs to train. :param model_selection: Optional model selection class, receives model evaluation results. """ n_epochs = self.algorithm_config.n_epochs if n_epochs is None else n_epochs self.model_selection = model_selection for epoch in itertools.count(): # check if we reached the max number of epochs if n_epochs and epoch == n_epochs: break print('********** Iteration {} **********'.format(epoch)) step_start_time = time.time() # do the actual update step (disable autograd, as we calculate the gradient from the rollout returns) with torch.no_grad(): self._update(distributed_rollouts) step_end_time = time.time() # log the step duration self.es_events.real_time(step_end_time - step_start_time) # update the epoch count increment_log_step()
def test_rollout_evaluator(): env = SequentialVectorEnv([lambda: TimeLimitWrapper.wrap(build_dummy_maze_env(), max_episode_steps=2)] * 2) policy = flatten_concat_probabilistic_policy_for_env(build_dummy_maze_env()) model_selection = _MockModelSelection() evaluator = RolloutEvaluator(eval_env=env, n_episodes=3, model_selection=model_selection) for i in range(2): evaluator.evaluate(policy) increment_log_step() assert model_selection.update_count == 2 assert evaluator.eval_env.get_stats_value( BaseEnvEvents.reward, LogStatsLevel.EPOCH, name="total_episode_count" ) >= 2 * 3
def on_train_result(self, trainer: Trainer, result: dict, **kwargs) -> None: """Aggregates stats of all rollouts in one local aggregator and then writes them out. Called at the end of Trainable.train(). :param trainer: Current model instance. :param result: Dict of results returned from model.train() call. You can mutate this object to add additional metrics. :param kwargs: Forward compatibility placeholder. """ # Initialize the logging for this process if not done yet if self.epoch_stats is None: print("Initializing logging of train results") self.init_logging(trainer.config) # The main local aggregator should be empty # - No stats should be collected here until we manually add them # - Stats from the last call should be cleared out already (written out to the logs) assert self.epoch_stats.input == {}, "input should be empty at the beginning" # Get the epoch stats from the individual rollouts epoch_aggregators = trainer.workers.foreach_worker( lambda worker: worker.foreach_env(lambda env: env.get_stats( LogStatsLevel.EPOCH))) # Collect all episode stats from the epoch aggregators of individual rollout envs in the main local aggregator for worker_epoch_aggregator in epoch_aggregators: for env_epoch_aggregator in worker_epoch_aggregator: # Pass stats from the individual env runs into the main epoch aggregator for stats_key, stats_value in env_epoch_aggregator.input.items( ): self.epoch_stats.input[stats_key].extend(stats_value) # clear logs at distributed workers def reset_episode_stats(env) -> None: """Empty inputs of the individual aggregators and make sure they don't have any consumers""" epoch_aggregator = env.get_stats(LogStatsLevel.EPOCH) epoch_aggregator.input = defaultdict(list) epoch_aggregator.consumers = [] trainer.workers.foreach_worker(lambda worker: worker.foreach_env( lambda env: reset_episode_stats(env))) # Increment log step to trigger epoch logging increment_log_step()
def test_step_increment_in_single_step_core_env(): """In single sub-step envs, events should be cleared out and env time incremented automatically.""" env = build_dummy_maze_env() env = LogStatsWrapper.wrap(env) env.reset() assert env.get_env_time() == 0 # 10 steps for _ in range(10): env.step(env.action_space.sample()) assert env.get_env_time() == 10 env.reset() increment_log_step() assert env.get_stats_value(BaseEnvEvents.reward, LogStatsLevel.EPOCH, name="total_step_count") == 10
def test_step_increment_in_structured_core_environments(): """Structured core envs manage the step incrementing themselves and Maze env should not interfere with that.""" env = build_dummy_maze_env_with_structured_core_env() env = LogStatsWrapper.wrap(env) env.reset() assert env.get_env_time() == 0 # Do 10 agent steps => 5 structured steps (as we have two agents) for _ in range(10): env.step(env.action_space.sample()) assert env.get_env_time() == 5 env.reset() increment_log_step() assert env.get_stats_value(BaseEnvEvents.reward, LogStatsLevel.EPOCH, name="total_step_count") == 5
def test_does_not_carry_over_stats_from_unfinished_episodes(): policy = flatten_concat_probabilistic_policy_for_env(build_dummy_maze_env()) # Wrap envs in a time-limit wrapper env = SequentialVectorEnv([lambda: TimeLimitWrapper.wrap(build_dummy_maze_env())] * 2) # Make one env slower than the other env.envs[0].set_max_episode_steps(2) env.envs[1].set_max_episode_steps(10) evaluator = RolloutEvaluator(eval_env=env, n_episodes=1, model_selection=None) for i in range(2): evaluator.evaluate(policy) increment_log_step() # We should get just one episode counted in stats assert evaluator.eval_env.get_stats_value( BaseEnvEvents.reward, LogStatsLevel.EPOCH, name="episode_count" ) == 1
def train(self, evaluator: Evaluator, n_epochs: Optional[int] = None, eval_every_k_iterations: Optional[int] = None) -> None: """ Run training. :param evaluator: Evaluator to use for evaluation rollouts :param n_epochs: How many epochs to train for :param eval_every_k_iterations: Number of iterations after which to run evaluation (in addition to evaluations at the end of each epoch, which are run automatically). If set to None, evaluations will run on epoch end only. """ if n_epochs is None: n_epochs = self.algorithm_config.n_epochs if eval_every_k_iterations is None: eval_every_k_iterations = self.algorithm_config.eval_every_k_iterations for epoch in range(n_epochs): print(f"\n********** Epoch {epoch + 1} started **********") evaluator.evaluate(self.policy) increment_log_step() for iteration, data in enumerate(self.data_loader, 0): observations, actions, actor_ids = data self._run_iteration(observations=observations, actions=actions, actor_ids=actor_ids) # Evaluate after each k iterations if set if eval_every_k_iterations is not None and \ iteration % eval_every_k_iterations == (eval_every_k_iterations - 1): print( f"\n********** Epoch {epoch + 1}: Iteration {iteration + 1} **********" ) evaluator.evaluate(self.policy) increment_log_step() print(f"\n********** Final evaluation **********") evaluator.evaluate(self.policy) increment_log_step()
def test_observation_statistics_logging(): """ observation normalization logging test """ # normalization config normalization_config = { "default_strategy": "maze.normalization_strategies.MeanZeroStdOneObservationNormalizationStrategy", "default_strategy_config": { "clip_range": (None, None), "axis": 0 }, "default_statistics": None, "statistics_dump": "statistics.pkl", "exclude": None, "manual_config": { "observation": { "strategy": "maze.normalization_strategies.MeanZeroStdOneObservationNormalizationStrategy", "strategy_config": { "clip_range": (0, 1) }, "statistics": { "mean": [0, 0, 0, 0], "std": [1, 1, 1, 1] } } } } writer = LogStatsWriterTensorboard(log_dir='test_log', tensorboard_render_figure=True) register_log_stats_writer(writer) # attach a console writer as well for immediate console feedback register_log_stats_writer(LogStatsWriterConsole()) # init environment env = GymMazeEnv("CartPole-v0") # wrap env with observation normalization env = ObservationNormalizationWrapper( env, default_strategy=normalization_config["default_strategy"], default_strategy_config=normalization_config[ "default_strategy_config"], default_statistics=normalization_config["default_statistics"], statistics_dump=normalization_config["statistics_dump"], sampling_policy=RandomPolicy(env.action_spaces_dict), exclude=normalization_config["exclude"], manual_config=normalization_config["manual_config"]) env = LogStatsWrapper.wrap(env, logging_prefix="train") n_episodes = 10 n_steps_per_episode = 100 for episode in range(n_episodes): _ = env.reset() for step in range(n_steps_per_episode): # take random action action = env.action_space.sample() # take step in env and trigger log stats writing _, _, done, _ = env.step(action) if done: break increment_log_step()
def _train_async(self, n_epochs) -> None: """Train policy using the synchronous advantage actor critic. :param n_epochs: number of epochs to train. """ # run training epochs if n_epochs <= 0: n_epochs = sys.maxsize epoch_length = self.algorithm_config.epoch_length patience = self.algorithm_config.patience # Perform a hard update on the critic self.learner_model.critic.update_target_weights(1.0) # run training epochs for epoch in range(n_epochs): start = time.time() print("Update epoch - {}".format(epoch)) # compute evaluation reward reward = -np.inf if self.evaluator: self.evaluate() # take training reward and notify model selection else: if epoch > 0: prev_reward = reward try: reward = self.distributed_workers.get_stats_value( BaseEnvEvents.reward, LogStatsLevel.EPOCH, name="mean") except KeyError: reward = prev_reward # best model selection self.model_selection.update(reward) # evaluate policy time_evaluation = time.time() - start # early stopping if patience and self.model_selection.last_improvement > patience: BColors.print_colored( "-> no improvement since {} epochs: EARLY STOPPING!". format(patience), color=BColors.WARNING) increment_log_step() break time_deq_actors = 0 time_before_update = time.time() for epoch_step_idx in range(epoch_length): q_size_before, q_size_after, time_deq_actors = self.distributed_workers.collect_rollouts( ) # Record the queue sizes self.events.estimated_queue_sizes(after=q_size_after, before=q_size_before) # policy update for batch_updates in range( self.algorithm_config.num_batches_per_iter): self._update() total_num_batch_updates =\ (batch_updates + epoch_step_idx * self.algorithm_config.num_batches_per_iter + (epoch_length * self.algorithm_config.num_batches_per_iter) * epoch) if total_num_batch_updates % self.algorithm_config.target_update_interval == 0: self.learner_model.critic.update_target_weights( self.algorithm_config.tau) self.distributed_workers.broadcast_updated_policy( self.learner_model.policy.state_dict()) time_updating = time.time() - time_before_update total_time = time.time() - start self.events.time_dequeuing_actors(time=time_deq_actors, percent=time_deq_actors / total_time) # Buffer events self.events.buffer_size(len( self.distributed_workers.replay_buffer)) self.events.buffer_avg_pick_per_transition( value=self.distributed_workers.replay_buffer. cum_moving_avg_num_picks) # increase step counter (which in turn triggers the log statistics writing) increment_log_step() print("Time required for epoch: {:.2f}s".format(total_time)) print( ' - total ({} steps) updating: {:.2f}s ({:.2f}%), mean time/step: {:.2f}s' .format( epoch_length * self.algorithm_config.num_batches_per_iter, time_updating, time_updating / total_time, time_updating / (epoch_length * self.algorithm_config.num_batches_per_iter))) print( ' - total time evaluating the model: {:.2f}s ({:.2f}%)'.format( time_evaluation, time_evaluation / total_time))
def train(self, n_epochs: Optional[int] = None) -> None: """Main train method of the actor critic trainer. This is used in order to do algorithm specific operations around this method in the main train method which is called by the runner. (e.g. this is used when it comes to multiprocessing) :param n_epochs: Number of epochs to train. """ n_epochs = self.algorithm_config.n_epochs if n_epochs is None else n_epochs # init minimum best model selection for early stopping if self.model_selection is None: self.model_selection = BestModelSelection(dump_file=None, model=None) # preserve original training coef setting value_loss_coef = self.algorithm_config.value_loss_coef policy_loss_coef = self.algorithm_config.policy_loss_coef entropy_coef = self.algorithm_config.entropy_coef # run training epochs if n_epochs <= 0: n_epochs = sys.maxsize for epoch in range(n_epochs): start = time.time() print("Update epoch - {}".format(epoch)) # check for critic burn in and reset coefficient to only update the critic if epoch < self.algorithm_config.critic_burn_in_epochs: self.algorithm_config.value_loss_coef = 1.0 self.algorithm_config.policy_loss_coef = 0.0 self.algorithm_config.entropy_coef = 0.0 else: self.algorithm_config.value_loss_coef = value_loss_coef self.algorithm_config.policy_loss_coef = policy_loss_coef self.algorithm_config.entropy_coef = entropy_coef # compute evaluation reward reward = -np.inf if self.evaluator: self.evaluate() # take training reward and notify best model selection manually else: if epoch > 0: prev_reward = reward try: reward = self.rollout_generator.get_stats_value(BaseEnvEvents.reward, LogStatsLevel.EPOCH, name="mean") except: reward = prev_reward self.model_selection.update(reward) # early stopping if self.algorithm_config.patience and \ self.model_selection.last_improvement > self.algorithm_config.patience: BColors.print_colored("-> no improvement since {} epochs: EARLY STOPPING!" .format(self.algorithm_config.patience), color=BColors.WARNING) increment_log_step() break # policy update for _ in tqdm(range(self.algorithm_config.epoch_length)): update_start = time.time() self._update() self.ac_events.time_update(time.time() - update_start) epoch_time = time.time() - start self.ac_events.time_epoch(epoch_time) # increase step counter (which in turn triggers the log statistics writing) increment_log_step() print("Time required for epoch: {:.2f}s".format(epoch_time))