Beispiel #1
0
    def train(self,
              distributed_rollouts: ESDistributedRollouts,
              n_epochs: Optional[int] = None,
              model_selection: Optional[ModelSelectionBase] = None) -> None:
        """
        Run the ES training loop.
        :param distributed_rollouts: The distribution interface for experience collection.
        :param n_epochs: Number of epochs to train.
        :param model_selection: Optional model selection class, receives model evaluation results.
        """

        n_epochs = self.algorithm_config.n_epochs if n_epochs is None else n_epochs
        self.model_selection = model_selection

        for epoch in itertools.count():
            # check if we reached the max number of epochs
            if n_epochs and epoch == n_epochs:
                break

            print('********** Iteration {} **********'.format(epoch))

            step_start_time = time.time()

            # do the actual update step (disable autograd, as we calculate the gradient from the rollout returns)
            with torch.no_grad():
                self._update(distributed_rollouts)

            step_end_time = time.time()

            # log the step duration
            self.es_events.real_time(step_end_time - step_start_time)

            # update the epoch count
            increment_log_step()
Beispiel #2
0
def test_rollout_evaluator():
    env = SequentialVectorEnv([lambda: TimeLimitWrapper.wrap(build_dummy_maze_env(), max_episode_steps=2)] * 2)
    policy = flatten_concat_probabilistic_policy_for_env(build_dummy_maze_env())
    model_selection = _MockModelSelection()

    evaluator = RolloutEvaluator(eval_env=env, n_episodes=3, model_selection=model_selection)
    for i in range(2):
        evaluator.evaluate(policy)
        increment_log_step()

    assert model_selection.update_count == 2
    assert evaluator.eval_env.get_stats_value(
        BaseEnvEvents.reward,
        LogStatsLevel.EPOCH,
        name="total_episode_count"
    ) >= 2 * 3
Beispiel #3
0
    def on_train_result(self, trainer: Trainer, result: dict,
                        **kwargs) -> None:
        """Aggregates stats of all rollouts in one local aggregator and then writes them out.
        Called at the end of Trainable.train().

        :param trainer: Current model instance.
        :param result: Dict of results returned from model.train() call.
            You can mutate this object to add additional metrics.
        :param kwargs: Forward compatibility placeholder.
        """

        # Initialize the logging for this process if not done yet
        if self.epoch_stats is None:
            print("Initializing logging of train results")
            self.init_logging(trainer.config)

        # The main local aggregator should be empty
        #  - No stats should be collected here until we manually add them
        #  - Stats from the last call should be cleared out already (written out to the logs)
        assert self.epoch_stats.input == {}, "input should be empty at the beginning"

        # Get the epoch stats from the individual rollouts
        epoch_aggregators = trainer.workers.foreach_worker(
            lambda worker: worker.foreach_env(lambda env: env.get_stats(
                LogStatsLevel.EPOCH)))

        # Collect all episode stats from the epoch aggregators of individual rollout envs in the main local aggregator
        for worker_epoch_aggregator in epoch_aggregators:
            for env_epoch_aggregator in worker_epoch_aggregator:
                # Pass stats from the individual env runs into the main epoch aggregator
                for stats_key, stats_value in env_epoch_aggregator.input.items(
                ):
                    self.epoch_stats.input[stats_key].extend(stats_value)

        # clear logs at distributed workers
        def reset_episode_stats(env) -> None:
            """Empty inputs of the individual aggregators and make sure they don't have any consumers"""
            epoch_aggregator = env.get_stats(LogStatsLevel.EPOCH)
            epoch_aggregator.input = defaultdict(list)
            epoch_aggregator.consumers = []

        trainer.workers.foreach_worker(lambda worker: worker.foreach_env(
            lambda env: reset_episode_stats(env)))

        # Increment log step to trigger epoch logging
        increment_log_step()
Beispiel #4
0
def test_step_increment_in_single_step_core_env():
    """In single sub-step envs, events should be cleared out and env time incremented automatically."""
    env = build_dummy_maze_env()
    env = LogStatsWrapper.wrap(env)

    env.reset()
    assert env.get_env_time() == 0

    # 10 steps
    for _ in range(10):
        env.step(env.action_space.sample())

    assert env.get_env_time() == 10
    env.reset()

    increment_log_step()

    assert env.get_stats_value(BaseEnvEvents.reward,
                               LogStatsLevel.EPOCH,
                               name="total_step_count") == 10
Beispiel #5
0
def test_step_increment_in_structured_core_environments():
    """Structured core envs manage the step incrementing themselves and Maze env should not interfere with that."""
    env = build_dummy_maze_env_with_structured_core_env()
    env = LogStatsWrapper.wrap(env)

    env.reset()
    assert env.get_env_time() == 0

    # Do 10 agent steps => 5 structured steps (as we have two agents)
    for _ in range(10):
        env.step(env.action_space.sample())

    assert env.get_env_time() == 5
    env.reset()

    increment_log_step()

    assert env.get_stats_value(BaseEnvEvents.reward,
                               LogStatsLevel.EPOCH,
                               name="total_step_count") == 5
Beispiel #6
0
def test_does_not_carry_over_stats_from_unfinished_episodes():
    policy = flatten_concat_probabilistic_policy_for_env(build_dummy_maze_env())

    # Wrap envs in a time-limit wrapper
    env = SequentialVectorEnv([lambda: TimeLimitWrapper.wrap(build_dummy_maze_env())] * 2)

    # Make one env slower than the other
    env.envs[0].set_max_episode_steps(2)
    env.envs[1].set_max_episode_steps(10)

    evaluator = RolloutEvaluator(eval_env=env, n_episodes=1, model_selection=None)
    for i in range(2):
        evaluator.evaluate(policy)
        increment_log_step()

        # We should get just one episode counted in stats
        assert evaluator.eval_env.get_stats_value(
            BaseEnvEvents.reward,
            LogStatsLevel.EPOCH,
            name="episode_count"
        ) == 1
Beispiel #7
0
    def train(self,
              evaluator: Evaluator,
              n_epochs: Optional[int] = None,
              eval_every_k_iterations: Optional[int] = None) -> None:
        """
        Run training.
        :param evaluator: Evaluator to use for evaluation rollouts
        :param n_epochs: How many epochs to train for
        :param eval_every_k_iterations: Number of iterations after which to run evaluation (in addition to evaluations
        at the end of each epoch, which are run automatically). If set to None, evaluations will run on epoch end only.
        """

        if n_epochs is None:
            n_epochs = self.algorithm_config.n_epochs
        if eval_every_k_iterations is None:
            eval_every_k_iterations = self.algorithm_config.eval_every_k_iterations

        for epoch in range(n_epochs):
            print(f"\n********** Epoch {epoch + 1} started **********")
            evaluator.evaluate(self.policy)
            increment_log_step()

            for iteration, data in enumerate(self.data_loader, 0):
                observations, actions, actor_ids = data
                self._run_iteration(observations=observations,
                                    actions=actions,
                                    actor_ids=actor_ids)

                # Evaluate after each k iterations if set
                if eval_every_k_iterations is not None and \
                        iteration % eval_every_k_iterations == (eval_every_k_iterations - 1):
                    print(
                        f"\n********** Epoch {epoch + 1}: Iteration {iteration + 1} **********"
                    )
                    evaluator.evaluate(self.policy)
                    increment_log_step()

        print(f"\n********** Final evaluation **********")
        evaluator.evaluate(self.policy)
        increment_log_step()
Beispiel #8
0
def test_observation_statistics_logging():
    """ observation normalization logging test """

    # normalization config
    normalization_config = {
        "default_strategy":
        "maze.normalization_strategies.MeanZeroStdOneObservationNormalizationStrategy",
        "default_strategy_config": {
            "clip_range": (None, None),
            "axis": 0
        },
        "default_statistics": None,
        "statistics_dump": "statistics.pkl",
        "exclude": None,
        "manual_config": {
            "observation": {
                "strategy":
                "maze.normalization_strategies.MeanZeroStdOneObservationNormalizationStrategy",
                "strategy_config": {
                    "clip_range": (0, 1)
                },
                "statistics": {
                    "mean": [0, 0, 0, 0],
                    "std": [1, 1, 1, 1]
                }
            }
        }
    }
    writer = LogStatsWriterTensorboard(log_dir='test_log',
                                       tensorboard_render_figure=True)
    register_log_stats_writer(writer)
    # attach a console writer as well for immediate console feedback
    register_log_stats_writer(LogStatsWriterConsole())

    # init environment
    env = GymMazeEnv("CartPole-v0")

    # wrap env with observation normalization
    env = ObservationNormalizationWrapper(
        env,
        default_strategy=normalization_config["default_strategy"],
        default_strategy_config=normalization_config[
            "default_strategy_config"],
        default_statistics=normalization_config["default_statistics"],
        statistics_dump=normalization_config["statistics_dump"],
        sampling_policy=RandomPolicy(env.action_spaces_dict),
        exclude=normalization_config["exclude"],
        manual_config=normalization_config["manual_config"])

    env = LogStatsWrapper.wrap(env, logging_prefix="train")

    n_episodes = 10
    n_steps_per_episode = 100
    for episode in range(n_episodes):
        _ = env.reset()
        for step in range(n_steps_per_episode):
            # take random action
            action = env.action_space.sample()

            # take step in env and trigger log stats writing
            _, _, done, _ = env.step(action)

            if done:
                break

        increment_log_step()
Beispiel #9
0
    def _train_async(self, n_epochs) -> None:
        """Train policy using the synchronous advantage actor critic.

        :param n_epochs: number of epochs to train.
        """

        # run training epochs
        if n_epochs <= 0:
            n_epochs = sys.maxsize
        epoch_length = self.algorithm_config.epoch_length
        patience = self.algorithm_config.patience

        # Perform a hard update on the critic
        self.learner_model.critic.update_target_weights(1.0)

        # run training epochs
        for epoch in range(n_epochs):
            start = time.time()
            print("Update epoch - {}".format(epoch))

            # compute evaluation reward
            reward = -np.inf
            if self.evaluator:
                self.evaluate()
            # take training reward and notify model selection
            else:
                if epoch > 0:
                    prev_reward = reward
                    try:
                        reward = self.distributed_workers.get_stats_value(
                            BaseEnvEvents.reward,
                            LogStatsLevel.EPOCH,
                            name="mean")
                    except KeyError:
                        reward = prev_reward

                # best model selection
                self.model_selection.update(reward)

            # evaluate policy
            time_evaluation = time.time() - start

            # early stopping
            if patience and self.model_selection.last_improvement > patience:
                BColors.print_colored(
                    "-> no improvement since {} epochs: EARLY STOPPING!".
                    format(patience),
                    color=BColors.WARNING)
                increment_log_step()
                break

            time_deq_actors = 0
            time_before_update = time.time()
            for epoch_step_idx in range(epoch_length):
                q_size_before, q_size_after, time_deq_actors = self.distributed_workers.collect_rollouts(
                )

                # Record the queue sizes
                self.events.estimated_queue_sizes(after=q_size_after,
                                                  before=q_size_before)

                # policy update
                for batch_updates in range(
                        self.algorithm_config.num_batches_per_iter):
                    self._update()
                    total_num_batch_updates =\
                        (batch_updates + epoch_step_idx * self.algorithm_config.num_batches_per_iter +
                         (epoch_length * self.algorithm_config.num_batches_per_iter) * epoch)
                    if total_num_batch_updates % self.algorithm_config.target_update_interval == 0:
                        self.learner_model.critic.update_target_weights(
                            self.algorithm_config.tau)

                    self.distributed_workers.broadcast_updated_policy(
                        self.learner_model.policy.state_dict())
            time_updating = time.time() - time_before_update

            total_time = time.time() - start
            self.events.time_dequeuing_actors(time=time_deq_actors,
                                              percent=time_deq_actors /
                                              total_time)

            # Buffer events
            self.events.buffer_size(len(
                self.distributed_workers.replay_buffer))
            self.events.buffer_avg_pick_per_transition(
                value=self.distributed_workers.replay_buffer.
                cum_moving_avg_num_picks)

            # increase step counter (which in turn triggers the log statistics writing)
            increment_log_step()

            print("Time required for epoch: {:.2f}s".format(total_time))
            print(
                ' - total ({} steps) updating: {:.2f}s ({:.2f}%), mean time/step: {:.2f}s'
                .format(
                    epoch_length * self.algorithm_config.num_batches_per_iter,
                    time_updating, time_updating / total_time, time_updating /
                    (epoch_length *
                     self.algorithm_config.num_batches_per_iter)))
            print(
                ' - total time evaluating the model: {:.2f}s ({:.2f}%)'.format(
                    time_evaluation, time_evaluation / total_time))
Beispiel #10
0
    def train(self, n_epochs: Optional[int] = None) -> None:
        """Main train method of the actor critic trainer. This is used in order to do algorithm specific operations
        around this method in the main train method which is called by the runner. (e.g. this is used when it comes to
        multiprocessing)

        :param n_epochs: Number of epochs to train.
        """

        n_epochs = self.algorithm_config.n_epochs if n_epochs is None else n_epochs

        # init minimum best model selection for early stopping
        if self.model_selection is None:
            self.model_selection = BestModelSelection(dump_file=None, model=None)

        # preserve original training coef setting
        value_loss_coef = self.algorithm_config.value_loss_coef
        policy_loss_coef = self.algorithm_config.policy_loss_coef
        entropy_coef = self.algorithm_config.entropy_coef

        # run training epochs
        if n_epochs <= 0:
            n_epochs = sys.maxsize

        for epoch in range(n_epochs):
            start = time.time()
            print("Update epoch - {}".format(epoch))

            # check for critic burn in and reset coefficient to only update the critic
            if epoch < self.algorithm_config.critic_burn_in_epochs:
                self.algorithm_config.value_loss_coef = 1.0
                self.algorithm_config.policy_loss_coef = 0.0
                self.algorithm_config.entropy_coef = 0.0
            else:
                self.algorithm_config.value_loss_coef = value_loss_coef
                self.algorithm_config.policy_loss_coef = policy_loss_coef
                self.algorithm_config.entropy_coef = entropy_coef

            # compute evaluation reward
            reward = -np.inf
            if self.evaluator:
                self.evaluate()
            # take training reward and notify best model selection manually
            else:
                if epoch > 0:
                    prev_reward = reward
                    try:
                        reward = self.rollout_generator.get_stats_value(BaseEnvEvents.reward, LogStatsLevel.EPOCH,
                                                                        name="mean")
                    except:
                        reward = prev_reward

                self.model_selection.update(reward)

            # early stopping
            if self.algorithm_config.patience and \
                    self.model_selection.last_improvement > self.algorithm_config.patience:
                BColors.print_colored("-> no improvement since {} epochs: EARLY STOPPING!"
                                      .format(self.algorithm_config.patience), color=BColors.WARNING)
                increment_log_step()
                break

            # policy update
            for _ in tqdm(range(self.algorithm_config.epoch_length)):
                update_start = time.time()
                self._update()
                self.ac_events.time_update(time.time() - update_start)

            epoch_time = time.time() - start
            self.ac_events.time_epoch(epoch_time)

            # increase step counter (which in turn triggers the log statistics writing)
            increment_log_step()

            print("Time required for epoch: {:.2f}s".format(epoch_time))