Esempio n. 1
0
    def _generate_runners(self, run_mode: RunMode) -> List[TrainingRunner]:
        """
        Generates training or rollout runner(s).
        :param run_mode: Run mode. See See :py:class:`~maze.maze.api.RunMode`.
        :return: Instantiated Runner instance.
        """

        cl = ConfigurationLoader(_run_mode=run_mode,
                                 _kwargs=self._auditors[run_mode].kwargs,
                                 _overrides=self._auditors[run_mode].overrides,
                                 _ephemeral_init_kwargs=self.
                                 _auditors[run_mode].ephemeral_init_kwargs)
        cl.load()

        self._workdirs = cl.workdirs
        self._configs[run_mode] = cl.configs
        runners: List[TrainingRunner] = []

        # Change to correct working directory (necessary due to being outside of Hydra scope).
        for workdir, config in zip(self._workdirs, self._configs[run_mode]):
            with working_directory(workdir):
                # Allow non-primitives in Hydra config.
                with omegaconf.flag_override(config, "allow_objects",
                                             True) as cfg:
                    # Set up and return runner.
                    runner = Factory(
                        base_type=TrainingRunner if run_mode ==
                        RunMode.TRAINING else RolloutRunner).instantiate(
                            cfg.runner)
                    runner.setup(cfg)
                    runners.append(runner)

        return runners
Esempio n. 2
0
def test_init_cartpole_rllib_model():
    """test the init methods"""
    hydra_overrides = {'rllib/runner': 'dev', 'model': 'rllib'}

    cfg = load_hydra_config('maze.conf', 'conf_rllib', hydra_overrides)

    runner = Factory(base_type=MazeRLlibRunner).instantiate(cfg.runner)
    runner.setup(cfg)
    ray_config, rllib_config, tune_config = runner.ray_config, runner.rllib_config, runner.tune_config

    assert isinstance(runner.env_factory(), CartPoleEnv)

    assert isinstance(ray_config, dict)
    assert isinstance(rllib_config, dict)
    assert isinstance(tune_config, dict)

    assert rllib_config['env'] == 'maze_env'
    assert rllib_config['framework'] == 'torch'
    assert rllib_config['num_workers'] == 1
    for k, v in rllib_config['model'].items():
        if v == "DEPRECATED_VALUE":
            v = DEPRECATED_VALUE
        assert k in MODEL_DEFAULTS, f'Maze RLlib model parameter \'{k}\' not in RLlib MODEL_DEFAULTS (rllib version: ' \
                                    f'{ray.__version__})'
        assert MODEL_DEFAULTS[k] == v, f'Rllib key:\'{k}\',value:\'{MODEL_DEFAULTS[k]}\' does not match with the ' \
                                       f'maze defined config \'{v}\' with rllib version: {ray.__version__}'

    if 'ObservationNormalizationWrapper' in cfg.wrappers:
        assert os.path.exists(
            cfg.wrappers.ObservationNormalizationWrapper.statistics_dump)
        os.remove(cfg.wrappers.ObservationNormalizationWrapper.statistics_dump)
Esempio n. 3
0
def _run_job(cfg: DictConfig) -> None:
    """Runs a regular maze job.

    :param cfg: Hydra configuration for the rollout.
    """
    set_matplotlib_backend()

    # If no env or agent base seed is given generate the seeds randomly and add them to the resolved hydra config
    if cfg.seeding.env_base_seed is None:
        cfg.seeding.env_base_seed = MazeSeeding.generate_seed_from_random_state(
            np.random.RandomState(None))
    if cfg.seeding.agent_base_seed is None:
        cfg.seeding.agent_base_seed = MazeSeeding.generate_seed_from_random_state(
            np.random.RandomState(None))

    # print and log config
    config_str = yaml.dump(OmegaConf.to_container(cfg, resolve=True),
                           sort_keys=False)
    with open("hydra_config.yaml", "w") as fp:
        fp.write("\n" + config_str)
    BColors.print_colored(config_str, color=BColors.HEADER)
    print("Output directory: {}\n".format(os.path.abspath(".")))

    # run job
    runner = Factory(base_type=Runner).instantiate(cfg.runner)
    runner.setup(cfg)
    runner.run()
Esempio n. 4
0
def test_init_cartpole_maze_model():
    """test the init methods """
    hydra_overrides = {
        'rllib/runner': 'dev',
        'configuration': 'test',
        'env': 'gym_env',
        'model': 'vector_obs',
        'wrappers': 'vector_obs',
        'critic': 'template_state'
    }

    cfg = load_hydra_config('maze.conf', 'conf_rllib', hydra_overrides)

    runner = Factory(base_type=MazeRLlibRunner).instantiate(cfg.runner)
    runner.setup(cfg)
    ray_config, rllib_config, tune_config = runner.ray_config, runner.rllib_config, runner.tune_config

    assert isinstance(runner.env_factory(), CartPoleEnv)

    assert issubclass(_global_registry.get(RLLIB_ACTION_DIST, 'maze_dist'),
                      MazeRLlibActionDistribution)
    assert issubclass(_global_registry.get(RLLIB_MODEL, 'maze_model'),
                      MazeRLlibPolicyModel)

    assert isinstance(ray_config, dict)
    assert isinstance(rllib_config, dict)
    assert isinstance(tune_config, dict)

    assert rllib_config['env'] == 'maze_env'
    assert rllib_config['framework'] == 'torch'
    assert rllib_config['num_workers'] == 1
    model_config = rllib_config['model']

    assert model_config['custom_action_dist'] == 'maze_dist'
    assert model_config['custom_model'] == 'maze_model'
    assert model_config['vf_share_layers'] is False
    assert model_config['custom_model_config'][
        'maze_model_composer_config'] == cfg.model
    assert model_config['custom_model_config'][
        'spaces_config_dump_file'] == cfg.runner.spaces_config_dump_file

    if 'ObservationNormalizationWrapper' in cfg.wrappers:
        assert os.path.exists(
            cfg.wrappers.ObservationNormalizationWrapper.statistics_dump)
        os.remove(cfg.wrappers.ObservationNormalizationWrapper.statistics_dump)
Esempio n. 5
0
class ESTrainer(Trainer):
    """Trainer class for OpenAI Evolution Strategies.

    :param algorithm_config: Algorithm parameters.
    :param torch_policy: Multi-step policy encapsulating the policy networks
    :param shared_noise: The noise table, with the same content for every worker and the master.
    :param normalization_stats: Normalization statistics as calculated by the NormalizeObservationWrapper.
    """
    def __init__(
        self, algorithm_config: ESAlgorithmConfig, torch_policy: TorchPolicy,
        shared_noise: SharedNoiseTable,
        normalization_stats: Optional[Dict[str, Tuple[np.ndarray, np.ndarray]]]
    ) -> None:
        super().__init__(algorithm_config)

        # --- training setup ---
        self.model_selection: Optional[ModelSelectionBase] = None
        self.policy: Union[Policy, TorchModel] = torch_policy

        self.shared_noise = shared_noise
        self.normalization_stats = normalization_stats

        # setup the optimizer, now that the policy is available
        self.optimizer = Factory(Optimizer).instantiate(
            algorithm_config.optimizer)
        self.optimizer.setup(self.policy)

        # prepare statistics collection
        self.eval_stats = LogStatsAggregator(LogStatsLevel.EPOCH,
                                             get_stats_logger("eval"))
        self.train_stats = LogStatsAggregator(LogStatsLevel.EPOCH,
                                              get_stats_logger("train"))
        # injection of ES-specific events
        self.es_events = self.train_stats.create_event_topic(ESEvents)

    @override(Trainer)
    def train(self,
              distributed_rollouts: ESDistributedRollouts,
              n_epochs: Optional[int] = None,
              model_selection: Optional[ModelSelectionBase] = None) -> None:
        """
        Run the ES training loop.
        :param distributed_rollouts: The distribution interface for experience collection.
        :param n_epochs: Number of epochs to train.
        :param model_selection: Optional model selection class, receives model evaluation results.
        """

        n_epochs = self.algorithm_config.n_epochs if n_epochs is None else n_epochs
        self.model_selection = model_selection

        for epoch in itertools.count():
            # check if we reached the max number of epochs
            if n_epochs and epoch == n_epochs:
                break

            print('********** Iteration {} **********'.format(epoch))

            step_start_time = time.time()

            # do the actual update step (disable autograd, as we calculate the gradient from the rollout returns)
            with torch.no_grad():
                self._update(distributed_rollouts)

            step_end_time = time.time()

            # log the step duration
            self.es_events.real_time(step_end_time - step_start_time)

            # update the epoch count
            increment_log_step()

    def load_state_dict(self, state_dict: Dict) -> None:
        """Set the model and optimizer state.
        :param state_dict: The state dict.
        """
        self.policy.load_state_dict(state_dict)

    @override(Trainer)
    def state_dict(self):
        """implementation of :class:`~maze.train.trainers.common.trainer.Trainer`
        """
        return self.policy.state_dict()

    @override(Trainer)
    def load_state(self, file_path: Union[str, BinaryIO]) -> None:
        """implementation of :class:`~maze.train.trainers.common.trainer.Trainer`
        """
        state_dict = torch.load(file_path,
                                map_location=torch.device(self.policy.device))
        self.load_state_dict(state_dict)

    def _update(self, distributed_rollouts: ESDistributedRollouts):
        # Pop off results for the current task
        n_train_episodes, n_timesteps_popped = 0, 0

        # aggregate all collected training rollouts for this episode
        epoch_results = ESRolloutResult(is_eval=False)

        # obtain a generator from the distribution interface
        rollouts_generator = distributed_rollouts.generate_rollouts(
            policy=self.policy,
            max_steps=self.algorithm_config.max_steps,
            noise_stddev=self.algorithm_config.noise_stddev,
            normalization_stats=self.normalization_stats)

        # collect eval and training rollouts
        for result in rollouts_generator:
            if result.is_eval:
                # This was an eval job
                for e in result.episode_stats:
                    self.eval_stats.receive(e)
                continue

            # we received training experience from perturbed policy networks
            epoch_results.noise_indices.extend(result.noise_indices)
            epoch_results.episode_stats.extend(result.episode_stats)

            # update the training statistics
            for e in result.episode_stats:
                self.train_stats.receive(e)

                n_train_episodes += 1
                n_timesteps_popped += e[(BaseEnvEvents.reward, "count", None)]

            # continue until we collected enough episodes and timesteps
            if (n_train_episodes >= self.algorithm_config.n_rollouts_per_update
                    and n_timesteps_popped >=
                    self.algorithm_config.n_timesteps_per_update):
                break

        # notify the model selection of the evaluation results
        eval_stats = self.eval_stats.reduce()
        if self.model_selection and len(eval_stats):
            reward = eval_stats[(BaseEnvEvents.reward, "mean", None)]
            self.model_selection.update(reward)

        # prepare returns, reshape the positive/negative antithetic estimation as (rollouts, 2)
        returns_n2 = np.array([
            e[(BaseEnvEvents.reward, "sum", None)]
            for e in epoch_results.episode_stats
        ]).reshape(-1, 2)

        # improve robustness: weight by rank, not by reward
        proc_returns_n2 = self._compute_centered_ranks(returns_n2)

        # compute the gradient
        g = self._batched_weighted_sum(
            proc_returns_n2[:, 0] - proc_returns_n2[:, 1],
            (self.shared_noise.get(idx, self.policy.num_params)
             for idx in epoch_results.noise_indices),
            batch_size=500)

        g /= n_train_episodes / 2.0

        # apply the weight update
        theta = get_flat_parameters(self.policy)
        update_ratio = self.optimizer.update(-g +
                                             self.algorithm_config.l2_penalty *
                                             theta.numpy())

        # statistics logging
        self.es_events.update_ratio(update_ratio)

        for i in self.policy.state_dict().keys():
            self.es_events.policy_grad_norm(policy_id=i,
                                            value=np.square(g).sum()**0.5)
            self.es_events.policy_norm(policy_id=i,
                                       value=np.square(theta).sum()**0.5)

    @classmethod
    def _iter_groups(cls, items: Iterable,
                     group_size: int) -> Generator[Tuple, None, None]:
        assert group_size >= 1
        group = []
        for x in items:
            group.append(x)
            if len(group) == group_size:
                yield tuple(group)
                del group[:]
        if group:
            yield tuple(group)

    @classmethod
    def _batched_weighted_sum(cls, weights: Iterable[float],
                              vectors: Iterable[np.ndarray],
                              batch_size: int) -> np.ndarray:
        """calculate a weighted sum of the given vectors, in steps of at most `batch_size` vectors"""
        # start with float, at the first operation numpy broadcasting takes care of the correct shape
        total: Union[np.array, float] = 0.

        for batch_weights, batch_vectors in zip(
                cls._iter_groups(weights, batch_size),
                cls._iter_groups(vectors, batch_size)):
            assert len(batch_weights) == len(batch_vectors) <= batch_size
            total += np.dot(np.asarray(batch_weights, dtype=np.float32),
                            np.asarray(batch_vectors, dtype=np.float32))

        return total

    @classmethod
    def _compute_ranks(cls, x: np.ndarray) -> np.ndarray:
        """
        Returns ranks in [0, len(x))
        Note: This is different from scipy.stats.rankdata, which returns ranks in [1, len(x)].
        """
        assert x.ndim == 1
        ranks = np.empty(len(x), dtype=int)
        ranks[x.argsort()] = np.arange(len(x))
        return ranks

    @classmethod
    def _compute_centered_ranks(cls, x):
        y = cls._compute_ranks(x.ravel()).reshape(x.shape).astype(np.float32)
        y /= (x.size - 1)
        y -= .5
        return y