Beispiel #1
0
    def __init__(
        self,
        context: Any,
        env: det.EnvContext,
        workloads: workload.Stream,
        load_path: Optional[pathlib.Path],
        rendezvous_info: RendezvousInfo,
        hvd_config: horovod.HorovodContext,
    ) -> None:
        self.context = context
        self.env = env
        self.workloads = workloads
        self.load_path = load_path
        self.rendezvous_info = rendezvous_info
        self.hvd_config = hvd_config

        self.prof = profiler.ProfilerAgent.from_env(
            env,
            rendezvous_info.get_rank(),
            context.distributed.get_rank(),
        )

        self._check_if_trial_supports_configurations(env)
Beispiel #2
0
    def from_configs(
        experiment_config: ExperimentConfig,
        rendezvous_info: RendezvousInfo,
        hparams: Dict[str, Any],
    ) -> "HorovodContext":
        """
        Create the HorovodContext according to experiment config and rendezvous info for this trial.
        """

        # Horovod is always used for multi-machine distributed training. For
        # single-machine multi-GPU training, Horovod is used when native_parallel is
        # disabled.
        multi_machine_trial = rendezvous_info.get_size() > 1
        multi_slot_trial = experiment_config["resources"]["slots_per_trial"] > 1
        use_horovod = multi_machine_trial or (
            multi_slot_trial
            and not experiment_config.native_parallel_enabled())

        check.is_in("optimizations", experiment_config)
        optimizations_config = cast(Dict[str, Any],
                                    experiment_config.get("optimizations"))

        check.is_in("aggregation_frequency", optimizations_config)
        check.is_in("gradient_compression", optimizations_config)
        check.is_in("average_training_metrics", optimizations_config)

        # Help users migrate from the old locations for these settings, in hparams.
        def error_message_removed_from_hparams(removed_hparam: str) -> str:
            return (
                f"Please move `{removed_hparam}` in the experiment config to "
                f"`Optimizations` from `hyperparameters`.")

        check.not_in(
            "aggregation_frequency",
            hparams,
            error_message_removed_from_hparams("aggregation_frequency"),
        )
        check.not_in(
            "gradient_compression",
            hparams,
            error_message_removed_from_hparams("gradient_compression"),
        )
        check.not_in(
            "grad_updates_size_file",
            hparams,
            error_message_removed_from_hparams("grad_updates_size_file"),
        )

        hvd_config = HorovodContext(
            use=use_horovod,
            aggregation_frequency=cast(
                int, optimizations_config.get("aggregation_frequency")),
            fp16_compression=cast(
                bool, optimizations_config.get("gradient_compression")),
            grad_updates_size_file=optimizations_config.get(
                "grad_updates_size_file", None),
            average_aggregated_gradients=cast(
                bool,
                optimizations_config.get("average_aggregated_gradients")),
            average_training_metrics=cast(
                bool, optimizations_config.get("average_training_metrics")),
        )

        if hvd_config.use and hvd_config.aggregation_frequency > 1:
            logging.info(
                f"Setting `aggregation_frequency` to {hvd_config.aggregation_frequency} "
                "to optimize training.")

        if hvd_config.use and hvd_config.fp16_compression:
            logging.info(
                "Enabling `gradient_compression` to optimize training.")

        return hvd_config