def __init__( self, context: Any, env: det.EnvContext, workloads: workload.Stream, load_path: Optional[pathlib.Path], rendezvous_info: RendezvousInfo, hvd_config: horovod.HorovodContext, ) -> None: self.context = context self.env = env self.workloads = workloads self.load_path = load_path self.rendezvous_info = rendezvous_info self.hvd_config = hvd_config self.prof = profiler.ProfilerAgent.from_env( env, rendezvous_info.get_rank(), context.distributed.get_rank(), ) self._check_if_trial_supports_configurations(env)
def from_configs( experiment_config: ExperimentConfig, rendezvous_info: RendezvousInfo, hparams: Dict[str, Any], ) -> "HorovodContext": """ Create the HorovodContext according to experiment config and rendezvous info for this trial. """ # Horovod is always used for multi-machine distributed training. For # single-machine multi-GPU training, Horovod is used when native_parallel is # disabled. multi_machine_trial = rendezvous_info.get_size() > 1 multi_slot_trial = experiment_config["resources"]["slots_per_trial"] > 1 use_horovod = multi_machine_trial or ( multi_slot_trial and not experiment_config.native_parallel_enabled()) check.is_in("optimizations", experiment_config) optimizations_config = cast(Dict[str, Any], experiment_config.get("optimizations")) check.is_in("aggregation_frequency", optimizations_config) check.is_in("gradient_compression", optimizations_config) check.is_in("average_training_metrics", optimizations_config) # Help users migrate from the old locations for these settings, in hparams. def error_message_removed_from_hparams(removed_hparam: str) -> str: return ( f"Please move `{removed_hparam}` in the experiment config to " f"`Optimizations` from `hyperparameters`.") check.not_in( "aggregation_frequency", hparams, error_message_removed_from_hparams("aggregation_frequency"), ) check.not_in( "gradient_compression", hparams, error_message_removed_from_hparams("gradient_compression"), ) check.not_in( "grad_updates_size_file", hparams, error_message_removed_from_hparams("grad_updates_size_file"), ) hvd_config = HorovodContext( use=use_horovod, aggregation_frequency=cast( int, optimizations_config.get("aggregation_frequency")), fp16_compression=cast( bool, optimizations_config.get("gradient_compression")), grad_updates_size_file=optimizations_config.get( "grad_updates_size_file", None), average_aggregated_gradients=cast( bool, optimizations_config.get("average_aggregated_gradients")), average_training_metrics=cast( bool, optimizations_config.get("average_training_metrics")), ) if hvd_config.use and hvd_config.aggregation_frequency > 1: logging.info( f"Setting `aggregation_frequency` to {hvd_config.aggregation_frequency} " "to optimize training.") if hvd_config.use and hvd_config.fp16_compression: logging.info( "Enabling `gradient_compression` to optimize training.") return hvd_config