def __init__(self, *args: Any, **kwargs: Any) -> None: det.TrialContext.__init__(self, *args, **kwargs) pytorch._PyTorchReducerContext.__init__(self, self.distributed.allgather) self._per_slot_batch_size, self._global_batch_size = util.calculate_batch_sizes( self.get_hparams(), self.env.experiment_config.slots_per_trial(), "PyTorchTrial", ) self._distributed_backend = det._DistributedBackend() self.device = self._init_device() # Track which types we have issued warnings for in to_device(). self._to_device_warned_types = set() # type: Set[Type] # The following attributes are initialized during the lifetime of # a PyTorchTrialContext. self.models = [] # type: List[nn.Module] self.optimizers = [] # type: List[torch.optim.Optimizer] self.profiler = None # type: Any self.lr_schedulers = [] # type: List[pytorch.LRScheduler] self._epoch_len = None # type: Optional[int] # Keep a map of wrapped models to their original input forms, which is needed # by torch DDP and apex to initialize in the correct order self._wrapped_models = {} # type: Dict[nn.Module, nn.Module] # Use a main model to contain all of the models because when using horovod # to broadcast the states of models we want to avoid name conflicts for these # states so we set all the models to be sub-module of the main model with # different names using __setattr__ and use the state_dict of the main model # for broadcasting. Note that broadcast_parameters only accepts state_dict() # although its doc says it also accepts named_parameters() self._main_model = nn.Module() self._scaler = None self._use_apex = False self._loss_ids = {} # type: Dict[torch.Tensor, int] self._last_backward_batch_idx = None # type: Optional[int] self._current_batch_idx = None # type: Optional[int] self.experimental = pytorch.PyTorchExperimentalContext(self) self._reducers = pytorch._PyTorchReducerContext() self._determined_profiler = None # type: Optional[profiler.ProfilerAgent] optimizations_config = self.env.experiment_config.get_optimizations_config( ) self._aggregation_frequency = cast( int, optimizations_config.get("aggregation_frequency")) self._fp16_compression = cast( bool, optimizations_config.get("gradient_compression")) self._average_aggregated_gradients = cast( bool, optimizations_config.get("average_aggregated_gradients")) self._average_training_metrics = cast( bool, optimizations_config.get("average_training_metrics"))
def make_trial_controller_from_trial_implementation( trial_class: Type[det.Trial], hparams: Dict, workloads: workload.Stream, scheduling_unit: int = 1, trial_seed: int = 0, exp_config: Optional[Dict] = None, checkpoint_dir: Optional[str] = None, latest_checkpoint: Optional[str] = None, steps_completed: int = 0, expose_gpus: bool = False, ) -> det.TrialController: if not exp_config: assert hasattr( trial_class, "_searcher_metric" ), "Trial classes for unit tests should be annotated with a _searcher_metric attribute" searcher_metric = trial_class._searcher_metric # type: ignore exp_config = make_default_exp_config(hparams, scheduling_unit, searcher_metric, checkpoint_dir=checkpoint_dir) env = make_default_env_context( hparams=hparams, experiment_config=exp_config, trial_seed=trial_seed, latest_checkpoint=latest_checkpoint, steps_completed=steps_completed, expose_gpus=expose_gpus, ) storage_manager = det.common.storage.SharedFSStorageManager(checkpoint_dir or "/tmp") core_context = core._dummy_init(storage_manager=storage_manager) distributed_backend = det._DistributedBackend() controller_class = trial_class.trial_controller_class assert controller_class is not None controller_class.pre_execute_hook(env, distributed_backend) trial_context = trial_class.trial_context_class(core_context, env) trial_inst = trial_class(trial_context) return controller_class.from_trial( trial_inst=trial_inst, context=trial_context, env=env, workloads=workloads, )
def test_one_batch( trial_class: Type[det.Trial], config: Optional[Dict[str, Any]] = None, ) -> Any: # Override the scheduling_unit value to 1. config = {**(config or {}), "scheduling_unit": 1} logging.info("Running a minimal test experiment locally") with tempfile.TemporaryDirectory() as checkpoint_dir: core_context, env = det._make_local_execution_env( managed_training=True, test_mode=True, config=config, checkpoint_dir=checkpoint_dir, limit_gpus=1, ) workloads = _make_test_workloads(env.experiment_config) logging.info(f"Using hyperparameters: {env.hparams}.") logging.debug( f"Using a test experiment config: {env.experiment_config}.") distributed_backend = det._DistributedBackend() controller_class = trial_class.trial_controller_class assert controller_class is not None controller_class.pre_execute_hook(env, distributed_backend) trial_context = trial_class.trial_context_class(core_context, env) logging.info(f"Creating {trial_class.__name__}.") trial_inst = trial_class(trial_context) controller = controller_class.from_trial( trial_inst=trial_inst, context=trial_context, env=env, workloads=workloads, ) controller.run() logging.info("The test experiment passed.") logging.info( "Note: to submit an experiment to the cluster, change local parameter to False" )
def main(train_entrypoint: str) -> int: info = det.get_cluster_info() assert info is not None, "must be run on-cluster" assert info.task_type == "TRIAL", f'must be run with task_type="TRIAL", not "{info.task_type}"' # TODO: refactor data_layer, and profiling to to not use the cli_cert. certs.cli_cert = certs.default_load(info.master_url) # TODO: Don't include EnvContext object in the future high-level APIs for PyTorch or Keras. # It was natural to create this big-blob-of-config object, but it was a mistake to pass it into # the lowest layers of the harness code; it's too large of an object to be easily mockable, # which is part of why building local training mode has always been a challenge. # # A better pattern is to pass in exactly the information that is necessary at each layer. We # will use that pattern for the future high-level APIs, but it's not worth refactoring e.g. the # TFKerasTrialController or EstimatorTrialController to add that functionality, so for now we # continue with the legacy strategy. env = det.EnvContext( master_url=info.master_url, master_cert_file=info.master_cert_file, master_cert_name=info.master_cert_name, experiment_config=info.trial._config, hparams=info.trial.hparams, latest_checkpoint=info.latest_checkpoint, steps_completed=info.trial._steps_completed, use_gpu=bool(info.gpu_uuids), container_gpus=info.gpu_uuids, slot_ids=info.slot_ids, debug=info.trial._debug, det_trial_unique_port_offset=info.trial._unique_port_offset, det_trial_id=str(info.trial.trial_id), det_experiment_id=str(info.trial.experiment_id), det_agent_id=info.agent_id, det_cluster_id=info.cluster_id, trial_seed=info.trial.trial_seed, trial_run_id=info.trial._trial_run_id, allocation_id=info.allocation_id, managed_training=True, test_mode=False, on_cluster=True, ) det.common.set_logger(env.debug) logging.debug("Starting harness.") with maybe_periodic_stacktraces(env.debug): # Step 1: Load user code. # We can't build a core.Context without rank information, and we can't gather rank # information until the distributed backend is initialized, and we can't initialize the # correct distributed backend until we know which Trial class the user implemented. trial_class = load.trial_class_from_entrypoint(train_entrypoint) controller_class = load.get_trial_controller_class(trial_class) if info.container_rank == 0: try: analytics.send_analytics("trial_loaded", analytics.get_trial_analytics(trial_class)) except Exception as e: logging.debug(f"Cannot send analytics: {e}") # Step 2: Initialize framework-specific details (dtrain framework, random seeds, etc). distributed_backend = det._DistributedBackend() controller_class.pre_execute_hook(env, distributed_backend) # Step 3: Now that the dtrain framework is initialized, build the DistributedContext object. # For harness.py, we only support a fixed set of Determined-provided launch layers, since # the TrialControllers only support a fixed set of launch layers. distributed = None if distributed_backend.use_horovod(): distributed = core.DistributedContext.from_horovod(horovod.hvd) elif distributed_backend.use_deepspeed(): distributed = core.DistributedContext.from_deepspeed() elif distributed_backend.use_torch(): distributed = core.DistributedContext.from_torch_distributed() elif len(info.container_addrs) > 1 or len(info.slot_ids) > 1: raise ValueError( "In multi-slot tasks, the determined.exec.harness module must not be invoked " "directly. Instead, it must be wrapped in one of the following launch layers: " "determined.launch.horovod, determined.launch.deepspeed" ) # Step 4: Let core.init() create the core.Context. with core.init( distributed=distributed, preempt_mode=core.PreemptMode.ChiefOnly, tensorboard_mode=core.TensorboardMode.MANUAL, ) as core_context: trial_context = trial_class.trial_context_class(core_context, env) # Step 4: Instantiate the user's Trial. trial_inst = trial_class(trial_context) # Step 5: Create a TrialController and execute training logging.info(f"Creating {controller_class.__name__} with {trial_class.__name__}.") controller = controller_class.from_trial( trial_inst=trial_inst, context=trial_context, env=env, ) controller.run() return 0