def _configure_amp(self) -> None: if self.use_amp(): if self.hvd_config.use: check.eq( self.hvd_config.aggregation_frequency, 1, "Mixed precision training (AMP) is not supported with " "aggregation frequency > 1.", ) check.true( torch.cuda.is_available(), "Mixed precision training (AMP) is supported only on GPU slots.", ) check.false( not self.hvd_config.use and self.n_gpus > 1, "To enable mixed precision training (AMP) for parallel training, " 'please set `resources["optimized_parallel"] = True`.', ) logging.info( f"Enabling mixed precision training with opt_level: {self._get_amp_setting()}." ) self.context.model, self.context.optimizer = apex.amp.initialize( self.context.model, self.context.optimizer, opt_level=self._get_amp_setting(), verbosity=1 if self.is_chief or self.env.experiment_config.debug_enabled() else 0, )
def _prepare_metrics_reducers(self, keys: Any) -> Dict[str, Reducer]: metrics_reducers = {} # type: Dict[str, Reducer] if isinstance(self.trial.evaluation_reducer(), Dict): metrics_reducers = cast(Dict[str, Any], self.trial.evaluation_reducer()) check.eq( metrics_reducers.keys(), keys, "Please provide a single evaluation reducer or " "provide a reducer for every validation metric. " f"Expected keys: {keys}, provided keys: {metrics_reducers.keys()}.", ) elif isinstance(self.trial.evaluation_reducer(), Reducer): for key in keys: metrics_reducers[key] = cast(Reducer, self.trial.evaluation_reducer()) for key in keys: check.true( isinstance(metrics_reducers[key], Reducer), "Please select `det.pytorch.Reducer` " "for reducing validation metrics.", ) return metrics_reducers
def __init__( self, context: Union[keras.TFKerasTrialContext, keras.TFKerasNativeContext], train_config: keras.TFKerasTrainConfig, ) -> None: super().__init__(context=context) self._training_cacheable = self._context.experimental.get_train_cacheable( ) self._training_dataset = train_config.training_data check.true( self._training_cacheable.is_decorator_used(), "Please use `@context.experimental.cache_train_dataset(dataset_name, dataset_version)`" " for the training dataset.", ) check.false( self._context.dataset_initialized, "Please do not use: `context.wrap_dataset(dataset)` if using " "`@context.experimental.cache_train_dataset()` and " "`@context.experimental.cache_validation_dataset()`.", ) check.is_instance( train_config.training_data, tf.data.Dataset, "Pass in a `tf.data.Dataset` object if using " "`@context.experimental.cache_train_dataset()`.", )
def _combine_metrics_across_processes( self, metrics: Dict[str, Any], num_batches: int ) -> Tuple[Optional[Dict[str, Any]], Optional[List[int]]]: # The chief receives the metric from every other training process. check.true(self.hvd_config.use) metrics_lists = {} # type: Dict[str, Any] batches_per_process = [] # type: List[int] if self.is_chief: self.train_process_comm_chief = cast(ipc.ZMQServer, self.train_process_comm_chief) worker_metrics = self.train_process_comm_chief.barrier( num_connections=hvd.size() - 1) worker_metrics = cast(List[ipc.MetricsInfo], worker_metrics) for metric_name in metrics.keys(): metrics_lists[metric_name] = [metrics[metric_name]] for worker_metric in worker_metrics: metrics_lists[metric_name].append( worker_metric.metrics[metric_name]) batches_per_process.append(num_batches) for worker_metric in worker_metrics: batches_per_process.append(worker_metric.num_batches) return metrics_lists, batches_per_process else: self.train_process_comm_worker = cast( ipc.ZMQClient, self.train_process_comm_worker) self.train_process_comm_worker.barrier(message=ipc.MetricsInfo( metrics=metrics, num_batches=num_batches)) return None, None
def _initialize_train_process_comm(self) -> None: check.true(self.hvd_config.use) srv_pub_port = (constants.INTER_TRAIN_PROCESS_COMM_PORT_1 + self.env.det_trial_unique_port_offset) srv_pull_port = (constants.INTER_TRAIN_PROCESS_COMM_PORT_2 + self.env.det_trial_unique_port_offset) if self.is_chief: logging.debug( f"Chief setting up server with ports {srv_pub_port}/{srv_pull_port}." ) self.train_process_comm_chief = ipc.ZMQBroadcastServer( num_connections=self.env.experiment_config.slots_per_trial() - 1, pub_port=srv_pub_port, pull_port=srv_pull_port, ) else: chief_ip_address = self.rendezvous_info.get_ip_addresses()[0] logging.debug(f"Non-Chief {hvd.rank()} setting up comm to " f"{chief_ip_address} w/ ports " f"{srv_pub_port}/{srv_pull_port}.") self.train_process_comm_worker = ipc.ZMQBroadcastClient( srv_pub_url=f"tcp://{chief_ip_address}:{srv_pub_port}", srv_pull_url=f"tcp://{chief_ip_address}:{srv_pull_port}", )
def set_runpy_trial_result( cls, trial_cls: Type[det.Trial], controller_cls: Type[det.TrialController] ) -> None: check.true(cls.get_instance().controller_cls is None, "Please don't load twice.") cls.get_instance().trial_cls = trial_cls cls.get_instance().controller_cls = controller_cls raise det.errors.StopLoadingImplementation()
def _init_run_config( self, config: tf.estimator.RunConfig) -> tf.estimator.RunConfig: logging.debug(f"Initializing RunConfig. Got RunConfig: {config} .") session_config = config.session_config train_distribute = None eval_distribute = None # The default session should already be defined, here we also set the session # for the estimator itself. self._init_session_config(session_config, self.env, self.hvd_config) if not self.hvd_config.use and len(self.env.container_gpus) > 1: check.true(len(self.rendezvous_info.get_addrs()) == 1) train_distribute = tf.distribute.MirroredStrategy() eval_distribute = tf.distribute.MirroredStrategy() config = config.replace( model_dir=str(self.estimator_dir), tf_random_seed=self.env.trial_seed, save_checkpoints_steps=None, # `train_and_evaluate()` requires that either # `save_checkpoints_steps` or `save_checkpoints_secs` is # set to greater than 0. save_checkpoints_secs=VERY_LARGE_NUMBER, session_config=session_config, train_distribute=train_distribute, eval_distribute=eval_distribute, experimental_distribute=None, ) logging.debug(f"Initialized RunConfig with args: {config}.") return config
def decode_bytes(s: str) -> str: r""" Hasura sends over any bytea value as the two-character string '\x' followed by the hex encoding of the bytes. This function turns such a value into the corresponding string. """ check.true(s.startswith(r"\x"), "Invalid log value received") return bytes.fromhex(s[2:]).decode()
def set_runpy_native_result( cls, context: det.NativeContext, controller_cls: Type[det.TrialController]) -> None: check.true(cls.get_instance().controller_cls is None, "Please don't load twice.") cls.get_instance().context = context cls.get_instance().controller_cls = controller_cls
def average_metrics(self, metrics: Dict[str, Any]) -> Optional[Dict[str, Any]]: check.true(self.hvd_config.use) if self.is_chief: self.train_process_comm_chief = cast(ipc.ZMQBroadcastServer, self.train_process_comm_chief) logging.debug( f"Chief {hvd.rank()} beginning receiving validation metrics.") worker_metrics, _ = self.train_process_comm_chief.gather_with_polling( lambda: None) self.train_process_comm_chief.broadcast(None) logging.debug( f"Chief {hvd.rank()} done receiving validation metrics.") for metric_name in metrics: if isinstance(metrics[metric_name], numbers.Number): metrics[metric_name] /= hvd.size() else: logging.warning( f"Skipping averaging metric: {metric_name}.") for metric_name in metrics.keys(): for worker_metric in worker_metrics: if isinstance(worker_metric[metric_name], numbers.Number): metrics[metric_name] += worker_metric[ metric_name] / hvd.size() return metrics else: self.train_process_comm_worker = cast( ipc.ZMQBroadcastClient, self.train_process_comm_worker) logging.debug(f"Worker {hvd.rank()} sending metrics.") self.train_process_comm_worker.send(metrics) # Synchronize with the chief so that there is no risk of accidentally calling send() # for a future gather before all workers have called send() on this gather. _ = self.train_process_comm_worker.recv() return None
def average_metrics(self, metrics: Dict[str, Any]) -> Optional[Dict[str, Any]]: # The chief receives the metric from every worker and computes # the average. check.true(self.hvd_config.use) if self.is_chief: self.train_process_comm_chief = cast(ipc.ZMQServer, self.train_process_comm_chief) logging.debug( f"Chief {hvd.rank()} beginning receiving validation metrics.") worker_metrics = self.train_process_comm_chief.barrier( num_connections=hvd.size() - 1) logging.debug( f"Chief {hvd.rank()} done receiving validation metrics.") for metric_name in metrics: if isinstance(metrics[metric_name], numbers.Number): metrics[metric_name] /= hvd.size() else: logging.warning( f"Skipping averaging metric: {metric_name}.") for metric_name in metrics.keys(): for worker_metric in worker_metrics: if isinstance(worker_metric[metric_name], numbers.Number): metrics[metric_name] += worker_metric[ metric_name] / hvd.size() return metrics else: self.train_process_comm_worker = cast( ipc.ZMQClient, self.train_process_comm_worker) logging.debug(f"Worker {hvd.rank()} sending metrics.") self.train_process_comm_worker.barrier(message=metrics) return None
def _init_run_config( self, config: tf.estimator.RunConfig) -> tf.estimator.RunConfig: logging.debug(f"Initializing RunConfig. Got RunConfig: {config} .") session_config = config.session_config train_distribute = None eval_distribute = None if self.hvd_config.use: if session_config is None: session_config = tf.compat.v1.ConfigProto() session_config.gpu_options.allow_growth = True session_config.gpu_options.visible_device_list = self.env.slot_ids[ horovod.hvd.local_rank()] elif len(self.env.container_gpus) > 1: check.true(len(self.rendezvous_info.get_addrs()) == 1) train_distribute = tf.distribute.MirroredStrategy() eval_distribute = tf.distribute.MirroredStrategy() config = config.replace( model_dir=str(self.estimator_dir), tf_random_seed=self.env.trial_seed, save_checkpoints_steps=None, # `train_and_evaluate()` requires that either # `save_checkpoints_steps` or `save_checkpoints_secs` is # set to greater than 0. save_checkpoints_secs=VERY_LARGE_NUMBER, session_config=session_config, train_distribute=train_distribute, eval_distribute=eval_distribute, experimental_distribute=None, ) logging.debug(f"Initialized RunConfig with args: {config}.") return config
def after_run(self, run_context: tf.estimator.SessionRunContext, run_values: tf.estimator.SessionRunValues) -> None: # Check for optimizer creation here because when model_fn is passed in as a closure, # the optimizer is not initialized until the first training step. check.true( self.estimator_trial_controller.context.optimizer_initialized, "Please pass your optimizer into " "`det.estimator.wrap_optimizer(optimizer)` " "right after creating it.", ) self._session = run_context.session self._current_global_step = run_values.results["global_step"] self.num_batches = cast(int, self.num_batches) self._collect_batch_metrics(run_values) self.batches_processed_in_step += 1 if self.batches_processed_in_step < self.num_batches: return # TODO: Average training results across GPUs. This might # degrade performance due to an increase in communication. # Loss training metric is sometimes called `loss_1` instead of `loss`. for step_metrics in self.step_metrics: if "loss" not in step_metrics and "loss_1" in step_metrics: step_metrics["loss"] = step_metrics["loss_1"] # Send the result of the training step back to the main process. check.is_not_none(self.train_response_func, "no response_func at end of train_for_step") self.train_response_func = cast(workload.ResponseFunc, self.train_response_func) if self.estimator_trial_controller.is_chief: response = { "metrics": det.util.make_metrics(self.batches_processed_in_step, self.step_metrics), "stop_requested": self.estimator_trial_controller.context.get_stop_requested(), "invalid_hp": False, } self.train_response_func(response) else: self.train_response_func(workload.Skipped()) # Reset step counter and clear the step metrics from memory. self.train_response_func = None self.batches_processed_in_step = 0 self.step_metrics = [] estimator._cleanup_after_train_step( self.estimator_trial_controller.estimator_dir) # Re-enter the control loop (block on receiving the next instruction) self.control_loop()
def __init__(self, *args: Any, **kwargs: Any) -> None: super().__init__(*args, **kwargs) check_startup_hook_ran = self.env.hparams.get("check_startup_hook_ran", False) if check_startup_hook_ran: check.true(os.path.isfile("startup-hook-ran"), "File should exists.") self.chaos = random.SystemRandom() self._batch_size = self.context.get_per_slot_batch_size() self.chaos_probability = self.env.hparams.get("chaos_probability", 0) self.chaos_probability_train = self.env.hparams.get( "chaos_probability_train") self.chaos_probability_validate = self.env.hparams.get( "chaos_probability_validate") self.chaos_probability_checkpoint = self.env.hparams.get( "chaos_probability_checkpoint") self.fail_on_first_validation = self.env.hparams.get( "fail_on_first_validation", "") self.fail_on_chechpoint_save = self.env.hparams.get( "fail_on_chechpoint_save", "") self.validation_set_size = self.env.hparams.get( "validation_set_size", 32 * 32) self.train_batch_secs = self.env.hparams.get("training_batch_seconds", 0) self.validation_secs = self.env.hparams.get( "validation_seconds", self.validation_set_size * self.train_batch_secs / self._batch_size, ) self.num_training_metrics = self.env.hparams.get( "num_training_metrics", 1) assert self.num_training_metrics > 0 self.num_validation_metrics = self.env.hparams.get( "num_validation_metrics", 1) assert self.num_validation_metrics > 0 self.save_secs = self.env.hparams.get("save_checkpoint_seconds", 0) self.load_secs = self.env.hparams.get("load_checkpoint_secs", 0) self.metrics_progression = self.env.hparams.get( "metrics_progression", "decreasing") assert self.metrics_progression in ("increasing", "decreasing", "constant") self.metrics_base = self.env.hparams.get("metrics_base", 0.9) assert 0 < self.metrics_base < 1 self.metrics_sigma = self.env.hparams.get("metrics_sigma", 0.0) assert 0 <= self.metrics_sigma self.write_null = self.env.hparams.get("write_null", False) self.request_stop = self.env.hparams.get("request_stop", False) if self.load_path is None: self.trained_steps = collections.Counter() else: self.load(self.load_path)
def get_runpy_result( cls, ) -> Tuple[Optional[det.NativeContext], Optional[Type[det.Trial]], Type[det.TrialController]]: check.true( cls.get_instance().controller_cls is not None, "Please load native implementation." ) return ( cls.get_instance().context, cls.get_instance().trial_cls, cast(Type[det.TrialController], cls.get_instance().controller_cls), )
def _check_if_trial_supports_configurations(self, env: det.EnvContext) -> None: if self.env.experiment_config.mixed_precision_enabled(): check.true( self.supports_mixed_precision(), "Mixed precision training is not supported for this framework interface. " 'Please set `mixed_precision = "O0"`.', ) if env.experiment_config.averaging_training_metrics_enabled(): check.true(self.supports_averaging_training_metrics())
def __init__( self, context: Union[keras.TFKerasTrialContext, keras.TFKerasNativeContext], train_config: keras.TFKerasTrainConfig, ) -> None: super().__init__(context=context) check.true( self._context.dataset_initialized, "Please use: `context.wrap_dataset(dataset)` if using `tf.data.Dataset`.", ) self._validation_dataset = train_config.validation_data
def load_native_implementation_controller( env: det.EnvContext, workloads: workload.Stream, load_path: Optional[pathlib.Path], rendezvous_info: det.RendezvousInfo, hvd_config: horovod.HorovodContext, ) -> det.TrialController: check.true( env.experiment_config.native_enabled(), "Experiment configuration does not have an internal.native " f"configuration: {env.experiment_config}", ) context, trial_class, controller_class = load.load_native_implementation( env, hvd_config) if trial_class is not None: return load_controller_from_trial( trial_class=trial_class, env=env, workloads=workloads, load_path=load_path, rendezvous_info=rendezvous_info, hvd_config=hvd_config, ) else: # Framework-specific native implementation. check.is_not_none( controller_class, "The class attribute `trial_controller_class` is " "None; please set it the correct subclass of `det.TrialController`", ) check.is_subclass( controller_class, det.TrialController, "The class attribute `trial_controller_class` is " "not a valid subclass of `det.TrialController`", ) logging.info( f"Creating {controller_class.__name__} with {type(context).__name__}." ) return cast(det.TrialController, controller_class).from_native( context=cast(det.NativeContext, context), env=env, workloads=workloads, load_path=load_path, rendezvous_info=rendezvous_info, hvd_config=hvd_config, )
def convert_notebook_to_python_script(notebook_path: str) -> str: check.check_true( notebook_path.endswith(".ipynb"), f"Notebook file {notebook_path} must has a suffix .ipynb" ) processed_cells_path = f"{notebook_path[:-6]}__det__.py" with open(notebook_path, "r") as f1, open(processed_cells_path, "w") as f2: obj = json.load(f1) check.true("cells" in obj, f"Invalid notebook file {notebook_path}") for cell in obj["cells"]: if cell["cell_type"] == "code": lines = [line for line in cell["source"] if not line.lstrip().startswith("!")] f2.writelines(lines) f2.write("\n") return processed_cells_path
def restore_path(self, metadata: StorageMetadata) -> Iterator[str]: """ Prepare a local directory exposing the checkpoint. Do some simple checks to make sure the configuration seems reasonable. """ storage_dir = os.path.join(self._base_path, metadata.storage_id) check.true( os.path.exists(storage_dir), "Storage directory does not exist: {}. Please verify " "that you are using the correct configuration value for " "checkpoint_storage.host_path".format(storage_dir), ) check.true( os.path.isdir(storage_dir), "Checkpoint path is not a directory: {}".format(storage_dir)) yield storage_dir
def _average_training_metrics( self, per_batch_metrics: List[Dict[str, Any]]) -> List[Dict[str, Any]]: """Average training metrics across GPUs""" check.true(self.hvd_config.use, "Can only average training metrics in multi-GPU training.") metrics_timeseries = util._list_to_dict(per_batch_metrics) # combined_timeseries is: dict[metric_name] -> 2d-array. # A measurement is accessed via combined_timeseries[metric_name][process_idx][batch_idx]. combined_timeseries, _ = self._combine_metrics_across_processes( metrics_timeseries, num_batches=len(per_batch_metrics)) # If the value for a metric is a single-element array, the averaging process will # change that into just the element. We record what metrics are single-element arrays # so we can wrap them in an array later (for perfect compatibility with non-averaging # codepath). array_metrics = [] for metric_name in per_batch_metrics[0].keys(): if isinstance(per_batch_metrics[0][metric_name], np.ndarray): array_metrics.append(metric_name) if self.is_chief: combined_timeseries_type = Dict[str, List[List[Any]]] combined_timeseries = cast(combined_timeseries_type, combined_timeseries) num_batches = len(per_batch_metrics) num_processes = hvd.size() averaged_metrics_timeseries = {} # type: Dict[str, List] for metric_name in combined_timeseries.keys(): averaged_metrics_timeseries[metric_name] = [] for batch_idx in range(num_batches): batch = [ combined_timeseries[metric_name][process_idx] [batch_idx] for process_idx in range(num_processes) ] np_batch = np.array(batch) batch_avg = np.mean( np_batch[np_batch != None]) # noqa: E711 if metric_name in array_metrics: batch_avg = np.array(batch_avg) averaged_metrics_timeseries[metric_name].append(batch_avg) per_batch_metrics = util._dict_to_list(averaged_metrics_timeseries) return per_batch_metrics
def _initialize_train_process_comm(self) -> None: check.true(self.hvd_config.use) if self.is_chief: logging.debug(f"Chief {hvd.rank()} setting up server with " f"port {constants.INTER_TRAIN_PROCESS_COMM_PORT}.") self.train_process_comm_chief = ipc.ZMQServer( ports=[constants.INTER_TRAIN_PROCESS_COMM_PORT], num_connections=1) else: chief_ip_address = self.rendezvous_info.get_ip_addresses()[0] logging.debug( f"Non-Chief {hvd.rank()} setting up comm to " f"{chief_ip_address} w/ port {constants.INTER_TRAIN_PROCESS_COMM_PORT}." ) self.train_process_comm_worker = ipc.ZMQClient( ip_address=chief_ip_address, port=constants.INTER_TRAIN_PROCESS_COMM_PORT)
def validate_batch_metrics(batch_metrics: List[Dict[str, Any]]) -> None: metric_dict = _list_to_dict(batch_metrics) # We expect that every batch has a metric named "loss". check.true( any(v for v in metric_dict if v.startswith("loss")), "model did not compute 'loss' training metric", ) # We expect that all batches have the same set of metrics. metric_dict_keys = metric_dict.keys() for idx, metric_dict in zip(range(len(batch_metrics)), batch_metrics): keys = metric_dict.keys() if metric_dict_keys == keys: continue check.eq(metric_dict_keys, keys, "inconsistent training metrics: index: {}".format(idx))
def _launch_horovodrun(self) -> subprocess.Popen: check.true(self.hvd_config.use) logging.debug(f"Starting training process on: {self.rendezvous_info.get_rank()}.") horovod_process_cmd = horovod.create_run_command( num_gpus_per_machine=self.num_gpus, ip_addresses=self.rendezvous_info.get_ip_addresses(), env=self.env, debug=self.env.experiment_config.debug_enabled(), optional_args=self.env.experiment_config.horovod_optional_args(), worker_process_env_path=self._worker_process_env_path, ) subprocess_env = { **os.environ, "NCCL_DEBUG": "INFO", "DET_HOROVOD_GLOO_RENDEZVOUS_PORT": str(constants.HOROVOD_GLOO_RENDEZVOUS_PORT), } return subprocess.Popen(horovod_process_cmd, env=subprocess_env)
def test_noop_pause() -> None: """ Walk through starting, pausing, and resuming a single no-op experiment. """ experiment_id = exp.create_experiment( conf.fixtures_path("no_op/single-medium-train-step.yaml"), conf.fixtures_path("no_op"), None, ) exp.wait_for_experiment_state(experiment_id, "ACTIVE") # Wait for the only trial to get scheduled. workload_active = False for _ in range(conf.MAX_TASK_SCHEDULED_SECS): workload_active = exp.experiment_has_active_workload(experiment_id) if workload_active: break else: time.sleep(1) check.true( workload_active, f"The only trial cannot be scheduled within {conf.MAX_TASK_SCHEDULED_SECS} seconds.", ) # Wait for the only trial to show progress, indicating the image is built and running. num_steps = 0 for _ in range(conf.MAX_TRIAL_BUILD_SECS): trials = exp.experiment_trials(experiment_id) if len(trials) > 0: only_trial = trials[0] num_steps = len(only_trial["steps"]) if num_steps > 1: break time.sleep(1) check.true( num_steps > 1, f"The only trial cannot start training within {conf.MAX_TRIAL_BUILD_SECS} seconds.", ) # Pause the experiment. Note that Determined does not currently differentiate # between a "stopping paused" and a "paused" state, so we follow this check # up by ensuring the experiment cleared all scheduled workloads. exp.pause_experiment(experiment_id) exp.wait_for_experiment_state(experiment_id, "PAUSED") # Wait at most 20 seconds for the experiment to clear all workloads (each # train step should take 5 seconds). for _ in range(20): workload_active = exp.experiment_has_active_workload(experiment_id) if not workload_active: break else: time.sleep(1) check.true( not workload_active, "The experiment cannot be paused within 20 seconds.", ) # Resume the experiment and wait for completion. exp.activate_experiment(experiment_id) exp.wait_for_experiment_state(experiment_id, "COMPLETED")
def _init_run_config( self, config: tf.estimator.RunConfig) -> tf.estimator.RunConfig: logging.debug(f"Initializing RunConfig. Got RunConfig: {config} .") session_config = config.session_config train_distribute = None eval_distribute = None if self.hvd_config.use: if session_config is None: session_config = tf.compat.v1.ConfigProto() session_config.gpu_options.allow_growth = True # If using CUDA_VISIBLE_DEVICES there is only one visible GPU # so there is no need to set visible devices for TF. # TODO (DET-3762): Remove this once it's no longer necessary. if not self.env.experiment_config.get("data", {}).get( "set_cuda_visible_devices", False): session_config.gpu_options.visible_device_list = str( self.env.slot_ids[horovod.hvd.local_rank()]) elif len(self.env.container_gpus) > 1: check.true(len(self.rendezvous_info.get_addrs()) == 1) train_distribute = tf.distribute.MirroredStrategy() eval_distribute = tf.distribute.MirroredStrategy() config = config.replace( model_dir=str(self.estimator_dir), tf_random_seed=self.env.trial_seed, save_checkpoints_steps=None, # `train_and_evaluate()` requires that either # `save_checkpoints_steps` or `save_checkpoints_secs` is # set to greater than 0. save_checkpoints_secs=VERY_LARGE_NUMBER, session_config=session_config, train_distribute=train_distribute, eval_distribute=eval_distribute, experimental_distribute=None, ) logging.debug(f"Initialized RunConfig with args: {config}.") return config
def wrapper(*args: Any, **kwargs: Any) -> tf.data.Dataset: ds = f(*args, **kwargs) if self.context.experimental.get_train_cacheable().is_decorator_used(): check.false( self.context.dataset_initialized, "Please do not use: `context.wrap_dataset(dataset)` if using " "`@context.experimental.cache_train_dataset(dataset_name, dataset_version)` " "and `@context.experimental.cache_validation_dataset(dataset_name, " "dataset_version)`.", ) else: check.true( self.context.dataset_initialized, "Please pass your datasets (train and test) into " "`context.wrap_dataset(dataset)` right after creating them.", ) if isinstance(ds, tf.data.Dataset): ds = ds.repeat() return ds
def wrap_scaler(self, scaler: Any) -> Any: """ Prepares to use automatic mixed precision through PyTorch’s native AMP API. The returned scaler should be passed to ``step_optimizer``, but usage does not otherwise differ from vanilla PyTorch APIs. Loss should be scaled before calling ``backward``, ``unscale_`` should be called before clipping gradients, ``update`` should be called after stepping all optimizers, etc. PyTorch 1.6 or greater is required for this feature. Arguments: scaler (``torch.cuda.amp.GradScaler``): Scaler to wrap and track. Returns: The scaler. It may be wrapped to add additional functionality for use in Determined. """ check.false( amp_import_error, "Failed to import torch.cuda.amp. PyTorch >= 1.6 required.") check.false(self._use_apex, "Do not mix APEX with PyTorch AMP.") check.is_none(self._scaler, "Please only call wrap_scaler or use_amp once.") check.true( len(self.models) == 0, "Please call wrap_scaler before wrap_model.") check.true( torch.cuda.is_available(), "Mixed precision training (AMP) is supported only on GPU slots.", ) self._scaler = scaler return scaler
def _full_storage_path( host_path: str, storage_path: Optional[str] = None, container_path: Optional[str] = None, ) -> str: """ Return the full path to the storage_path, either as a subdirectory of the host_path in the host environment, where container_path must be None, or as a subdirectory of the container_path when in the container enviornment, where container_path must not be None. """ check.true(os.path.isabs(host_path), "`host_path` must be an absolute path.") if storage_path is None: return host_path if container_path is None else container_path abs_path = os.path.normpath(os.path.join(host_path, storage_path)) check.true(abs_path.startswith(host_path), "storage path must be a subdirectory of host path.") storage_path = os.path.relpath(abs_path, host_path) return os.path.join( host_path if container_path is None else container_path, storage_path)
def _combine_metrics_across_processes( self, metrics: Dict[str, Any], num_batches: int ) -> Tuple[Optional[Dict[str, Any]], Optional[List[int]]]: # The chief receives the metric from every other training process. check.true(self.hvd_config.use) metrics_lists = {} # type: Dict[str, Any] batches_per_process = [] # type: List[int] if self.is_chief: self.train_process_comm_chief = cast( ipc.ZMQBroadcastServer, self.train_process_comm_chief ) worker_metrics, _ = self.train_process_comm_chief.gather_with_polling(lambda: None) self.train_process_comm_chief.broadcast(None) worker_metrics = cast(List[ipc.MetricsInfo], worker_metrics) for metric_name in metrics.keys(): metrics_lists[metric_name] = [metrics[metric_name]] for worker_metric in worker_metrics: metrics_lists[metric_name].append(worker_metric.metrics[metric_name]) batches_per_process.append(num_batches) for worker_metric in worker_metrics: batches_per_process.append(worker_metric.num_batches) return metrics_lists, batches_per_process else: self.train_process_comm_worker = cast( ipc.ZMQBroadcastClient, self.train_process_comm_worker ) self.train_process_comm_worker.send( ipc.MetricsInfo(metrics=metrics, num_batches=num_batches) ) # Synchronize with the chief so that there is no risk of accidentally calling send() # for a future gather before all workers have called send() on this gather. _ = self.train_process_comm_worker.recv() return None, None