def wrap_model(self, model: torch.nn.Module) -> torch.nn.Module: """Returns a wrapped model.""" if self.env.managed_training: check.false(self._use_apex, "Must call wrap_model() before configure_apex_amp.") model = model.to(self.device) if self.distributed.size > 1 and self._distributed_backend.use_torch( ): wrapped_model = self._PyTorchDistributedDataParallel(model) else: wrapped_model = model self._wrapped_models[wrapped_model] = model else: wrapped_model = model model_id = len(self.models) self._main_model.__setattr__(f"model_{model_id}", wrapped_model) if self.experimental._auto_amp: wrapped_model = self.autocast_forward_pass(wrapped_model) self.models.append(wrapped_model) return wrapped_model
def wrap_scaler(self, scaler: Any) -> Any: """ Prepares to use automatic mixed precision through PyTorch’s native AMP API. The returned scaler should be passed to ``step_optimizer``, but usage does not otherwise differ from vanilla PyTorch APIs. Loss should be scaled before calling ``backward``, ``unscale_`` should be called before clipping gradients, ``update`` should be called after stepping all optimizers, etc. PyTorch 1.6 or greater is required for this feature. Arguments: scaler (``torch.cuda.amp.GradScaler``): Scaler to wrap and track. Returns: The scaler. It may be wrapped to add additional functionality for use in Determined. """ check.false(amp_import_error, "Failed to import torch.cuda.amp. PyTorch >= 1.6 required.") check.false(self._use_apex, "Do not mix APEX with PyTorch AMP.") check.is_none(self._scaler, "Please only call wrap_scaler or use_amp once.") check.true(len(self.models) == 0, "Please call wrap_scaler before wrap_model.") check.true( torch.cuda.is_available(), "Mixed precision training (AMP) is supported only on GPU slots.", ) self._scaler = scaler return scaler
def wrap_model(self, model: torch.nn.Module) -> torch.nn.Module: """Returns a wrapped model.""" if self.env.managed_training: check.false(self._use_apex, "Must call wrap_model() before configure_apex_amp.") model = model.to(self.device) if not self.hvd_config.use and self.n_gpus > 1: check.eq( self.hvd_config.aggregation_frequency, 1, "Please enable `optimized_parallel` to use aggregation " "frequency greater than 1 for single machine multi-GPU " "training.", ) model = nn.DataParallel(model) logging.debug("Initialized model for native parallel training.") model_id = len(self.models) self._main_model.__setattr__(f"model_{model_id}", model) if self.experimental._auto_amp: model = self.autocast_forward_pass(model) self.models.append(model) return model
def wrap_optimizer( self, optimizer: torch.optim.Optimizer, backward_passes_per_step: int = 1, ) -> torch.optim.Optimizer: """Returns a wrapped optimizer. The optimizer must use the models wrapped by :meth:`wrap_model`. This function creates a ``horovod.DistributedOptimizer`` if using parallel/distributed training. `backward_passes_per_step` can be used to specify how many gradient aggregation steps will be performed in a single `train_batch` call per optimizer step. In most cases, this will just be the default value 1. However, this advanced functionality can be used to support training loops like the one shown below: .. code-block:: python def train_batch( self, batch: TorchData, epoch_idx: int, batch_idx: int ) -> Dict[str, torch.Tensor]: data, labels = batch output = self.model(data) loss1 = output['loss1'] loss2 = output['loss2'] self.context.backward(loss1) self.context.backward(loss2) self.context.step_optimizer(self.optimizer, backward_passes_per_step=2) return {"loss1": loss1, "loss2": loss2} """ if self.env.managed_training: check.false( self._use_apex, "Must call wrap_optimizer() before configure_apex_amp.") check.gt_eq( backward_passes_per_step, 1, "backward_passes_per_step for local gradient aggregation must be >= 1", ) if self.distributed.size > 1 and self._distributed_backend.use_horovod( ): optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=self._filter_named_parameters(optimizer), backward_passes_per_step=backward_passes_per_step * self._aggregation_frequency, compression=hvd.Compression.fp16 if self._fp16_compression else hvd.Compression.none, ) logging.debug( "Initialized optimizer for distributed and optimized parallel training." ) self.optimizers.append(optimizer) return optimizer
def wrapper(*args: Any, **kwargs: Any) -> tf.data.Dataset: ds = f(*args, **kwargs) if self.context.experimental.get_train_cacheable().is_decorator_used(): check.false( self.context.dataset_initialized, "Please do not use: `context.wrap_dataset(dataset)` if using " "`@context.experimental.cache_train_dataset(dataset_name, dataset_version)` " "and `@context.experimental.cache_validation_dataset(dataset_name, " "dataset_version)`.", ) else: check.true( self.context.dataset_initialized, "Please pass your datasets (train and test) into " "`context.wrap_dataset(dataset)` right after creating them.", ) if isinstance(ds, tf.data.Dataset): ds = ds.repeat() return ds
def configure_apex_amp( self, models: Union[torch.nn.Module, List[torch.nn.Module]], optimizers: Union[torch.optim.Optimizer, List[torch.optim.Optimizer]], enabled: Optional[bool] = True, opt_level: Optional[str] = "O1", cast_model_type: Optional[torch.dtype] = None, patch_torch_functions: Optional[bool] = None, keep_batchnorm_fp32: Optional[Union[bool, str]] = None, master_weights: Optional[bool] = None, loss_scale: Optional[Union[float, str]] = None, cast_model_outputs: Optional[torch.dtype] = None, num_losses: Optional[int] = 1, verbosity: Optional[int] = 1, min_loss_scale: Optional[float] = None, max_loss_scale: Optional[float] = 2.0**24, ) -> Tuple: """ Configure automatic mixed precision for your models and optimizers using NVIDIA's Apex PyTorch extension. Note that details for apex.amp are handled automatically within Determined after this call. This function must be called **after** you have finished constructing your models and optimizers with :meth:`wrap_model` and :meth:`wrap_optimizer`. This function has the same arguments as `apex.amp.initialize <https://nvidia.github.io/apex/amp.html#apex.amp.initialize>`_. .. warning:: When using distributed training and automatic mixed precision, we only support ``num_losses=1`` and calling backward on the loss once. Arguments: models (``torch.nn.Module`` or list of ``torch.nn.Module`` s): Model(s) to modify/cast. optimizers (``torch.optim.Optimizer`` or list of ``torch.optim.Optimizer`` s): Optimizers to modify/cast. REQUIRED for training. enabled (bool, optional, default=True): If False, renders all Amp calls no-ops, so your script should run as if Amp were not present. opt_level (str, optional, default="O1"): Pure or mixed precision optimization level. Accepted values are "O0", "O1", "O2", and "O3", explained in detail above. cast_model_type (``torch.dtype``, optional, default=None): Optional property override, see above. patch_torch_functions (bool, optional, default=None): Optional property override. keep_batchnorm_fp32 (bool or str, optional, default=None): Optional property override. If passed as a string, must be the string "True" or "False". master_weights (bool, optional, default=None): Optional property override. loss_scale (float or str, optional, default=None): Optional property override. If passed as a string, must be a string representing a number, e.g., "128.0", or the string "dynamic". cast_model_outputs (torch.dtype, optional, default=None): Option to ensure that the outputs of your model is always cast to a particular type regardless of ``opt_level``. num_losses (int, optional, default=1): Option to tell Amp in advance how many losses/backward passes you plan to use. When used in conjunction with the ``loss_id`` argument to ``amp.scale_loss``, enables Amp to use a different loss scale per loss/backward pass, which can improve stability. If ``num_losses`` is left to 1, Amp will still support multiple losses/backward passes, but use a single global loss scale for all of them. verbosity (int, default=1): Set to 0 to suppress Amp-related output. min_loss_scale (float, default=None): Sets a floor for the loss scale values that can be chosen by dynamic loss scaling. The default value of None means that no floor is imposed. If dynamic loss scaling is not used, `min_loss_scale` is ignored. max_loss_scale (float, default=2.**24): Sets a ceiling for the loss scale values that can be chosen by dynamic loss scaling. If dynamic loss scaling is not used, `max_loss_scale` is ignored. Returns: Model(s) and optimizer(s) modified according to the ``opt_level``. If ``optimizers`` args were lists, the corresponding return value will also be a list. """ if not self.env.managed_training: return models, optimizers check.is_none(self._scaler, "Do not mix APEX with PyTorch AMP") check.false(self._use_apex, "Please only call configure_apex_amp once.") if self.distributed.size > 1: check.eq( num_losses, 1, "When using parallel/distributed training, " "Determined only supports configure_apex_amp with num_losses = 1", ) self._use_apex = True if self.distributed.size > 1: check.eq( self._aggregation_frequency, 1, "Mixed precision training (AMP) is not supported with " "aggregation frequency > 1.", ) check.true( torch.cuda.is_available(), "Mixed precision training (AMP) is supported only on GPU slots.", ) if self._distributed_backend.use_torch(): # We need to get the pre-wrapped input models to initialize APEX because if isinstance(models, list): models = [ self._wrapped_models[wrapped_model] for wrapped_model in models ] else: models = self._wrapped_models[models] logging.info( f"Enabling mixed precision training with opt_level: {opt_level}.") models, optimizers = apex.amp.initialize( models=models, optimizers=optimizers, enabled=enabled, opt_level=opt_level, cast_model_type=cast_model_type, patch_torch_functions=patch_torch_functions, keep_batchnorm_fp32=keep_batchnorm_fp32, master_weights=master_weights, loss_scale=loss_scale, cast_model_outputs=cast_model_outputs, num_losses=num_losses, min_loss_scale=min_loss_scale, max_loss_scale=max_loss_scale, verbosity=verbosity if self.distributed.get_rank() == 0 or self.env.experiment_config.debug_enabled() else 0, ) if not isinstance(models, list): self.models = [models] if self.distributed.size > 1 and self._distributed_backend.use_torch(): # If Torch DDP is in use, re-wrap the models self.models = [ self._PyTorchDistributedDataParallel(model) for model in self.models ] if not isinstance(optimizers, list): self.optimizers = [optimizers] return models, optimizers