Exemple #1
0
    def wrap_model(self, model: torch.nn.Module) -> torch.nn.Module:
        """Returns a wrapped model."""

        if self.env.managed_training:
            check.false(self._use_apex,
                        "Must call wrap_model() before configure_apex_amp.")

            model = model.to(self.device)

            if self.distributed.size > 1 and self._distributed_backend.use_torch(
            ):

                wrapped_model = self._PyTorchDistributedDataParallel(model)
            else:
                wrapped_model = model

            self._wrapped_models[wrapped_model] = model
        else:
            wrapped_model = model

        model_id = len(self.models)
        self._main_model.__setattr__(f"model_{model_id}", wrapped_model)

        if self.experimental._auto_amp:
            wrapped_model = self.autocast_forward_pass(wrapped_model)

        self.models.append(wrapped_model)
        return wrapped_model
Exemple #2
0
    def wrap_scaler(self, scaler: Any) -> Any:
        """
        Prepares to use automatic mixed precision through PyTorch’s native AMP API. The returned
        scaler should be passed to ``step_optimizer``, but usage does not otherwise differ from
        vanilla PyTorch APIs. Loss should be scaled before calling ``backward``, ``unscale_`` should
        be called before clipping gradients, ``update`` should be called after stepping all
        optimizers, etc.

        PyTorch 1.6 or greater is required for this feature.

        Arguments:
            scaler (``torch.cuda.amp.GradScaler``):  Scaler to wrap and track.

        Returns:
            The scaler. It may be wrapped to add additional functionality for use in Determined.
        """

        check.false(amp_import_error, "Failed to import torch.cuda.amp. PyTorch >= 1.6 required.")

        check.false(self._use_apex, "Do not mix APEX with PyTorch AMP.")

        check.is_none(self._scaler, "Please only call wrap_scaler or use_amp once.")

        check.true(len(self.models) == 0, "Please call wrap_scaler before wrap_model.")

        check.true(
            torch.cuda.is_available(),
            "Mixed precision training (AMP) is supported only on GPU slots.",
        )

        self._scaler = scaler

        return scaler
Exemple #3
0
    def wrap_model(self, model: torch.nn.Module) -> torch.nn.Module:
        """Returns a wrapped model."""

        if self.env.managed_training:
            check.false(self._use_apex, "Must call wrap_model() before configure_apex_amp.")

            model = model.to(self.device)
            if not self.hvd_config.use and self.n_gpus > 1:
                check.eq(
                    self.hvd_config.aggregation_frequency,
                    1,
                    "Please enable `optimized_parallel` to use aggregation "
                    "frequency greater than 1 for single machine multi-GPU "
                    "training.",
                )
                model = nn.DataParallel(model)
                logging.debug("Initialized model for native parallel training.")

        model_id = len(self.models)
        self._main_model.__setattr__(f"model_{model_id}", model)

        if self.experimental._auto_amp:
            model = self.autocast_forward_pass(model)

        self.models.append(model)
        return model
Exemple #4
0
    def wrap_optimizer(
        self,
        optimizer: torch.optim.Optimizer,
        backward_passes_per_step: int = 1,
    ) -> torch.optim.Optimizer:
        """Returns a wrapped optimizer.

        The optimizer must use the models wrapped by :meth:`wrap_model`. This function
        creates a ``horovod.DistributedOptimizer`` if using parallel/distributed training.

        `backward_passes_per_step` can be used to specify how many gradient aggregation
        steps will be performed in a single `train_batch` call per optimizer step.
        In most cases, this will just be the default value 1.  However, this advanced functionality
        can be used to support training loops like the one shown below:

        .. code-block:: python

            def train_batch(
                self, batch: TorchData, epoch_idx: int, batch_idx: int
            ) -> Dict[str, torch.Tensor]:
                data, labels = batch
                output = self.model(data)
                loss1 = output['loss1']
                loss2 = output['loss2']
                self.context.backward(loss1)
                self.context.backward(loss2)
                self.context.step_optimizer(self.optimizer, backward_passes_per_step=2)
                return {"loss1": loss1, "loss2": loss2}

        """
        if self.env.managed_training:
            check.false(
                self._use_apex,
                "Must call wrap_optimizer() before configure_apex_amp.")
            check.gt_eq(
                backward_passes_per_step,
                1,
                "backward_passes_per_step for local gradient aggregation must be >= 1",
            )

            if self.distributed.size > 1 and self._distributed_backend.use_horovod(
            ):
                optimizer = hvd.DistributedOptimizer(
                    optimizer,
                    named_parameters=self._filter_named_parameters(optimizer),
                    backward_passes_per_step=backward_passes_per_step *
                    self._aggregation_frequency,
                    compression=hvd.Compression.fp16
                    if self._fp16_compression else hvd.Compression.none,
                )
                logging.debug(
                    "Initialized optimizer for distributed and optimized parallel training."
                )

        self.optimizers.append(optimizer)
        return optimizer
Exemple #5
0
        def wrapper(*args: Any, **kwargs: Any) -> tf.data.Dataset:
            ds = f(*args, **kwargs)

            if self.context.experimental.get_train_cacheable().is_decorator_used():
                check.false(
                    self.context.dataset_initialized,
                    "Please do not use: `context.wrap_dataset(dataset)` if using "
                    "`@context.experimental.cache_train_dataset(dataset_name, dataset_version)` "
                    "and `@context.experimental.cache_validation_dataset(dataset_name, "
                    "dataset_version)`.",
                )
            else:
                check.true(
                    self.context.dataset_initialized,
                    "Please pass your datasets (train and test) into "
                    "`context.wrap_dataset(dataset)` right after creating them.",
                )

            if isinstance(ds, tf.data.Dataset):
                ds = ds.repeat()

            return ds
Exemple #6
0
    def configure_apex_amp(
        self,
        models: Union[torch.nn.Module, List[torch.nn.Module]],
        optimizers: Union[torch.optim.Optimizer, List[torch.optim.Optimizer]],
        enabled: Optional[bool] = True,
        opt_level: Optional[str] = "O1",
        cast_model_type: Optional[torch.dtype] = None,
        patch_torch_functions: Optional[bool] = None,
        keep_batchnorm_fp32: Optional[Union[bool, str]] = None,
        master_weights: Optional[bool] = None,
        loss_scale: Optional[Union[float, str]] = None,
        cast_model_outputs: Optional[torch.dtype] = None,
        num_losses: Optional[int] = 1,
        verbosity: Optional[int] = 1,
        min_loss_scale: Optional[float] = None,
        max_loss_scale: Optional[float] = 2.0**24,
    ) -> Tuple:
        """
        Configure automatic mixed precision for your models and optimizers using NVIDIA's Apex
        PyTorch extension. Note that details for apex.amp are handled automatically within
        Determined after this call.

        This function must be called **after** you have finished constructing your models and
        optimizers with :meth:`wrap_model` and :meth:`wrap_optimizer`.

        This function has the same arguments as
        `apex.amp.initialize <https://nvidia.github.io/apex/amp.html#apex.amp.initialize>`_.

        .. warning::
            When using distributed training and automatic mixed precision,
            we only support ``num_losses=1`` and calling backward on the loss once.

        Arguments:
            models (``torch.nn.Module`` or list of ``torch.nn.Module`` s):  Model(s) to modify/cast.
            optimizers (``torch.optim.Optimizer`` or list of ``torch.optim.Optimizer`` s):
                Optimizers to modify/cast. REQUIRED for training.
            enabled (bool, optional, default=True):  If False, renders all Amp calls no-ops,
                so your script should run as if Amp were not present.
            opt_level (str, optional, default="O1"):  Pure or mixed precision optimization level.
                Accepted values are "O0", "O1", "O2", and "O3", explained in detail above.
            cast_model_type (``torch.dtype``, optional, default=None):  Optional property override,
                see above.
            patch_torch_functions (bool, optional, default=None):  Optional property override.
            keep_batchnorm_fp32 (bool or str, optional, default=None):  Optional property override.
                If passed as a string, must be the string "True" or "False".
            master_weights (bool, optional, default=None):  Optional property override.
            loss_scale (float or str, optional, default=None):  Optional property override.
                If passed as a string, must be a string representing a number, e.g., "128.0",
                or the string "dynamic".
            cast_model_outputs (torch.dtype, optional, default=None):  Option to ensure that
                the outputs of your model is always cast to a particular type regardless of
                ``opt_level``.
            num_losses (int, optional, default=1):  Option to tell Amp in advance how many
                losses/backward passes you plan to use.  When used in conjunction with the
                ``loss_id`` argument to ``amp.scale_loss``, enables Amp to use a different
                loss scale per loss/backward pass, which can improve stability.
                If ``num_losses`` is left to 1, Amp will still support multiple losses/backward
                passes, but use a single global loss scale for all of them.
            verbosity (int, default=1):  Set to 0 to suppress Amp-related output.
            min_loss_scale (float, default=None):  Sets a floor for the loss scale values that
                can be chosen by dynamic loss scaling.  The default value of None means that no
                floor is imposed. If dynamic loss scaling is not used, `min_loss_scale` is ignored.
            max_loss_scale (float, default=2.**24):  Sets a ceiling for the loss scale values
                that can be chosen by dynamic loss scaling.  If dynamic loss scaling is not used,
                `max_loss_scale` is ignored.

        Returns:
            Model(s) and optimizer(s) modified according to the ``opt_level``.
            If  ``optimizers`` args were lists, the corresponding return value will
            also be a list.
        """
        if not self.env.managed_training:
            return models, optimizers

        check.is_none(self._scaler, "Do not mix APEX with PyTorch AMP")

        check.false(self._use_apex,
                    "Please only call configure_apex_amp once.")
        if self.distributed.size > 1:
            check.eq(
                num_losses,
                1,
                "When using parallel/distributed training, "
                "Determined only supports configure_apex_amp with num_losses = 1",
            )

        self._use_apex = True

        if self.distributed.size > 1:
            check.eq(
                self._aggregation_frequency,
                1,
                "Mixed precision training (AMP) is not supported with "
                "aggregation frequency > 1.",
            )

        check.true(
            torch.cuda.is_available(),
            "Mixed precision training (AMP) is supported only on GPU slots.",
        )

        if self._distributed_backend.use_torch():
            # We need to get the pre-wrapped input models to initialize APEX because
            if isinstance(models, list):
                models = [
                    self._wrapped_models[wrapped_model]
                    for wrapped_model in models
                ]
            else:
                models = self._wrapped_models[models]

        logging.info(
            f"Enabling mixed precision training with opt_level: {opt_level}.")
        models, optimizers = apex.amp.initialize(
            models=models,
            optimizers=optimizers,
            enabled=enabled,
            opt_level=opt_level,
            cast_model_type=cast_model_type,
            patch_torch_functions=patch_torch_functions,
            keep_batchnorm_fp32=keep_batchnorm_fp32,
            master_weights=master_weights,
            loss_scale=loss_scale,
            cast_model_outputs=cast_model_outputs,
            num_losses=num_losses,
            min_loss_scale=min_loss_scale,
            max_loss_scale=max_loss_scale,
            verbosity=verbosity if self.distributed.get_rank() == 0
            or self.env.experiment_config.debug_enabled() else 0,
        )

        if not isinstance(models, list):
            self.models = [models]

        if self.distributed.size > 1 and self._distributed_backend.use_torch():
            # If Torch DDP is in use, re-wrap the models
            self.models = [
                self._PyTorchDistributedDataParallel(model)
                for model in self.models
            ]

        if not isinstance(optimizers, list):
            self.optimizers = [optimizers]
        return models, optimizers