コード例 #1
0
    def wrap_optimizer(
        self,
        optimizer: torch.optim.Optimizer  # type: ignore
    ) -> torch.optim.Optimizer:  # type: ignore
        """Returns a wrapped optimizer.

        The optimizer must use the models wrapped by :meth:`wrap_model`. This function
        creates a ``horovod.DistributedOptimizer`` if using parallel/distributed training.
        """
        if self.env.training:
            check.false(
                self._use_amp,
                "Must call wrap_optimizer() before configure_apex_amp.")

            if self.hvd_config.use:
                use_compression = self.hvd_config.fp16_compression
                optimizer = hvd.DistributedOptimizer(
                    optimizer,
                    named_parameters=self._filter_named_parameters(optimizer),
                    backward_passes_per_step=self.hvd_config.
                    aggregation_frequency,
                    compression=hvd.Compression.fp16
                    if use_compression else hvd.Compression.none,
                )
                logging.debug(
                    "Initialized optimizer for distributed and optimized parallel training."
                )

        self.optimizers.append(optimizer)
        return optimizer
コード例 #2
0
    def _launch_fit(self) -> None:
        check.false(self.fit_loop_started)
        self.fit_loop_started = True

        self.tf_keras_callbacks.append(DeterminedEarlyStoppingCallback(self))
        self.tf_keras_callbacks.append(WaitForInstructionsCallback(self))

        profile_frequency = self.env.experiment_config.profile_frequency()
        if profile_frequency:
            self.tf_keras_callbacks.append(
                DeterminedProfiler(profile_frequency,
                                   DeterminedProfiler.OUTPUT_FILENAME))

        if self.hvd_config.use:
            # When using horovod broadcast initial variable states from rank 0 to
            # all other processes.
            self.tf_keras_callbacks.append(
                hvd.callbacks.BroadcastGlobalVariablesCallback(0))

        (
            training_input,
            batches_per_epoch,
        ) = self._train_input_manager.get_training_input_and_batches_per_epoch(
        )

        _ = self.model.fit(
            training_input,
            callbacks=self.tf_keras_callbacks,
            shuffle=False,
            steps_per_epoch=batches_per_epoch,
            initial_epoch=self._train_input_manager.get_initial_epoch(),
            epochs=IMPOSSIBLY_LARGE_EPOCHS,
            validation_split=0,
            verbose=0,
        ).history
コード例 #3
0
    def wrap_model(self, model: torch.nn.Module) -> torch.nn.Module:
        """Returns a wrapped model."""

        if self.env.managed_training:
            check.false(self._use_amp,
                        "Must call wrap_model() before configure_apex_amp.")

            model = model.to(self.device)
            if not self.hvd_config.use and self.n_gpus > 1:
                check.eq(
                    self.hvd_config.aggregation_frequency,
                    1,
                    "Please enable `optimized_parallel` to use aggregation "
                    "frequency greater than 1 for single machine multi-GPU "
                    "training.",
                )
                model = nn.DataParallel(model)
                logging.debug(
                    "Initialized model for native parallel training.")

        model_id = len(self.models)
        self._main_model.__setattr__(f"model_{model_id}", model)

        self.models.append(model)
        return model
コード例 #4
0
    def _configure_amp(self) -> None:
        if self.use_amp():
            if self.hvd_config.use:
                check.eq(
                    self.hvd_config.aggregation_frequency,
                    1,
                    "Mixed precision training (AMP) is not supported with "
                    "aggregation frequency > 1.",
                )

            check.true(
                torch.cuda.is_available(),
                "Mixed precision training (AMP) is supported only on GPU slots.",
            )
            check.false(
                not self.hvd_config.use and self.n_gpus > 1,
                "To enable mixed precision training (AMP) for parallel training, "
                'please set `resources["optimized_parallel"] = True`.',
            )

            logging.info(
                f"Enabling mixed precision training with opt_level: {self._get_amp_setting()}."
            )
            self.context.model, self.context.optimizer = apex.amp.initialize(
                self.context.model,
                self.context.optimizer,
                opt_level=self._get_amp_setting(),
                verbosity=1 if self.is_chief or self.env.experiment_config.debug_enabled() else 0,
            )
コード例 #5
0
ファイル: _tf_keras_inputs.py プロジェクト: yitang/determined
    def __init__(
        self,
        context: Union[keras.TFKerasTrialContext, keras.TFKerasNativeContext],
        train_config: keras.TFKerasTrainConfig,
    ) -> None:
        super().__init__(context=context)

        self._training_cacheable = self._context.experimental.get_train_cacheable(
        )
        self._training_dataset = train_config.training_data

        check.true(
            self._training_cacheable.is_decorator_used(),
            "Please use `@context.experimental.cache_train_dataset(dataset_name, dataset_version)`"
            " for the training dataset.",
        )
        check.false(
            self._context.dataset_initialized,
            "Please do not use: `context.wrap_dataset(dataset)` if using "
            "`@context.experimental.cache_train_dataset()` and "
            "`@context.experimental.cache_validation_dataset()`.",
        )
        check.is_instance(
            train_config.training_data,
            tf.data.Dataset,
            "Pass in a `tf.data.Dataset` object if using "
            "`@context.experimental.cache_train_dataset()`.",
        )
コード例 #6
0
    def wrap_optimizer(
        self,
        optimizer: torch.optim.Optimizer,  # type: ignore
        backward_passes_per_step: int = 1,
    ) -> torch.optim.Optimizer:  # type: ignore
        """Returns a wrapped optimizer.

        The optimizer must use the models wrapped by :meth:`wrap_model`. This function
        creates a ``horovod.DistributedOptimizer`` if using parallel/distributed training.

        `backward_passes_per_step` can be used to specify how many gradient aggregation
        steps will be performed in a single `train_batch` call per optimizer step.
        In most cases, this will just be the default value 1.  However, this advanced functionality
        can be used to support training loops like the one shown below:

        .. code-block:: python

            def train_batch(
                self, batch: TorchData, epoch_idx: int, batch_idx: int
            ) -> Dict[str, torch.Tensor]:
                data, labels = batch
                output = self.model(data)
                loss1 = output['loss1']
                loss2 = output['loss2']
                self.context.backward(loss1)
                self.context.backward(loss2)
                self.context.step_optimizer(self.optimizer, backward_passes_per_step=2)
                return {"loss1": loss1, "loss2": loss2}

        """
        if self.env.managed_training:
            check.false(
                self._use_amp,
                "Must call wrap_optimizer() before configure_apex_amp.")
            check.gt_eq(
                backward_passes_per_step,
                1,
                "backwar_passes_per_step for local gradient aggregation must be >= 1",
            )

            if self.hvd_config.use:
                use_compression = self.hvd_config.fp16_compression
                optimizer = hvd.DistributedOptimizer(
                    optimizer,
                    named_parameters=self._filter_named_parameters(optimizer),
                    backward_passes_per_step=backward_passes_per_step *
                    self.hvd_config.aggregation_frequency,
                    compression=hvd.Compression.fp16
                    if use_compression else hvd.Compression.none,
                )
                logging.debug(
                    "Initialized optimizer for distributed and optimized parallel training."
                )

        self.optimizers.append(optimizer)
        return optimizer
コード例 #7
0
ファイル: _pytorch_trial.py プロジェクト: hugokce/determined
 def _clip_grads(self, parameters: Any) -> None:
     # TODO: Support clip by norm other than L2.
     clip_grad_l2_norm = self.env.hparams.get("clip_grad_l2_norm", None)
     clip_by_val = self.env.hparams.get("clip_grad_val", None)
     check.false(
         clip_grad_l2_norm is not None and clip_by_val is not None,
         "Please specify either `clip_grad_l2_norm` or `clip_by_val` "
         "in your hparams, not both.",
     )
     if clip_grad_l2_norm is not None:
         logging.debug(
             f"Clipping gradients by L2 norm of: {clip_grad_l2_norm}.")
         torch.nn.utils.clip_grad_norm_(parameters,
                                        clip_grad_l2_norm)  # type: ignore
     elif clip_by_val is not None:
         logging.debug(f"Clipping gradients by value of: {clip_by_val}.")
         torch.nn.utils.clip_grad_value_(parameters,
                                         clip_by_val)  # type: ignore
     else:
         logging.debug("No gradient clipping enabled.")
コード例 #8
0
        def wrapper(*args: Any, **kwargs: Any) -> tf.data.Dataset:
            ds = f(*args, **kwargs)

            if self.context.experimental.get_train_cacheable().is_decorator_used():
                check.false(
                    self.context.dataset_initialized,
                    "Please do not use: `context.wrap_dataset(dataset)` if using "
                    "`@context.experimental.cache_train_dataset(dataset_name, dataset_version)` "
                    "and `@context.experimental.cache_validation_dataset(dataset_name, "
                    "dataset_version)`.",
                )
            else:
                check.true(
                    self.context.dataset_initialized,
                    "Please pass your datasets (train and test) into "
                    "`context.wrap_dataset(dataset)` right after creating them.",
                )

            if isinstance(ds, tf.data.Dataset):
                ds = ds.repeat()

            return ds
コード例 #9
0
    def wrap_scaler(self, scaler: Any) -> Any:
        """
        Prepares to use automatic mixed precision through PyTorch’s native AMP API. The returned
        scaler should be passed to ``step_optimizer``, but usage does not otherwise differ from
        vanilla PyTorch APIs. Loss should be scaled before calling ``backward``, ``unscale_`` should
        be called before clipping gradients, ``update`` should be called after stepping all
        optimizers, etc.

        PyTorch 1.6 or greater is required for this feature.

        Arguments:
            scaler (``torch.cuda.amp.GradScaler``):  Scaler to wrap and track.

        Returns:
            The scaler. It may be wrapped to add additional functionality for use in Determined.
        """

        check.false(
            amp_import_error,
            "Failed to import torch.cuda.amp. PyTorch >= 1.6 required.")

        check.false(self._use_apex, "Do not mix APEX with PyTorch AMP.")

        check.is_none(self._scaler,
                      "Please only call wrap_scaler or use_amp once.")

        check.true(
            len(self.models) == 0,
            "Please call wrap_scaler before wrap_model.")

        check.true(
            torch.cuda.is_available(),
            "Mixed precision training (AMP) is supported only on GPU slots.",
        )

        self._scaler = scaler

        return scaler
コード例 #10
0
    def configure_apex_amp(
        self,
        models: Union[torch.nn.Module, List[torch.nn.Module]],
        optimizers: Union[torch.optim.Optimizer, List[torch.optim.Optimizer]],
        enabled: Optional[bool] = True,
        opt_level: Optional[str] = "O1",
        cast_model_type: Optional[torch.dtype] = None,
        patch_torch_functions: Optional[bool] = None,
        keep_batchnorm_fp32: Optional[Union[bool, str]] = None,
        master_weights: Optional[bool] = None,
        loss_scale: Optional[Union[float, str]] = None,
        cast_model_outputs: Optional[torch.dtype] = None,
        num_losses: Optional[int] = 1,
        verbosity: Optional[int] = 1,
        min_loss_scale: Optional[float] = None,
        max_loss_scale: Optional[float] = 2.0 ** 24,
    ) -> Tuple:
        """
        Configure automatic mixed precision for your models and optimizers. Note that details
        for apex.amp are handled automatically within Determined after this call.

        This function must be called **after** you have finished constructing your models and
        optimizers with :meth:`wrap_model` and :meth:`wrap_optimizer`.

        This function has the same arguments as
        `apex.amp.initialize <https://nvidia.github.io/apex/amp.html#apex.amp.initialize>`_.

        .. warning::
            When using distributed training and automatic mixed precision,
            we only support ``num_losses=1`` and calling backward on the loss once.

        Arguments:
            models (``torch.nn.Module`` or list of ``torch.nn.Module`` s):  Model(s) to modify/cast.
            optimizers (``torch.optim.Optimizer`` or list of ``torch.optim.Optimizer`` s):
                Optimizers to modify/cast. REQUIRED for training.
            enabled (bool, optional, default=True):  If False, renders all Amp calls no-ops,
                so your script should run as if Amp were not present.
            opt_level (str, optional, default="O1"):  Pure or mixed precision optimization level.
                Accepted values are "O0", "O1", "O2", and "O3", explained in detail above.
            cast_model_type (``torch.dtype``, optional, default=None):  Optional property override,
                see above.
            patch_torch_functions (bool, optional, default=None):  Optional property override.
            keep_batchnorm_fp32 (bool or str, optional, default=None):  Optional property override.
                If passed as a string, must be the string "True" or "False".
            master_weights (bool, optional, default=None):  Optional property override.
            loss_scale (float or str, optional, default=None):  Optional property override.
                If passed as a string, must be a string representing a number, e.g., "128.0",
                or the string "dynamic".
            cast_model_outputs (torch.dtype, optional, default=None):  Option to ensure that
                the outputs of your model is always cast to a particular type regardless of
                ``opt_level``.
            num_losses (int, optional, default=1):  Option to tell Amp in advance how many
                losses/backward passes you plan to use.  When used in conjunction with the
                ``loss_id`` argument to ``amp.scale_loss``, enables Amp to use a different
                loss scale per loss/backward pass, which can improve stability.
                If ``num_losses`` is left to 1, Amp will still support multiple losses/backward
                passes, but use a single global loss scale for all of them.
            verbosity (int, default=1):  Set to 0 to suppress Amp-related output.
            min_loss_scale (float, default=None):  Sets a floor for the loss scale values that
                can be chosen by dynamic loss scaling.  The default value of None means that no
                floor is imposed. If dynamic loss scaling is not used, `min_loss_scale` is ignored.
            max_loss_scale (float, default=2.**24):  Sets a ceiling for the loss scale values
                that can be chosen by dynamic loss scaling.  If dynamic loss scaling is not used,
                `max_loss_scale` is ignored.

        Returns:
            Model(s) and optimizer(s) modified according to the ``opt_level``.
            If  ``optimizers`` args were lists, the corresponding return value will
            also be a list.
        """
        if not self.env.managed_training:
            return models, optimizers

        check.false(self._use_amp, "Please only call configure_apex_amp once.")
        if self.hvd_config.use:
            check.eq(
                num_losses,
                1,
                "When using parallel/distributed training, "
                "Determined only supports configure_apex_amp with num_losses = 1",
            )

        self._use_amp = True

        if self.hvd_config.use:
            check.eq(
                self.hvd_config.aggregation_frequency,
                1,
                "Mixed precision training (AMP) is not supported with "
                "aggregation frequency > 1.",
            )

        check.true(
            torch.cuda.is_available(),
            "Mixed precision training (AMP) is supported only on GPU slots.",
        )

        logging.info(f"Enabling mixed precision training with opt_level: {opt_level}.")
        models, optimizers = apex.amp.initialize(
            models=models,
            optimizers=optimizers,
            enabled=enabled,
            opt_level=opt_level,
            cast_model_type=cast_model_type,
            patch_torch_functions=patch_torch_functions,
            keep_batchnorm_fp32=keep_batchnorm_fp32,
            master_weights=master_weights,
            loss_scale=loss_scale,
            cast_model_outputs=cast_model_outputs,
            num_losses=num_losses,
            min_loss_scale=min_loss_scale,
            max_loss_scale=max_loss_scale,
            verbosity=verbosity
            if self.distributed.get_rank() == 0 or self.env.experiment_config.debug_enabled()
            else 0,
        )
        if not isinstance(models, list):
            self.models = [models]
        if not isinstance(optimizers, list):
            self.optimizers = [optimizers]
        return models, optimizers
コード例 #11
0
    def _train_for_step(self, step_id: int, batches_per_step: int) -> workload.Response:
        check.gt(step_id, 0)

        # Set the behavior of certain layers (e.g., dropout) that are different
        # between training and inference.
        self.context.model.train()

        for callback in self.callbacks.values():
            callback.on_train_step_start(step_id)

        step_idx = step_id - 1
        start = step_idx * batches_per_step
        end = start + batches_per_step

        per_batch_metrics = []  # type: List[Dict]
        num_inputs = 0

        for batch_idx in range(start, end):
            batch = next(self.training_iterator)
            num_inputs += data_length(batch)

            batch = self._to_device(batch)
            # Forward pass.
            tr_metrics = self.trial.train_batch(
                batch=batch,
                model=self.context.model,
                epoch_idx=self.get_epoch_idx(batch_idx),
                batch_idx=batch_idx,
            )

            if isinstance(tr_metrics, torch.Tensor):
                tr_metrics = {"loss": tr_metrics}

            check.is_instance(
                tr_metrics,
                dict,
                "train_batch() must return a dictionary "
                "mapping string names to Tensor metrics, got {type(tr_metrics)}",
            )
            check.is_in("loss", tr_metrics.keys(), 'Please include "loss" in you training metrics.')

            # Backwards pass.
            loss = tr_metrics["loss"]
            communicate_and_update = (batch_idx + 1) % self.hvd_config.aggregation_frequency == 0
            if self.use_amp():
                with apex.amp.scale_loss(loss, self.context.optimizer) as scaled_loss:
                    scaled_loss.backward()
                    if self.hvd_config.use and communicate_and_update:
                        # When using horovod, we need to finish communicating gradient
                        # updates before they are unscaled which happens when we exit
                        # of this context manager.
                        self.context.optimizer.synchronize()
            else:
                loss.backward()

                # Communication needs to be synchronized so that is completed
                # before we apply gradient clipping and `step()`.
                if communicate_and_update and self.hvd_config.use:
                    self.context.optimizer.synchronize()

            if communicate_and_update:
                parameters = (
                    self.context.model.parameters()
                    if not self.use_amp()
                    else apex.amp.master_params(self.context.optimizer)
                )

                if self.hvd_config.average_aggregated_gradients:
                    self._average_gradients(
                        parameters=parameters, divisor=self.hvd_config.aggregation_frequency
                    )

                # TODO: Remove this check in v0.12.8.
                check.false(
                    self.env.hparams.get("clip_grad_l2_norm", None)
                    or self.env.hparams.get("clip_grad_val", None),
                    "Please specify gradient clipping via callbacks.",
                )

                for callback in self.callbacks.values():
                    callback.on_before_optimizer_step(parameters)

                if self.hvd_config.use:
                    with self.context.optimizer.skip_synchronize():
                        self.context.optimizer.step()
                else:
                    self.context.optimizer.step()
                self.context.optimizer.zero_grad()

                # Step learning rate of a LRScheduler.
                if self.context.lr_scheduler is not None:
                    self._auto_step_lr_scheduler_per_batch(batch_idx, self.context.lr_scheduler)

            for name, metric in tr_metrics.items():
                # Convert PyTorch metric values to NumPy, so that
                # `det.util.encode_json` handles them properly without
                # needing a dependency on PyTorch.
                if isinstance(metric, torch.Tensor):
                    metric = metric.cpu().detach().numpy()
                tr_metrics[name] = metric

            check.is_in("loss", tr_metrics, 'Please include "loss" in your training metrics.')
            per_batch_metrics.append(tr_metrics)

        if self.hvd_config.use and self.hvd_config.average_training_metrics:
            per_batch_metrics = self._average_training_metrics(per_batch_metrics)

        if self.hvd_config.use:
            num_inputs *= hvd.size()

        metrics = det.util.make_metrics(num_inputs, per_batch_metrics)

        for callback in self.callbacks.values():
            callback.on_train_step_end(step_id, metrics)

        if not self.is_chief:
            return workload.Skipped()

        logging.debug(f"Done training step: {num_inputs} records in {batches_per_step} batches.")

        return metrics