def _compute_validation_metrics(self) -> Any:
        """
        Computes validation metrics using either Evaluator() or CustomInferenceRunner().
        """
        if self.evaluator:
            check.is_none(self.validation_metrics_names)
            metrics = self.evaluator.compute_validation_metrics()
        else:
            check.is_not_none(self.validation_metrics_names)
            # Find our custom Inference callback.
            custom_inference_callback = None  # type: Optional[CustomInferenceRunner]
            for callback in self.trainer._callbacks.cbs:
                if isinstance(callback, CustomInferenceRunner):
                    custom_inference_callback = callback
                    break
            custom_inference_callback = cast(CustomInferenceRunner,
                                             custom_inference_callback)
            self.validation_metrics_names = cast(List[str],
                                                 self.validation_metrics_names)
            metrics = custom_inference_callback.trigger_on_validation_step(
                self.validation_metrics_names)

        if not self.is_chief:
            return workload.Skipped()

        return {"validation_metrics": metrics}
Exemple #2
0
    def _send_recv_workload(self, wkld: workload.Workload,
                            args: List[Any]) -> workload.Response:
        # Broadcast every workload to every worker on this machine.
        self.broadcast_server.broadcast((wkld, args))

        if wkld.kind == workload.Workload.Kind.TERMINATE:
            # Do not perform health checks once worker have been instructed to terminate.
            self._worker_process_ids = []

        try:
            responses, exception_received = self.broadcast_server.gather_with_polling(
                self._health_check)
        except det.errors.WorkerError:
            if wkld.kind == workload.Workload.Kind.TERMINATE:
                return {}
            raise

        if exception_received:
            raise det.errors.WorkerError("Training process died.")

        # Find the response from the chief worker for the trial (the only non-SkippedWorkload). The
        # chief may report to another container, in which case we will only have SkippedWorkloads.
        chief_worker_response = None  # Optional[workload.Metrics]
        for response in responses:
            if isinstance(response, workload.Skipped):
                continue
            # Any other response must be a Dict[str, Any]-like object.
            check.is_instance(
                response, dict,
                f"Received non-metrics object from worker: {response}")
            # There should only be one chief response.
            check.is_none(chief_worker_response,
                          "Received multiple non-SkippedWorkload messages.")
            chief_worker_response = cast(Dict[str, Any], response)

        # Confirm that if we have did not see a chief response then we are not the chief machine.
        if chief_worker_response is None:
            check.gt(
                self.rendezvous_info.get_rank(),
                0,
                "Received SkippedWorkload message from chief worker.",
            )

        return workload.Skipped(
        ) if chief_worker_response is None else chief_worker_response
Exemple #3
0
    def __init__(
        self,
        estimator: tf.estimator.Estimator,
        user_train_spec: tf.estimator.TrainSpec,
        val_spec: tf.estimator.EvalSpec,
        serving_input_receiver_fns: Dict[str,
                                         estimator.ServingInputReceiverFn],
        context: estimator.EstimatorContext,
        *args: Any,
        **kwargs: Any,
    ) -> None:
        super().__init__(context, *args, **kwargs)  # type: ignore

        # Catch if the estimator has been configured to use a tf.distribute.Strategy
        # as this can conflict with Determined's distributed training and lead to
        # crashes/OOM. We cannot reliable tell the user that this was the cause of
        # their failure, because the code may crash before this point in user code
        # during build_estimator(). train_distribute is valid if it is None or if
        # it is an empty tf.contrib.distribute.DistributeConfig
        if estimator.config.train_distribute is not None:
            check.is_none(
                estimator.config.train_distribute.train_distribute,
                f"TensorFlow's approach to distributed training can conflict with "
                f"Determined's. Currently Determined requires that the train_distribute "
                f"field of the RunConfig not be set. Your estimator has "
                f"train_distribute={str(estimator.config.train_distribute.train_distribute)}",
            )
            check.is_none(
                estimator.config.train_distribute.eval_distribute,
                f"TensorFlow's approach to distributed training can conflict with "
                f"Determined's. Currently Determined requires that the eval_distribute "
                f"field of the RunConfig not be set. Your estimator has "
                f"eval_distribute={str(estimator.config.train_distribute.eval_distribute)}",
            )
        self.estimator = estimator
        self.user_train_spec = user_train_spec
        self.val_spec = val_spec
        self.serving_input_receiver_fns = serving_input_receiver_fns

        # Used to send Terminate response following post-trial close callback.
        self.exit_response_func = None  # type: Optional[workload.ResponseFunc]

        context.experimental._set_allgather_fn(self.allgather_metrics)

        self._init_model()
Exemple #4
0
    def __init__(
        self,
        num_connections: Optional[int] = None,
        ports: Optional[List[int]] = None,
        port_range: Optional[Tuple[int, int]] = None,
    ) -> None:
        self.context = zmq.Context()
        self.sockets = []  # type: List[zmq.Socket]
        self.ports = []  # type: List[int]

        if ports:
            check.is_none(port_range)
            self._bind_to_specified_ports(ports=ports)
            check.eq(len(self.ports), len(ports))
        else:
            check.is_not_none(num_connections)
            check.is_not_none(port_range)
            num_connections = cast(int, num_connections)
            port_range = cast(Tuple[int, int], port_range)
            self._bind_to_random_ports(port_range=port_range,
                                       num_connections=num_connections)
            check.eq(len(self.ports), num_connections)
    def wrap_scaler(self, scaler: Any) -> Any:
        """
        Prepares to use automatic mixed precision through PyTorch’s native AMP API. The returned
        scaler should be passed to ``step_optimizer``, but usage does not otherwise differ from
        vanilla PyTorch APIs. Loss should be scaled before calling ``backward``, ``unscale_`` should
        be called before clipping gradients, ``update`` should be called after stepping all
        optimizers, etc.

        PyTorch 1.6 or greater is required for this feature.

        Arguments:
            scaler (``torch.cuda.amp.GradScaler``):  Scaler to wrap and track.

        Returns:
            The scaler. It may be wrapped to add additional functionality for use in Determined.
        """

        check.false(
            amp_import_error,
            "Failed to import torch.cuda.amp. PyTorch >= 1.6 required.")

        check.false(self._use_apex, "Do not mix APEX with PyTorch AMP.")

        check.is_none(self._scaler,
                      "Please only call wrap_scaler or use_amp once.")

        check.true(
            len(self.models) == 0,
            "Please call wrap_scaler before wrap_model.")

        check.true(
            torch.cuda.is_available(),
            "Mixed precision training (AMP) is supported only on GPU slots.",
        )

        self._scaler = scaler

        return scaler
Exemple #6
0
 def _respond(self, resp: Response) -> None:
     """Capture a response from the trial controller."""
     check.is_none(self._response,
                   "_respond() was called twice by the TrialController")
     self._response = resp
    def configure_apex_amp(
        self,
        models: Union[torch.nn.Module, List[torch.nn.Module]],
        optimizers: Union[torch.optim.Optimizer,
                          List[torch.optim.Optimizer]],  # type: ignore
        enabled: Optional[bool] = True,
        opt_level: Optional[str] = "O1",
        cast_model_type: Optional[torch.dtype] = None,
        patch_torch_functions: Optional[bool] = None,
        keep_batchnorm_fp32: Optional[Union[bool, str]] = None,
        master_weights: Optional[bool] = None,
        loss_scale: Optional[Union[float, str]] = None,
        cast_model_outputs: Optional[torch.dtype] = None,
        num_losses: Optional[int] = 1,
        verbosity: Optional[int] = 1,
        min_loss_scale: Optional[float] = None,
        max_loss_scale: Optional[float] = 2.0**24,
    ) -> Tuple:
        """
        Configure automatic mixed precision for your models and optimizers using NVIDIA's Apex
        PyTorch extension. Note that details for apex.amp are handled automatically within
        Determined after this call.

        This function must be called **after** you have finished constructing your models and
        optimizers with :meth:`wrap_model` and :meth:`wrap_optimizer`.

        This function has the same arguments as
        `apex.amp.initialize <https://nvidia.github.io/apex/amp.html#apex.amp.initialize>`_.

        .. warning::
            When using distributed training and automatic mixed precision,
            we only support ``num_losses=1`` and calling backward on the loss once.

        Arguments:
            models (``torch.nn.Module`` or list of ``torch.nn.Module`` s):  Model(s) to modify/cast.
            optimizers (``torch.optim.Optimizer`` or list of ``torch.optim.Optimizer`` s):
                Optimizers to modify/cast. REQUIRED for training.
            enabled (bool, optional, default=True):  If False, renders all Amp calls no-ops,
                so your script should run as if Amp were not present.
            opt_level (str, optional, default="O1"):  Pure or mixed precision optimization level.
                Accepted values are "O0", "O1", "O2", and "O3", explained in detail above.
            cast_model_type (``torch.dtype``, optional, default=None):  Optional property override,
                see above.
            patch_torch_functions (bool, optional, default=None):  Optional property override.
            keep_batchnorm_fp32 (bool or str, optional, default=None):  Optional property override.
                If passed as a string, must be the string "True" or "False".
            master_weights (bool, optional, default=None):  Optional property override.
            loss_scale (float or str, optional, default=None):  Optional property override.
                If passed as a string, must be a string representing a number, e.g., "128.0",
                or the string "dynamic".
            cast_model_outputs (torch.dtype, optional, default=None):  Option to ensure that
                the outputs of your model is always cast to a particular type regardless of
                ``opt_level``.
            num_losses (int, optional, default=1):  Option to tell Amp in advance how many
                losses/backward passes you plan to use.  When used in conjunction with the
                ``loss_id`` argument to ``amp.scale_loss``, enables Amp to use a different
                loss scale per loss/backward pass, which can improve stability.
                If ``num_losses`` is left to 1, Amp will still support multiple losses/backward
                passes, but use a single global loss scale for all of them.
            verbosity (int, default=1):  Set to 0 to suppress Amp-related output.
            min_loss_scale (float, default=None):  Sets a floor for the loss scale values that
                can be chosen by dynamic loss scaling.  The default value of None means that no
                floor is imposed. If dynamic loss scaling is not used, `min_loss_scale` is ignored.
            max_loss_scale (float, default=2.**24):  Sets a ceiling for the loss scale values
                that can be chosen by dynamic loss scaling.  If dynamic loss scaling is not used,
                `max_loss_scale` is ignored.

        Returns:
            Model(s) and optimizer(s) modified according to the ``opt_level``.
            If  ``optimizers`` args were lists, the corresponding return value will
            also be a list.
        """
        if not self.env.managed_training:
            return models, optimizers

        check.is_none(self._scaler, "Do not mix APEX with PyTorch AMP")

        check.false(self._use_apex,
                    "Please only call configure_apex_amp once.")
        if self.hvd_config.use:
            check.eq(
                num_losses,
                1,
                "When using parallel/distributed training, "
                "Determined only supports configure_apex_amp with num_losses = 1",
            )

        self._use_apex = True

        if self.hvd_config.use:
            check.eq(
                self.hvd_config.aggregation_frequency,
                1,
                "Mixed precision training (AMP) is not supported with "
                "aggregation frequency > 1.",
            )

        check.true(
            torch.cuda.is_available(),
            "Mixed precision training (AMP) is supported only on GPU slots.",
        )

        logging.info(
            f"Enabling mixed precision training with opt_level: {opt_level}.")
        models, optimizers = apex.amp.initialize(
            models=models,
            optimizers=optimizers,
            enabled=enabled,
            opt_level=opt_level,
            cast_model_type=cast_model_type,
            patch_torch_functions=patch_torch_functions,
            keep_batchnorm_fp32=keep_batchnorm_fp32,
            master_weights=master_weights,
            loss_scale=loss_scale,
            cast_model_outputs=cast_model_outputs,
            num_losses=num_losses,
            min_loss_scale=min_loss_scale,
            max_loss_scale=max_loss_scale,
            verbosity=verbosity if self.distributed.get_rank() == 0
            or self.env.experiment_config.debug_enabled() else 0,
        )
        if not isinstance(models, list):
            self.models = [models]
        if not isinstance(optimizers, list):
            self.optimizers = [optimizers]
        return models, optimizers