コード例 #1
0
ファイル: horovod.py プロジェクト: shiyuann/determined
    def require_horovod_type(self, horovod_type: str, reason: str) -> None:
        """
        Declare the required type of horovod and give a unique reason as to why it is required.

        The reason makes for clear error reporting if require_horovod_type() is called a second
        time but with a different type.
        """

        known_types = {"tensorflow", "tensorflow.keras", "torch"}
        check.is_in(horovod_type, known_types,
                    "Unknown horovod type requested.")

        if self._poly_hvd_type is not None:
            check.eq(
                horovod_type,
                self._poly_hvd_type,
                f"require_horovod_type() called with with type {horovod_type} after a previous "
                f"call with type {self._poly_hvd_type} in the same process. The reason for the "
                f"first call was '{self._poly_hvd_first_reason}'; the reason for this call is "
                f"'{reason}'.",
            )
        else:
            self._poly_hvd_type = horovod_type
            self._poly_hvd_first_reason = reason
            # If horovod has not been imported yet, do it now.
            try:
                self._poly_hvd_module = importlib.import_module(
                    f"horovod.{horovod_type}")
            except ImportError:
                pass
コード例 #2
0
 def from_config(cls, config: Dict[str, Any],
                 container_path: Optional[str]) -> "StorageManager":
     allowed_keys = {
         "host_path", "storage_path", "container_path", "propagation"
     }
     for key in config.keys():
         check.is_in(key, allowed_keys, "extra key in shared_fs config")
     check.is_in("host_path", config,
                 "shared_fs config is missing host_path")
     # Ignore legacy configuration values propagation and container_path.
     base_path = _full_storage_path(config["host_path"],
                                    config.get("storage_path"),
                                    container_path)
     return cls(base_path)
コード例 #3
0
def binary_error_rate(predictions: torch.Tensor,
                      labels: torch.Tensor) -> float:
    """Return the classification error rate for binary classification."""
    check.eq(predictions.shape[0], labels.shape[0])
    check.is_in(len(predictions.shape), [1, 2])
    if len(predictions.shape) == 2:
        check.eq(predictions.shape[1], 1)
    check.len_eq(labels.shape, 1, "Labels must be a column vector")

    if len(predictions.shape) > 1:
        predictions = torch.squeeze(predictions)

    errors = torch.sum(
        labels.to(torch.long) != torch.round(predictions).to(torch.long))
    result = float(errors) / predictions.shape[0]  # type: float
    return result
コード例 #4
0
    def wrap_lr_scheduler(
        self,
        lr_scheduler: torch.optim.lr_scheduler._LRScheduler,
        step_mode: pytorch.LRScheduler.StepMode,
        frequency: int = 1,
    ) -> torch.optim.lr_scheduler._LRScheduler:
        """
        Returns a wrapped LR scheduler.

        The LR scheduler must use an optimizer wrapped by :meth:`wrap_optimizer`.  If ``apex.amp``
        is in use, the optimizer must also have been configured with :meth:`configure_apex_amp`.
        """
        if isinstance(lr_scheduler,
                      torch.optim.lr_scheduler.ReduceLROnPlateau):
            if step_mode != pytorch.LRScheduler.StepMode.MANUAL_STEP:
                raise det.errors.InvalidExperimentException(
                    "detected that context.wrap_lr_scheduler() was called with an instance of "
                    "torch.optim.lr_scheduer.ReduceLROnPlateau as the lr_scheduler.  This lr "
                    "scheduler class does not have the usual step() parameters, and so it can "
                    "only be used with step_mode=MANUAL_STEP.\n"
                    "\n"
                    "For example, if you wanted to step it on every validation step, you might "
                    "wrap your lr_scheduler and pass it to a callback like this:\n"
                    "\n"
                    "class MyLRStepper(PyTorchCallback):\n"
                    "    def __init__(self, wrapped_lr_scheduler):\n"
                    "        self.wrapped_lr_scheduler = wrapped_lr_scheduler\n"
                    "\n"
                    "    def on_validation_end(self, metrics):\n"
                    '        self.wrapped_lr_scheduler.step(metrics["validation_error"])\n'
                )

        opt = getattr(lr_scheduler, "optimizer", None)
        if opt is not None:
            check.is_in(
                opt,
                self.optimizers,
                "Must use an optimizer that is returned by wrap_optimizer()",
            )
        wrapped = pytorch.LRScheduler(lr_scheduler, step_mode, frequency)
        self.lr_schedulers.append(wrapped)

        # Return the original LR scheduler to the user in case they have customizations that we
        # don't care about.
        return lr_scheduler
コード例 #5
0
ファイル: horovod.py プロジェクト: shiyuann/determined
    def from_configs(
        experiment_config: ExperimentConfig,
        rendezvous_info: RendezvousInfo,
        hparams: Dict[str, Any],
    ) -> "HorovodContext":
        """
        Create the HorovodContext according to experiment config and rendezvous info for this trial.
        """

        # Horovod is always used for multi-machine distributed training. For
        # single-machine multi-GPU training, Horovod is used when native_parallel is
        # disabled.
        multi_machine_trial = rendezvous_info.get_size() > 1
        multi_slot_trial = experiment_config["resources"]["slots_per_trial"] > 1
        use_horovod = multi_machine_trial or (
            multi_slot_trial
            and not experiment_config.native_parallel_enabled())

        check.is_in("optimizations", experiment_config)
        optimizations_config = cast(Dict[str, Any],
                                    experiment_config.get("optimizations"))

        check.is_in("aggregation_frequency", optimizations_config)
        check.is_in("gradient_compression", optimizations_config)
        check.is_in("average_training_metrics", optimizations_config)

        # Help users migrate from the old locations for these settings, in hparams.
        def error_message_removed_from_hparams(removed_hparam: str) -> str:
            return (
                f"Please move `{removed_hparam}` in the experiment config to "
                f"`Optimizations` from `hyperparameters`.")

        check.not_in(
            "aggregation_frequency",
            hparams,
            error_message_removed_from_hparams("aggregation_frequency"),
        )
        check.not_in(
            "gradient_compression",
            hparams,
            error_message_removed_from_hparams("gradient_compression"),
        )
        check.not_in(
            "grad_updates_size_file",
            hparams,
            error_message_removed_from_hparams("grad_updates_size_file"),
        )

        hvd_config = HorovodContext(
            use=use_horovod,
            aggregation_frequency=cast(
                int, optimizations_config.get("aggregation_frequency")),
            fp16_compression=cast(
                bool, optimizations_config.get("gradient_compression")),
            grad_updates_size_file=optimizations_config.get(
                "grad_updates_size_file", None),
            average_aggregated_gradients=cast(
                bool,
                optimizations_config.get("average_aggregated_gradients")),
            average_training_metrics=cast(
                bool, optimizations_config.get("average_training_metrics")),
        )

        if hvd_config.use and hvd_config.aggregation_frequency > 1:
            logging.info(
                f"Setting `aggregation_frequency` to {hvd_config.aggregation_frequency} "
                "to optimize training.")

        if hvd_config.use and hvd_config.fp16_compression:
            logging.info(
                "Enabling `gradient_compression` to optimize training.")

        return hvd_config