Esempio n. 1
0
 def from_json(dict: Dict[str, Any]) -> "Workload":
     check.check_in(dict["kind"], Workload.Kind.__members__)
     return Workload(
         Workload.Kind[dict["kind"]],
         dict["experiment_id"],
         dict["trial_id"],
         dict["step_id"],
         dict["num_batches"],
         dict["total_batches_processed"],
     )
Esempio n. 2
0
def build(config: Dict[str, Any],
          container_path: Optional[str]) -> StorageManager:
    """
    Return a checkpoint manager defined by the value of the `type` key in
    the configuration dictionary. Throws a `TypeError` if no storage manager
    with `type` is defined.
    """
    check_in("type", config,
             "Missing 'type' parameter of storage configuration")

    # Make a deep copy of the config because we are removing items to
    # pass to the constructor of the `StorageManager`.
    config = copy.deepcopy(config)
    identifier = config.pop("type")
    check_type(identifier, str,
               "`type` parameter of storage configuration must be a string")

    try:
        subclass = _STORAGE_MANAGERS[identifier]
    except KeyError:
        raise TypeError("Unknown storage type: {}".format(identifier))

    # Remove configurations that should not be directly passed to
    # subclasses. Keeping these would result in the subclass __init__()
    # function failing to a TypeError with an unexpected keyword.
    config.pop("save_experiment_best", None)
    config.pop("save_trial_best", None)
    config.pop("save_trial_latest", None)

    # For shared_fs maintain backwards compatibility by folding old keys into
    # storage_path.
    if identifier == "shared_fs" and "storage_path" not in config:
        if "tensorboard_path" in config:
            config["storage_path"] = config.get("tensorboard_path", None)
        else:
            config["storage_path"] = config.get("checkpoint_path", None)
    elif identifier == "azure":
        if not ("connection_string" in config or "account_url" in config):
            raise ValueError(
                """At least one of [connection_string, account_url] must be specified for Azure Blob
                 Storage, but none were.""")
        if "container" not in config:
            raise ValueError(
                "Container name must be specified for Azure Blob Storage.")

    config.pop("tensorboard_path", None)
    config.pop("checkpoint_path", None)

    try:
        return subclass.from_config(config, container_path)
    except TypeError as e:
        raise TypeError(
            "Failed to instantiate {} checkpoint storage: {}".format(
                identifier, str(e)))
Esempio n. 3
0
def create_performance_args(env: det.EnvContext) -> List[str]:
    optimizations = env.experiment_config.get("optimizations", {})
    check.check_in("auto_tune_tensor_fusion", optimizations)
    check.check_in("tensor_fusion_threshold", optimizations)
    check.check_in("tensor_fusion_cycle_time", optimizations)

    if optimizations.get("auto_tune_tensor_fusion"):
        performance_args = [
            "--autotune",
            "--autotune-log-file",
            str(constants.HOROVOD_AUTOTUNE_LOG_FILEPATH),
        ]
    else:
        performance_args = [
            "--fusion-threshold-mb",
            str(optimizations.get("tensor_fusion_threshold")),
            "--cycle-time-ms",
            str(optimizations.get("tensor_fusion_cycle_time")),
        ]

    # Prevent horovod from auto-tuning these parameters.
    performance_args.extend([
        "--cache-capacity",
        str(1024),
        "--no-hierarchical-allreduce",
        "--no-hierarchical-allgather",
    ])
    return performance_args
Esempio n. 4
0
    def __init__(
        self,
        context: PyTorchTrialContext,
        lightning_module: pl.LightningModule,
        precision: Union[Literal[32], Literal[16]] = 32,
        amp_backend: Union[Literal["native"], Literal["apex"]] = "native",
        amp_level: Union[Literal["O0", "O1", "O2", "O3"]] = "O2",
    ):
        """
        This performs the necessary initialization steps to:

        1. check the compatibility of the provided ``LightningModule`` with ``LightningAdapter``.
        2. define a ``PyTorchTrial`` with models, optimizers, and LR schedulers that are provided
           by ``LightningModule``.
        3. patch the ``LightningModule`` methods that depend on a ``Trainer``.

        After inheriting this class, you need to override this function to initialize the adapted
        ``PyTorchTrial``.
        Within your ``__init__`` , you should instantiate the ``LightningModule`` and call
        ``super().__init__``.

        Here is a minimal code example.

        .. code-block:: python

            def __init__(self, context: PyTorchTrialContext) -> None:
                lm = mnist.LightningMNISTClassifier(lr=context.get_hparam('learning_rate'))
                super().__init__(context, lightning_module=lm)

        Arguments:
            context (PyTorchTrialContext)
            lightning_module (``LightningModule``):
                User-defined lightning module.
            precision (int, default=32):
                Precision to use.
                Accepted values are 16, and 32.
            amp_backend (str):
                Automatic mixed precision backend to use.
                Accepted values are "native", and "mixed".
            amp_level (str, optional, default="O2"):
                Apex amp optimization level.
                Accepted values are "O0", "O1", "O2", and "O3".
                https://nvidia.github.io/apex/amp.html#opt-levels-and-properties

        """

        check.check_in(precision, {16, 32},
                       "only precisions 16 & 32 are supported.")
        check.check_in(amp_backend, {"native", "apex"},
                       'only "native", and "apex" are supported')

        check_compatibility(lightning_module)
        override_unsupported_nud(lightning_module, context)

        if precision == 16 and amp_backend == "native":
            context.experimental.use_amp()

        context.wrap_model(lightning_module)

        pls = _LightningAdapterState(context, lightning_module, [], [])
        self._pls = pls
        pls.optimizers, pls.lr_schedulers = self.setup_optimizers_schedulers()

        if precision == 16 and amp_backend == "apex":
            context.configure_apex_amp(
                context.models,
                context.optimizers,
                enabled=True,
                opt_level=amp_level,
            )

        # set lightning_module properties
        pls.lm.use_ddp = False
        pls.lm.use_ddp2 = False
        pls.lm.use_dp = False
        pls.lm.use_tpu = False
        type(pls.lm).local_rank = context.distributed.get_local_rank(
        )  # type: ignore
        type(pls.lm).global_rank = context.distributed.get_rank(
        )  # type: ignore
        pls.lm.to(context.device)
        use_amp = context.experimental._auto_amp or context._use_apex
        pls.lm.use_amp = use_amp
        pls.lm.precision = "mixed" if use_amp else precision  # type: ignore