def from_json(dict: Dict[str, Any]) -> "Workload": check.check_in(dict["kind"], Workload.Kind.__members__) return Workload( Workload.Kind[dict["kind"]], dict["experiment_id"], dict["trial_id"], dict["step_id"], dict["num_batches"], dict["total_batches_processed"], )
def build(config: Dict[str, Any], container_path: Optional[str]) -> StorageManager: """ Return a checkpoint manager defined by the value of the `type` key in the configuration dictionary. Throws a `TypeError` if no storage manager with `type` is defined. """ check_in("type", config, "Missing 'type' parameter of storage configuration") # Make a deep copy of the config because we are removing items to # pass to the constructor of the `StorageManager`. config = copy.deepcopy(config) identifier = config.pop("type") check_type(identifier, str, "`type` parameter of storage configuration must be a string") try: subclass = _STORAGE_MANAGERS[identifier] except KeyError: raise TypeError("Unknown storage type: {}".format(identifier)) # Remove configurations that should not be directly passed to # subclasses. Keeping these would result in the subclass __init__() # function failing to a TypeError with an unexpected keyword. config.pop("save_experiment_best", None) config.pop("save_trial_best", None) config.pop("save_trial_latest", None) # For shared_fs maintain backwards compatibility by folding old keys into # storage_path. if identifier == "shared_fs" and "storage_path" not in config: if "tensorboard_path" in config: config["storage_path"] = config.get("tensorboard_path", None) else: config["storage_path"] = config.get("checkpoint_path", None) elif identifier == "azure": if not ("connection_string" in config or "account_url" in config): raise ValueError( """At least one of [connection_string, account_url] must be specified for Azure Blob Storage, but none were.""") if "container" not in config: raise ValueError( "Container name must be specified for Azure Blob Storage.") config.pop("tensorboard_path", None) config.pop("checkpoint_path", None) try: return subclass.from_config(config, container_path) except TypeError as e: raise TypeError( "Failed to instantiate {} checkpoint storage: {}".format( identifier, str(e)))
def create_performance_args(env: det.EnvContext) -> List[str]: optimizations = env.experiment_config.get("optimizations", {}) check.check_in("auto_tune_tensor_fusion", optimizations) check.check_in("tensor_fusion_threshold", optimizations) check.check_in("tensor_fusion_cycle_time", optimizations) if optimizations.get("auto_tune_tensor_fusion"): performance_args = [ "--autotune", "--autotune-log-file", str(constants.HOROVOD_AUTOTUNE_LOG_FILEPATH), ] else: performance_args = [ "--fusion-threshold-mb", str(optimizations.get("tensor_fusion_threshold")), "--cycle-time-ms", str(optimizations.get("tensor_fusion_cycle_time")), ] # Prevent horovod from auto-tuning these parameters. performance_args.extend([ "--cache-capacity", str(1024), "--no-hierarchical-allreduce", "--no-hierarchical-allgather", ]) return performance_args
def __init__( self, context: PyTorchTrialContext, lightning_module: pl.LightningModule, precision: Union[Literal[32], Literal[16]] = 32, amp_backend: Union[Literal["native"], Literal["apex"]] = "native", amp_level: Union[Literal["O0", "O1", "O2", "O3"]] = "O2", ): """ This performs the necessary initialization steps to: 1. check the compatibility of the provided ``LightningModule`` with ``LightningAdapter``. 2. define a ``PyTorchTrial`` with models, optimizers, and LR schedulers that are provided by ``LightningModule``. 3. patch the ``LightningModule`` methods that depend on a ``Trainer``. After inheriting this class, you need to override this function to initialize the adapted ``PyTorchTrial``. Within your ``__init__`` , you should instantiate the ``LightningModule`` and call ``super().__init__``. Here is a minimal code example. .. code-block:: python def __init__(self, context: PyTorchTrialContext) -> None: lm = mnist.LightningMNISTClassifier(lr=context.get_hparam('learning_rate')) super().__init__(context, lightning_module=lm) Arguments: context (PyTorchTrialContext) lightning_module (``LightningModule``): User-defined lightning module. precision (int, default=32): Precision to use. Accepted values are 16, and 32. amp_backend (str): Automatic mixed precision backend to use. Accepted values are "native", and "mixed". amp_level (str, optional, default="O2"): Apex amp optimization level. Accepted values are "O0", "O1", "O2", and "O3". https://nvidia.github.io/apex/amp.html#opt-levels-and-properties """ check.check_in(precision, {16, 32}, "only precisions 16 & 32 are supported.") check.check_in(amp_backend, {"native", "apex"}, 'only "native", and "apex" are supported') check_compatibility(lightning_module) override_unsupported_nud(lightning_module, context) if precision == 16 and amp_backend == "native": context.experimental.use_amp() context.wrap_model(lightning_module) pls = _LightningAdapterState(context, lightning_module, [], []) self._pls = pls pls.optimizers, pls.lr_schedulers = self.setup_optimizers_schedulers() if precision == 16 and amp_backend == "apex": context.configure_apex_amp( context.models, context.optimizers, enabled=True, opt_level=amp_level, ) # set lightning_module properties pls.lm.use_ddp = False pls.lm.use_ddp2 = False pls.lm.use_dp = False pls.lm.use_tpu = False type(pls.lm).local_rank = context.distributed.get_local_rank( ) # type: ignore type(pls.lm).global_rank = context.distributed.get_rank( ) # type: ignore pls.lm.to(context.device) use_amp = context.experimental._auto_amp or context._use_apex pls.lm.use_amp = use_amp pls.lm.precision = "mixed" if use_amp else precision # type: ignore