Ejemplo n.º 1
0
def create_trial_instance(
    trial_def: Type[det.Trial],
    checkpoint_dir: str,
    config: Optional[Dict[str, Any]] = None,
    hparams: Optional[Dict[str, Any]] = None,
) -> det.Trial:
    """
    Create a trial instance from a Trial class definition. This can be a useful
    utility for debugging your trial logic in any development environment.

    Arguments:
        trial_def: A class definition that inherits from the det.Trial interface.
        checkpoint_dir:
            The checkpoint directory that the trial will use for loading and
            saving checkpoints.
        config:
            An optional experiment configuration that is used to initialize the
            :class:`determined.TrialContext`. If not specified, a minimal default
            is used.
    """
    determined_common.set_logger(
        util.debug_mode()
        or det.ExperimentConfig(config or {}).debug_enabled())
    env, rendezvous_info, hvd_config = det._make_local_execution_env(
        False, config, hparams)
    trial_context = trial_def.trial_context_class(env, hvd_config)
    return trial_def(trial_context)
Ejemplo n.º 2
0
def test_one_batch(
    trial_class: Type[det.Trial],
    config: Optional[Dict[str, Any]] = None,
) -> Any:
    # Override the scheduling_unit value to 1.
    config = {**(config or {}), "scheduling_unit": 1}

    logging.info("Running a minimal test experiment locally")
    checkpoint_dir = tempfile.TemporaryDirectory()
    env, rendezvous_info, hvd_config = det._make_local_execution_env(
        managed_training=True, test_mode=True, config=config, limit_gpus=1)
    workloads = _make_test_workloads(
        pathlib.Path(checkpoint_dir.name).joinpath("checkpoint"),
        env.experiment_config)
    logging.info(f"Using hyperparameters: {env.hparams}.")
    logging.debug(f"Using a test experiment config: {env.experiment_config}.")

    # Case 2: test one batch for Trial implementation.
    controller = load.load_trial(
        trial_class=trial_class,
        env=env,
        workloads=workloads,
        load_path=None,
        rendezvous_info=rendezvous_info,
        hvd_config=hvd_config,
    )
    controller.run()
    checkpoint_dir.cleanup()
    logging.info(
        "Note: to submit an experiment to the cluster, change local parameter to False"
    )
Ejemplo n.º 3
0
def _load_trial_on_local(
        context_dir: pathlib.Path, training: bool, config: Dict[str, Any],
        hparams: Dict[str, Any]) -> Tuple[Type[det.Trial], det.TrialContext]:
    with det._local_execution_manager(context_dir):
        trial_class = load.load_trial_implementation(config["entrypoint"])
        env, rendezvous_info, hvd_config = det._make_local_execution_env(
            training, config, hparams)
        trial_context = trial_class.trial_context_class(env, hvd_config)
    return trial_class, trial_context
Ejemplo n.º 4
0
def _load_trial_on_local(
    context_dir: pathlib.Path,
    managed_training: bool,
    config: Dict[str, Any],
    hparams: Dict[str, Any],
) -> Tuple[Type[det.Trial], det.TrialContext]:
    with det._local_execution_manager(context_dir):
        trial_class = load.trial_class_from_entrypoint(config["entrypoint"])
        env, rendezvous_info, hvd_config = det._make_local_execution_env(
            managed_training=managed_training, test_mode=False, config=config, hparams=hparams
        )
        trial_context = trial_class.trial_context_class(env, hvd_config, rendezvous_info)
    return trial_class, trial_context
Ejemplo n.º 5
0
def test_one_batch(
    trial_class: Type[det.Trial],
    config: Optional[Dict[str, Any]] = None,
) -> Any:
    # Override the scheduling_unit value to 1.
    config = {**(config or {}), "scheduling_unit": 1}

    logging.info("Running a minimal test experiment locally")
    with tempfile.TemporaryDirectory() as checkpoint_dir:
        core_context, env = det._make_local_execution_env(
            managed_training=True,
            test_mode=True,
            config=config,
            checkpoint_dir=checkpoint_dir,
            limit_gpus=1,
        )

        workloads = _make_test_workloads(env.experiment_config)
        logging.info(f"Using hyperparameters: {env.hparams}.")
        logging.debug(
            f"Using a test experiment config: {env.experiment_config}.")

        distributed_backend = det._DistributedBackend()
        controller_class = trial_class.trial_controller_class
        assert controller_class is not None
        controller_class.pre_execute_hook(env, distributed_backend)

        trial_context = trial_class.trial_context_class(core_context, env)
        logging.info(f"Creating {trial_class.__name__}.")
        trial_inst = trial_class(trial_context)

        controller = controller_class.from_trial(
            trial_inst=trial_inst,
            context=trial_context,
            env=env,
            workloads=workloads,
        )

        controller.run()

        logging.info("The test experiment passed.")
        logging.info(
            "Note: to submit an experiment to the cluster, change local parameter to False"
        )
    def from_config(cls, config: Dict[str, Any]) -> "TrialContext":
        """
        Create a context object suitable for debugging outside of Determined.

        An example for a subclass of :class:`~determined.pytorch._pytorch_trial.PyTorchTrial`:

        .. code-block:: python

            config = { ... }
            context = det.pytorch.PyTorchTrialContext.from_config(config)
            my_trial = MyPyTorchTrial(context)

            train_ds = my_trial.build_training_data_loader()
            for epoch_idx in range(3):
                for batch_idx, batch in enumerate(train_ds):
                    metrics = my_trial.train_batch(batch, epoch_idx, batch_idx)
                    ...

        An example for a subclass of :class:`~determined.keras._tf_keras_trial.TFKerasTrial`:

        .. code-block:: python

            config = { ... }
            context = det.keras.TFKerasTrialContext.from_config(config)
            my_trial = tf_keras_one_var_model.OneVarTrial(context)

            model = my_trial.build_model()
            model.fit(my_trial.build_training_data_loader())
            eval_metrics = model.evaluate(my_trial.build_validation_data_loader())

        Arguments:
            config: An experiment config file, in dictionary form.
        """
        core_context, env = det._make_local_execution_env(
            managed_training=False,
            test_mode=False,
            config=config,
            checkpoint_dir="/tmp",
            limit_gpus=1,
        )
        return cls(core_context, env)
Ejemplo n.º 7
0
def create_trial_instance(
    trial_def: Type[det.Trial],
    checkpoint_dir: str,
    config: Optional[Dict[str, Any]] = None,
    hparams: Optional[Dict[str, Any]] = None,
) -> det.Trial:
    """
    Deprecated: please use your TrialContext's .from_config() method instead.

    Create a trial instance from a Trial class definition. This can be a useful
    utility for debugging your trial logic in any development environment.

    Arguments:
        trial_def: A class definition that inherits from the det.Trial interface.
        checkpoint_dir:
            The checkpoint directory that the trial will use for loading and
            saving checkpoints.
        config:
            An optional experiment configuration that is used to initialize the
            :class:`determined.TrialContext`. If not specified, a minimal default
            is used.
    """
    warnings.warn(
        "det.experimental.create_trial_instance() is now deprecated.  Please use\n"
        "your TrialContext's .from_config() method instead.  Example\n"
        "\n"
        "    context = PyTorchTrialContext.from_config()\n"
        "    my_trial = MyPyTorchTrial(context)\n",
        FutureWarning,
    )
    determined.common.set_logger(
        util.debug_mode()
        or det.ExperimentConfig(config or {}).debug_enabled())
    env, rendezvous_info, hvd_config = det._make_local_execution_env(
        managed_training=False,
        test_mode=False,
        config=config,
        hparams=hparams)
    trial_context = trial_def.trial_context_class(
        env, hvd_config, rendezvous_info=rendezvous_info)
    return trial_def(trial_context)
Ejemplo n.º 8
0
def test_one_batch(
    controller_cls: Optional[Type[det.TrialController]] = None,
    native_context_cls: Optional[Type[det.NativeContext]] = None,
    trial_class: Optional[Type[det.Trial]] = None,
    config: Optional[Dict[str, Any]] = None,
) -> Any:
    # Override the scheduling_unit value to 1.
    config = {**(config or {}), "scheduling_unit": 1}

    logging.info("Running a minimal test experiment locally")
    checkpoint_dir = tempfile.TemporaryDirectory()
    env, rendezvous_info, hvd_config = det._make_local_execution_env(
        managed_training=True, test_mode=True, config=config, limit_gpus=1)
    workloads = _make_test_workloads(
        pathlib.Path(checkpoint_dir.name).joinpath("checkpoint"),
        env.experiment_config)
    logging.info(f"Using hyperparameters: {env.hparams}.")
    logging.debug(f"Using a test experiment config: {env.experiment_config}.")

    if native_context_cls is not None and controller_cls is not None:
        # Case 1: test one batch for Native implementation.
        controller_cls.pre_execute_hook(env=env, hvd_config=hvd_config)
        context = native_context_cls(
            env=env,
            hvd_config=hvd_config,
            rendezvous_info=rendezvous_info,
        )

        def train_fn() -> None:
            controller = cast(Type[det.TrialController],
                              controller_cls).from_native(
                                  context=context,
                                  env=env,
                                  workloads=workloads,
                                  load_path=None,
                                  rendezvous_info=rendezvous_info,
                                  hvd_config=hvd_config,
                              )
            controller.run()
            checkpoint_dir.cleanup()

        context._set_train_fn(train_fn)
        logging.info(
            "Note: to submit an experiment to the cluster, change local parameter to False"
        )
        return context

    elif trial_class is not None:
        # Case 2: test one batch for Trial implementation.
        controller = load.load_controller_from_trial(
            trial_class=trial_class,
            env=env,
            workloads=workloads,
            load_path=None,
            rendezvous_info=rendezvous_info,
            hvd_config=hvd_config,
        )
        controller.run()
        checkpoint_dir.cleanup()
        logging.info(
            "Note: to submit an experiment to the cluster, change local parameter to False"
        )

    else:
        raise errors.InternalException(
            "Must provide a trial_def if using Trial API or "
            "a controller_cls and a native_context_cls if using Native API.")