Esempio n. 1
0
def make_test_workloads(checkpoint_dir: pathlib.Path,
                        config: det.ExperimentConfig) -> workload.Stream:
    interceptor = workload.WorkloadResponseInterceptor()

    print("Training one batch")
    yield from interceptor.send(workload.train_workload(1),
                                [config.batches_per_step()])
    metrics = interceptor.metrics_result()
    batch_metrics = metrics["batch_metrics"]
    check.eq(len(batch_metrics), config.batches_per_step())
    if util.debug_mode():
        print(f"Finished training, metrics: {batch_metrics}")

    print("Validating one step")
    yield from interceptor.send(workload.validation_workload(1), [])
    validation = interceptor.metrics_result()
    v_metrics = validation["validation_metrics"]
    if util.debug_mode():
        print(f"Finished validating, validation metrics: {v_metrics}")

    print(f"Saving a checkpoint to {checkpoint_dir}.")
    yield workload.checkpoint_workload(), [checkpoint_dir
                                           ], workload.ignore_workload_response
    print(f"Finished saving a checkpoint to {checkpoint_dir}.")

    yield workload.terminate_workload(), [], workload.ignore_workload_response
    print("The test experiment passed.")
Esempio n. 2
0
def create_trial_instance(
    trial_def: Type[det.Trial],
    checkpoint_dir: str,
    config: Optional[Dict[str, Any]] = None,
    hparams: Optional[Dict[str, Any]] = None,
) -> det.Trial:
    """
    Create a trial instance from a Trial class definition. This can be a useful
    utility for debugging your trial logic in any development environment.

    Arguments:
        trial_def: A class definition that inherits from the det.Trial interface.
        checkpoint_dir:
            The checkpoint directory that the trial will use for loading and
            saving checkpoints.
        config:
            An optional experiment configuration that is used to initialize the
            :class:`determined.TrialContext`. If not specified, a minimal default
            is used.
    """
    determined_common.set_logger(
        util.debug_mode()
        or det.ExperimentConfig(config or {}).debug_enabled())
    env, rendezvous_info, hvd_config = det._make_local_execution_env(
        False, config, hparams)
    trial_context = trial_def.trial_context_class(env, hvd_config)
    return trial_def(trial_context)
Esempio n. 3
0
        def die(message: str, always_print_traceback: bool = False) -> None:
            if always_print_traceback or debug_mode():
                import traceback

                traceback.print_exc()

            parser.exit(1, colored(message + "\n", "red"))
Esempio n. 4
0
def test_one_batch(
    context_path: pathlib.Path,
    trial_class: Optional[Type[det.Trial]] = None,
    config: Optional[Dict[str, Any]] = None,
) -> None:
    # Override the batches_per_step value to 1.
    # TODO(DET-2931): Make the validation step a single batch as well.
    config = {**(config or {}), "batches_per_step": 1}

    print("Running a minimal test experiment locally")
    checkpoint_dir = tempfile.TemporaryDirectory()
    env, workloads, rendezvous_info, hvd_config = make_test_experiment_env(
        checkpoint_dir=pathlib.Path(checkpoint_dir.name), config=config)
    print(f"Using hyperparameters: {env.hparams}")
    if util.debug_mode():
        print(f"Using a test experiment config: {env.experiment_config}")

    with local_execution_manager(context_path):
        if not trial_class:
            if util.debug_mode():
                print("Loading trial class from experiment configuration")
            trial_class = load.load_trial_implementation(
                env.experiment_config["entrypoint"])

        controller = load.load_controller_from_trial(
            trial_class=trial_class,
            env=env,
            workloads=workloads,
            load_path=None,
            rendezvous_info=rendezvous_info,
            hvd_config=hvd_config,
        )
        controller.run()

    checkpoint_dir.cleanup()
    print(
        "Note: to submit an experiment to the cluster, change mode argument to Mode.CLUSTER"
    )
Esempio n. 5
0
def init_native(
    trial_def: Optional[Type[det.Trial]] = None,
    controller_cls: Optional[Type[det.TrialController]] = None,
    native_context_cls: Optional[Type[det.NativeContext]] = None,
    config: Optional[Dict[str, Any]] = None,
    local: bool = False,
    test: bool = False,
    context_dir: str = "",
    command: Optional[List[str]] = None,
    master_url: Optional[str] = None,
) -> Any:
    determined_common.set_logger(
        util.debug_mode()
        or det.ExperimentConfig(config or {}).debug_enabled())

    if local:
        if not test:
            logging.warning("local training is not supported, testing instead")

        with det._local_execution_manager(pathlib.Path(context_dir).resolve()):
            return test_one_batch(
                controller_cls=controller_cls,
                native_context_cls=native_context_cls,
                trial_class=trial_def,
                config=config,
            )

    else:
        return _init_cluster_mode(
            trial_def=trial_def,
            controller_cls=controller_cls,
            native_context_cls=native_context_cls,
            config=config,
            test=test,
            context_dir=context_dir,
            command=command,
            master_url=master_url,
        )
Esempio n. 6
0
def _init_native(
    controller_cls: Type[det.TrialController],
    native_context_cls: Type[det.NativeContext],
    config: Optional[Dict[str, Any]] = None,
    mode: Mode = Mode.CLUSTER,
    context_dir: str = "",
    command: Optional[List[str]] = None,
    master_url: Optional[str] = None,
) -> Any:
    if Mode(mode) == Mode.CLUSTER:
        if load.RunpyGlobals.is_initialized():
            controller_cls.pre_execute_hook(
                env=load.RunpyGlobals.get_instance().env,
                hvd_config=load.RunpyGlobals.get_instance().hvd_config,
            )
            context = native_context_cls(
                env=load.RunpyGlobals.get_instance().env,
                hvd_config=load.RunpyGlobals.get_instance().hvd_config,
            )
            load.RunpyGlobals.set_runpy_native_result(context, controller_cls)
            context._set_train_fn(_stop_loading_implementation)
            return context

        else:
            create_experiment(config=config,
                              context_dir=context_dir,
                              command=command,
                              master_url=master_url)
            print("Exiting the program after submitting the experiment.")
            sys.exit(0)

    elif Mode(mode) == Mode.LOCAL:
        print("Running a minimal test experiment locally")
        checkpoint_dir = tempfile.TemporaryDirectory()
        env, workloads, rendezvous_info, hvd_config = make_test_experiment_env(
            checkpoint_dir=pathlib.Path(checkpoint_dir.name), config=config)
        print(f"Using hyperparameters: {env.hparams}")
        if util.debug_mode():
            print(f"Using a test experiment config: {env.experiment_config}")

        controller_cls.pre_execute_hook(env=env, hvd_config=hvd_config)
        context = native_context_cls(env=env, hvd_config=hvd_config)

        def train_fn() -> None:
            controller = controller_cls.from_native(
                context=context,
                env=env,
                workloads=workloads,
                load_path=None,
                rendezvous_info=rendezvous_info,
                hvd_config=hvd_config,
            )
            controller.run()
            checkpoint_dir.cleanup()

        context._set_train_fn(train_fn)
        return context

    else:
        raise errors.InvalidExperimentException(
            "Must use either local mode or cluster mode.")
Esempio n. 7
0
def create(
    trial_def: Type[det.Trial],
    config: Optional[Dict[str, Any]] = None,
    mode: Mode = Mode.CLUSTER,
    context_dir: str = "",
    command: Optional[List[str]] = None,
    master_url: Optional[str] = None,
) -> None:
    # TODO: Add a reference to the local development tutorial.
    """
    Create an experiment.

    Arguments:
        trial_def:
            A class definition implementing the ``det.Trial`` interface.
        config:
            A dictionary representing the experiment configuration to be
            associated with the experiment.
        mode:
            The :py:class:`determined.experimental.Mode` used when creating
            an experiment

            1. ``Mode.CLUSTER`` (default): Submit the experiment to a remote
            Determined cluster.

            2. ``Mode.LOCAL``: Test the experiment in the calling
            Python process for local development / debugging purposes.
            Run through a minimal loop of training, validation, and checkpointing steps.

        context_dir:
            A string filepath that defines the context directory. All model
            code will be executed with this as the current working directory.

            In CLUSTER mode, this argument is required. All files in this
            directory will be uploaded to the Determined cluster. The total
            size of this directory must be under 96 MB.

            In LOCAL mode, this argument is optional and assumed to be the
            current working directory by default.
        command:
            A list of strings that is used as the entrypoint of the training
            script in the Determined task environment. When executing this
            function via a python script, this argument is inferred to be
            ``sys.argv`` by default. When executing this function via IPython
            or Jupyter notebook, this argument is required.

            Example: When creating an experiment by running "python train.py
            --flag value", the default command is inferred as ["train.py",
            "--flag", "value"].

        master_url:
            An optional string to use as the Determined master URL in submit
            mode. If not specified, will be inferred from the environment
            variable ``DET_MASTER``.
    """

    det._set_logger(util.debug_mode() or det.ExperimentConfig(config or {}).debug_enabled())
    if Mode(mode) == Mode.CLUSTER:
        if load.RunpyGlobals.is_initialized():
            load.RunpyGlobals.set_runpy_trial_result(
                trial_def, cast(Type[det.TrialController], trial_def.trial_controller_class)
            )
            _stop_loading_implementation()

        else:
            create_experiment(
                config=config, context_dir=context_dir, command=command, master_url=master_url
            )

    elif Mode(mode) == Mode.LOCAL:
        context_path = pathlib.Path(context_dir) if context_dir else pathlib.Path.cwd()
        test_one_batch(context_path, trial_class=trial_def, config=config)
    else:
        raise errors.InvalidExperimentException("Must use either local mode or cluster mode.")