Exemple #1
0
def create_default_env_context(
        experiment_config: Dict[str, Any]) -> det.EnvContext:
    det_trial_runner_network_interface = constants.AUTO_DETECT_TRIAL_RUNNER_NETWORK_INTERFACE
    return det.EnvContext(
        experiment_config=experiment_config,
        initial_workload=workload.Workload(
            workload.Workload.Kind.RUN_STEP,
            ExperimentID(1),
            TrialID(1),
            StepID(1),
            det.ExperimentConfig(experiment_config).scheduling_unit(),
            0,
        ),
        master_addr="",
        master_port=0,
        use_tls=False,
        master_cert_file=None,
        master_cert_name=None,
        container_id="",
        hparams={"global_batch_size": 32},
        latest_checkpoint=None,
        use_gpu=False,
        container_gpus=[],
        slot_ids=[],
        debug=False,
        workload_manager_type="",
        det_rendezvous_ports="",
        det_trial_unique_port_offset=0,
        det_trial_runner_network_interface=det_trial_runner_network_interface,
        det_trial_id="1",
        det_experiment_id="1",
        det_cluster_id="uuid-123",
        trial_seed=0,
    )
def do_test_launch(config: Dict[str, Any], cmd: List[str],
                   mock_popen: mock.MagicMock) -> None:
    mock_proc = mock.MagicMock()
    mock_proc.wait.return_value = 99
    mock_popen.return_value = mock_proc
    assert launch.launch(det.ExperimentConfig(config)) == 99
    mock_popen.assert_called_once_with(cmd)
Exemple #3
0
def create_trial_instance(
    trial_def: Type[det.Trial],
    checkpoint_dir: str,
    config: Optional[Dict[str, Any]] = None,
    hparams: Optional[Dict[str, Any]] = None,
) -> det.Trial:
    """
    Create a trial instance from a Trial class definition. This can be a useful
    utility for debugging your trial logic in any development environment.

    Arguments:
        trial_def: A class definition that inherits from the det.Trial interface.
        checkpoint_dir:
            The checkpoint directory that the trial will use for loading and
            saving checkpoints.
        config:
            An optional experiment configuration that is used to initialize the
            :class:`determined.TrialContext`. If not specified, a minimal default
            is used.
    """
    determined_common.set_logger(
        util.debug_mode()
        or det.ExperimentConfig(config or {}).debug_enabled())
    env, rendezvous_info, hvd_config = det._make_local_execution_env(
        False, config, hparams)
    trial_context = trial_def.trial_context_class(env, hvd_config)
    return trial_def(trial_context)
Exemple #4
0
    def __init__(
        self,
        master_addr: str,
        master_port: int,
        use_tls: bool,
        master_cert_file: Optional[str],
        master_cert_name: Optional[str],
        container_id: str,
        experiment_config: Dict[str, Any],
        hparams: Dict[str, Any],
        initial_workload: workload.Workload,
        latest_checkpoint: Optional[Dict[str, Any]],
        use_gpu: bool,
        container_gpus: List[str],
        slot_ids: List[int],
        debug: bool,
        workload_manager_type: str,
        det_rendezvous_port: str,
        det_trial_unique_port_offset: int,
        det_trial_runner_network_interface: str,
        det_trial_id: str,
        det_experiment_id: str,
        det_agent_id: str,
        det_cluster_id: str,
        det_task_token: str,
        trial_seed: int,
        managed_training: bool,
        test_mode: bool,
        on_cluster: bool,
    ):
        self.master_addr = master_addr
        self.master_port = master_port
        self.use_tls = use_tls
        self.master_cert_file = master_cert_file
        self.master_cert_name = master_cert_name
        self.container_id = container_id
        self.experiment_config = det.ExperimentConfig(experiment_config)
        self.hparams = hparams
        self.initial_workload = initial_workload
        self.latest_checkpoint = latest_checkpoint
        self.use_gpu = use_gpu
        self.container_gpus = container_gpus
        self.slot_ids = slot_ids
        self.debug = debug
        self.workload_manager_type = workload_manager_type
        self.det_rendezvous_port = det_rendezvous_port
        self.det_trial_unique_port_offset = det_trial_unique_port_offset
        self.det_trial_runner_network_interface = det_trial_runner_network_interface
        self.det_trial_id = det_trial_id
        self.det_experiment_id = det_experiment_id
        self.det_agent_id = det_agent_id
        self.det_cluster_id = det_cluster_id
        self.det_task_token = det_task_token
        self.trial_seed = trial_seed
        self.managed_training = managed_training
        self.test_mode = test_mode
        self.on_cluster = on_cluster

        self._per_slot_batch_size, self._global_batch_size = self._calculate_batch_sizes(
        )
Exemple #5
0
def _make_local_execution_env(
    managed_training: bool,
    test_mode: bool,
    config: Optional[Dict[str, Any]],
    hparams: Optional[Dict[str, Any]] = None,
    limit_gpus: Optional[int] = None,
) -> Tuple[det.EnvContext, det.RendezvousInfo, horovod.HorovodContext]:
    config = det.ExperimentConfig(
        _make_local_execution_exp_config(config,
                                         managed_training=managed_training,
                                         test_mode=test_mode))
    hparams = hparams or api.generate_random_hparam_values(
        config.get("hyperparameters", {}))
    use_gpu, container_gpus, slot_ids = _get_gpus(limit_gpus)
    local_rendezvous_ports = (
        f"{constants.LOCAL_RENDEZVOUS_PORT},{constants.LOCAL_RENDEZVOUS_PORT+1}"
    )

    env = det.EnvContext(
        master_addr="",
        master_port=0,
        use_tls=False,
        master_cert_file=None,
        master_cert_name=None,
        container_id="",
        experiment_config=config,
        hparams=hparams,
        initial_workload=workload.train_workload(1, 1, 1,
                                                 config.scheduling_unit()),
        latest_checkpoint=None,
        use_gpu=use_gpu,
        container_gpus=container_gpus,
        slot_ids=slot_ids,
        debug=config.debug_enabled(),
        workload_manager_type="",
        det_rendezvous_ports=local_rendezvous_ports,
        det_trial_unique_port_offset=0,
        det_trial_runner_network_interface=constants.
        AUTO_DETECT_TRIAL_RUNNER_NETWORK_INTERFACE,
        det_trial_id="",
        det_experiment_id="",
        det_cluster_id="",
        trial_seed=config.experiment_seed(),
        managed_training=managed_training,
        test_mode=test_mode,
        on_cluster=False,
    )
    rendezvous_ports = env.rendezvous_ports()
    rendezvous_info = det.RendezvousInfo(
        addrs=[f"0.0.0.0:{rendezvous_ports[0]}"],
        addrs2=[f"0.0.0.0:{rendezvous_ports[1]}"],
        rank=0)
    hvd_config = horovod.HorovodContext.from_configs(env.experiment_config,
                                                     rendezvous_info,
                                                     env.hparams)

    return env, rendezvous_info, hvd_config
Exemple #6
0
def make_default_env_context(hparams: Dict[str, Any],
                             experiment_config: Optional[Dict] = None,
                             trial_seed: int = 0) -> det.EnvContext:
    if experiment_config is None:
        experiment_config = make_default_exp_config(hparams, 1)

    # TODO(ryan): Fix the parameter passing so that this doesn't read from environment variables,
    # and we can get rid of the @expose_gpus fixture.
    use_gpu = distutils.util.strtobool(os.environ.get("DET_USE_GPU", "false"))
    gpu_uuids = gpu.get_gpu_uuids_and_validate(use_gpu)

    return det.EnvContext(
        experiment_config=experiment_config,
        initial_workload=workload.Workload(
            workload.Workload.Kind.RUN_STEP,
            ExperimentID(1),
            TrialID(1),
            StepID(1),
            det.ExperimentConfig(experiment_config).scheduling_unit(),
            0,
        ),
        master_addr="",
        master_port=0,
        use_tls=False,
        master_cert_file=None,
        master_cert_name=None,
        container_id="",
        hparams=hparams,
        latest_checkpoint=None,
        use_gpu=use_gpu,
        container_gpus=gpu_uuids,
        slot_ids=[],
        debug=False,
        workload_manager_type="TRIAL_WORKLOAD_MANAGER",
        det_rendezvous_ports="",
        det_trial_unique_port_offset=0,
        det_trial_runner_network_interface=constants.
        AUTO_DETECT_TRIAL_RUNNER_NETWORK_INTERFACE,
        det_trial_id="1",
        det_experiment_id="1",
        det_agent_id="1",
        det_cluster_id="uuid-123",
        det_task_token="",
        trial_seed=trial_seed,
        managed_training=True,
        test_mode=False,
        on_cluster=False,
    )
Exemple #7
0
 def __init__(
     self,
     master_url: str,
     master_cert_file: Optional[str],
     master_cert_name: Optional[str],
     experiment_config: Dict[str, Any],
     hparams: Dict[str, Any],
     latest_checkpoint: Optional[str],
     steps_completed: int,
     use_gpu: bool,
     container_gpus: List[str],
     slot_ids: List[int],
     debug: bool,
     det_trial_unique_port_offset: int,
     det_trial_id: str,
     det_experiment_id: str,
     det_agent_id: str,
     det_cluster_id: str,
     trial_seed: int,
     trial_run_id: int,
     allocation_id: str,
     managed_training: bool,
     test_mode: bool,
     on_cluster: bool,
 ):
     self.master_url = master_url
     self.master_cert_file = master_cert_file
     self.master_cert_name = master_cert_name
     self.experiment_config = det.ExperimentConfig(experiment_config)
     self.hparams = hparams
     self.latest_checkpoint = latest_checkpoint
     self.steps_completed = steps_completed
     self.use_gpu = use_gpu
     self.container_gpus = container_gpus
     self.slot_ids = slot_ids
     self.debug = debug
     self.det_trial_unique_port_offset = det_trial_unique_port_offset
     self.det_trial_id = det_trial_id
     self.det_experiment_id = det_experiment_id
     self.det_agent_id = det_agent_id
     self.det_cluster_id = det_cluster_id
     self.trial_seed = trial_seed
     self.trial_run_id = trial_run_id
     self.allocation_id = allocation_id
     self.managed_training = managed_training
     self.test_mode = test_mode
     self.on_cluster = on_cluster
Exemple #8
0
def _make_local_test_experiment_env(
    checkpoint_dir: pathlib.Path,
    config: Optional[Dict[str, Any]],
    hparams: Optional[Dict[str, Any]] = None,
) -> Tuple[det.EnvContext, workload.Stream, det.RendezvousInfo,
           horovod.HorovodContext]:
    config = det.ExperimentConfig(_make_local_test_experiment_config(config))
    hparams = hparams or _generate_test_hparam_values(config)
    use_gpu, container_gpus, slot_ids = _get_gpus()
    local_rendezvous_ports = (
        f"{constants.LOCAL_RENDEZVOUS_PORT},{constants.LOCAL_RENDEZVOUS_PORT+1}"
    )

    env = det.EnvContext(
        master_addr="",
        master_port=1,
        container_id="test_mode",
        experiment_config=config,
        hparams=hparams,
        initial_workload=workload.train_workload(1, 1, 1,
                                                 config.batches_per_step()),
        latest_checkpoint=None,
        use_gpu=use_gpu,
        container_gpus=container_gpus,
        slot_ids=slot_ids,
        debug=config.debug_enabled(),
        workload_manager_type="",
        det_rendezvous_ports=local_rendezvous_ports,
        det_trial_runner_network_interface=constants.
        AUTO_DETECT_TRIAL_RUNNER_NETWORK_INTERFACE,
        det_trial_id="1",
        det_experiment_id="1",
        det_cluster_id="test_mode",
        trial_seed=config.experiment_seed(),
    )
    workloads = _make_test_workloads(checkpoint_dir.joinpath("checkpoint"),
                                     config)
    rendezvous_ports = env.rendezvous_ports()
    rendezvous_info = det.RendezvousInfo(
        addrs=[f"0.0.0.0:{rendezvous_ports[0]}"],
        addrs2=[f"0.0.0.0:{rendezvous_ports[1]}"],
        rank=0)
    hvd_config = horovod.HorovodContext.from_configs(env.experiment_config,
                                                     rendezvous_info,
                                                     env.hparams)

    return env, workloads, rendezvous_info, hvd_config
Exemple #9
0
def _make_local_execution_env(
    managed_training: bool,
    test_mode: bool,
    config: Optional[Dict[str, Any]],
    checkpoint_dir: str,
    hparams: Optional[Dict[str, Any]] = None,
    limit_gpus: Optional[int] = None,
) -> Tuple[core.Context, det.EnvContext]:
    config = det.ExperimentConfig(
        _make_local_execution_exp_config(config,
                                         checkpoint_dir,
                                         managed_training=managed_training,
                                         test_mode=test_mode))
    hparams = hparams or api.generate_random_hparam_values(
        config.get("hyperparameters", {}))
    use_gpu, container_gpus, slot_ids = _get_gpus(limit_gpus)

    env = det.EnvContext(
        master_url="",
        master_cert_file=None,
        master_cert_name=None,
        experiment_config=config,
        hparams=hparams,
        latest_checkpoint=None,
        steps_completed=0,
        use_gpu=use_gpu,
        container_gpus=container_gpus,
        slot_ids=slot_ids,
        debug=config.debug_enabled(),
        det_trial_unique_port_offset=0,
        det_trial_id="",
        det_agent_id="",
        det_experiment_id="",
        det_cluster_id="",
        trial_seed=config.experiment_seed(),
        trial_run_id=1,
        allocation_id="",
        managed_training=managed_training,
        test_mode=test_mode,
        on_cluster=False,
    )

    core_context = core._dummy_init()

    return core_context, env
Exemple #10
0
    def __init__(
        self,
        master_addr: str,
        master_port: int,
        container_id: str,
        experiment_config: Dict[str, Any],
        hparams: Dict[str, Any],
        initial_workload: workload.Workload,
        latest_checkpoint: Optional[Dict[str, Any]],
        use_gpu: bool,
        container_gpus: List[str],
        slot_ids: List[int],
        debug: bool,
        workload_manager_type: str,
        det_rendezvous_ports: str,
        det_trial_runner_network_interface: str,
        det_trial_id: str,
        det_experiment_id: str,
        det_cluster_id: str,
        trial_seed: int,
        training: bool = True,
    ):
        self.master_addr = master_addr
        self.master_port = master_port
        self.container_id = container_id
        self.experiment_config = det.ExperimentConfig(experiment_config)
        self.hparams = hparams
        self.initial_workload = initial_workload
        self.latest_checkpoint = latest_checkpoint
        self.use_gpu = use_gpu
        self.container_gpus = container_gpus
        self.slot_ids = slot_ids
        self.debug = debug
        self.workload_manager_type = workload_manager_type
        self.det_rendezvous_ports = det_rendezvous_ports
        self.det_trial_runner_network_interface = det_trial_runner_network_interface
        self.det_trial_id = det_trial_id
        self.det_experiment_id = det_experiment_id
        self.det_cluster_id = det_cluster_id
        self.trial_seed = trial_seed
        self.training = training

        self._per_slot_batch_size, self._global_batch_size = self._calculate_batch_sizes(
        )
Exemple #11
0
def create_trial_instance(
    trial_def: Type[det.Trial],
    checkpoint_dir: str,
    config: Optional[Dict[str, Any]] = None,
    hparams: Optional[Dict[str, Any]] = None,
) -> det.Trial:
    """
    Deprecated: please use your TrialContext's .from_config() method instead.

    Create a trial instance from a Trial class definition. This can be a useful
    utility for debugging your trial logic in any development environment.

    Arguments:
        trial_def: A class definition that inherits from the det.Trial interface.
        checkpoint_dir:
            The checkpoint directory that the trial will use for loading and
            saving checkpoints.
        config:
            An optional experiment configuration that is used to initialize the
            :class:`determined.TrialContext`. If not specified, a minimal default
            is used.
    """
    warnings.warn(
        "det.experimental.create_trial_instance() is now deprecated.  Please use\n"
        "your TrialContext's .from_config() method instead.  Example\n"
        "\n"
        "    context = PyTorchTrialContext.from_config()\n"
        "    my_trial = MyPyTorchTrial(context)\n",
        FutureWarning,
    )
    determined.common.set_logger(
        util.debug_mode()
        or det.ExperimentConfig(config or {}).debug_enabled())
    env, rendezvous_info, hvd_config = det._make_local_execution_env(
        managed_training=False,
        test_mode=False,
        config=config,
        hparams=hparams)
    trial_context = trial_def.trial_context_class(
        env, hvd_config, rendezvous_info=rendezvous_info)
    return trial_def(trial_context)
Exemple #12
0
def init_native(
    trial_def: Optional[Type[det.Trial]] = None,
    controller_cls: Optional[Type[det.TrialController]] = None,
    native_context_cls: Optional[Type[det.NativeContext]] = None,
    config: Optional[Dict[str, Any]] = None,
    local: bool = False,
    test: bool = False,
    context_dir: str = "",
    command: Optional[List[str]] = None,
    master_url: Optional[str] = None,
) -> Any:
    determined.common.set_logger(
        util.debug_mode()
        or det.ExperimentConfig(config or {}).debug_enabled())

    if local:
        if not test:
            logging.warning("local training is not supported, testing instead")

        with det._local_execution_manager(pathlib.Path(context_dir).resolve()):
            return test_one_batch(
                controller_cls=controller_cls,
                native_context_cls=native_context_cls,
                trial_class=trial_def,
                config=config,
            )

    else:
        return _init_cluster_mode(
            trial_def=trial_def,
            controller_cls=controller_cls,
            native_context_cls=native_context_cls,
            config=config,
            test=test,
            context_dir=context_dir,
            command=command,
            master_url=master_url,
        )
Exemple #13
0
def _init_native(
    controller_cls: Type[det.TrialController],
    native_context_cls: Type[det.NativeContext],
    config: Optional[Dict[str, Any]] = None,
    mode: Mode = Mode.CLUSTER,
    context_dir: str = "",
    command: Optional[List[str]] = None,
    master_url: Optional[str] = None,
) -> Any:
    det._set_logger(util.debug_mode() or det.ExperimentConfig(config or {}).debug_enabled())

    if Mode(mode) == Mode.CLUSTER:
        if load.RunpyGlobals.is_initialized():
            controller_cls.pre_execute_hook(
                env=load.RunpyGlobals.get_instance().env,
                hvd_config=load.RunpyGlobals.get_instance().hvd_config,
            )
            context = native_context_cls(
                env=load.RunpyGlobals.get_instance().env,
                hvd_config=load.RunpyGlobals.get_instance().hvd_config,
            )
            load.RunpyGlobals.set_runpy_native_result(context, controller_cls)
            context._set_train_fn(_stop_loading_implementation)
            return context

        else:
            create_experiment(
                config=config, context_dir=context_dir, command=command, master_url=master_url
            )
            logging.info("Exiting the program after submitting the experiment.")
            sys.exit(0)

    elif Mode(mode) == Mode.LOCAL:
        logging.info("Running a minimal test experiment locally")
        checkpoint_dir = tempfile.TemporaryDirectory()
        env, workloads, rendezvous_info, hvd_config = make_test_experiment_env(
            checkpoint_dir=pathlib.Path(checkpoint_dir.name), config=config
        )
        logging.info(f"Using hyperparameters: {env.hparams}")
        logging.debug(f"Using a test experiment config: {env.experiment_config}")

        controller_cls.pre_execute_hook(env=env, hvd_config=hvd_config)
        context = native_context_cls(env=env, hvd_config=hvd_config)

        def train_fn() -> None:
            controller = controller_cls.from_native(
                context=context,
                env=env,
                workloads=workloads,
                load_path=None,
                rendezvous_info=rendezvous_info,
                hvd_config=hvd_config,
            )
            controller.run()
            checkpoint_dir.cleanup()

        context._set_train_fn(train_fn)
        return context

    else:
        raise errors.InvalidExperimentException("Must use either local mode or cluster mode.")
Exemple #14
0
    def __init__(self, trial_inst: det.Trial, *args: Any,
                 **kwargs: Any) -> None:
        super().__init__(*args, **kwargs)

        check.is_instance(trial_inst, PyTorchTrial,
                          "PyTorchTrialController needs an PyTorchTrial")
        self.trial = cast(PyTorchTrial, trial_inst)
        self.context = cast(PyTorchTrialContext, self.context)
        self.callbacks = self.trial.build_callbacks()

        # TODO(DET-3262): remove this backward compatibility of old interface.
        if (util.is_overridden(self.trial.build_model, PyTorchTrial)
                or util.is_overridden(self.trial.optimizer, PyTorchTrial)
                or util.is_overridden(self.trial.create_lr_scheduler,
                                      PyTorchTrial)):
            check.true(
                util.is_overridden(self.trial.build_model, PyTorchTrial)
                and util.is_overridden(self.trial.optimizer, PyTorchTrial),
                "Both build_model() and optimizer() must be defined "
                "if any of build_model(), optimizer(), and create_lr_scheduler() are defined. "
                "If you want to use the new interface, you should instead instantiate your models, "
                "optimizers, and LR schedulers in __init__ and call context.backward(loss) "
                "and context.step_optimizer(optimizer) in train_batch.",
            )

            model = self.context._Model(self.trial.build_model())
            optim = self.context._Optimizer(self.trial.optimizer(model))

            lr_scheduler = self.trial.create_lr_scheduler(optim)
            if lr_scheduler is not None:
                self.context.lr_schedulers.append(lr_scheduler)

            if det.ExperimentConfig(self.context.get_experiment_config()
                                    ).mixed_precision_enabled():
                self.context._configure_apex_amp(
                    models=model,
                    optimizers=optim,
                    opt_level=self.context.get_experiment_config().get(
                        "optimizations", {}).get("mixed_precision", "O0"),
                )

            train_batch = self.trial.train_batch

            def new_train_batch(
                    batch: TorchData, model: nn.Module, epoch_idx: int,
                    batch_idx: int) -> Union[torch.Tensor, Dict[str, Any]]:
                tr_metrics = train_batch(batch, model, epoch_idx, batch_idx)
                if isinstance(tr_metrics, torch.Tensor):
                    tr_metrics = {"loss": tr_metrics}
                check.is_instance(
                    tr_metrics,
                    dict,
                    "train_batch() must return a dictionary "
                    f"mapping string names to Tensor metrics, got {type(tr_metrics)}",
                )
                check.is_in("loss", tr_metrics.keys(),
                            'Please include "loss" in you training metrics.')

                def clip_grads(parameters: Iterator) -> None:
                    for callback in self.callbacks.values():
                        callback.on_before_optimizer_step(parameters)

                self.context._backward(tr_metrics["loss"])
                self.context._step_optimizer(self.context.optimizers[0],
                                             clip_grads=clip_grads)
                return tr_metrics

            self.trial.__setattr__("train_batch", new_train_batch)

        check.gt_eq(
            len(self.context.models),
            1,
            "Must have at least one model. "
            "This might be caused by not wrapping your model with Model()",
        )
        check.gt_eq(
            len(self.context.optimizers),
            1,
            "Must have at least one optimizer. "
            "This might be caused by not wrapping your model with Optimizer()",
        )
        self._check_evaluate_implementation()

        # Validation loader will be undefined on process ranks > 0
        # when the user defines `validate_full_dataset()`.
        self.validation_loader = None  # type: Optional[torch.utils.data.DataLoader]
        self._set_data_loaders()

        # If a load path is provided load weights and restore the data location.
        self._load()

        if self.hvd_config.use:
            hvd.broadcast_parameters(self.context._main_model.state_dict(),
                                     root_rank=0)
            for optimizer in self.context.optimizers:
                hvd.broadcast_optimizer_state(optimizer, root_rank=0)

        self.training_iterator = iter(self.training_loader)
Exemple #15
0
def create(
    trial_def: Type[det.Trial],
    config: Optional[Dict[str, Any]] = None,
    local: bool = False,
    test: bool = False,
    context_dir: str = "",
    command: Optional[List[str]] = None,
    master_url: Optional[str] = None,
) -> Any:
    # TODO: Add a reference to the local development tutorial.
    """
    Create an experiment.

    Arguments:
        trial_def:
            A class definition implementing the :class:`determined.Trial`
            interface.

        config:
            A dictionary representing the experiment configuration to be
            associated with the experiment.

        local:
            A boolean indicating if training should be done locally. When
            ``False``, the experiment will be submitted to the Determined
            cluster. Defaults to ``False``.

        test:
            A boolean indicating if the experiment should be shortened
            to a minimal loop of training on a small amount of data,
            performing validation, and checkpointing.  ``test=True`` is
            useful for quick iteration during model porting or debugging
            because common errors will surface more quickly.  Defaults
            to ``False``.

        context_dir:
            A string filepath that defines the context directory. All model
            code will be executed with this as the current working directory.

            When ``local=False``, this argument is required. All files in this
            directory will be uploaded to the Determined cluster. The total
            size of this directory must be under 96 MB.

            When ``local=True``, this argument is optional and defaults to
            the current working directory.

        command:
            A list of strings that is used as the entrypoint of the training
            script in the Determined task environment. When executing this
            function via a Python script, this argument is inferred to be
            ``sys.argv`` by default. When executing this function via IPython
            or Jupyter notebook, this argument is required.

            Example: When creating an experiment by running ``python train.py
            --flag value``, the default command is inferred as ``["train.py",
            "--flag", "value"]``.

        master_url:
            An optional string to use as the Determined master URL when
            ``local=False``. If not specified, will be inferred from the
            environment variable ``DET_MASTER``.
    """

    if local and not test:
        raise NotImplementedError(
            "det.create(local=True, test=False) is not yet implemented. Please set local=False "
            "or test=True.")

    determined.common.set_logger(
        util.debug_mode()
        or det.ExperimentConfig(config or {}).debug_enabled())

    if local:
        # Local test mode.
        with det._local_execution_manager(pathlib.Path(context_dir).resolve()):
            return test_one_batch(
                trial_class=trial_def,
                config=config,
            )

    elif not load.RunpyGlobals.is_initialized():
        # Cluster mode, but still running locally; submit the experiment.
        _submit_experiment(
            config=config,
            test=test,
            context_dir=context_dir,
            command=command,
            master_url=master_url,
        )

    else:
        # Cluster mode, now on the cluster; actually train.
        load.RunpyGlobals.set_runpy_trial_result(trial_def)
        raise det.errors.StopLoadingImplementation()
Exemple #16
0
    def _apply_backwards_compatibility(self) -> None:
        # TODO(DET-3262): remove this backward compatibility of old interface.
        if (util.is_overridden(self.trial.build_model, PyTorchTrial)
                or util.is_overridden(self.trial.optimizer, PyTorchTrial)
                or util.is_overridden(self.trial.create_lr_scheduler,
                                      PyTorchTrial)):
            logging.warning(
                "build_model(), optimizer(), and create_lr_scheduler(), which belong to "
                "the old interface, are deprecated. Please see the following documentation "
                "of PyTorchTrial for the new interface \n"
                f"{PyTorchTrial.__doc__}")
            logging.warning(
                "The callback on_before_optimizer_step is deprecated."
                "Please use context.step_optimizer to clip gradients.")
            check.true(
                util.is_overridden(self.trial.build_model, PyTorchTrial)
                and util.is_overridden(self.trial.optimizer, PyTorchTrial),
                "Both build_model() and optimizer() must be defined "
                "if any of build_model(), optimizer(), and create_lr_scheduler() are defined. "
                "If you want to use the new interface, you should instead instantiate your models, "
                "optimizers, and LR schedulers in __init__ and call context.backward(loss) "
                "and context.step_optimizer(optimizer) in train_batch.",
            )

            model = self.context.wrap_model(self.trial.build_model())
            optim = self.context.wrap_optimizer(self.trial.optimizer(model))

            lr_scheduler = self.trial.create_lr_scheduler(optim)
            if lr_scheduler is not None:
                opt = getattr(lr_scheduler._scheduler, "optimizer", None)
                if opt is not None:
                    check.is_in(
                        opt,
                        self.context.optimizers,
                        "Must use a wrapped optimizer that is passed in by the optimizer "
                        "argument of create_lr_scheduler",
                    )
                self.context.lr_schedulers.append(lr_scheduler)

            if det.ExperimentConfig(self.context.get_experiment_config()
                                    ).mixed_precision_enabled():
                logging.warning(
                    "The experiment configuration field optimization.mixed_precision is deprecated."
                    "Please use configure_apex_amp in __init__ to configrue apex amp. "
                    "See the following documentation of PyTorchTrial for the new interface \n"
                    f"{PyTorchTrial.__doc__}")
                self.context.configure_apex_amp(
                    models=model,
                    optimizers=optim,
                    opt_level=self.context.get_experiment_config().get(
                        "optimizations", {}).get("mixed_precision", "O0"),
                )

            # Backward compatibility: train_batch
            train_batch = cast(Callable, self.trial.train_batch)

            def new_train_batch(batch: pytorch.TorchData, epoch_idx: int,
                                batch_idx: int) -> Any:
                tr_metrics = train_batch(
                    batch=batch,
                    model=model,
                    epoch_idx=epoch_idx,
                    batch_idx=batch_idx,
                )
                if isinstance(tr_metrics, torch.Tensor):
                    tr_metrics = {"loss": tr_metrics}
                check.is_instance(
                    tr_metrics,
                    dict,
                    "train_batch() must return a dictionary "
                    f"mapping string names to Tensor metrics, got {type(tr_metrics)}",
                )
                check.is_in("loss", tr_metrics.keys(),
                            'Please include "loss" in you training metrics.')

                def clip_grads(parameters: Iterator) -> None:
                    for callback in self.callbacks.values():
                        callback.on_before_optimizer_step(parameters)

                self.context.backward(tr_metrics["loss"])
                self.context.step_optimizer(self.context.optimizers[0],
                                            clip_grads=clip_grads)

                return tr_metrics

            self.trial.__setattr__("train_batch", new_train_batch)

            # Backward compatibility: evaluate_batch
            if self._evaluate_batch_defined():
                evaluate_batch = cast(Callable, self.trial.evaluate_batch)

                def new_evaluate_batch(batch: pytorch.TorchData) -> Any:
                    return evaluate_batch(model=model, batch=batch)

                self.trial.__setattr__("evaluate_batch", new_evaluate_batch)

            # Backward compatibility: evaluate_full_dataset
            if self._evaluate_full_dataset_defined():
                evaluate_full_dataset = cast(Callable,
                                             self.trial.evaluate_full_dataset)

                def new_evaluate_full_dataset(
                        data_loader: torch.utils.data.DataLoader) -> Any:
                    return evaluate_full_dataset(model=model,
                                                 data_loader=data_loader)

                self.trial.__setattr__("evaluate_full_dataset",
                                       new_evaluate_full_dataset)
Exemple #17
0
def create(
    trial_def: Type[det.Trial],
    config: Optional[Dict[str, Any]] = None,
    mode: Mode = Mode.CLUSTER,
    context_dir: str = "",
    command: Optional[List[str]] = None,
    master_url: Optional[str] = None,
) -> None:
    # TODO: Add a reference to the local development tutorial.
    """
    Create an experiment.

    Arguments:
        trial_def:
            A class definition implementing the ``det.Trial`` interface.
        config:
            A dictionary representing the experiment configuration to be
            associated with the experiment.
        mode:
            The :py:class:`determined.experimental.Mode` used when creating
            an experiment

            1. ``Mode.CLUSTER`` (default): Submit the experiment to a remote
            Determined cluster.

            2. ``Mode.LOCAL``: Test the experiment in the calling
            Python process for local development / debugging purposes.
            Run through a minimal loop of training, validation, and checkpointing steps.

        context_dir:
            A string filepath that defines the context directory. All model
            code will be executed with this as the current working directory.

            In CLUSTER mode, this argument is required. All files in this
            directory will be uploaded to the Determined cluster. The total
            size of this directory must be under 96 MB.

            In LOCAL mode, this argument is optional and assumed to be the
            current working directory by default.
        command:
            A list of strings that is used as the entrypoint of the training
            script in the Determined task environment. When executing this
            function via a python script, this argument is inferred to be
            ``sys.argv`` by default. When executing this function via IPython
            or Jupyter notebook, this argument is required.

            Example: When creating an experiment by running "python train.py
            --flag value", the default command is inferred as ["train.py",
            "--flag", "value"].

        master_url:
            An optional string to use as the Determined master URL in submit
            mode. If not specified, will be inferred from the environment
            variable ``DET_MASTER``.
    """

    det._set_logger(util.debug_mode() or det.ExperimentConfig(config or {}).debug_enabled())
    if Mode(mode) == Mode.CLUSTER:
        if load.RunpyGlobals.is_initialized():
            load.RunpyGlobals.set_runpy_trial_result(
                trial_def, cast(Type[det.TrialController], trial_def.trial_controller_class)
            )
            _stop_loading_implementation()

        else:
            create_experiment(
                config=config, context_dir=context_dir, command=command, master_url=master_url
            )

    elif Mode(mode) == Mode.LOCAL:
        context_path = pathlib.Path(context_dir) if context_dir else pathlib.Path.cwd()
        test_one_batch(context_path, trial_class=trial_def, config=config)
    else:
        raise errors.InvalidExperimentException("Must use either local mode or cluster mode.")
        pass

    return new_dict


if __name__ == "__main__":
    info = det.get_cluster_info()
    assert info is not None, "must be run on-cluster"
    assert info.task_type == "TRIAL", f'must be run with task_type="TRIAL", not "{info.task_type}"'

    # Hack: get the resources id from the environment.
    resources_id = os.environ.get("DET_RESOURCES_ID")
    assert resources_id is not None, "Unable to run with DET_RESOURCES_ID unset"

    # Hack: read the full config.  The experiment config is not a stable API!
    experiment_config = det.ExperimentConfig(info.trial._config)

    determined.common.set_logger(experiment_config.debug_enabled())

    logging.info(
        f"New trial runner in (container {resources_id}) on agent {info.agent_id}: "
        + json.dumps(mask_config_dict(info.trial._config)))

    # Perform validations
    try:
        logging.info("Validating checkpoint storage ...")
        storage.validate_config(
            experiment_config.get_checkpoint_storage(),
            container_path=constants.SHARED_FS_CONTAINER_PATH,
        )
    except Exception as e:
Exemple #19
0
def main(script: List[str]) -> int:
    info = det.get_cluster_info()
    assert info is not None, "must be run on-cluster"
    assert info.task_type == "TRIAL", f'must be run with task_type="TRIAL", not "{info.task_type}"'
    experiment_config = det.ExperimentConfig(info.trial._config)
    determined.common.set_logger(experiment_config.debug_enabled())

    multi_machine = len(info.container_addrs) > 1
    check_deepspeed_version(multi_machine)

    # Hack: get the resources id from the environment.
    resources_id = os.environ.get("DET_RESOURCES_ID")
    assert resources_id is not None, "Unable to run with DET_RESOURCES_ID unset"

    # TODO: refactor websocket, data_layer, and profiling to to not use the cli_cert.
    cert = certs.default_load(info.master_url)
    certs.cli_cert = cert

    # The launch layer should provide the chief_ip to the training code, so that the training code
    # can function with a different launch layer in a different environment.  Inside Determined, the
    # easiest way to get the chief_ip is with container_addrs.
    chief_ip = info.container_addrs[0]

    # Chief IP is set as an environment variable to support nested launch layers
    os.environ["DET_CHIEF_IP"] = chief_ip

    # If the NCCL_SOCKET_IFNAME environment variable wasn't explicitly set by
    # the user in the experiment's YAML file, then set it to the distributed
    # network interface, if the value of "dtrain_network_interface" under
    # "task_container_defaults" has been set in the "master.yaml".
    if is_using_cuda() and not is_nccl_socket_ifname_env_var_set():
        dtrain_network_interface = os.environ.get("DET_INTER_NODE_NETWORK_INTERFACE", None)

        if dtrain_network_interface is not None and len(dtrain_network_interface) > 0:
            os.environ["NCCL_SOCKET_IFNAME"] = dtrain_network_interface

    # All ranks will need to run sshd.
    run_sshd_command = create_sshd_cmd()

    if info.container_rank > 0:
        # Non-chief machines just run sshd.

        # Mark sshd containers as daemon containers that the master should kill when all non-daemon
        # containers (deepspeed launcher, in this case) have exited.
        api.post(
            info.master_url,
            path=f"/api/v1/allocations/{info.allocation_id}/resources/{resources_id}/daemon",
            cert=cert,
        )

        # Wrap it in a pid_server to ensure that we can't hang if a worker fails.
        # This is useful for deepspeed which does not have good error handling for remote processes
        # spun up by pdsh.
        pid_server_cmd = create_pid_server_cmd(info.allocation_id, len(info.slot_ids))

        logging.debug(
            f"Non-chief [{info.container_rank}] training process launch "
            f"command: {run_sshd_command}."
        )
        p = subprocess.Popen(pid_server_cmd + run_sshd_command)
        with det.util.forward_signals(p):
            return p.wait()

    # We always need to set this variable to initialize the context correctly, even in the single
    # slot case.
    os.environ["USE_DEEPSPEED"] = "1"

    # The chief has several layers of wrapper processes:
    # - a top-level pid_server, which causes the whole container to exit if any local worker dies.
    # - deepspeed, which launches $slots_per_trial copies of the following layers:
    #     - a pid_client process to contact the local pid_server
    #     - wrap_rank, which redirects stdin/stdout to the local container
    #     - harness.py, which actually does the training for the worker

    pid_server_cmd = create_pid_server_cmd(info.allocation_id, len(info.slot_ids))

    hostfile_path = get_hostfile_path(multi_machine)

    master_address = create_hostlist_file(
        hostfile_path=hostfile_path,
        num_proc_per_machine=len(info.slot_ids),
        ip_addresses=info.container_addrs,
    )
    cmd = create_run_command(master_address, hostfile_path)

    pid_client_cmd = create_pid_client_cmd(info.allocation_id)

    log_redirect_cmd = create_log_redirect_cmd()

    harness_cmd = script

    logging.debug(f"chief worker calling deepspeed with args: {cmd[1:]} ...")

    full_cmd = pid_server_cmd + cmd + pid_client_cmd + log_redirect_cmd + harness_cmd

    if not multi_machine:
        p = subprocess.Popen(full_cmd)
        with det.util.forward_signals(p):
            return p.wait()

    # Create the environment file that will be passed by deepspeed to individual ranks.
    create_deepspeed_env_file()
    # Set custom PDSH args:
    # * bypass strict host checking
    # * -p our custom port
    # * other args are default ssh args for pdsh
    os.environ["PDSH_SSH_ARGS"] = (
        "-o PasswordAuthentication=no -o StrictHostKeyChecking=no "
        f"-p {constants.DTRAIN_SSH_PORT} -2 -a -x %h"
    )

    # Chief worker also needs to run sshd when using pdsh and multi-machine training.
    sshd_process = subprocess.Popen(run_sshd_command)

    try:
        # Chief machine waits for every worker's sshd to be available.  All machines should be
        # close to in-step by now because all machines just finished synchronizing rendezvous
        # info.
        deadline = time.time() + 20
        for peer_addr in info.container_addrs:
            util.check_sshd(peer_addr, deadline, constants.DTRAIN_SSH_PORT)

        p = subprocess.Popen(full_cmd)
        with det.util.forward_signals(p):
            return p.wait()
    finally:
        sshd_process.kill()
        sshd_process.wait()