Ejemplo n.º 1
0
def trigger_preemption(signum: int, frame: types.FrameType) -> None:
    info = det.get_cluster_info()
    if info and info.container_rank == 0:
        # Chief container, requests preemption, others ignore
        logging.debug(
            f"[rank={info.container_rank}] SIGTERM: Preemption imminent.")
        # Notify the master that we need to be preempted
        api.post(
            info.master_url,
            f"/api/v1/allocations/{info.allocation_id}/signals/pending_preemption"
        )
Ejemplo n.º 2
0
def main(override_args: List[str], script: List[str]) -> int:
    override_args = override_args or []

    info = det.get_cluster_info()
    assert info is not None, "must be run on-cluster"

    single_slot = len(info.container_addrs) == 1 and len(info.slot_ids) <= 1

    # Detect single-slot trials and skip distributed launch
    if single_slot:
        p = subprocess.Popen(script)
        with det.util.forward_signals(p):
            return p.wait()

    os.environ["USE_TORCH_DISTRIBUTED"] = "True"

    chief_ip = info.container_addrs[0]
    os.environ["DET_CHIEF_IP"] = chief_ip

    torch_distributed_cmd = create_launch_cmd(
        len(info.container_addrs),
        len(info.slot_ids),
        info.container_rank,
        "localhost" if len(info.container_addrs) == 1 else chief_ip,
        override_args,
    )

    log_redirect_cmd = create_log_redirect_cmd()

    # Due to a bug in PyTorch, we need to wrap the launcher in pid_server/pid_client to correctly
    # handle errors and ensure workers don't hang when a process fails
    pid_server_cmd = create_pid_server_cmd(info.allocation_id, len(info.slot_ids))
    pid_client_cmd = create_pid_client_cmd(info.allocation_id)

    launch_cmd = pid_server_cmd + torch_distributed_cmd + pid_client_cmd + log_redirect_cmd + script

    logging.debug(f"Torch distributed launching with: {launch_cmd}")

    p = subprocess.Popen(launch_cmd)
    with det.util.forward_signals(p):
        return p.wait()
def _get_training_port_offset() -> int:
    info = det.get_cluster_info()
    if info and info.task_type == "TRIAL":
        return info.trial._unique_port_offset
    return 0
Ejemplo n.º 4
0
                new_dict["checkpoint_storage"][key] = mask
    except (KeyError, AttributeError):
        pass

    try:
        if new_dict["environment"]["registry_auth"].get(
                "password") is not None:
            new_dict["environment"]["registry_auth"]["password"] = mask
    except (KeyError, AttributeError):
        pass

    return new_dict


if __name__ == "__main__":
    info = det.get_cluster_info()
    assert info is not None, "must be run on-cluster"
    assert info.task_type == "TRIAL", f'must be run with task_type="TRIAL", not "{info.task_type}"'

    # Hack: get the resources id from the environment.
    resources_id = os.environ.get("DET_RESOURCES_ID")
    assert resources_id is not None, "Unable to run with DET_RESOURCES_ID unset"

    # Hack: read the full config.  The experiment config is not a stable API!
    experiment_config = det.ExperimentConfig(info.trial._config)

    determined.common.set_logger(experiment_config.debug_enabled())

    logging.info(
        f"New trial runner in (container {resources_id}) on agent {info.agent_id}: "
        + json.dumps(mask_config_dict(info.trial._config)))
Ejemplo n.º 5
0
def main(script: List[str]) -> int:
    info = det.get_cluster_info()
    assert info is not None, "must be run on-cluster"
    assert info.task_type == "TRIAL", f'must be run with task_type="TRIAL", not "{info.task_type}"'

    # Hack: get the resources id from the environment.
    resources_id = os.environ.get("DET_RESOURCES_ID")
    assert resources_id is not None, "Unable to run with DET_RESOURCES_ID unset"

    # TODO: refactor websocket, data_layer, and profiling to to not use the cli_cert.
    cert = certs.default_load(info.master_url)
    certs.cli_cert = cert

    # The launch layer should provide the chief_ip to the training code, so that the training code
    # can function with a different launch layer in a different environment.  Inside Determined, the
    # easiest way to get the chief_ip is with container_addrs.
    chief_ip = info.container_addrs[0]

    # Chief IP is set as an environment variable to support nested launch layers
    os.environ["DET_CHIEF_IP"] = chief_ip

    # All ranks will need to run sshd.
    run_sshd_command = create_sshd_cmd()

    if info.container_rank > 0:
        # Non-chief machines just run sshd.

        # Mark sshd containers as daemon containers that the master should kill when all non-daemon
        # containers (deepspeed launcher, in this case) have exited.
        api.post(
            info.master_url,
            path=f"/api/v1/allocations/{info.allocation_id}/resources/{resources_id}/daemon",
            cert=cert,
        )

        # Wrap it in a pid_server to ensure that we can't hang if a worker fails.
        # This is useful for deepspeed which does not have good error handling for remote processes
        # spun up by pdsh.
        pid_server_cmd = create_pid_server_cmd(info.allocation_id, len(info.slot_ids))

        logging.debug(
            f"Non-chief [{info.container_rank}] training process launch "
            f"command: {run_sshd_command}."
        )
        return subprocess.Popen(pid_server_cmd + run_sshd_command).wait()

    # We always need to set this variable to initialize the context correctly, even in the single
    # slot case.
    os.environ["USE_DEEPSPEED"] = "1"

    # The chief has several layers of wrapper processes:
    # - a top-level pid_server, which causes the whole container to exit if any local worker dies.
    # - deepspeed, which launches $slots_per_trial copies of the following layers:
    #     - a pid_client process to contact the local pid_server
    #     - wrap_rank, which redirects stdin/stdout to the local container
    #     - harness.py, which actually does the training for the worker

    pid_server_cmd = create_pid_server_cmd(info.allocation_id, len(info.slot_ids))

    master_address = create_hostlist_file(
        hostfile_path=pathlib.Path(hostfile_path),
        num_proc_per_machine=len(info.slot_ids),
        ip_addresses=info.container_addrs,
    )
    cmd = create_run_command(master_address, hostfile_path)

    pid_client_cmd = create_pid_client_cmd(info.allocation_id)

    log_redirect_cmd = create_log_redirect_cmd()

    harness_cmd = script

    logging.debug(f"chief worker calling deepspeed with args: {cmd[1:]} ...")

    full_cmd = pid_server_cmd + cmd + pid_client_cmd + log_redirect_cmd + harness_cmd

    multi_machine = len(info.container_addrs) > 1
    if not multi_machine:
        return subprocess.Popen(full_cmd).wait()

    # Create the environment file that will be passed by deepspeed to individual ranks.
    create_deepspeed_env_file()
    # Set custom PDSH args:
    # * bypass strict host checking
    # * -p our custom port
    # * other args are default ssh args for pdsh
    os.environ["PDSH_SSH_ARGS"] = (
        "-o PasswordAuthentication=no -o StrictHostKeyChecking=no "
        f"-p {constants.DTRAIN_SSH_PORT} -2 -a -x %h"
    )

    # Chief worker also needs to run sshd when using pdsh and multi-machine training.
    sshd_process = subprocess.Popen(run_sshd_command)

    try:
        # Chief machine waits for every worker's sshd to be available.  All machines should be
        # close to in-step by now because all machines just finished synchronizing rendezvous
        # info.
        deadline = time.time() + 20
        for peer_addr in info.container_addrs:
            util.check_sshd(peer_addr, deadline, constants.DTRAIN_SSH_PORT)

        return subprocess.Popen(full_cmd).wait()
    finally:
        sshd_process.kill()
        sshd_process.wait()
Ejemplo n.º 6
0
def main(train_entrypoint: str) -> int:
    info = det.get_cluster_info()
    assert info is not None, "must be run on-cluster"
    assert info.task_type == "TRIAL", f'must be run with task_type="TRIAL", not "{info.task_type}"'

    # TODO: refactor data_layer, and profiling to to not use the cli_cert.
    certs.cli_cert = certs.default_load(info.master_url)

    # TODO: Don't include EnvContext object in the future high-level APIs for PyTorch or Keras.
    # It was natural to create this big-blob-of-config object, but it was a mistake to pass it into
    # the lowest layers of the harness code; it's too large of an object to be easily mockable,
    # which is part of why building local training mode has always been a challenge.
    #
    # A better pattern is to pass in exactly the information that is necessary at each layer.  We
    # will use that pattern for the future high-level APIs, but it's not worth refactoring e.g. the
    # TFKerasTrialController or EstimatorTrialController to add that functionality, so for now we
    # continue with the legacy strategy.

    env = det.EnvContext(
        master_url=info.master_url,
        master_cert_file=info.master_cert_file,
        master_cert_name=info.master_cert_name,
        experiment_config=info.trial._config,
        hparams=info.trial.hparams,
        latest_checkpoint=info.latest_checkpoint,
        steps_completed=info.trial._steps_completed,
        use_gpu=bool(info.gpu_uuids),
        container_gpus=info.gpu_uuids,
        slot_ids=info.slot_ids,
        debug=info.trial._debug,
        det_trial_unique_port_offset=info.trial._unique_port_offset,
        det_trial_id=str(info.trial.trial_id),
        det_experiment_id=str(info.trial.experiment_id),
        det_agent_id=info.agent_id,
        det_cluster_id=info.cluster_id,
        trial_seed=info.trial.trial_seed,
        trial_run_id=info.trial._trial_run_id,
        allocation_id=info.allocation_id,
        managed_training=True,
        test_mode=False,
        on_cluster=True,
    )

    det.common.set_logger(env.debug)
    logging.debug("Starting harness.")

    with maybe_periodic_stacktraces(env.debug):
        # Step 1: Load user code.
        # We can't build a core.Context without rank information, and we can't gather rank
        # information until the distributed backend is initialized, and we can't initialize the
        # correct distributed backend until we know which Trial class the user implemented.
        trial_class = load.trial_class_from_entrypoint(train_entrypoint)
        controller_class = load.get_trial_controller_class(trial_class)
        if info.container_rank == 0:
            try:
                analytics.send_analytics("trial_loaded", analytics.get_trial_analytics(trial_class))
            except Exception as e:
                logging.debug(f"Cannot send analytics: {e}")

        # Step 2: Initialize framework-specific details (dtrain framework, random seeds, etc).
        distributed_backend = det._DistributedBackend()
        controller_class.pre_execute_hook(env, distributed_backend)

        # Step 3: Now that the dtrain framework is initialized, build the DistributedContext object.
        # For harness.py, we only support a fixed set of Determined-provided launch layers, since
        # the TrialControllers only support a fixed set of launch layers.
        distributed = None
        if distributed_backend.use_horovod():
            distributed = core.DistributedContext.from_horovod(horovod.hvd)
        elif distributed_backend.use_deepspeed():
            distributed = core.DistributedContext.from_deepspeed()
        elif distributed_backend.use_torch():
            distributed = core.DistributedContext.from_torch_distributed()
        elif len(info.container_addrs) > 1 or len(info.slot_ids) > 1:
            raise ValueError(
                "In multi-slot tasks, the determined.exec.harness module must not be invoked "
                "directly.  Instead, it must be wrapped in one of the following launch layers: "
                "determined.launch.horovod, determined.launch.deepspeed"
            )

        # Step 4: Let core.init() create the core.Context.
        with core.init(
            distributed=distributed,
            preempt_mode=core.PreemptMode.ChiefOnly,
            tensorboard_mode=core.TensorboardMode.MANUAL,
        ) as core_context:
            trial_context = trial_class.trial_context_class(core_context, env)

            # Step 4: Instantiate the user's Trial.
            trial_inst = trial_class(trial_context)

            # Step 5: Create a TrialController and execute training
            logging.info(f"Creating {controller_class.__name__} with {trial_class.__name__}.")
            controller = controller_class.from_trial(
                trial_inst=trial_inst,
                context=trial_context,
                env=env,
            )

            controller.run()

    return 0
Ejemplo n.º 7
0
def main(hvd_args: List[str], script: List[str], autohorovod: bool) -> int:
    hvd_args = hvd_args or []

    info = det.get_cluster_info()
    assert info is not None, "must be run on-cluster"
    assert info.task_type == "TRIAL", f'must be run with task_type="TRIAL", not "{info.task_type}"'

    # When --autohorovod was set, detect single-slot and zero-slot trials.
    if autohorovod and len(
            info.container_addrs) == 1 and len(info.slot_ids) <= 1:
        p = subprocess.Popen(script)
        with det.util.forward_signals(p):
            return p.wait()

    # Hack: get the resources id from the environment.
    resources_id = os.environ.get("DET_RESOURCES_ID")
    assert resources_id is not None, "Unable to run with DET_RESOURCES_ID unset"

    # Hack: read the full config.  The experiment config is not a stable API!
    experiment_config = info.trial._config

    debug = experiment_config.get("debug", False)
    if debug:
        logging.getLogger().setLevel(logging.DEBUG)

    # TODO: refactor websocket, data_layer, and profiling to to not use the cli_cert.
    cert = certs.default_load(info.master_url)
    certs.cli_cert = cert

    # The launch layer should provide the chief_ip to the training code, so that the training code
    # can function with a different launch layer in a different environment.  Inside Determined, the
    # easiest way to get the chief_ip is with container_addrs.
    chief_ip = info.container_addrs[0]

    # Chief IP is set as an environment variable to support nested launch layers
    os.environ["DET_CHIEF_IP"] = chief_ip

    if info.container_rank > 0:
        # Non-chief machines just run sshd.

        # Mark sshd containers as daemon resources that the master should kill when all non-daemon
        # contiainers (horovodrun, in this case) have exited.
        api.post(
            info.master_url,
            path=
            f"/api/v1/allocations/{info.allocation_id}/resources/{resources_id}/daemon",
            cert=cert,
        )

        pid_server_cmd, run_sshd_command = create_sshd_worker_cmd(
            info.allocation_id, len(info.slot_ids), debug=debug)

        logging.debug(
            f"Non-chief [{info.container_rank}] training process launch "
            f"command: {run_sshd_command}.")
        p = subprocess.Popen(pid_server_cmd + run_sshd_command)
        with det.util.forward_signals(p):
            return p.wait()

    # Chief machine waits for every worker's sshd to be available.  All machines should be pretty
    # close to in-step by now because all machines just finished synchronizing rendezvous info.
    deadline = time.time() + 20
    for peer_addr in info.container_addrs[1:]:
        util.check_sshd(peer_addr, deadline, DTRAIN_SSH_PORT)

    # The chief has several layers of wrapper processes:
    # - a top-level pid_server, which causes the whole container to exit if any local worker dies.
    # - horovodrun, which launches $slots_per_trial copies of the following layers:
    #     - a pid_client process to contact the local pid_server
    #     - wrap_rank, which redirects stdin/stdout to the local container
    #     - harness.py, which actually does the training for the worker
    #
    # It is a bug in horovod that causes us to have this pid_server/pid_client pair of layers.
    # We can remove these layers when the upstream fix has been around for long enough that we can
    # reasonably require user images to have patched horovod installations.

    pid_server_cmd = create_hvd_pid_server_cmd(info.allocation_id,
                                               len(info.slot_ids))

    # TODO: remove this (very old) hack when we have a configurable launch layer.
    hvd_optional_args = experiment_config.get("data",
                                              {}).get("__det_dtrain_args", [])
    hvd_optional_args += hvd_args
    if debug:
        hvd_optional_args += ["--mpi-args=-v --display-map"]

    hvd_cmd = horovod.create_run_command(
        num_proc_per_machine=len(info.slot_ids),
        ip_addresses=info.container_addrs,
        inter_node_network_interface=info.trial._inter_node_network_interface,
        optimizations=experiment_config["optimizations"],
        debug=debug,
        optional_args=hvd_optional_args,
    )

    worker_wrapper_cmd = create_worker_wrapper_cmd(info.allocation_id)

    logging.debug(
        f"chief worker calling horovodrun with args: {hvd_cmd[1:]} ...")

    os.environ["USE_HOROVOD"] = "1"

    # We now have environment images with built-in OpenMPI.   When invoked the
    # SLURM_JOBID variable triggers integration with SLURM, however, we are
    # running in a singularity container and SLURM may or may not have
    # compatible configuration enabled.  We therefore clear the SLURM_JOBID variable
    # before invoking mpi so that mpirun will honor the args passed via horvod
    # run to it describing the hosts and process topology, otherwise mpi ends
    # up wanting to launch all -np# processes on the local causing an oversubscription
    # error ("There are not enough slots available in the system").
    os.environ.pop("SLURM_JOBID", None)
    p = subprocess.Popen(pid_server_cmd + hvd_cmd + worker_wrapper_cmd +
                         script)
    with det.util.forward_signals(p):
        return p.wait()
Ejemplo n.º 8
0
def get_allocation_token() -> str:
    info = det.get_cluster_info()
    if info is None:
        return ""
    return info.session_token
Ejemplo n.º 9
0
def init(
    *,
    distributed: Optional[core.DistributedContext] = None,
    # TODO: figure out a better way to deal with checkpointing in the local training case.
    storage_manager: Optional[storage.StorageManager] = None,
    preempt_mode: core.PreemptMode = core.PreemptMode.WorkersAskChief,
    tensorboard_mode: core.TensorboardMode = core.TensorboardMode.AUTO,
) -> Context:
    """
    ``core.init()`` builds a :class:`core.Context <determined.core.Context>` for use with the Core
    API.

    Always use ``with core.init() as context`` instead of instantiating a ``core.Context`` directly.
    Certain components of the Core API may be configured by passing arguments to ``core.init()``.
    The only arg that is required is a ``DistributedContext``, and even that is only required for
    for multi-slot tasks.

    All of your training must occur within the scope of the ``with core.init() as core_context``, as
    there are resources necessary for training which start in the ``core.Context``'s ``__enter__``
    method and must be cleaned up in its ``__exit__()`` method.

    Arguments:
        distributed (``core.DistributedContext``, optional): Passing a ``DistributedContext`` is
            required for multi-slot training, but unnecessary for single-slot training.  Defaults to
            ``None``.
        preempt_mode (``core.PreemptMode``, optional): Configure the calling pattern for the
            ``core_context.preempt.should_preempt()`` method.  See
            :class:`~determined.core.PreemptMode` for more detail.  Defaults to ``WorkersAskChief``.
        storage_manager: Internal use only.
        tensorboard_mode (``core.TensorboardMode``, optional): Define how Tensorboard
            metrics and profiling data are retained. See
            :class:`~determined.core.TensorboardMode`` for more detail. Defaults to ``AUTO``.
    """
    info = det.get_cluster_info()
    if info is None:
        return _dummy_init(distributed=distributed, storage_manager=storage_manager)

    # We are on the cluster.
    cert = certs.default_load(info.master_url)
    session = Session(info.master_url, None, None, cert, max_retries=get_max_retries_config())

    if distributed is None:
        if len(info.container_addrs) > 1 or len(info.slot_ids) > 1:
            raise ValueError("you must provide a valid DistributedContext for a multi-slot task")

    distributed = distributed or core.DummyDistributedContext()

    preempt = core.PreemptContext(session, info.allocation_id, distributed, preempt_mode)

    # At present, we only support tensorboards in Trial tasks.
    tbd_writer = None

    train = None
    searcher = None

    if info.task_type == "TRIAL":
        # Prepare the tensorboard hooks.
        tensorboard_manager = tensorboard.build(
            info.cluster_id,
            str(info.trial.experiment_id),
            str(info.trial.trial_id),
            info.trial._config["checkpoint_storage"],
            container_path=constants.SHARED_FS_CONTAINER_PATH,
        )
        if tensorboard_mode == core.TensorboardMode.AUTO:
            tbd_writer = tensorboard.get_metric_writer()

        train = core.TrainContext(
            session,
            info.trial.trial_id,
            info.trial._trial_run_id,
            info.trial.experiment_id,
            distributed,
            tensorboard_mode,
            tensorboard_manager,
            tbd_writer,
        )
        units = core._parse_searcher_units(info.trial._config)
        searcher = core.SearcherContext(
            session,
            distributed,
            info.trial.trial_id,
            info.trial._trial_run_id,
            info.allocation_id,
            units,
        )

        if storage_manager is None:
            storage_manager = storage.build(
                info.trial._config["checkpoint_storage"],
                container_path=constants.SHARED_FS_CONTAINER_PATH,
            )

        checkpoint = core.CheckpointContext(
            distributed,
            storage_manager,
            session,
            info.task_id,
            info.allocation_id,
            tensorboard_mode,
            tensorboard_manager,
        )

    else:
        # TODO: support checkpointing for non-trial tasks.
        if storage_manager is None:
            base_path = appdirs.user_data_dir("determined")
            logger.info("no storage_manager provided; storing checkpoints in {base_path}")
            storage_manager = storage.SharedFSStorageManager(base_path)
        checkpoint = core.DummyCheckpointContext(distributed, storage_manager)

    _install_stacktrace_on_sigusr1()

    return Context(
        distributed=distributed,
        checkpoint=checkpoint,
        preempt=preempt,
        train=train,
        searcher=searcher,
    )