def create_test_session() -> session.Session:
    murl = conf.make_master_url()
    certs.cli_cert = certs.default_load(murl)
    authentication.cli_auth = authentication.Authentication(murl,
                                                            try_reauth=True)
    return session.Session(murl, "determined", authentication.cli_auth,
                           certs.cli_cert)
def test_streaming_observability_metrics_apis(
    framework_base_experiment: str, framework_timings_enabled: bool
) -> None:
    # TODO: refactor tests to not use cli singleton auth.
    certs.cli_cert = certs.default_load(conf.make_master_url())
    authentication.cli_auth = authentication.Authentication(conf.make_master_url(), try_reauth=True)

    config_path = conf.tutorials_path(f"../{framework_base_experiment}/const.yaml")
    model_def_path = conf.tutorials_path(f"../{framework_base_experiment}")

    config_obj = conf.load_config(config_path)
    config_obj = conf.set_profiling_enabled(config_obj)
    with tempfile.NamedTemporaryFile() as tf:
        with open(tf.name, "w") as f:
            yaml.dump(config_obj, f)
        experiment_id = exp.create_experiment(
            tf.name,
            model_def_path,
        )

    exp.wait_for_experiment_state(experiment_id, "COMPLETED")
    trials = exp.experiment_trials(experiment_id)
    trial_id = trials[0]["id"]

    gpu_enabled = conf.GPU_ENABLED

    request_profiling_metric_labels(trial_id, framework_timings_enabled, gpu_enabled)
    if gpu_enabled:
        request_profiling_system_metrics(trial_id, "gpu_util")
    if framework_timings_enabled:
        request_profiling_pytorch_timing_metrics(trial_id, "train_batch")
Exemple #3
0
def main():
    port = os.environ["NOTEBOOK_PORT"]
    notebook_id = os.environ["DET_TASK_ID"]
    notebook_server = f"http://127.0.0.1:{port}/proxy/{notebook_id}"
    master_url = os.environ["DET_MASTER"]
    cert = certs.default_load(master_url)
    try:
        idle_type = IdleType[os.environ["NOTEBOOK_IDLE_TYPE"].upper()]
    except KeyError:
        logging.warning("unknown idle type '%s', using default value",
                        os.environ["NOTEBOOK_IDLE_TYPE"])
        idle_type = IdleType.KERNELS_OR_TERMINALS

    wait_for_jupyter(("127.0.0.1", int(port)))

    while True:
        try:
            idle = is_idle(notebook_server, idle_type)
            api.put(
                master_url,
                f"/api/v1/notebooks/{notebook_id}/report_idle",
                {
                    "notebook_id": notebook_id,
                    "idle": idle
                },
                cert=cert,
            )
        except Exception as e:
            logging.warning("ignoring error communicating with master",
                            exc_info=True)
        time.sleep(1)
Exemple #4
0
def num_experiments() -> int:
    certs.cli_cert = certs.default_load(conf.make_master_url())
    authentication.cli_auth = authentication.Authentication(
        conf.make_master_url(), try_reauth=True)
    r = api.get(conf.make_master_url(), "experiments")
    assert r.status_code == requests.codes.ok, r.text
    return len(r.json())
Exemple #5
0
def get_command(command_id: str) -> Any:
    certs.cli_cert = certs.default_load(conf.make_master_url())
    authentication.cli_auth = authentication.Authentication(
        conf.make_master_url(), try_reauth=True)
    r = api.get(conf.make_master_url(), "api/v1/commands/" + command_id)
    assert r.status_code == requests.codes.ok, r.text
    return r.json()["command"]
Exemple #6
0
def trial_metrics(trial_id: int) -> Dict[str, Any]:
    certs.cli_cert = certs.default_load(conf.make_master_url())
    authentication.cli_auth = authentication.Authentication(
        conf.make_master_url(), try_reauth=True)
    r = api.get(conf.make_master_url(), "trials/{}/metrics".format(trial_id))
    json = r.json()  # type: Dict[str, Any]
    return json
    def __init__(
        self,
        master: Optional[str] = None,
        user: Optional[str] = None,
        password: Optional[str] = None,
        cert_path: Optional[str] = None,
        cert_name: Optional[str] = None,
        noverify: bool = False,
    ):
        master = master or util.get_default_master_address()

        cert = certs.default_load(
            master_url=master,
            explicit_path=cert_path,
            explicit_cert_name=cert_name,
            explicit_noverify=noverify,
        )

        # TODO: This should probably be try_reauth=False, but it appears that would break the case
        # where the default credentials are available from the master and could be discovered by
        # a REST API call against the master.
        auth = authentication.Authentication(master,
                                             user,
                                             password,
                                             try_reauth=True,
                                             cert=cert)

        self._session = session.Session(master, user, auth, cert)
Exemple #8
0
def cancel_experiment_v1(experiment_id: int) -> None:
    certs.cli_cert = certs.default_load(conf.make_master_url())
    authentication.cli_auth = authentication.Authentication(
        conf.make_master_url(), try_reauth=True)
    r = api.post(conf.make_master_url(),
                 "/api/v1/experiments/{}/cancel".format(experiment_id))
    r.raise_for_status()
    wait_for_experiment_state(experiment_id, "CANCELED")
Exemple #9
0
def main(ready: Pattern):
    master_url = str(os.environ["DET_MASTER"])
    cert = certs.default_load(master_url)
    allocation_id = str(os.environ["DET_ALLOCATION_ID"])
    for line in sys.stdin:
        if ready.match(line):
            post_ready(master_url, cert, allocation_id)
            return
Exemple #10
0
def experiment_json(experiment_id: int) -> Dict[str, Any]:
    certs.cli_cert = certs.default_load(conf.make_master_url())
    authentication.cli_auth = authentication.Authentication(
        conf.make_master_url(), try_reauth=True)
    r = api.get(conf.make_master_url(), "experiments/{}".format(experiment_id))
    assert r.status_code == requests.codes.ok, r.text
    json = r.json()  # type: Dict[str, Any]
    return json
Exemple #11
0
def trial_logs(trial_id: int) -> List[str]:
    certs.cli_cert = certs.default_load(conf.make_master_url())
    authentication.cli_auth = authentication.Authentication(
        conf.make_master_url(), try_reauth=True)
    return [
        tl["message"]
        for tl in api.trial_logs(conf.make_master_url(), trial_id)
    ]
def get_num_running_commands() -> int:
    # TODO: refactor tests to not use cli singleton auth.
    certs.cli_cert = certs.default_load(conf.make_master_url())
    authentication.cli_auth = authentication.Authentication(conf.make_master_url(), try_reauth=True)
    r = api.get(conf.make_master_url(), "api/v1/commands")
    assert r.status_code == requests.codes.ok, r.text

    return len([command for command in r.json()["commands"] if command["state"] == "STATE_RUNNING"])
Exemple #13
0
def experiment_has_active_workload(experiment_id: int) -> bool:
    certs.cli_cert = certs.default_load(conf.make_master_url())
    authentication.cli_auth = authentication.Authentication(
        conf.make_master_url(), try_reauth=True)
    r = api.get(conf.make_master_url(), "tasks").json()
    for task in r.values():
        if "Experiment {}".format(experiment_id) in task["name"] and len(
                task["containers"]) > 0:
            return True

    return False
Exemple #14
0
def experiment_has_completed_workload(experiment_id: int) -> bool:
    certs.cli_cert = certs.default_load(conf.make_master_url())
    authentication.cli_auth = authentication.Authentication(
        conf.make_master_url(), try_reauth=True)
    trials = experiment_trials(experiment_id)

    if not any(trials):
        return False

    return any(
        any(s["state"] == "COMPLETED" for s in t["steps"]) for t in trials)
Exemple #15
0
def cluster_slots() -> Dict[str, Any]:
    """
    cluster_slots returns a dict of slots that each agent has.
    :return:  Dict[AgentID, List[Slot]]
    """
    # TODO: refactor tests to not use cli singleton auth.
    certs.cli_cert = certs.default_load(conf.make_master_url())
    authentication.cli_auth = authentication.Authentication(conf.make_master_url(), try_reauth=True)
    r = api.get(conf.make_master_url(), "agents")
    assert r.status_code == requests.codes.ok, r.text
    json = r.json()  # type: Dict[str, Any]
    return {agent["id"]: agent["slots"].values() for agent in json.values()}
Exemple #16
0
def change_experiment_state(experiment_id: int, new_state: str) -> None:
    # TODO(DET-5678): refactor tests to not use cli singleton auth.
    certs.cli_cert = certs.default_load(conf.make_master_url())
    authentication.cli_auth = authentication.Authentication(
        conf.make_master_url(), try_reauth=True)
    r = api.patch(
        conf.make_master_url(),
        "experiments/{}".format(experiment_id),
        headers={"Content-Type": "application/merge-patch+json"},
        body={"state": new_state},
    )
    assert r.status_code == requests.codes.no_content, r.text
Exemple #17
0
def main() -> None:
    if len(sys.argv) != 2:
        print("worker_process_env_path must be provided as a commandline argument", file=sys.stderr)
        sys.exit(1)

    # Load the worker process env.
    worker_process_env_path = pathlib.Path(sys.argv[1])
    worker_process_env = layers.WorkerProcessContext.from_file(worker_process_env_path)

    config_logging(worker_process_env)

    # API code expects credential to be available as an environment variable
    os.environ["DET_TASK_TOKEN"] = worker_process_env.env.det_task_token

    # TODO: refactor websocket, data_layer, and profiling to to not use the cli_cert.
    master_url = (
        f"http{'s' if worker_process_env.env.use_tls else ''}://"
        f"{worker_process_env.env.master_addr}:{worker_process_env.env.master_port}"
    )
    certs.cli_cert = certs.default_load(master_url=master_url)

    if worker_process_env.env.experiment_config.debug_enabled():
        faulthandler.dump_traceback_later(30, repeat=True)

    # Establish the connection to the ZMQBroadcastServer in this container.
    pub_url = f"tcp://localhost:{worker_process_env.broadcast_pub_port}"
    sub_url = f"tcp://localhost:{worker_process_env.broadcast_pull_port}"
    with ipc.ZMQBroadcastClient(pub_url, sub_url) as broadcast_client:

        # Wrap the communication layer in a workload.Stream.
        subrec = layers.SubprocessReceiver(broadcast_client)
        workloads = iter(subrec)

        with det._catch_sys_exit():
            with det._catch_init_invalid_hp(workloads):
                controller = load.prepare_controller(
                    worker_process_env.env,
                    workloads,
                    worker_process_env.load_path,
                    worker_process_env.rendezvous_info,
                    worker_process_env.hvd_config,
                )

            try:
                controller.run()

            except Exception as e:
                broadcast_client.send_exception_message()
                raise e
Exemple #18
0
def experiment_has_completed_workload(experiment_id: int) -> bool:
    certs.cli_cert = certs.default_load(conf.make_master_url())
    authentication.cli_auth = authentication.Authentication(
        conf.make_master_url(), try_reauth=True)
    trials = experiment_trials(experiment_id)

    if not any(trials):
        return False

    for t in trials:
        for s in t.workloads:
            if (s.training is not None and s.training.state
                    == determinedexperimentv1State.STATE_COMPLETED) or (
                        s.validation is not None and s.validation.state
                        == determinedexperimentv1State.STATE_COMPLETED):
                return True
    return False
def wait_for_gc_to_finish(experiment_id: int) -> None:
    certs.cli_cert = certs.default_load(conf.make_master_url())
    authentication.cli_auth = authentication.Authentication(
        conf.make_master_url(), try_reauth=True)
    saw_gc = False
    # Don't wait longer than 5 minutes (as 600 half-seconds to improve our sampling resolution).
    for _ in range(600):
        r = api.get(conf.make_master_url(), "tasks").json()
        names = [task["name"] for task in r.values()]
        gc_name = f"Checkpoint GC (Experiment {experiment_id})"
        if gc_name in names:
            saw_gc = True
        elif saw_gc:
            # We previously saw checkpoint gc but now we don't, so it must have finished.
            return
        time.sleep(0.5)

    # It's possible that it ran really fast and we missed it, so just log this.
    print("Did not observe checkpoint gc start or finish!", file=sys.stderr)
Exemple #20
0
def test_task_logs(task_type: str, task_config: Dict[str, Any],
                   log_regex: Any) -> None:
    # TODO: refactor tests to not use cli singleton auth.
    master_url = conf.make_master_url()
    certs.cli_cert = certs.default_load(conf.make_master_url())
    authentication.cli_auth = authentication.Authentication(
        conf.make_master_url(), try_reauth=True)

    rps = bindings.get_GetResourcePools(
        session.Session(master_url, "determined", authentication.cli_auth,
                        certs.cli_cert))
    assert rps.resourcePools and len(
        rps.resourcePools) > 0, "missing resource pool"

    if (rps.resourcePools[0].type
            == bindings.v1ResourcePoolType.RESOURCE_POOL_TYPE_K8S
            and task_type == command.TaskTypeCommand):
        # TODO(DET-6712): Investigate intermittent slowness with K8s command logs.
        return

    body = {}
    if task_type == command.TaskTypeTensorBoard:
        exp_id = exp.run_basic_test(
            conf.fixtures_path("no_op/single.yaml"),
            conf.fixtures_path("no_op"),
            1,
        )
        body.update({"experiment_ids": [exp_id]})

    resp = command.launch_command(
        master_url,
        f"api/v1/{command.RemoteTaskNewAPIs[task_type]}",
        task_config,
        "",
        default_body=body,
    )
    task_id = resp[command.RemoteTaskName[task_type]]["id"]
    try:
        check_logs(master_url, task_id, log_regex, api.task_logs,
                   api.task_log_fields)
    finally:
        command._kill(master_url, task_type, task_id)
def test_hp_importance_api() -> None:
    certs.cli_cert = certs.default_load(conf.make_master_url())
    authentication.cli_auth = authentication.Authentication(
        conf.make_master_url(), try_reauth=True)

    pool = mp.pool.ThreadPool(processes=1)

    experiment_id = exp.create_experiment(
        conf.fixtures_path("mnist_pytorch/random.yaml"),
        conf.tutorials_path("mnist_pytorch"),
    )

    hp_importance_thread = pool.apply_async(request_hp_importance,
                                            (experiment_id, ))

    hp_importance_results = hp_importance_thread.get()

    if hp_importance_results is not None:
        pytest.fail("hyperparameter-importance: %s. Results: %s" %
                    hp_importance_results)
def test_change_displayname(clean_auth: None) -> None:
    u_patch = create_test_user(ADMIN_CREDENTIALS, False)
    original_name = u_patch.username

    master_url = conf.make_master_url()
    certs.cli_cert = certs.default_load(master_url)
    authentication.cli_auth = authentication.Authentication(
        conf.make_master_url(),
        requested_user=original_name,
        password="",
        try_reauth=True)
    sess = session.Session(master_url, original_name, authentication.cli_auth,
                           certs.cli_cert)

    # Get API bindings object for the created test user
    all_users = bindings.get_GetUsers(sess).users
    assert all_users is not None
    current_user = list(
        filter(lambda u: u.username == original_name, all_users))[0]
    assert current_user is not None and current_user.id

    # Rename user using display name
    patch_user = bindings.v1PatchUser(displayName="renamed")
    bindings.patch_PatchUser(sess, body=patch_user, userId=current_user.id)

    modded_user = bindings.get_GetUser(sess, userId=current_user.id).user
    assert modded_user is not None
    assert modded_user.displayName == "renamed"

    # Avoid display name of 'admin'
    patch_user.displayName = "Admin"
    with pytest.raises(errors.APIException):
        bindings.patch_PatchUser(sess, body=patch_user, userId=current_user.id)

    # Clear display name (UI will show username)
    patch_user.displayName = ""
    bindings.patch_PatchUser(sess, body=patch_user, userId=current_user.id)

    modded_user = bindings.get_GetUser(sess, userId=current_user.id).user
    assert modded_user is not None
    assert modded_user.displayName == ""
Exemple #23
0
def test_trial_logs() -> None:
    # TODO: refactor tests to not use cli singleton auth.
    master_url = conf.make_master_url()
    certs.cli_cert = certs.default_load(conf.make_master_url())
    authentication.cli_auth = authentication.Authentication(
        conf.make_master_url(), try_reauth=True)

    experiment_id = exp.run_basic_test(conf.fixtures_path("no_op/single.yaml"),
                                       conf.fixtures_path("no_op"), 1)
    trial = exp.experiment_trials(experiment_id)[0].trial
    trial_id = trial.id
    task_id = trial.taskId
    assert task_id != ""

    log_regex = re.compile("^.*New trial runner.*$")
    # Trial-specific APIs should work just fine.
    check_logs(master_url, trial_id, log_regex, api.trial_logs,
               api.trial_log_fields)
    # And so should new task log APIs.
    check_logs(master_url, task_id, log_regex, api.task_logs,
               api.task_log_fields)
def test_experimental_experiment_api_determined_disabled() -> None:
    context_path = pathlib.Path(conf.fixtures_path("no_op"))
    model_def_path = pathlib.Path(
        conf.fixtures_path("no_op/single-medium-train-step.yaml"))

    model_context = context.Context.from_local(context_path)

    with model_def_path.open("r") as fin:
        dai_experiment_config = util.safe_load_yaml_with_exceptions(fin)

    determined_master = conf.make_master_url()
    requested_user, password = create_test_user(ADMIN_CREDENTIALS,
                                                add_password=True)
    a_username, _ = ADMIN_CREDENTIALS

    try:
        det_spawn(["-u", a_username, "user", "deactivate", "determined"])

        certs.cli_cert = certs.default_load(master_url=determined_master, )
        determined_api.authentication.cli_auth = determined_api.authentication.Authentication(
            determined_master,
            requested_user=requested_user,
            password=password,
            try_reauth=True,
            cert=certs.cli_cert,
        )
        exp_id = determined_api.experiment.create_experiment_and_follow_logs(
            master_url=determined_master,
            config=dai_experiment_config,
            model_context=model_context,
            template=None,
            additional_body_fields={},
            activate=True,
            follow_first_trial_logs=False,
        )

        exp.wait_for_experiment_state(exp_id, EXP_STATE.STATE_COMPLETED)
    finally:
        det_spawn(["-u", a_username, "user", "activate", "determined"])
Exemple #25
0
def main(hvd_args: List[str], script: List[str], autohorovod: bool) -> int:
    hvd_args = hvd_args or []

    info = det.get_cluster_info()
    assert info is not None, "must be run on-cluster"
    assert info.task_type == "TRIAL", f'must be run with task_type="TRIAL", not "{info.task_type}"'

    # When --autohorovod was set, detect single-slot and zero-slot trials.
    if autohorovod and len(
            info.container_addrs) == 1 and len(info.slot_ids) <= 1:
        p = subprocess.Popen(script)
        with det.util.forward_signals(p):
            return p.wait()

    # Hack: get the resources id from the environment.
    resources_id = os.environ.get("DET_RESOURCES_ID")
    assert resources_id is not None, "Unable to run with DET_RESOURCES_ID unset"

    # Hack: read the full config.  The experiment config is not a stable API!
    experiment_config = info.trial._config

    debug = experiment_config.get("debug", False)
    if debug:
        logging.getLogger().setLevel(logging.DEBUG)

    # TODO: refactor websocket, data_layer, and profiling to to not use the cli_cert.
    cert = certs.default_load(info.master_url)
    certs.cli_cert = cert

    # The launch layer should provide the chief_ip to the training code, so that the training code
    # can function with a different launch layer in a different environment.  Inside Determined, the
    # easiest way to get the chief_ip is with container_addrs.
    chief_ip = info.container_addrs[0]

    # Chief IP is set as an environment variable to support nested launch layers
    os.environ["DET_CHIEF_IP"] = chief_ip

    if info.container_rank > 0:
        # Non-chief machines just run sshd.

        # Mark sshd containers as daemon resources that the master should kill when all non-daemon
        # contiainers (horovodrun, in this case) have exited.
        api.post(
            info.master_url,
            path=
            f"/api/v1/allocations/{info.allocation_id}/resources/{resources_id}/daemon",
            cert=cert,
        )

        pid_server_cmd, run_sshd_command = create_sshd_worker_cmd(
            info.allocation_id, len(info.slot_ids), debug=debug)

        logging.debug(
            f"Non-chief [{info.container_rank}] training process launch "
            f"command: {run_sshd_command}.")
        p = subprocess.Popen(pid_server_cmd + run_sshd_command)
        with det.util.forward_signals(p):
            return p.wait()

    # Chief machine waits for every worker's sshd to be available.  All machines should be pretty
    # close to in-step by now because all machines just finished synchronizing rendezvous info.
    deadline = time.time() + 20
    for peer_addr in info.container_addrs[1:]:
        util.check_sshd(peer_addr, deadline, DTRAIN_SSH_PORT)

    # The chief has several layers of wrapper processes:
    # - a top-level pid_server, which causes the whole container to exit if any local worker dies.
    # - horovodrun, which launches $slots_per_trial copies of the following layers:
    #     - a pid_client process to contact the local pid_server
    #     - wrap_rank, which redirects stdin/stdout to the local container
    #     - harness.py, which actually does the training for the worker
    #
    # It is a bug in horovod that causes us to have this pid_server/pid_client pair of layers.
    # We can remove these layers when the upstream fix has been around for long enough that we can
    # reasonably require user images to have patched horovod installations.

    pid_server_cmd = create_hvd_pid_server_cmd(info.allocation_id,
                                               len(info.slot_ids))

    # TODO: remove this (very old) hack when we have a configurable launch layer.
    hvd_optional_args = experiment_config.get("data",
                                              {}).get("__det_dtrain_args", [])
    hvd_optional_args += hvd_args
    if debug:
        hvd_optional_args += ["--mpi-args=-v --display-map"]

    hvd_cmd = horovod.create_run_command(
        num_proc_per_machine=len(info.slot_ids),
        ip_addresses=info.container_addrs,
        inter_node_network_interface=info.trial._inter_node_network_interface,
        optimizations=experiment_config["optimizations"],
        debug=debug,
        optional_args=hvd_optional_args,
    )

    worker_wrapper_cmd = create_worker_wrapper_cmd(info.allocation_id)

    logging.debug(
        f"chief worker calling horovodrun with args: {hvd_cmd[1:]} ...")

    os.environ["USE_HOROVOD"] = "1"

    # We now have environment images with built-in OpenMPI.   When invoked the
    # SLURM_JOBID variable triggers integration with SLURM, however, we are
    # running in a singularity container and SLURM may or may not have
    # compatible configuration enabled.  We therefore clear the SLURM_JOBID variable
    # before invoking mpi so that mpirun will honor the args passed via horvod
    # run to it describing the hosts and process topology, otherwise mpi ends
    # up wanting to launch all -np# processes on the local causing an oversubscription
    # error ("There are not enough slots available in the system").
    os.environ.pop("SLURM_JOBID", None)
    p = subprocess.Popen(pid_server_cmd + hvd_cmd + worker_wrapper_cmd +
                         script)
    with det.util.forward_signals(p):
        return p.wait()
import distutils.util
import io
import os
import tarfile

from determined import constants
from determined.common.api import certs, request

if __name__ == "__main__":
    exp_id = os.environ["DET_EXPERIMENT_ID"]
    master_addr = os.environ["DET_MASTER_ADDR"]
    master_port = os.environ["DET_MASTER_PORT"]
    use_tls = distutils.util.strtobool(os.environ.get("DET_USE_TLS", "false"))

    master_url = f"http{'s' if use_tls else ''}://{master_addr}:{master_port}"
    certs.cli_cert = certs.default_load(master_url=master_url)

    resp = request.get(master_url, f"api/v1/experiments/{exp_id}/model_def")
    resp.raise_for_status()

    tgz = base64.b64decode(resp.json()["b64Tgz"])

    with tarfile.open(fileobj=io.BytesIO(tgz), mode="r:gz") as model_def:
        # Ensure all members of the tarball resolve to subdirectories.
        for path in model_def.getnames():
            if os.path.relpath(path).startswith("../"):
                raise ValueError(
                    f"'{path}' in tarball would expand to a parent directory")
        model_def.extractall(path=constants.MANAGED_TRAINING_MODEL_COPY)
        model_def.extractall(path=".")
Exemple #27
0
def main(train_entrypoint: str) -> int:
    info = det.get_cluster_info()
    assert info is not None, "must be run on-cluster"
    assert info.task_type == "TRIAL", f'must be run with task_type="TRIAL", not "{info.task_type}"'

    # TODO: refactor data_layer, and profiling to to not use the cli_cert.
    certs.cli_cert = certs.default_load(info.master_url)

    # TODO: Don't include EnvContext object in the future high-level APIs for PyTorch or Keras.
    # It was natural to create this big-blob-of-config object, but it was a mistake to pass it into
    # the lowest layers of the harness code; it's too large of an object to be easily mockable,
    # which is part of why building local training mode has always been a challenge.
    #
    # A better pattern is to pass in exactly the information that is necessary at each layer.  We
    # will use that pattern for the future high-level APIs, but it's not worth refactoring e.g. the
    # TFKerasTrialController or EstimatorTrialController to add that functionality, so for now we
    # continue with the legacy strategy.

    env = det.EnvContext(
        master_url=info.master_url,
        master_cert_file=info.master_cert_file,
        master_cert_name=info.master_cert_name,
        experiment_config=info.trial._config,
        hparams=info.trial.hparams,
        latest_checkpoint=info.latest_checkpoint,
        steps_completed=info.trial._steps_completed,
        use_gpu=bool(info.gpu_uuids),
        container_gpus=info.gpu_uuids,
        slot_ids=info.slot_ids,
        debug=info.trial._debug,
        det_trial_unique_port_offset=info.trial._unique_port_offset,
        det_trial_id=str(info.trial.trial_id),
        det_experiment_id=str(info.trial.experiment_id),
        det_agent_id=info.agent_id,
        det_cluster_id=info.cluster_id,
        trial_seed=info.trial.trial_seed,
        trial_run_id=info.trial._trial_run_id,
        allocation_id=info.allocation_id,
        managed_training=True,
        test_mode=False,
        on_cluster=True,
    )

    det.common.set_logger(env.debug)
    logging.debug("Starting harness.")

    with maybe_periodic_stacktraces(env.debug):
        # Step 1: Load user code.
        # We can't build a core.Context without rank information, and we can't gather rank
        # information until the distributed backend is initialized, and we can't initialize the
        # correct distributed backend until we know which Trial class the user implemented.
        trial_class = load.trial_class_from_entrypoint(train_entrypoint)
        controller_class = load.get_trial_controller_class(trial_class)
        if info.container_rank == 0:
            try:
                analytics.send_analytics("trial_loaded", analytics.get_trial_analytics(trial_class))
            except Exception as e:
                logging.debug(f"Cannot send analytics: {e}")

        # Step 2: Initialize framework-specific details (dtrain framework, random seeds, etc).
        distributed_backend = det._DistributedBackend()
        controller_class.pre_execute_hook(env, distributed_backend)

        # Step 3: Now that the dtrain framework is initialized, build the DistributedContext object.
        # For harness.py, we only support a fixed set of Determined-provided launch layers, since
        # the TrialControllers only support a fixed set of launch layers.
        distributed = None
        if distributed_backend.use_horovod():
            distributed = core.DistributedContext.from_horovod(horovod.hvd)
        elif distributed_backend.use_deepspeed():
            distributed = core.DistributedContext.from_deepspeed()
        elif distributed_backend.use_torch():
            distributed = core.DistributedContext.from_torch_distributed()
        elif len(info.container_addrs) > 1 or len(info.slot_ids) > 1:
            raise ValueError(
                "In multi-slot tasks, the determined.exec.harness module must not be invoked "
                "directly.  Instead, it must be wrapped in one of the following launch layers: "
                "determined.launch.horovod, determined.launch.deepspeed"
            )

        # Step 4: Let core.init() create the core.Context.
        with core.init(
            distributed=distributed,
            preempt_mode=core.PreemptMode.ChiefOnly,
            tensorboard_mode=core.TensorboardMode.MANUAL,
        ) as core_context:
            trial_context = trial_class.trial_context_class(core_context, env)

            # Step 4: Instantiate the user's Trial.
            trial_inst = trial_class(trial_context)

            # Step 5: Create a TrialController and execute training
            logging.info(f"Creating {controller_class.__name__} with {trial_class.__name__}.")
            controller = controller_class.from_trial(
                trial_inst=trial_inst,
                context=trial_context,
                env=env,
            )

            controller.run()

    return 0
Exemple #28
0
def main(args: List[str] = sys.argv[1:], ) -> None:
    # TODO: we lazily import "det deploy" but in the future we'd want to lazily import everything.
    parser = make_parser()

    full_cmd, aliases = generate_aliases(deploy_cmd.name)
    is_deploy_cmd = len(args) > 0 and any(args[0] == alias
                                          for alias in [*aliases, full_cmd])
    if is_deploy_cmd:
        from determined.deploy.cli import args_description as deploy_args_description

        add_args(parser, [deploy_args_description])
    else:
        add_args(parser, all_args_description)

    try:
        argcomplete.autocomplete(parser)

        parsed_args = parser.parse_args(args)

        def die(message: str, always_print_traceback: bool = False) -> None:
            if always_print_traceback or debug_mode():
                import traceback

                traceback.print_exc(file=sys.stderr)

            parser.exit(1, colored(message + "\n", "red"))

        v = vars(parsed_args)
        if not v.get("func"):
            parser.print_usage()
            parser.exit(2, "{}: no subcommand specified\n".format(parser.prog))

        try:
            # For `det deploy`, skip interaction with master.
            if is_deploy_cmd:
                parsed_args.func(parsed_args)
                return

            # Configure the CLI's Cert singleton.
            certs.cli_cert = certs.default_load(parsed_args.master)

            try:
                check_version(parsed_args)
            except requests.exceptions.SSLError:
                # An SSLError usually means that we queried a master over HTTPS and got an untrusted
                # cert, so allow the user to store and trust the current cert. (It could also mean
                # that we tried to talk HTTPS on the HTTP port, but distinguishing that based on the
                # exception is annoying, and we'll figure that out in the next step anyway.)
                addr = api.parse_master_address(parsed_args.master)
                check_not_none(addr.hostname)
                check_not_none(addr.port)
                try:
                    ctx = SSL.Context(SSL.TLSv1_2_METHOD)
                    conn = SSL.Connection(ctx, socket.socket())
                    conn.set_tlsext_host_name(
                        cast(str, addr.hostname).encode())
                    conn.connect(
                        cast(Sequence[Union[str, int]],
                             (addr.hostname, addr.port)))
                    conn.do_handshake()
                    cert_pem_data = "".join(
                        crypto.dump_certificate(crypto.FILETYPE_PEM,
                                                cert).decode()
                        for cert in conn.get_peer_cert_chain())
                except crypto.Error:
                    die("Tried to connect over HTTPS but couldn't get a certificate from the "
                        "master; consider using HTTP")

                cert_hash = hashlib.sha256(
                    ssl.PEM_cert_to_DER_cert(cert_pem_data)).hexdigest()
                cert_fingerprint = ":".join(chunks(cert_hash, 2))

                if not render.yes_or_no(
                        "The master sent an untrusted certificate chain with this SHA256 fingerprint:\n"
                        "{}\nDo you want to trust this certificate from now on?"
                        .format(cert_fingerprint)):
                    die("Unable to verify master certificate")

                certs.CertStore(certs.default_store()).set_cert(
                    parsed_args.master, cert_pem_data)
                # Reconfigure the CLI's Cert singleton, but preserve the certificate name.
                old_cert_name = certs.cli_cert.name
                certs.cli_cert = certs.Cert(cert_pem=cert_pem_data,
                                            name=old_cert_name)

                check_version(parsed_args)

            parsed_args.func(parsed_args)
        except KeyboardInterrupt as e:
            raise e
        except (api.errors.BadRequestException,
                api.errors.BadResponseException) as e:
            die("Failed to {}: {}".format(parsed_args.func.__name__, e))
        except api.errors.CorruptTokenCacheException:
            die("Failed to login: Attempted to read a corrupted token cache. "
                "The store has been deleted; please try again.")
        except EnterpriseOnlyError as e:
            die(f"Determined Enterprise Edition is required for this functionality: {e}"
                )
        except Exception:
            die("Failed to {}".format(parsed_args.func.__name__),
                always_print_traceback=True)
    except KeyboardInterrupt:
        # die() may not be defined yet.
        if debug_mode():
            import traceback

            traceback.print_exc(file=sys.stderr)

        print(colored("Interrupting...\n", "red"), file=sys.stderr)
        exit(3)
def main(script: List[str]) -> int:
    info = det.get_cluster_info()
    assert info is not None, "must be run on-cluster"
    assert info.task_type == "TRIAL", f'must be run with task_type="TRIAL", not "{info.task_type}"'

    # Hack: get the resources id from the environment.
    resources_id = os.environ.get("DET_RESOURCES_ID")
    assert resources_id is not None, "Unable to run with DET_RESOURCES_ID unset"

    # TODO: refactor websocket, data_layer, and profiling to to not use the cli_cert.
    cert = certs.default_load(info.master_url)
    certs.cli_cert = cert

    # The launch layer should provide the chief_ip to the training code, so that the training code
    # can function with a different launch layer in a different environment.  Inside Determined, the
    # easiest way to get the chief_ip is with container_addrs.
    chief_ip = info.container_addrs[0]

    # Chief IP is set as an environment variable to support nested launch layers
    os.environ["DET_CHIEF_IP"] = chief_ip

    # All ranks will need to run sshd.
    run_sshd_command = create_sshd_cmd()

    if info.container_rank > 0:
        # Non-chief machines just run sshd.

        # Mark sshd containers as daemon containers that the master should kill when all non-daemon
        # containers (deepspeed launcher, in this case) have exited.
        api.post(
            info.master_url,
            path=f"/api/v1/allocations/{info.allocation_id}/resources/{resources_id}/daemon",
            cert=cert,
        )

        # Wrap it in a pid_server to ensure that we can't hang if a worker fails.
        # This is useful for deepspeed which does not have good error handling for remote processes
        # spun up by pdsh.
        pid_server_cmd = create_pid_server_cmd(info.allocation_id, len(info.slot_ids))

        logging.debug(
            f"Non-chief [{info.container_rank}] training process launch "
            f"command: {run_sshd_command}."
        )
        return subprocess.Popen(pid_server_cmd + run_sshd_command).wait()

    # We always need to set this variable to initialize the context correctly, even in the single
    # slot case.
    os.environ["USE_DEEPSPEED"] = "1"

    # The chief has several layers of wrapper processes:
    # - a top-level pid_server, which causes the whole container to exit if any local worker dies.
    # - deepspeed, which launches $slots_per_trial copies of the following layers:
    #     - a pid_client process to contact the local pid_server
    #     - wrap_rank, which redirects stdin/stdout to the local container
    #     - harness.py, which actually does the training for the worker

    pid_server_cmd = create_pid_server_cmd(info.allocation_id, len(info.slot_ids))

    master_address = create_hostlist_file(
        hostfile_path=pathlib.Path(hostfile_path),
        num_proc_per_machine=len(info.slot_ids),
        ip_addresses=info.container_addrs,
    )
    cmd = create_run_command(master_address, hostfile_path)

    pid_client_cmd = create_pid_client_cmd(info.allocation_id)

    log_redirect_cmd = create_log_redirect_cmd()

    harness_cmd = script

    logging.debug(f"chief worker calling deepspeed with args: {cmd[1:]} ...")

    full_cmd = pid_server_cmd + cmd + pid_client_cmd + log_redirect_cmd + harness_cmd

    multi_machine = len(info.container_addrs) > 1
    if not multi_machine:
        return subprocess.Popen(full_cmd).wait()

    # Create the environment file that will be passed by deepspeed to individual ranks.
    create_deepspeed_env_file()
    # Set custom PDSH args:
    # * bypass strict host checking
    # * -p our custom port
    # * other args are default ssh args for pdsh
    os.environ["PDSH_SSH_ARGS"] = (
        "-o PasswordAuthentication=no -o StrictHostKeyChecking=no "
        f"-p {constants.DTRAIN_SSH_PORT} -2 -a -x %h"
    )

    # Chief worker also needs to run sshd when using pdsh and multi-machine training.
    sshd_process = subprocess.Popen(run_sshd_command)

    try:
        # Chief machine waits for every worker's sshd to be available.  All machines should be
        # close to in-step by now because all machines just finished synchronizing rendezvous
        # info.
        deadline = time.time() + 20
        for peer_addr in info.container_addrs:
            util.check_sshd(peer_addr, deadline, constants.DTRAIN_SSH_PORT)

        return subprocess.Popen(full_cmd).wait()
    finally:
        sshd_process.kill()
        sshd_process.wait()
Exemple #30
0
def main() -> None:
    for k in ENVIRONMENT_VARIABLE_KEYS:
        if k not in os.environ:
            sys.exit("Environment not set: missing " + k)

    experiment_config = simplejson.loads(os.environ["DET_EXPERIMENT_CONFIG"])
    debug = experiment_config.get("debug", False)
    determined.common.set_logger(debug)

    master_addr = os.environ["DET_MASTER_ADDR"]
    master_port = int(os.environ["DET_MASTER_PORT"])
    use_tls = distutils.util.strtobool(os.environ.get("DET_USE_TLS", "false"))
    master_cert_file = os.environ.get("DET_MASTER_CERT_FILE")
    master_cert_name = os.environ.get("DET_MASTER_CERT_NAME")
    agent_id = os.environ["DET_AGENT_ID"]
    container_id = os.environ["DET_CONTAINER_ID"]
    hparams = simplejson.loads(os.environ["DET_HPARAMS"])
    initial_work = workload.Workload.from_json(
        simplejson.loads(os.environ["DET_INITIAL_WORKLOAD"]))

    # TODO: refactor websocket, data_layer, and profiling to to not use the cli_cert.
    certs.cli_cert = certs.default_load(
        master_url=f"http{'s' if use_tls else ''}://{master_addr}:{master_port}"
    )

    with open(os.environ["DET_LATEST_CHECKPOINT"], "r") as f:
        latest_checkpoint = json.load(f)

    use_gpu = distutils.util.strtobool(os.environ.get("DET_USE_GPU", "false"))
    slot_ids = json.loads(os.environ["DET_SLOT_IDS"])
    workload_manager_type = os.environ["DET_WORKLOAD_MANAGER_TYPE"]
    det_rendezvous_port = os.environ["DET_RENDEZVOUS_PORT"]
    det_trial_unique_port_offset = int(
        os.environ["DET_TRIAL_UNIQUE_PORT_OFFSET"])
    det_trial_runner_network_interface = os.environ[
        "DET_TRIAL_RUNNER_NETWORK_INTERFACE"]
    det_trial_id = os.environ["DET_TRIAL_ID"]
    det_experiment_id = os.environ["DET_EXPERIMENT_ID"]
    det_agent_id = os.environ["DET_AGENT_ID"]
    det_cluster_id = os.environ["DET_CLUSTER_ID"]
    det_task_token = os.environ["DET_TASK_TOKEN"]
    trial_seed = int(os.environ["DET_TRIAL_SEED"])

    gpu_uuids = gpu.get_gpu_uuids_and_validate(use_gpu, slot_ids)

    env = det.EnvContext(
        master_addr,
        master_port,
        use_tls,
        master_cert_file,
        master_cert_name,
        container_id,
        experiment_config,
        hparams,
        initial_work,
        latest_checkpoint,
        use_gpu,
        gpu_uuids,
        slot_ids,
        debug,
        workload_manager_type,
        det_rendezvous_port,
        det_trial_unique_port_offset,
        det_trial_runner_network_interface,
        det_trial_id,
        det_experiment_id,
        det_agent_id,
        det_cluster_id,
        det_task_token,
        trial_seed,
        managed_training=True,
        test_mode=False,
        on_cluster=True,
    )

    logging.info(
        f"New trial runner in (container {container_id}) on agent {agent_id}: {env.__dict__}."
    )

    try:
        storage.validate_config(
            env.experiment_config["checkpoint_storage"],
            container_path=constants.SHARED_FS_CONTAINER_PATH,
        )
    except Exception as e:
        logging.error("Checkpoint storage validation failed: {}".format(e))
        sys.exit(1)

    try:
        build_and_run_training_pipeline(env)
    except det.InvalidHP:
        logging.info("InvalidHP detected, trial is exiting")
        pass