def create_test_session() -> session.Session: murl = conf.make_master_url() certs.cli_cert = certs.default_load(murl) authentication.cli_auth = authentication.Authentication(murl, try_reauth=True) return session.Session(murl, "determined", authentication.cli_auth, certs.cli_cert)
def test_streaming_observability_metrics_apis( framework_base_experiment: str, framework_timings_enabled: bool ) -> None: # TODO: refactor tests to not use cli singleton auth. certs.cli_cert = certs.default_load(conf.make_master_url()) authentication.cli_auth = authentication.Authentication(conf.make_master_url(), try_reauth=True) config_path = conf.tutorials_path(f"../{framework_base_experiment}/const.yaml") model_def_path = conf.tutorials_path(f"../{framework_base_experiment}") config_obj = conf.load_config(config_path) config_obj = conf.set_profiling_enabled(config_obj) with tempfile.NamedTemporaryFile() as tf: with open(tf.name, "w") as f: yaml.dump(config_obj, f) experiment_id = exp.create_experiment( tf.name, model_def_path, ) exp.wait_for_experiment_state(experiment_id, "COMPLETED") trials = exp.experiment_trials(experiment_id) trial_id = trials[0]["id"] gpu_enabled = conf.GPU_ENABLED request_profiling_metric_labels(trial_id, framework_timings_enabled, gpu_enabled) if gpu_enabled: request_profiling_system_metrics(trial_id, "gpu_util") if framework_timings_enabled: request_profiling_pytorch_timing_metrics(trial_id, "train_batch")
def main(): port = os.environ["NOTEBOOK_PORT"] notebook_id = os.environ["DET_TASK_ID"] notebook_server = f"http://127.0.0.1:{port}/proxy/{notebook_id}" master_url = os.environ["DET_MASTER"] cert = certs.default_load(master_url) try: idle_type = IdleType[os.environ["NOTEBOOK_IDLE_TYPE"].upper()] except KeyError: logging.warning("unknown idle type '%s', using default value", os.environ["NOTEBOOK_IDLE_TYPE"]) idle_type = IdleType.KERNELS_OR_TERMINALS wait_for_jupyter(("127.0.0.1", int(port))) while True: try: idle = is_idle(notebook_server, idle_type) api.put( master_url, f"/api/v1/notebooks/{notebook_id}/report_idle", { "notebook_id": notebook_id, "idle": idle }, cert=cert, ) except Exception as e: logging.warning("ignoring error communicating with master", exc_info=True) time.sleep(1)
def num_experiments() -> int: certs.cli_cert = certs.default_load(conf.make_master_url()) authentication.cli_auth = authentication.Authentication( conf.make_master_url(), try_reauth=True) r = api.get(conf.make_master_url(), "experiments") assert r.status_code == requests.codes.ok, r.text return len(r.json())
def get_command(command_id: str) -> Any: certs.cli_cert = certs.default_load(conf.make_master_url()) authentication.cli_auth = authentication.Authentication( conf.make_master_url(), try_reauth=True) r = api.get(conf.make_master_url(), "api/v1/commands/" + command_id) assert r.status_code == requests.codes.ok, r.text return r.json()["command"]
def trial_metrics(trial_id: int) -> Dict[str, Any]: certs.cli_cert = certs.default_load(conf.make_master_url()) authentication.cli_auth = authentication.Authentication( conf.make_master_url(), try_reauth=True) r = api.get(conf.make_master_url(), "trials/{}/metrics".format(trial_id)) json = r.json() # type: Dict[str, Any] return json
def __init__( self, master: Optional[str] = None, user: Optional[str] = None, password: Optional[str] = None, cert_path: Optional[str] = None, cert_name: Optional[str] = None, noverify: bool = False, ): master = master or util.get_default_master_address() cert = certs.default_load( master_url=master, explicit_path=cert_path, explicit_cert_name=cert_name, explicit_noverify=noverify, ) # TODO: This should probably be try_reauth=False, but it appears that would break the case # where the default credentials are available from the master and could be discovered by # a REST API call against the master. auth = authentication.Authentication(master, user, password, try_reauth=True, cert=cert) self._session = session.Session(master, user, auth, cert)
def cancel_experiment_v1(experiment_id: int) -> None: certs.cli_cert = certs.default_load(conf.make_master_url()) authentication.cli_auth = authentication.Authentication( conf.make_master_url(), try_reauth=True) r = api.post(conf.make_master_url(), "/api/v1/experiments/{}/cancel".format(experiment_id)) r.raise_for_status() wait_for_experiment_state(experiment_id, "CANCELED")
def main(ready: Pattern): master_url = str(os.environ["DET_MASTER"]) cert = certs.default_load(master_url) allocation_id = str(os.environ["DET_ALLOCATION_ID"]) for line in sys.stdin: if ready.match(line): post_ready(master_url, cert, allocation_id) return
def experiment_json(experiment_id: int) -> Dict[str, Any]: certs.cli_cert = certs.default_load(conf.make_master_url()) authentication.cli_auth = authentication.Authentication( conf.make_master_url(), try_reauth=True) r = api.get(conf.make_master_url(), "experiments/{}".format(experiment_id)) assert r.status_code == requests.codes.ok, r.text json = r.json() # type: Dict[str, Any] return json
def trial_logs(trial_id: int) -> List[str]: certs.cli_cert = certs.default_load(conf.make_master_url()) authentication.cli_auth = authentication.Authentication( conf.make_master_url(), try_reauth=True) return [ tl["message"] for tl in api.trial_logs(conf.make_master_url(), trial_id) ]
def get_num_running_commands() -> int: # TODO: refactor tests to not use cli singleton auth. certs.cli_cert = certs.default_load(conf.make_master_url()) authentication.cli_auth = authentication.Authentication(conf.make_master_url(), try_reauth=True) r = api.get(conf.make_master_url(), "api/v1/commands") assert r.status_code == requests.codes.ok, r.text return len([command for command in r.json()["commands"] if command["state"] == "STATE_RUNNING"])
def experiment_has_active_workload(experiment_id: int) -> bool: certs.cli_cert = certs.default_load(conf.make_master_url()) authentication.cli_auth = authentication.Authentication( conf.make_master_url(), try_reauth=True) r = api.get(conf.make_master_url(), "tasks").json() for task in r.values(): if "Experiment {}".format(experiment_id) in task["name"] and len( task["containers"]) > 0: return True return False
def experiment_has_completed_workload(experiment_id: int) -> bool: certs.cli_cert = certs.default_load(conf.make_master_url()) authentication.cli_auth = authentication.Authentication( conf.make_master_url(), try_reauth=True) trials = experiment_trials(experiment_id) if not any(trials): return False return any( any(s["state"] == "COMPLETED" for s in t["steps"]) for t in trials)
def cluster_slots() -> Dict[str, Any]: """ cluster_slots returns a dict of slots that each agent has. :return: Dict[AgentID, List[Slot]] """ # TODO: refactor tests to not use cli singleton auth. certs.cli_cert = certs.default_load(conf.make_master_url()) authentication.cli_auth = authentication.Authentication(conf.make_master_url(), try_reauth=True) r = api.get(conf.make_master_url(), "agents") assert r.status_code == requests.codes.ok, r.text json = r.json() # type: Dict[str, Any] return {agent["id"]: agent["slots"].values() for agent in json.values()}
def change_experiment_state(experiment_id: int, new_state: str) -> None: # TODO(DET-5678): refactor tests to not use cli singleton auth. certs.cli_cert = certs.default_load(conf.make_master_url()) authentication.cli_auth = authentication.Authentication( conf.make_master_url(), try_reauth=True) r = api.patch( conf.make_master_url(), "experiments/{}".format(experiment_id), headers={"Content-Type": "application/merge-patch+json"}, body={"state": new_state}, ) assert r.status_code == requests.codes.no_content, r.text
def main() -> None: if len(sys.argv) != 2: print("worker_process_env_path must be provided as a commandline argument", file=sys.stderr) sys.exit(1) # Load the worker process env. worker_process_env_path = pathlib.Path(sys.argv[1]) worker_process_env = layers.WorkerProcessContext.from_file(worker_process_env_path) config_logging(worker_process_env) # API code expects credential to be available as an environment variable os.environ["DET_TASK_TOKEN"] = worker_process_env.env.det_task_token # TODO: refactor websocket, data_layer, and profiling to to not use the cli_cert. master_url = ( f"http{'s' if worker_process_env.env.use_tls else ''}://" f"{worker_process_env.env.master_addr}:{worker_process_env.env.master_port}" ) certs.cli_cert = certs.default_load(master_url=master_url) if worker_process_env.env.experiment_config.debug_enabled(): faulthandler.dump_traceback_later(30, repeat=True) # Establish the connection to the ZMQBroadcastServer in this container. pub_url = f"tcp://localhost:{worker_process_env.broadcast_pub_port}" sub_url = f"tcp://localhost:{worker_process_env.broadcast_pull_port}" with ipc.ZMQBroadcastClient(pub_url, sub_url) as broadcast_client: # Wrap the communication layer in a workload.Stream. subrec = layers.SubprocessReceiver(broadcast_client) workloads = iter(subrec) with det._catch_sys_exit(): with det._catch_init_invalid_hp(workloads): controller = load.prepare_controller( worker_process_env.env, workloads, worker_process_env.load_path, worker_process_env.rendezvous_info, worker_process_env.hvd_config, ) try: controller.run() except Exception as e: broadcast_client.send_exception_message() raise e
def experiment_has_completed_workload(experiment_id: int) -> bool: certs.cli_cert = certs.default_load(conf.make_master_url()) authentication.cli_auth = authentication.Authentication( conf.make_master_url(), try_reauth=True) trials = experiment_trials(experiment_id) if not any(trials): return False for t in trials: for s in t.workloads: if (s.training is not None and s.training.state == determinedexperimentv1State.STATE_COMPLETED) or ( s.validation is not None and s.validation.state == determinedexperimentv1State.STATE_COMPLETED): return True return False
def wait_for_gc_to_finish(experiment_id: int) -> None: certs.cli_cert = certs.default_load(conf.make_master_url()) authentication.cli_auth = authentication.Authentication( conf.make_master_url(), try_reauth=True) saw_gc = False # Don't wait longer than 5 minutes (as 600 half-seconds to improve our sampling resolution). for _ in range(600): r = api.get(conf.make_master_url(), "tasks").json() names = [task["name"] for task in r.values()] gc_name = f"Checkpoint GC (Experiment {experiment_id})" if gc_name in names: saw_gc = True elif saw_gc: # We previously saw checkpoint gc but now we don't, so it must have finished. return time.sleep(0.5) # It's possible that it ran really fast and we missed it, so just log this. print("Did not observe checkpoint gc start or finish!", file=sys.stderr)
def test_task_logs(task_type: str, task_config: Dict[str, Any], log_regex: Any) -> None: # TODO: refactor tests to not use cli singleton auth. master_url = conf.make_master_url() certs.cli_cert = certs.default_load(conf.make_master_url()) authentication.cli_auth = authentication.Authentication( conf.make_master_url(), try_reauth=True) rps = bindings.get_GetResourcePools( session.Session(master_url, "determined", authentication.cli_auth, certs.cli_cert)) assert rps.resourcePools and len( rps.resourcePools) > 0, "missing resource pool" if (rps.resourcePools[0].type == bindings.v1ResourcePoolType.RESOURCE_POOL_TYPE_K8S and task_type == command.TaskTypeCommand): # TODO(DET-6712): Investigate intermittent slowness with K8s command logs. return body = {} if task_type == command.TaskTypeTensorBoard: exp_id = exp.run_basic_test( conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), 1, ) body.update({"experiment_ids": [exp_id]}) resp = command.launch_command( master_url, f"api/v1/{command.RemoteTaskNewAPIs[task_type]}", task_config, "", default_body=body, ) task_id = resp[command.RemoteTaskName[task_type]]["id"] try: check_logs(master_url, task_id, log_regex, api.task_logs, api.task_log_fields) finally: command._kill(master_url, task_type, task_id)
def test_hp_importance_api() -> None: certs.cli_cert = certs.default_load(conf.make_master_url()) authentication.cli_auth = authentication.Authentication( conf.make_master_url(), try_reauth=True) pool = mp.pool.ThreadPool(processes=1) experiment_id = exp.create_experiment( conf.fixtures_path("mnist_pytorch/random.yaml"), conf.tutorials_path("mnist_pytorch"), ) hp_importance_thread = pool.apply_async(request_hp_importance, (experiment_id, )) hp_importance_results = hp_importance_thread.get() if hp_importance_results is not None: pytest.fail("hyperparameter-importance: %s. Results: %s" % hp_importance_results)
def test_change_displayname(clean_auth: None) -> None: u_patch = create_test_user(ADMIN_CREDENTIALS, False) original_name = u_patch.username master_url = conf.make_master_url() certs.cli_cert = certs.default_load(master_url) authentication.cli_auth = authentication.Authentication( conf.make_master_url(), requested_user=original_name, password="", try_reauth=True) sess = session.Session(master_url, original_name, authentication.cli_auth, certs.cli_cert) # Get API bindings object for the created test user all_users = bindings.get_GetUsers(sess).users assert all_users is not None current_user = list( filter(lambda u: u.username == original_name, all_users))[0] assert current_user is not None and current_user.id # Rename user using display name patch_user = bindings.v1PatchUser(displayName="renamed") bindings.patch_PatchUser(sess, body=patch_user, userId=current_user.id) modded_user = bindings.get_GetUser(sess, userId=current_user.id).user assert modded_user is not None assert modded_user.displayName == "renamed" # Avoid display name of 'admin' patch_user.displayName = "Admin" with pytest.raises(errors.APIException): bindings.patch_PatchUser(sess, body=patch_user, userId=current_user.id) # Clear display name (UI will show username) patch_user.displayName = "" bindings.patch_PatchUser(sess, body=patch_user, userId=current_user.id) modded_user = bindings.get_GetUser(sess, userId=current_user.id).user assert modded_user is not None assert modded_user.displayName == ""
def test_trial_logs() -> None: # TODO: refactor tests to not use cli singleton auth. master_url = conf.make_master_url() certs.cli_cert = certs.default_load(conf.make_master_url()) authentication.cli_auth = authentication.Authentication( conf.make_master_url(), try_reauth=True) experiment_id = exp.run_basic_test(conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), 1) trial = exp.experiment_trials(experiment_id)[0].trial trial_id = trial.id task_id = trial.taskId assert task_id != "" log_regex = re.compile("^.*New trial runner.*$") # Trial-specific APIs should work just fine. check_logs(master_url, trial_id, log_regex, api.trial_logs, api.trial_log_fields) # And so should new task log APIs. check_logs(master_url, task_id, log_regex, api.task_logs, api.task_log_fields)
def test_experimental_experiment_api_determined_disabled() -> None: context_path = pathlib.Path(conf.fixtures_path("no_op")) model_def_path = pathlib.Path( conf.fixtures_path("no_op/single-medium-train-step.yaml")) model_context = context.Context.from_local(context_path) with model_def_path.open("r") as fin: dai_experiment_config = util.safe_load_yaml_with_exceptions(fin) determined_master = conf.make_master_url() requested_user, password = create_test_user(ADMIN_CREDENTIALS, add_password=True) a_username, _ = ADMIN_CREDENTIALS try: det_spawn(["-u", a_username, "user", "deactivate", "determined"]) certs.cli_cert = certs.default_load(master_url=determined_master, ) determined_api.authentication.cli_auth = determined_api.authentication.Authentication( determined_master, requested_user=requested_user, password=password, try_reauth=True, cert=certs.cli_cert, ) exp_id = determined_api.experiment.create_experiment_and_follow_logs( master_url=determined_master, config=dai_experiment_config, model_context=model_context, template=None, additional_body_fields={}, activate=True, follow_first_trial_logs=False, ) exp.wait_for_experiment_state(exp_id, EXP_STATE.STATE_COMPLETED) finally: det_spawn(["-u", a_username, "user", "activate", "determined"])
def main(hvd_args: List[str], script: List[str], autohorovod: bool) -> int: hvd_args = hvd_args or [] info = det.get_cluster_info() assert info is not None, "must be run on-cluster" assert info.task_type == "TRIAL", f'must be run with task_type="TRIAL", not "{info.task_type}"' # When --autohorovod was set, detect single-slot and zero-slot trials. if autohorovod and len( info.container_addrs) == 1 and len(info.slot_ids) <= 1: p = subprocess.Popen(script) with det.util.forward_signals(p): return p.wait() # Hack: get the resources id from the environment. resources_id = os.environ.get("DET_RESOURCES_ID") assert resources_id is not None, "Unable to run with DET_RESOURCES_ID unset" # Hack: read the full config. The experiment config is not a stable API! experiment_config = info.trial._config debug = experiment_config.get("debug", False) if debug: logging.getLogger().setLevel(logging.DEBUG) # TODO: refactor websocket, data_layer, and profiling to to not use the cli_cert. cert = certs.default_load(info.master_url) certs.cli_cert = cert # The launch layer should provide the chief_ip to the training code, so that the training code # can function with a different launch layer in a different environment. Inside Determined, the # easiest way to get the chief_ip is with container_addrs. chief_ip = info.container_addrs[0] # Chief IP is set as an environment variable to support nested launch layers os.environ["DET_CHIEF_IP"] = chief_ip if info.container_rank > 0: # Non-chief machines just run sshd. # Mark sshd containers as daemon resources that the master should kill when all non-daemon # contiainers (horovodrun, in this case) have exited. api.post( info.master_url, path= f"/api/v1/allocations/{info.allocation_id}/resources/{resources_id}/daemon", cert=cert, ) pid_server_cmd, run_sshd_command = create_sshd_worker_cmd( info.allocation_id, len(info.slot_ids), debug=debug) logging.debug( f"Non-chief [{info.container_rank}] training process launch " f"command: {run_sshd_command}.") p = subprocess.Popen(pid_server_cmd + run_sshd_command) with det.util.forward_signals(p): return p.wait() # Chief machine waits for every worker's sshd to be available. All machines should be pretty # close to in-step by now because all machines just finished synchronizing rendezvous info. deadline = time.time() + 20 for peer_addr in info.container_addrs[1:]: util.check_sshd(peer_addr, deadline, DTRAIN_SSH_PORT) # The chief has several layers of wrapper processes: # - a top-level pid_server, which causes the whole container to exit if any local worker dies. # - horovodrun, which launches $slots_per_trial copies of the following layers: # - a pid_client process to contact the local pid_server # - wrap_rank, which redirects stdin/stdout to the local container # - harness.py, which actually does the training for the worker # # It is a bug in horovod that causes us to have this pid_server/pid_client pair of layers. # We can remove these layers when the upstream fix has been around for long enough that we can # reasonably require user images to have patched horovod installations. pid_server_cmd = create_hvd_pid_server_cmd(info.allocation_id, len(info.slot_ids)) # TODO: remove this (very old) hack when we have a configurable launch layer. hvd_optional_args = experiment_config.get("data", {}).get("__det_dtrain_args", []) hvd_optional_args += hvd_args if debug: hvd_optional_args += ["--mpi-args=-v --display-map"] hvd_cmd = horovod.create_run_command( num_proc_per_machine=len(info.slot_ids), ip_addresses=info.container_addrs, inter_node_network_interface=info.trial._inter_node_network_interface, optimizations=experiment_config["optimizations"], debug=debug, optional_args=hvd_optional_args, ) worker_wrapper_cmd = create_worker_wrapper_cmd(info.allocation_id) logging.debug( f"chief worker calling horovodrun with args: {hvd_cmd[1:]} ...") os.environ["USE_HOROVOD"] = "1" # We now have environment images with built-in OpenMPI. When invoked the # SLURM_JOBID variable triggers integration with SLURM, however, we are # running in a singularity container and SLURM may or may not have # compatible configuration enabled. We therefore clear the SLURM_JOBID variable # before invoking mpi so that mpirun will honor the args passed via horvod # run to it describing the hosts and process topology, otherwise mpi ends # up wanting to launch all -np# processes on the local causing an oversubscription # error ("There are not enough slots available in the system"). os.environ.pop("SLURM_JOBID", None) p = subprocess.Popen(pid_server_cmd + hvd_cmd + worker_wrapper_cmd + script) with det.util.forward_signals(p): return p.wait()
import distutils.util import io import os import tarfile from determined import constants from determined.common.api import certs, request if __name__ == "__main__": exp_id = os.environ["DET_EXPERIMENT_ID"] master_addr = os.environ["DET_MASTER_ADDR"] master_port = os.environ["DET_MASTER_PORT"] use_tls = distutils.util.strtobool(os.environ.get("DET_USE_TLS", "false")) master_url = f"http{'s' if use_tls else ''}://{master_addr}:{master_port}" certs.cli_cert = certs.default_load(master_url=master_url) resp = request.get(master_url, f"api/v1/experiments/{exp_id}/model_def") resp.raise_for_status() tgz = base64.b64decode(resp.json()["b64Tgz"]) with tarfile.open(fileobj=io.BytesIO(tgz), mode="r:gz") as model_def: # Ensure all members of the tarball resolve to subdirectories. for path in model_def.getnames(): if os.path.relpath(path).startswith("../"): raise ValueError( f"'{path}' in tarball would expand to a parent directory") model_def.extractall(path=constants.MANAGED_TRAINING_MODEL_COPY) model_def.extractall(path=".")
def main(train_entrypoint: str) -> int: info = det.get_cluster_info() assert info is not None, "must be run on-cluster" assert info.task_type == "TRIAL", f'must be run with task_type="TRIAL", not "{info.task_type}"' # TODO: refactor data_layer, and profiling to to not use the cli_cert. certs.cli_cert = certs.default_load(info.master_url) # TODO: Don't include EnvContext object in the future high-level APIs for PyTorch or Keras. # It was natural to create this big-blob-of-config object, but it was a mistake to pass it into # the lowest layers of the harness code; it's too large of an object to be easily mockable, # which is part of why building local training mode has always been a challenge. # # A better pattern is to pass in exactly the information that is necessary at each layer. We # will use that pattern for the future high-level APIs, but it's not worth refactoring e.g. the # TFKerasTrialController or EstimatorTrialController to add that functionality, so for now we # continue with the legacy strategy. env = det.EnvContext( master_url=info.master_url, master_cert_file=info.master_cert_file, master_cert_name=info.master_cert_name, experiment_config=info.trial._config, hparams=info.trial.hparams, latest_checkpoint=info.latest_checkpoint, steps_completed=info.trial._steps_completed, use_gpu=bool(info.gpu_uuids), container_gpus=info.gpu_uuids, slot_ids=info.slot_ids, debug=info.trial._debug, det_trial_unique_port_offset=info.trial._unique_port_offset, det_trial_id=str(info.trial.trial_id), det_experiment_id=str(info.trial.experiment_id), det_agent_id=info.agent_id, det_cluster_id=info.cluster_id, trial_seed=info.trial.trial_seed, trial_run_id=info.trial._trial_run_id, allocation_id=info.allocation_id, managed_training=True, test_mode=False, on_cluster=True, ) det.common.set_logger(env.debug) logging.debug("Starting harness.") with maybe_periodic_stacktraces(env.debug): # Step 1: Load user code. # We can't build a core.Context without rank information, and we can't gather rank # information until the distributed backend is initialized, and we can't initialize the # correct distributed backend until we know which Trial class the user implemented. trial_class = load.trial_class_from_entrypoint(train_entrypoint) controller_class = load.get_trial_controller_class(trial_class) if info.container_rank == 0: try: analytics.send_analytics("trial_loaded", analytics.get_trial_analytics(trial_class)) except Exception as e: logging.debug(f"Cannot send analytics: {e}") # Step 2: Initialize framework-specific details (dtrain framework, random seeds, etc). distributed_backend = det._DistributedBackend() controller_class.pre_execute_hook(env, distributed_backend) # Step 3: Now that the dtrain framework is initialized, build the DistributedContext object. # For harness.py, we only support a fixed set of Determined-provided launch layers, since # the TrialControllers only support a fixed set of launch layers. distributed = None if distributed_backend.use_horovod(): distributed = core.DistributedContext.from_horovod(horovod.hvd) elif distributed_backend.use_deepspeed(): distributed = core.DistributedContext.from_deepspeed() elif distributed_backend.use_torch(): distributed = core.DistributedContext.from_torch_distributed() elif len(info.container_addrs) > 1 or len(info.slot_ids) > 1: raise ValueError( "In multi-slot tasks, the determined.exec.harness module must not be invoked " "directly. Instead, it must be wrapped in one of the following launch layers: " "determined.launch.horovod, determined.launch.deepspeed" ) # Step 4: Let core.init() create the core.Context. with core.init( distributed=distributed, preempt_mode=core.PreemptMode.ChiefOnly, tensorboard_mode=core.TensorboardMode.MANUAL, ) as core_context: trial_context = trial_class.trial_context_class(core_context, env) # Step 4: Instantiate the user's Trial. trial_inst = trial_class(trial_context) # Step 5: Create a TrialController and execute training logging.info(f"Creating {controller_class.__name__} with {trial_class.__name__}.") controller = controller_class.from_trial( trial_inst=trial_inst, context=trial_context, env=env, ) controller.run() return 0
def main(args: List[str] = sys.argv[1:], ) -> None: # TODO: we lazily import "det deploy" but in the future we'd want to lazily import everything. parser = make_parser() full_cmd, aliases = generate_aliases(deploy_cmd.name) is_deploy_cmd = len(args) > 0 and any(args[0] == alias for alias in [*aliases, full_cmd]) if is_deploy_cmd: from determined.deploy.cli import args_description as deploy_args_description add_args(parser, [deploy_args_description]) else: add_args(parser, all_args_description) try: argcomplete.autocomplete(parser) parsed_args = parser.parse_args(args) def die(message: str, always_print_traceback: bool = False) -> None: if always_print_traceback or debug_mode(): import traceback traceback.print_exc(file=sys.stderr) parser.exit(1, colored(message + "\n", "red")) v = vars(parsed_args) if not v.get("func"): parser.print_usage() parser.exit(2, "{}: no subcommand specified\n".format(parser.prog)) try: # For `det deploy`, skip interaction with master. if is_deploy_cmd: parsed_args.func(parsed_args) return # Configure the CLI's Cert singleton. certs.cli_cert = certs.default_load(parsed_args.master) try: check_version(parsed_args) except requests.exceptions.SSLError: # An SSLError usually means that we queried a master over HTTPS and got an untrusted # cert, so allow the user to store and trust the current cert. (It could also mean # that we tried to talk HTTPS on the HTTP port, but distinguishing that based on the # exception is annoying, and we'll figure that out in the next step anyway.) addr = api.parse_master_address(parsed_args.master) check_not_none(addr.hostname) check_not_none(addr.port) try: ctx = SSL.Context(SSL.TLSv1_2_METHOD) conn = SSL.Connection(ctx, socket.socket()) conn.set_tlsext_host_name( cast(str, addr.hostname).encode()) conn.connect( cast(Sequence[Union[str, int]], (addr.hostname, addr.port))) conn.do_handshake() cert_pem_data = "".join( crypto.dump_certificate(crypto.FILETYPE_PEM, cert).decode() for cert in conn.get_peer_cert_chain()) except crypto.Error: die("Tried to connect over HTTPS but couldn't get a certificate from the " "master; consider using HTTP") cert_hash = hashlib.sha256( ssl.PEM_cert_to_DER_cert(cert_pem_data)).hexdigest() cert_fingerprint = ":".join(chunks(cert_hash, 2)) if not render.yes_or_no( "The master sent an untrusted certificate chain with this SHA256 fingerprint:\n" "{}\nDo you want to trust this certificate from now on?" .format(cert_fingerprint)): die("Unable to verify master certificate") certs.CertStore(certs.default_store()).set_cert( parsed_args.master, cert_pem_data) # Reconfigure the CLI's Cert singleton, but preserve the certificate name. old_cert_name = certs.cli_cert.name certs.cli_cert = certs.Cert(cert_pem=cert_pem_data, name=old_cert_name) check_version(parsed_args) parsed_args.func(parsed_args) except KeyboardInterrupt as e: raise e except (api.errors.BadRequestException, api.errors.BadResponseException) as e: die("Failed to {}: {}".format(parsed_args.func.__name__, e)) except api.errors.CorruptTokenCacheException: die("Failed to login: Attempted to read a corrupted token cache. " "The store has been deleted; please try again.") except EnterpriseOnlyError as e: die(f"Determined Enterprise Edition is required for this functionality: {e}" ) except Exception: die("Failed to {}".format(parsed_args.func.__name__), always_print_traceback=True) except KeyboardInterrupt: # die() may not be defined yet. if debug_mode(): import traceback traceback.print_exc(file=sys.stderr) print(colored("Interrupting...\n", "red"), file=sys.stderr) exit(3)
def main(script: List[str]) -> int: info = det.get_cluster_info() assert info is not None, "must be run on-cluster" assert info.task_type == "TRIAL", f'must be run with task_type="TRIAL", not "{info.task_type}"' # Hack: get the resources id from the environment. resources_id = os.environ.get("DET_RESOURCES_ID") assert resources_id is not None, "Unable to run with DET_RESOURCES_ID unset" # TODO: refactor websocket, data_layer, and profiling to to not use the cli_cert. cert = certs.default_load(info.master_url) certs.cli_cert = cert # The launch layer should provide the chief_ip to the training code, so that the training code # can function with a different launch layer in a different environment. Inside Determined, the # easiest way to get the chief_ip is with container_addrs. chief_ip = info.container_addrs[0] # Chief IP is set as an environment variable to support nested launch layers os.environ["DET_CHIEF_IP"] = chief_ip # All ranks will need to run sshd. run_sshd_command = create_sshd_cmd() if info.container_rank > 0: # Non-chief machines just run sshd. # Mark sshd containers as daemon containers that the master should kill when all non-daemon # containers (deepspeed launcher, in this case) have exited. api.post( info.master_url, path=f"/api/v1/allocations/{info.allocation_id}/resources/{resources_id}/daemon", cert=cert, ) # Wrap it in a pid_server to ensure that we can't hang if a worker fails. # This is useful for deepspeed which does not have good error handling for remote processes # spun up by pdsh. pid_server_cmd = create_pid_server_cmd(info.allocation_id, len(info.slot_ids)) logging.debug( f"Non-chief [{info.container_rank}] training process launch " f"command: {run_sshd_command}." ) return subprocess.Popen(pid_server_cmd + run_sshd_command).wait() # We always need to set this variable to initialize the context correctly, even in the single # slot case. os.environ["USE_DEEPSPEED"] = "1" # The chief has several layers of wrapper processes: # - a top-level pid_server, which causes the whole container to exit if any local worker dies. # - deepspeed, which launches $slots_per_trial copies of the following layers: # - a pid_client process to contact the local pid_server # - wrap_rank, which redirects stdin/stdout to the local container # - harness.py, which actually does the training for the worker pid_server_cmd = create_pid_server_cmd(info.allocation_id, len(info.slot_ids)) master_address = create_hostlist_file( hostfile_path=pathlib.Path(hostfile_path), num_proc_per_machine=len(info.slot_ids), ip_addresses=info.container_addrs, ) cmd = create_run_command(master_address, hostfile_path) pid_client_cmd = create_pid_client_cmd(info.allocation_id) log_redirect_cmd = create_log_redirect_cmd() harness_cmd = script logging.debug(f"chief worker calling deepspeed with args: {cmd[1:]} ...") full_cmd = pid_server_cmd + cmd + pid_client_cmd + log_redirect_cmd + harness_cmd multi_machine = len(info.container_addrs) > 1 if not multi_machine: return subprocess.Popen(full_cmd).wait() # Create the environment file that will be passed by deepspeed to individual ranks. create_deepspeed_env_file() # Set custom PDSH args: # * bypass strict host checking # * -p our custom port # * other args are default ssh args for pdsh os.environ["PDSH_SSH_ARGS"] = ( "-o PasswordAuthentication=no -o StrictHostKeyChecking=no " f"-p {constants.DTRAIN_SSH_PORT} -2 -a -x %h" ) # Chief worker also needs to run sshd when using pdsh and multi-machine training. sshd_process = subprocess.Popen(run_sshd_command) try: # Chief machine waits for every worker's sshd to be available. All machines should be # close to in-step by now because all machines just finished synchronizing rendezvous # info. deadline = time.time() + 20 for peer_addr in info.container_addrs: util.check_sshd(peer_addr, deadline, constants.DTRAIN_SSH_PORT) return subprocess.Popen(full_cmd).wait() finally: sshd_process.kill() sshd_process.wait()
def main() -> None: for k in ENVIRONMENT_VARIABLE_KEYS: if k not in os.environ: sys.exit("Environment not set: missing " + k) experiment_config = simplejson.loads(os.environ["DET_EXPERIMENT_CONFIG"]) debug = experiment_config.get("debug", False) determined.common.set_logger(debug) master_addr = os.environ["DET_MASTER_ADDR"] master_port = int(os.environ["DET_MASTER_PORT"]) use_tls = distutils.util.strtobool(os.environ.get("DET_USE_TLS", "false")) master_cert_file = os.environ.get("DET_MASTER_CERT_FILE") master_cert_name = os.environ.get("DET_MASTER_CERT_NAME") agent_id = os.environ["DET_AGENT_ID"] container_id = os.environ["DET_CONTAINER_ID"] hparams = simplejson.loads(os.environ["DET_HPARAMS"]) initial_work = workload.Workload.from_json( simplejson.loads(os.environ["DET_INITIAL_WORKLOAD"])) # TODO: refactor websocket, data_layer, and profiling to to not use the cli_cert. certs.cli_cert = certs.default_load( master_url=f"http{'s' if use_tls else ''}://{master_addr}:{master_port}" ) with open(os.environ["DET_LATEST_CHECKPOINT"], "r") as f: latest_checkpoint = json.load(f) use_gpu = distutils.util.strtobool(os.environ.get("DET_USE_GPU", "false")) slot_ids = json.loads(os.environ["DET_SLOT_IDS"]) workload_manager_type = os.environ["DET_WORKLOAD_MANAGER_TYPE"] det_rendezvous_port = os.environ["DET_RENDEZVOUS_PORT"] det_trial_unique_port_offset = int( os.environ["DET_TRIAL_UNIQUE_PORT_OFFSET"]) det_trial_runner_network_interface = os.environ[ "DET_TRIAL_RUNNER_NETWORK_INTERFACE"] det_trial_id = os.environ["DET_TRIAL_ID"] det_experiment_id = os.environ["DET_EXPERIMENT_ID"] det_agent_id = os.environ["DET_AGENT_ID"] det_cluster_id = os.environ["DET_CLUSTER_ID"] det_task_token = os.environ["DET_TASK_TOKEN"] trial_seed = int(os.environ["DET_TRIAL_SEED"]) gpu_uuids = gpu.get_gpu_uuids_and_validate(use_gpu, slot_ids) env = det.EnvContext( master_addr, master_port, use_tls, master_cert_file, master_cert_name, container_id, experiment_config, hparams, initial_work, latest_checkpoint, use_gpu, gpu_uuids, slot_ids, debug, workload_manager_type, det_rendezvous_port, det_trial_unique_port_offset, det_trial_runner_network_interface, det_trial_id, det_experiment_id, det_agent_id, det_cluster_id, det_task_token, trial_seed, managed_training=True, test_mode=False, on_cluster=True, ) logging.info( f"New trial runner in (container {container_id}) on agent {agent_id}: {env.__dict__}." ) try: storage.validate_config( env.experiment_config["checkpoint_storage"], container_path=constants.SHARED_FS_CONTAINER_PATH, ) except Exception as e: logging.error("Checkpoint storage validation failed: {}".format(e)) sys.exit(1) try: build_and_run_training_pipeline(env) except det.InvalidHP: logging.info("InvalidHP detected, trial is exiting") pass