def trigger_preemption(signum: int, frame: types.FrameType) -> None: info = det.get_cluster_info() if info and info.container_rank == 0: # Chief container, requests preemption, others ignore logging.debug( f"[rank={info.container_rank}] SIGTERM: Preemption imminent.") # Notify the master that we need to be preempted api.post( info.master_url, f"/api/v1/allocations/{info.allocation_id}/signals/pending_preemption" )
def main(override_args: List[str], script: List[str]) -> int: override_args = override_args or [] info = det.get_cluster_info() assert info is not None, "must be run on-cluster" single_slot = len(info.container_addrs) == 1 and len(info.slot_ids) <= 1 # Detect single-slot trials and skip distributed launch if single_slot: p = subprocess.Popen(script) with det.util.forward_signals(p): return p.wait() os.environ["USE_TORCH_DISTRIBUTED"] = "True" chief_ip = info.container_addrs[0] os.environ["DET_CHIEF_IP"] = chief_ip torch_distributed_cmd = create_launch_cmd( len(info.container_addrs), len(info.slot_ids), info.container_rank, "localhost" if len(info.container_addrs) == 1 else chief_ip, override_args, ) log_redirect_cmd = create_log_redirect_cmd() # Due to a bug in PyTorch, we need to wrap the launcher in pid_server/pid_client to correctly # handle errors and ensure workers don't hang when a process fails pid_server_cmd = create_pid_server_cmd(info.allocation_id, len(info.slot_ids)) pid_client_cmd = create_pid_client_cmd(info.allocation_id) launch_cmd = pid_server_cmd + torch_distributed_cmd + pid_client_cmd + log_redirect_cmd + script logging.debug(f"Torch distributed launching with: {launch_cmd}") p = subprocess.Popen(launch_cmd) with det.util.forward_signals(p): return p.wait()
def _get_training_port_offset() -> int: info = det.get_cluster_info() if info and info.task_type == "TRIAL": return info.trial._unique_port_offset return 0
new_dict["checkpoint_storage"][key] = mask except (KeyError, AttributeError): pass try: if new_dict["environment"]["registry_auth"].get( "password") is not None: new_dict["environment"]["registry_auth"]["password"] = mask except (KeyError, AttributeError): pass return new_dict if __name__ == "__main__": info = det.get_cluster_info() assert info is not None, "must be run on-cluster" assert info.task_type == "TRIAL", f'must be run with task_type="TRIAL", not "{info.task_type}"' # Hack: get the resources id from the environment. resources_id = os.environ.get("DET_RESOURCES_ID") assert resources_id is not None, "Unable to run with DET_RESOURCES_ID unset" # Hack: read the full config. The experiment config is not a stable API! experiment_config = det.ExperimentConfig(info.trial._config) determined.common.set_logger(experiment_config.debug_enabled()) logging.info( f"New trial runner in (container {resources_id}) on agent {info.agent_id}: " + json.dumps(mask_config_dict(info.trial._config)))
def main(script: List[str]) -> int: info = det.get_cluster_info() assert info is not None, "must be run on-cluster" assert info.task_type == "TRIAL", f'must be run with task_type="TRIAL", not "{info.task_type}"' # Hack: get the resources id from the environment. resources_id = os.environ.get("DET_RESOURCES_ID") assert resources_id is not None, "Unable to run with DET_RESOURCES_ID unset" # TODO: refactor websocket, data_layer, and profiling to to not use the cli_cert. cert = certs.default_load(info.master_url) certs.cli_cert = cert # The launch layer should provide the chief_ip to the training code, so that the training code # can function with a different launch layer in a different environment. Inside Determined, the # easiest way to get the chief_ip is with container_addrs. chief_ip = info.container_addrs[0] # Chief IP is set as an environment variable to support nested launch layers os.environ["DET_CHIEF_IP"] = chief_ip # All ranks will need to run sshd. run_sshd_command = create_sshd_cmd() if info.container_rank > 0: # Non-chief machines just run sshd. # Mark sshd containers as daemon containers that the master should kill when all non-daemon # containers (deepspeed launcher, in this case) have exited. api.post( info.master_url, path=f"/api/v1/allocations/{info.allocation_id}/resources/{resources_id}/daemon", cert=cert, ) # Wrap it in a pid_server to ensure that we can't hang if a worker fails. # This is useful for deepspeed which does not have good error handling for remote processes # spun up by pdsh. pid_server_cmd = create_pid_server_cmd(info.allocation_id, len(info.slot_ids)) logging.debug( f"Non-chief [{info.container_rank}] training process launch " f"command: {run_sshd_command}." ) return subprocess.Popen(pid_server_cmd + run_sshd_command).wait() # We always need to set this variable to initialize the context correctly, even in the single # slot case. os.environ["USE_DEEPSPEED"] = "1" # The chief has several layers of wrapper processes: # - a top-level pid_server, which causes the whole container to exit if any local worker dies. # - deepspeed, which launches $slots_per_trial copies of the following layers: # - a pid_client process to contact the local pid_server # - wrap_rank, which redirects stdin/stdout to the local container # - harness.py, which actually does the training for the worker pid_server_cmd = create_pid_server_cmd(info.allocation_id, len(info.slot_ids)) master_address = create_hostlist_file( hostfile_path=pathlib.Path(hostfile_path), num_proc_per_machine=len(info.slot_ids), ip_addresses=info.container_addrs, ) cmd = create_run_command(master_address, hostfile_path) pid_client_cmd = create_pid_client_cmd(info.allocation_id) log_redirect_cmd = create_log_redirect_cmd() harness_cmd = script logging.debug(f"chief worker calling deepspeed with args: {cmd[1:]} ...") full_cmd = pid_server_cmd + cmd + pid_client_cmd + log_redirect_cmd + harness_cmd multi_machine = len(info.container_addrs) > 1 if not multi_machine: return subprocess.Popen(full_cmd).wait() # Create the environment file that will be passed by deepspeed to individual ranks. create_deepspeed_env_file() # Set custom PDSH args: # * bypass strict host checking # * -p our custom port # * other args are default ssh args for pdsh os.environ["PDSH_SSH_ARGS"] = ( "-o PasswordAuthentication=no -o StrictHostKeyChecking=no " f"-p {constants.DTRAIN_SSH_PORT} -2 -a -x %h" ) # Chief worker also needs to run sshd when using pdsh and multi-machine training. sshd_process = subprocess.Popen(run_sshd_command) try: # Chief machine waits for every worker's sshd to be available. All machines should be # close to in-step by now because all machines just finished synchronizing rendezvous # info. deadline = time.time() + 20 for peer_addr in info.container_addrs: util.check_sshd(peer_addr, deadline, constants.DTRAIN_SSH_PORT) return subprocess.Popen(full_cmd).wait() finally: sshd_process.kill() sshd_process.wait()
def main(train_entrypoint: str) -> int: info = det.get_cluster_info() assert info is not None, "must be run on-cluster" assert info.task_type == "TRIAL", f'must be run with task_type="TRIAL", not "{info.task_type}"' # TODO: refactor data_layer, and profiling to to not use the cli_cert. certs.cli_cert = certs.default_load(info.master_url) # TODO: Don't include EnvContext object in the future high-level APIs for PyTorch or Keras. # It was natural to create this big-blob-of-config object, but it was a mistake to pass it into # the lowest layers of the harness code; it's too large of an object to be easily mockable, # which is part of why building local training mode has always been a challenge. # # A better pattern is to pass in exactly the information that is necessary at each layer. We # will use that pattern for the future high-level APIs, but it's not worth refactoring e.g. the # TFKerasTrialController or EstimatorTrialController to add that functionality, so for now we # continue with the legacy strategy. env = det.EnvContext( master_url=info.master_url, master_cert_file=info.master_cert_file, master_cert_name=info.master_cert_name, experiment_config=info.trial._config, hparams=info.trial.hparams, latest_checkpoint=info.latest_checkpoint, steps_completed=info.trial._steps_completed, use_gpu=bool(info.gpu_uuids), container_gpus=info.gpu_uuids, slot_ids=info.slot_ids, debug=info.trial._debug, det_trial_unique_port_offset=info.trial._unique_port_offset, det_trial_id=str(info.trial.trial_id), det_experiment_id=str(info.trial.experiment_id), det_agent_id=info.agent_id, det_cluster_id=info.cluster_id, trial_seed=info.trial.trial_seed, trial_run_id=info.trial._trial_run_id, allocation_id=info.allocation_id, managed_training=True, test_mode=False, on_cluster=True, ) det.common.set_logger(env.debug) logging.debug("Starting harness.") with maybe_periodic_stacktraces(env.debug): # Step 1: Load user code. # We can't build a core.Context without rank information, and we can't gather rank # information until the distributed backend is initialized, and we can't initialize the # correct distributed backend until we know which Trial class the user implemented. trial_class = load.trial_class_from_entrypoint(train_entrypoint) controller_class = load.get_trial_controller_class(trial_class) if info.container_rank == 0: try: analytics.send_analytics("trial_loaded", analytics.get_trial_analytics(trial_class)) except Exception as e: logging.debug(f"Cannot send analytics: {e}") # Step 2: Initialize framework-specific details (dtrain framework, random seeds, etc). distributed_backend = det._DistributedBackend() controller_class.pre_execute_hook(env, distributed_backend) # Step 3: Now that the dtrain framework is initialized, build the DistributedContext object. # For harness.py, we only support a fixed set of Determined-provided launch layers, since # the TrialControllers only support a fixed set of launch layers. distributed = None if distributed_backend.use_horovod(): distributed = core.DistributedContext.from_horovod(horovod.hvd) elif distributed_backend.use_deepspeed(): distributed = core.DistributedContext.from_deepspeed() elif distributed_backend.use_torch(): distributed = core.DistributedContext.from_torch_distributed() elif len(info.container_addrs) > 1 or len(info.slot_ids) > 1: raise ValueError( "In multi-slot tasks, the determined.exec.harness module must not be invoked " "directly. Instead, it must be wrapped in one of the following launch layers: " "determined.launch.horovod, determined.launch.deepspeed" ) # Step 4: Let core.init() create the core.Context. with core.init( distributed=distributed, preempt_mode=core.PreemptMode.ChiefOnly, tensorboard_mode=core.TensorboardMode.MANUAL, ) as core_context: trial_context = trial_class.trial_context_class(core_context, env) # Step 4: Instantiate the user's Trial. trial_inst = trial_class(trial_context) # Step 5: Create a TrialController and execute training logging.info(f"Creating {controller_class.__name__} with {trial_class.__name__}.") controller = controller_class.from_trial( trial_inst=trial_inst, context=trial_context, env=env, ) controller.run() return 0
def main(hvd_args: List[str], script: List[str], autohorovod: bool) -> int: hvd_args = hvd_args or [] info = det.get_cluster_info() assert info is not None, "must be run on-cluster" assert info.task_type == "TRIAL", f'must be run with task_type="TRIAL", not "{info.task_type}"' # When --autohorovod was set, detect single-slot and zero-slot trials. if autohorovod and len( info.container_addrs) == 1 and len(info.slot_ids) <= 1: p = subprocess.Popen(script) with det.util.forward_signals(p): return p.wait() # Hack: get the resources id from the environment. resources_id = os.environ.get("DET_RESOURCES_ID") assert resources_id is not None, "Unable to run with DET_RESOURCES_ID unset" # Hack: read the full config. The experiment config is not a stable API! experiment_config = info.trial._config debug = experiment_config.get("debug", False) if debug: logging.getLogger().setLevel(logging.DEBUG) # TODO: refactor websocket, data_layer, and profiling to to not use the cli_cert. cert = certs.default_load(info.master_url) certs.cli_cert = cert # The launch layer should provide the chief_ip to the training code, so that the training code # can function with a different launch layer in a different environment. Inside Determined, the # easiest way to get the chief_ip is with container_addrs. chief_ip = info.container_addrs[0] # Chief IP is set as an environment variable to support nested launch layers os.environ["DET_CHIEF_IP"] = chief_ip if info.container_rank > 0: # Non-chief machines just run sshd. # Mark sshd containers as daemon resources that the master should kill when all non-daemon # contiainers (horovodrun, in this case) have exited. api.post( info.master_url, path= f"/api/v1/allocations/{info.allocation_id}/resources/{resources_id}/daemon", cert=cert, ) pid_server_cmd, run_sshd_command = create_sshd_worker_cmd( info.allocation_id, len(info.slot_ids), debug=debug) logging.debug( f"Non-chief [{info.container_rank}] training process launch " f"command: {run_sshd_command}.") p = subprocess.Popen(pid_server_cmd + run_sshd_command) with det.util.forward_signals(p): return p.wait() # Chief machine waits for every worker's sshd to be available. All machines should be pretty # close to in-step by now because all machines just finished synchronizing rendezvous info. deadline = time.time() + 20 for peer_addr in info.container_addrs[1:]: util.check_sshd(peer_addr, deadline, DTRAIN_SSH_PORT) # The chief has several layers of wrapper processes: # - a top-level pid_server, which causes the whole container to exit if any local worker dies. # - horovodrun, which launches $slots_per_trial copies of the following layers: # - a pid_client process to contact the local pid_server # - wrap_rank, which redirects stdin/stdout to the local container # - harness.py, which actually does the training for the worker # # It is a bug in horovod that causes us to have this pid_server/pid_client pair of layers. # We can remove these layers when the upstream fix has been around for long enough that we can # reasonably require user images to have patched horovod installations. pid_server_cmd = create_hvd_pid_server_cmd(info.allocation_id, len(info.slot_ids)) # TODO: remove this (very old) hack when we have a configurable launch layer. hvd_optional_args = experiment_config.get("data", {}).get("__det_dtrain_args", []) hvd_optional_args += hvd_args if debug: hvd_optional_args += ["--mpi-args=-v --display-map"] hvd_cmd = horovod.create_run_command( num_proc_per_machine=len(info.slot_ids), ip_addresses=info.container_addrs, inter_node_network_interface=info.trial._inter_node_network_interface, optimizations=experiment_config["optimizations"], debug=debug, optional_args=hvd_optional_args, ) worker_wrapper_cmd = create_worker_wrapper_cmd(info.allocation_id) logging.debug( f"chief worker calling horovodrun with args: {hvd_cmd[1:]} ...") os.environ["USE_HOROVOD"] = "1" # We now have environment images with built-in OpenMPI. When invoked the # SLURM_JOBID variable triggers integration with SLURM, however, we are # running in a singularity container and SLURM may or may not have # compatible configuration enabled. We therefore clear the SLURM_JOBID variable # before invoking mpi so that mpirun will honor the args passed via horvod # run to it describing the hosts and process topology, otherwise mpi ends # up wanting to launch all -np# processes on the local causing an oversubscription # error ("There are not enough slots available in the system"). os.environ.pop("SLURM_JOBID", None) p = subprocess.Popen(pid_server_cmd + hvd_cmd + worker_wrapper_cmd + script) with det.util.forward_signals(p): return p.wait()
def get_allocation_token() -> str: info = det.get_cluster_info() if info is None: return "" return info.session_token
def init( *, distributed: Optional[core.DistributedContext] = None, # TODO: figure out a better way to deal with checkpointing in the local training case. storage_manager: Optional[storage.StorageManager] = None, preempt_mode: core.PreemptMode = core.PreemptMode.WorkersAskChief, tensorboard_mode: core.TensorboardMode = core.TensorboardMode.AUTO, ) -> Context: """ ``core.init()`` builds a :class:`core.Context <determined.core.Context>` for use with the Core API. Always use ``with core.init() as context`` instead of instantiating a ``core.Context`` directly. Certain components of the Core API may be configured by passing arguments to ``core.init()``. The only arg that is required is a ``DistributedContext``, and even that is only required for for multi-slot tasks. All of your training must occur within the scope of the ``with core.init() as core_context``, as there are resources necessary for training which start in the ``core.Context``'s ``__enter__`` method and must be cleaned up in its ``__exit__()`` method. Arguments: distributed (``core.DistributedContext``, optional): Passing a ``DistributedContext`` is required for multi-slot training, but unnecessary for single-slot training. Defaults to ``None``. preempt_mode (``core.PreemptMode``, optional): Configure the calling pattern for the ``core_context.preempt.should_preempt()`` method. See :class:`~determined.core.PreemptMode` for more detail. Defaults to ``WorkersAskChief``. storage_manager: Internal use only. tensorboard_mode (``core.TensorboardMode``, optional): Define how Tensorboard metrics and profiling data are retained. See :class:`~determined.core.TensorboardMode`` for more detail. Defaults to ``AUTO``. """ info = det.get_cluster_info() if info is None: return _dummy_init(distributed=distributed, storage_manager=storage_manager) # We are on the cluster. cert = certs.default_load(info.master_url) session = Session(info.master_url, None, None, cert, max_retries=get_max_retries_config()) if distributed is None: if len(info.container_addrs) > 1 or len(info.slot_ids) > 1: raise ValueError("you must provide a valid DistributedContext for a multi-slot task") distributed = distributed or core.DummyDistributedContext() preempt = core.PreemptContext(session, info.allocation_id, distributed, preempt_mode) # At present, we only support tensorboards in Trial tasks. tbd_writer = None train = None searcher = None if info.task_type == "TRIAL": # Prepare the tensorboard hooks. tensorboard_manager = tensorboard.build( info.cluster_id, str(info.trial.experiment_id), str(info.trial.trial_id), info.trial._config["checkpoint_storage"], container_path=constants.SHARED_FS_CONTAINER_PATH, ) if tensorboard_mode == core.TensorboardMode.AUTO: tbd_writer = tensorboard.get_metric_writer() train = core.TrainContext( session, info.trial.trial_id, info.trial._trial_run_id, info.trial.experiment_id, distributed, tensorboard_mode, tensorboard_manager, tbd_writer, ) units = core._parse_searcher_units(info.trial._config) searcher = core.SearcherContext( session, distributed, info.trial.trial_id, info.trial._trial_run_id, info.allocation_id, units, ) if storage_manager is None: storage_manager = storage.build( info.trial._config["checkpoint_storage"], container_path=constants.SHARED_FS_CONTAINER_PATH, ) checkpoint = core.CheckpointContext( distributed, storage_manager, session, info.task_id, info.allocation_id, tensorboard_mode, tensorboard_manager, ) else: # TODO: support checkpointing for non-trial tasks. if storage_manager is None: base_path = appdirs.user_data_dir("determined") logger.info("no storage_manager provided; storing checkpoints in {base_path}") storage_manager = storage.SharedFSStorageManager(base_path) checkpoint = core.DummyCheckpointContext(distributed, storage_manager) _install_stacktrace_on_sigusr1() return Context( distributed=distributed, checkpoint=checkpoint, preempt=preempt, train=train, searcher=searcher, )