def make_default_env_context( hparams: Dict[str, Any], experiment_config: Optional[Dict] = None, trial_seed: int = 0 ) -> det.EnvContext: if experiment_config is None: experiment_config = make_default_exp_config(hparams, 1) # TODO(ryan): Fix the parameter passing so that this doesn't read from environment variables, # and we can get rid of the @expose_gpus fixture. use_gpu = distutils.util.strtobool(os.environ.get("DET_USE_GPU", "false")) gpu_uuids = gpu.get_gpu_uuids_and_validate(use_gpu) return det.EnvContext( experiment_config=experiment_config, initial_workload=workload.Workload( workload.Workload.Kind.RUN_STEP, ExperimentID(1), TrialID(1), StepID(1) ), master_addr="", master_port=0, container_id="", hparams=hparams, latest_checkpoint=None, use_gpu=use_gpu, container_gpus=gpu_uuids, slot_ids=[], debug=False, workload_manager_type="", det_rendezvous_ports="", det_trial_runner_network_interface=constants.AUTO_DETECT_TRIAL_RUNNER_NETWORK_INTERFACE, det_trial_id="1", det_experiment_id="1", det_cluster_id="uuid-123", trial_seed=trial_seed, )
def create_default_env_context( experiment_config: Dict[str, Any]) -> det.EnvContext: det_trial_runner_network_interface = constants.AUTO_DETECT_TRIAL_RUNNER_NETWORK_INTERFACE return det.EnvContext( experiment_config=experiment_config, initial_workload=workload.Workload( workload.Workload.Kind.RUN_STEP, ExperimentID(1), TrialID(1), StepID(1), det.ExperimentConfig(experiment_config).scheduling_unit(), 0, ), master_addr="", master_port=0, use_tls=False, master_cert_file=None, master_cert_name=None, container_id="", hparams={"global_batch_size": 32}, latest_checkpoint=None, use_gpu=False, container_gpus=[], slot_ids=[], debug=False, workload_manager_type="", det_rendezvous_ports="", det_trial_unique_port_offset=0, det_trial_runner_network_interface=det_trial_runner_network_interface, det_trial_id="1", det_experiment_id="1", det_cluster_id="uuid-123", trial_seed=0, )
def main() -> None: for k in ENVIRONMENT_VARIABLE_KEYS: if k not in os.environ: sys.exit("Environment not set: missing " + k) experiment_config = simplejson.loads(os.environ["DET_EXPERIMENT_CONFIG"]) debug = experiment_config.get("debug", False) det._set_logger(debug) master_addr = os.environ["DET_MASTER_ADDR"] master_port = int(os.environ["DET_MASTER_PORT"]) agent_id = os.environ["DET_AGENT_ID"] container_id = os.environ["DET_CONTAINER_ID"] hparams = simplejson.loads(os.environ["DET_HPARAMS"]) initial_work = workload.Workload.from_json(simplejson.loads(os.environ["DET_INITIAL_WORKLOAD"])) latest_checkpoint = simplejson.loads(os.environ["DET_LATEST_CHECKPOINT"]) use_gpu = distutils.util.strtobool(os.environ.get("DET_USE_GPU", "false")) slot_ids = json.loads(os.environ["DET_SLOT_IDS"]) workload_manager_type = os.environ["DET_WORKLOAD_MANAGER_TYPE"] det_rendezvous_ports = os.environ["DET_RENDEZVOUS_PORTS"] det_trial_runner_network_interface = os.environ["DET_TRIAL_RUNNER_NETWORK_INTERFACE"] det_trial_id = os.environ["DET_TRIAL_ID"] det_experiment_id = os.environ["DET_EXPERIMENT_ID"] det_cluster_id = os.environ["DET_CLUSTER_ID"] trial_seed = int(os.environ["DET_TRIAL_SEED"]) gpu_uuids = gpu.get_gpu_uuids_and_validate(use_gpu, slot_ids) env = det.EnvContext( master_addr, master_port, container_id, experiment_config, hparams, initial_work, latest_checkpoint, use_gpu, gpu_uuids, slot_ids, debug, workload_manager_type, det_rendezvous_ports, det_trial_runner_network_interface, det_trial_id, det_experiment_id, det_cluster_id, trial_seed, ) logging.info( f"New trial runner in (container {container_id}) on agent {agent_id}: {env.__dict__}." ) try: storage.validate_config(env.experiment_config["checkpoint_storage"]) except Exception as e: logging.error("Checkpoint storage validation failed: {}".format(e)) sys.exit(1) build_and_run_training_pipeline(env)
def get_dummy_env() -> det.EnvContext: return det.EnvContext( master_url="", master_cert_file=None, master_cert_name=None, experiment_config={"resources": {"slots_per_trial": 1, "native_parallel": False}}, latest_checkpoint=None, steps_completed=0, use_gpu=False, container_gpus=[], slot_ids=[], debug=False, hparams={"global_batch_size": 1}, det_trial_unique_port_offset=0, det_trial_id="1", det_agent_id="1", det_experiment_id="1", det_cluster_id="uuid-123", trial_seed=0, trial_run_id=1, allocation_id="", managed_training=True, test_mode=False, on_cluster=False, )
def get_dummy_env() -> det.EnvContext: return det.EnvContext( master_addr="", master_port=0, container_id="", experiment_config={ "resources": { "slots_per_trial": 1, "native_parallel": False } }, initial_workload=workload.Workload( workload.Workload.Kind.RUN_STEP, determined_common.types.ExperimentID(1), determined_common.types.TrialID(1), determined_common.types.StepID(1), ), latest_checkpoint=None, use_gpu=False, container_gpus=[], slot_ids=[], debug=False, workload_manager_type="", hparams={"global_batch_size": 1}, det_rendezvous_ports="", det_trial_runner_network_interface=constants. AUTO_DETECT_TRIAL_RUNNER_NETWORK_INTERFACE, det_trial_id="1", det_experiment_id="1", det_cluster_id="uuid-123", trial_seed=0, )
def _make_local_execution_env( managed_training: bool, test_mode: bool, config: Optional[Dict[str, Any]], hparams: Optional[Dict[str, Any]] = None, limit_gpus: Optional[int] = None, ) -> Tuple[det.EnvContext, det.RendezvousInfo, horovod.HorovodContext]: config = det.ExperimentConfig( _make_local_execution_exp_config(config, managed_training=managed_training, test_mode=test_mode)) hparams = hparams or api.generate_random_hparam_values( config.get("hyperparameters", {})) use_gpu, container_gpus, slot_ids = _get_gpus(limit_gpus) local_rendezvous_ports = ( f"{constants.LOCAL_RENDEZVOUS_PORT},{constants.LOCAL_RENDEZVOUS_PORT+1}" ) env = det.EnvContext( master_addr="", master_port=0, use_tls=False, master_cert_file=None, master_cert_name=None, container_id="", experiment_config=config, hparams=hparams, initial_workload=workload.train_workload(1, 1, 1, config.scheduling_unit()), latest_checkpoint=None, use_gpu=use_gpu, container_gpus=container_gpus, slot_ids=slot_ids, debug=config.debug_enabled(), workload_manager_type="", det_rendezvous_ports=local_rendezvous_ports, det_trial_unique_port_offset=0, det_trial_runner_network_interface=constants. AUTO_DETECT_TRIAL_RUNNER_NETWORK_INTERFACE, det_trial_id="", det_experiment_id="", det_cluster_id="", trial_seed=config.experiment_seed(), managed_training=managed_training, test_mode=test_mode, on_cluster=False, ) rendezvous_ports = env.rendezvous_ports() rendezvous_info = det.RendezvousInfo( addrs=[f"0.0.0.0:{rendezvous_ports[0]}"], addrs2=[f"0.0.0.0:{rendezvous_ports[1]}"], rank=0) hvd_config = horovod.HorovodContext.from_configs(env.experiment_config, rendezvous_info, env.hparams) return env, rendezvous_info, hvd_config
def _make_local_test_experiment_env( checkpoint_dir: pathlib.Path, config: Optional[Dict[str, Any]], hparams: Optional[Dict[str, Any]] = None, ) -> Tuple[det.EnvContext, workload.Stream, det.RendezvousInfo, horovod.HorovodContext]: config = det.ExperimentConfig(_make_local_test_experiment_config(config)) hparams = hparams or _generate_test_hparam_values(config) use_gpu, container_gpus, slot_ids = _get_gpus() local_rendezvous_ports = ( f"{constants.LOCAL_RENDEZVOUS_PORT},{constants.LOCAL_RENDEZVOUS_PORT+1}" ) env = det.EnvContext( master_addr="", master_port=1, container_id="test_mode", experiment_config=config, hparams=hparams, initial_workload=workload.train_workload(1, 1, 1, config.batches_per_step()), latest_checkpoint=None, use_gpu=use_gpu, container_gpus=container_gpus, slot_ids=slot_ids, debug=config.debug_enabled(), workload_manager_type="", det_rendezvous_ports=local_rendezvous_ports, det_trial_runner_network_interface=constants. AUTO_DETECT_TRIAL_RUNNER_NETWORK_INTERFACE, det_trial_id="1", det_experiment_id="1", det_cluster_id="test_mode", trial_seed=config.experiment_seed(), ) workloads = _make_test_workloads(checkpoint_dir.joinpath("checkpoint"), config) rendezvous_ports = env.rendezvous_ports() rendezvous_info = det.RendezvousInfo( addrs=[f"0.0.0.0:{rendezvous_ports[0]}"], addrs2=[f"0.0.0.0:{rendezvous_ports[1]}"], rank=0) hvd_config = horovod.HorovodContext.from_configs(env.experiment_config, rendezvous_info, env.hparams) return env, workloads, rendezvous_info, hvd_config
def _make_local_execution_env( managed_training: bool, test_mode: bool, config: Optional[Dict[str, Any]], checkpoint_dir: str, hparams: Optional[Dict[str, Any]] = None, limit_gpus: Optional[int] = None, ) -> Tuple[core.Context, det.EnvContext]: config = det.ExperimentConfig( _make_local_execution_exp_config(config, checkpoint_dir, managed_training=managed_training, test_mode=test_mode)) hparams = hparams or api.generate_random_hparam_values( config.get("hyperparameters", {})) use_gpu, container_gpus, slot_ids = _get_gpus(limit_gpus) env = det.EnvContext( master_url="", master_cert_file=None, master_cert_name=None, experiment_config=config, hparams=hparams, latest_checkpoint=None, steps_completed=0, use_gpu=use_gpu, container_gpus=container_gpus, slot_ids=slot_ids, debug=config.debug_enabled(), det_trial_unique_port_offset=0, det_trial_id="", det_agent_id="", det_experiment_id="", det_cluster_id="", trial_seed=config.experiment_seed(), trial_run_id=1, allocation_id="", managed_training=managed_training, test_mode=test_mode, on_cluster=False, ) core_context = core._dummy_init() return core_context, env
def get_dummy_env() -> det.EnvContext: return det.EnvContext( master_addr="", master_port=0, use_tls=False, master_cert_file=None, master_cert_name=None, container_id="", experiment_config={ "resources": { "slots_per_trial": 1, "native_parallel": False } }, initial_workload=workload.Workload( workload.Workload.Kind.RUN_STEP, determined.common.types.ExperimentID(1), determined.common.types.TrialID(1), determined.common.types.StepID(1), constants.DEFAULT_SCHEDULING_UNIT, 0, ), latest_checkpoint=None, use_gpu=False, container_gpus=[], slot_ids=[], debug=False, workload_manager_type="", hparams={"global_batch_size": 1}, det_rendezvous_port="", det_trial_unique_port_offset=0, det_trial_runner_network_interface=constants. AUTO_DETECT_TRIAL_RUNNER_NETWORK_INTERFACE, det_trial_id="1", det_agent_id="1", det_experiment_id="1", det_task_token="", det_cluster_id="uuid-123", trial_seed=0, managed_training=True, test_mode=False, on_cluster=False, )
def make_default_env_context( hparams: Dict[str, Any], experiment_config: Dict, trial_seed: int = 0, latest_checkpoint: Optional[str] = None, steps_completed: int = 0, expose_gpus: bool = False, ) -> det.EnvContext: assert (latest_checkpoint is None) == (steps_completed == 0) if expose_gpus: gpu_uuids = gpu.get_gpu_uuids() use_gpu = bool(gpu_uuids) else: gpu_uuids = [] use_gpu = False return det.EnvContext( experiment_config=experiment_config, master_url="", master_cert_file=None, master_cert_name=None, hparams=hparams, latest_checkpoint=latest_checkpoint, steps_completed=steps_completed, use_gpu=use_gpu, container_gpus=gpu_uuids, slot_ids=[], debug=False, det_trial_unique_port_offset=0, det_trial_id="1", det_experiment_id="1", det_agent_id="1", det_cluster_id="uuid-123", trial_seed=trial_seed, trial_run_id=1, allocation_id="", managed_training=True, test_mode=False, on_cluster=False, )
def main() -> None: for k in ENVIRONMENT_VARIABLE_KEYS: if k not in os.environ: sys.exit("Environment not set: missing " + k) experiment_config = simplejson.loads(os.environ["DET_EXPERIMENT_CONFIG"]) debug = experiment_config.get("debug", False) determined.common.set_logger(debug) master_addr = os.environ["DET_MASTER_ADDR"] master_port = int(os.environ["DET_MASTER_PORT"]) use_tls = distutils.util.strtobool(os.environ.get("DET_USE_TLS", "false")) master_cert_file = os.environ.get("DET_MASTER_CERT_FILE") master_cert_name = os.environ.get("DET_MASTER_CERT_NAME") agent_id = os.environ["DET_AGENT_ID"] container_id = os.environ["DET_CONTAINER_ID"] hparams = simplejson.loads(os.environ["DET_HPARAMS"]) initial_work = workload.Workload.from_json( simplejson.loads(os.environ["DET_INITIAL_WORKLOAD"])) with open(os.environ["DET_LATEST_CHECKPOINT"], "r") as f: latest_checkpoint = json.load(f) use_gpu = distutils.util.strtobool(os.environ.get("DET_USE_GPU", "false")) slot_ids = json.loads(os.environ["DET_SLOT_IDS"]) workload_manager_type = os.environ["DET_WORKLOAD_MANAGER_TYPE"] det_rendezvous_port = os.environ["DET_RENDEZVOUS_PORT"] det_trial_unique_port_offset = int( os.environ["DET_TRIAL_UNIQUE_PORT_OFFSET"]) det_trial_runner_network_interface = os.environ[ "DET_TRIAL_RUNNER_NETWORK_INTERFACE"] det_trial_id = os.environ["DET_TRIAL_ID"] det_experiment_id = os.environ["DET_EXPERIMENT_ID"] det_agent_id = os.environ["DET_AGENT_ID"] det_cluster_id = os.environ["DET_CLUSTER_ID"] det_task_token = os.environ["DET_TASK_TOKEN"] trial_seed = int(os.environ["DET_TRIAL_SEED"]) gpu_uuids = gpu.get_gpu_uuids_and_validate(use_gpu, slot_ids) env = det.EnvContext( master_addr, master_port, use_tls, master_cert_file, master_cert_name, container_id, experiment_config, hparams, initial_work, latest_checkpoint, use_gpu, gpu_uuids, slot_ids, debug, workload_manager_type, det_rendezvous_port, det_trial_unique_port_offset, det_trial_runner_network_interface, det_trial_id, det_experiment_id, det_agent_id, det_cluster_id, det_task_token, trial_seed, managed_training=True, test_mode=False, on_cluster=True, ) logging.info( f"New trial runner in (container {container_id}) on agent {agent_id}: {env.__dict__}." ) try: storage.validate_config( env.experiment_config["checkpoint_storage"], container_path=constants.SHARED_FS_CONTAINER_PATH, ) except Exception as e: logging.error("Checkpoint storage validation failed: {}".format(e)) sys.exit(1) try: build_and_run_training_pipeline(env) except det.InvalidHP: logging.info("InvalidHP detected, gracefully exiting trial") pass
def main(train_entrypoint: str) -> int: info = det.get_cluster_info() assert info is not None, "must be run on-cluster" assert info.task_type == "TRIAL", f'must be run with task_type="TRIAL", not "{info.task_type}"' # TODO: refactor data_layer, and profiling to to not use the cli_cert. certs.cli_cert = certs.default_load(info.master_url) # TODO: Don't include EnvContext object in the future high-level APIs for PyTorch or Keras. # It was natural to create this big-blob-of-config object, but it was a mistake to pass it into # the lowest layers of the harness code; it's too large of an object to be easily mockable, # which is part of why building local training mode has always been a challenge. # # A better pattern is to pass in exactly the information that is necessary at each layer. We # will use that pattern for the future high-level APIs, but it's not worth refactoring e.g. the # TFKerasTrialController or EstimatorTrialController to add that functionality, so for now we # continue with the legacy strategy. env = det.EnvContext( master_url=info.master_url, master_cert_file=info.master_cert_file, master_cert_name=info.master_cert_name, experiment_config=info.trial._config, hparams=info.trial.hparams, latest_checkpoint=info.latest_checkpoint, steps_completed=info.trial._steps_completed, use_gpu=bool(info.gpu_uuids), container_gpus=info.gpu_uuids, slot_ids=info.slot_ids, debug=info.trial._debug, det_trial_unique_port_offset=info.trial._unique_port_offset, det_trial_id=str(info.trial.trial_id), det_experiment_id=str(info.trial.experiment_id), det_agent_id=info.agent_id, det_cluster_id=info.cluster_id, trial_seed=info.trial.trial_seed, trial_run_id=info.trial._trial_run_id, allocation_id=info.allocation_id, managed_training=True, test_mode=False, on_cluster=True, ) det.common.set_logger(env.debug) logging.debug("Starting harness.") with maybe_periodic_stacktraces(env.debug): # Step 1: Load user code. # We can't build a core.Context without rank information, and we can't gather rank # information until the distributed backend is initialized, and we can't initialize the # correct distributed backend until we know which Trial class the user implemented. trial_class = load.trial_class_from_entrypoint(train_entrypoint) controller_class = load.get_trial_controller_class(trial_class) if info.container_rank == 0: try: analytics.send_analytics("trial_loaded", analytics.get_trial_analytics(trial_class)) except Exception as e: logging.debug(f"Cannot send analytics: {e}") # Step 2: Initialize framework-specific details (dtrain framework, random seeds, etc). distributed_backend = det._DistributedBackend() controller_class.pre_execute_hook(env, distributed_backend) # Step 3: Now that the dtrain framework is initialized, build the DistributedContext object. # For harness.py, we only support a fixed set of Determined-provided launch layers, since # the TrialControllers only support a fixed set of launch layers. distributed = None if distributed_backend.use_horovod(): distributed = core.DistributedContext.from_horovod(horovod.hvd) elif distributed_backend.use_deepspeed(): distributed = core.DistributedContext.from_deepspeed() elif distributed_backend.use_torch(): distributed = core.DistributedContext.from_torch_distributed() elif len(info.container_addrs) > 1 or len(info.slot_ids) > 1: raise ValueError( "In multi-slot tasks, the determined.exec.harness module must not be invoked " "directly. Instead, it must be wrapped in one of the following launch layers: " "determined.launch.horovod, determined.launch.deepspeed" ) # Step 4: Let core.init() create the core.Context. with core.init( distributed=distributed, preempt_mode=core.PreemptMode.ChiefOnly, tensorboard_mode=core.TensorboardMode.MANUAL, ) as core_context: trial_context = trial_class.trial_context_class(core_context, env) # Step 4: Instantiate the user's Trial. trial_inst = trial_class(trial_context) # Step 5: Create a TrialController and execute training logging.info(f"Creating {controller_class.__name__} with {trial_class.__name__}.") controller = controller_class.from_trial( trial_inst=trial_inst, context=trial_context, env=env, ) controller.run() return 0