def main() -> None: for k in ENVIRONMENT_VARIABLE_KEYS: if k not in os.environ: sys.exit("Environment not set: missing " + k) experiment_config = simplejson.loads(os.environ["DET_EXPERIMENT_CONFIG"]) debug = experiment_config.get("debug", False) det._set_logger(debug) master_addr = os.environ["DET_MASTER_ADDR"] master_port = int(os.environ["DET_MASTER_PORT"]) agent_id = os.environ["DET_AGENT_ID"] container_id = os.environ["DET_CONTAINER_ID"] hparams = simplejson.loads(os.environ["DET_HPARAMS"]) initial_work = workload.Workload.from_json(simplejson.loads(os.environ["DET_INITIAL_WORKLOAD"])) latest_checkpoint = simplejson.loads(os.environ["DET_LATEST_CHECKPOINT"]) use_gpu = distutils.util.strtobool(os.environ.get("DET_USE_GPU", "false")) slot_ids = json.loads(os.environ["DET_SLOT_IDS"]) workload_manager_type = os.environ["DET_WORKLOAD_MANAGER_TYPE"] det_rendezvous_ports = os.environ["DET_RENDEZVOUS_PORTS"] det_trial_runner_network_interface = os.environ["DET_TRIAL_RUNNER_NETWORK_INTERFACE"] det_trial_id = os.environ["DET_TRIAL_ID"] det_experiment_id = os.environ["DET_EXPERIMENT_ID"] det_cluster_id = os.environ["DET_CLUSTER_ID"] trial_seed = int(os.environ["DET_TRIAL_SEED"]) gpu_uuids = gpu.get_gpu_uuids_and_validate(use_gpu, slot_ids) env = det.EnvContext( master_addr, master_port, container_id, experiment_config, hparams, initial_work, latest_checkpoint, use_gpu, gpu_uuids, slot_ids, debug, workload_manager_type, det_rendezvous_ports, det_trial_runner_network_interface, det_trial_id, det_experiment_id, det_cluster_id, trial_seed, ) logging.info( f"New trial runner in (container {container_id}) on agent {agent_id}: {env.__dict__}." ) try: storage.validate_config(env.experiment_config["checkpoint_storage"]) except Exception as e: logging.error("Checkpoint storage validation failed: {}".format(e)) sys.exit(1) build_and_run_training_pipeline(env)
def test_verify_s3_upload_error(tmp_path: Path, monkeypatch: MonkeyPatch) -> None: tmpdir_s = str(tmp_path) monkeypatch.setattr("boto3.client", s3.s3_faulty_client) config = { "type": "s3", "bucket": "bucket", "access_key": "key", "secret_key": "secret", "temp_dir": tmpdir_s, } assert len(os.listdir(tmpdir_s)) == 0 with pytest.raises(S3UploadFailedError): storage.validate_config(config, container_path=None) assert len(os.listdir(tmpdir_s)) == 0
def main() -> None: for k in ENVIRONMENT_VARIABLE_KEYS: if k not in os.environ: sys.exit("Environment not set: missing " + k) experiment_config = simplejson.loads(os.environ["DET_EXPERIMENT_CONFIG"]) debug = experiment_config.get("debug", False) determined_common.set_logger(debug) master_addr = os.environ["DET_MASTER_ADDR"] master_port = int(os.environ["DET_MASTER_PORT"]) use_tls = distutils.util.strtobool(os.environ.get("DET_USE_TLS", "false")) master_cert_file = os.environ.get("DET_MASTER_CERT_FILE") master_cert_name = os.environ.get("DET_MASTER_CERT_NAME") agent_id = os.environ["DET_AGENT_ID"] container_id = os.environ["DET_CONTAINER_ID"] hparams = simplejson.loads(os.environ["DET_HPARAMS"]) initial_work = workload.Workload.from_json( simplejson.loads(os.environ["DET_INITIAL_WORKLOAD"])) with open(os.environ["DET_LATEST_CHECKPOINT"], "r") as f: latest_checkpoint = json.load(f) use_gpu = distutils.util.strtobool(os.environ.get("DET_USE_GPU", "false")) slot_ids = json.loads(os.environ["DET_SLOT_IDS"]) workload_manager_type = os.environ["DET_WORKLOAD_MANAGER_TYPE"] det_rendezvous_ports = os.environ["DET_RENDEZVOUS_PORTS"] det_trial_unique_port_offset = int( os.environ["DET_TRIAL_UNIQUE_PORT_OFFSET"]) det_trial_runner_network_interface = os.environ[ "DET_TRIAL_RUNNER_NETWORK_INTERFACE"] det_trial_id = os.environ["DET_TRIAL_ID"] det_experiment_id = os.environ["DET_EXPERIMENT_ID"] det_cluster_id = os.environ["DET_CLUSTER_ID"] trial_seed = int(os.environ["DET_TRIAL_SEED"]) gpu_uuids = gpu.get_gpu_uuids_and_validate(use_gpu, slot_ids) env = det.EnvContext( master_addr, master_port, use_tls, master_cert_file, master_cert_name, container_id, experiment_config, hparams, initial_work, latest_checkpoint, use_gpu, gpu_uuids, slot_ids, debug, workload_manager_type, det_rendezvous_ports, det_trial_unique_port_offset, det_trial_runner_network_interface, det_trial_id, det_experiment_id, det_cluster_id, trial_seed, ) logging.info( f"New trial runner in (container {container_id}) on agent {agent_id}: {env.__dict__}." ) try: storage.validate_config( env.experiment_config["checkpoint_storage"], container_path=constants.SHARED_FS_CONTAINER_PATH, ) except Exception as e: logging.error("Checkpoint storage validation failed: {}".format(e)) sys.exit(1) try: build_and_run_training_pipeline(env) except det.InvalidHP: logging.info("InvalidHP detected, gracefully exiting trial") pass