def test_noop_pause_of_experiment_without_trials() -> None: """ Walk through starting, pausing, and resuming a single no-op experiment which will never schedule a trial. """ config_obj = conf.load_config(conf.fixtures_path("no_op/single-one-short-step.yaml")) impossibly_large = 100 config_obj["max_restarts"] = 0 config_obj["resources"] = {"slots_per_trial": impossibly_large} with tempfile.NamedTemporaryFile() as tf: with open(tf.name, "w") as f: yaml.dump(config_obj, f) experiment_id = exp.create_experiment(tf.name, conf.fixtures_path("no_op"), None) exp.pause_experiment(experiment_id) exp.wait_for_experiment_state(experiment_id, bindings.determinedexperimentv1State.STATE_PAUSED) exp.activate_experiment(experiment_id) exp.wait_for_experiment_state(experiment_id, bindings.determinedexperimentv1State.STATE_ACTIVE) for _ in range(5): assert ( exp.experiment_state(experiment_id) == bindings.determinedexperimentv1State.STATE_ACTIVE ) time.sleep(1) exp.cancel_single(experiment_id)
def run_failure_test_with_temp_config( config: Dict[Any, Any], model_def_path: str, error_str: Optional[str] = None, ) -> None: with tempfile.NamedTemporaryFile() as tf: with open(tf.name, "w") as f: yaml.dump(config, f) run_failure_test(tf.name, model_def_path, error_str=error_str)
def test_noop_experiment_config_override() -> None: config_obj = conf.load_config(conf.fixtures_path("no_op/single-one-short-step.yaml")) with tempfile.NamedTemporaryFile() as tf: with open(tf.name, "w") as f: yaml.dump(config_obj, f) experiment_id = exp.create_experiment( tf.name, conf.fixtures_path("no_op"), ["--config", "reproducibility.experiment_seed=8200"], ) exp_config = exp.experiment_config_json(experiment_id) assert exp_config["reproducibility"]["experiment_seed"] == 8200 exp.cancel_single(experiment_id)
def test_non_root_experiment(clean_auth: None, tmp_path: pathlib.Path) -> None: user = create_linked_user(65534, "nobody", 65534, "nogroup") with logged_in_user(user): with open(conf.fixtures_path("no_op/model_def.py")) as f: model_def_content = f.read() with open(conf.fixtures_path("no_op/single-one-short-step.yaml")) as f: config = yaml.safe_load(f) # Use a user-owned path to ensure shared_fs uses the container_path and not host_path. with non_tmp_shared_fs_path() as host_path: config["checkpoint_storage"] = { "type": "shared_fs", "host_path": host_path, } # Call `det --version` in a startup hook to ensure that det is on the PATH. with FileTree( tmp_path, { "startup-hook.sh": "det --version || exit 77", "const.yaml": yaml.dump(config), # type: ignore "model_def.py": model_def_content, }, ) as tree: exp.run_basic_test(str(tree.joinpath("const.yaml")), str(tree), None)
def run_basic_test_with_temp_config( config: Dict[Any, Any], model_def_path: str, expected_trials: Optional[int], create_args: Optional[List[str]] = None, max_wait_secs: int = conf.DEFAULT_MAX_WAIT_SECS, ) -> int: with tempfile.NamedTemporaryFile() as tf: with open(tf.name, "w") as f: yaml.dump(config, f) experiment_id = run_basic_test( tf.name, model_def_path, expected_trials, create_args, max_wait_secs=max_wait_secs, ) return experiment_id
def master_up( port: int, master_config_path: Optional[Path], storage_host_path: Path, master_name: str, image_repo_prefix: Optional[str], version: Optional[str], db_password: str, delete_db: bool, autorestart: bool, cluster_name: str, auto_work_dir: Optional[Path], ) -> None: command = ["up", "-d"] if image_repo_prefix is None: image_repo_prefix = "determinedai" if version is None: version = determined.__version__ if autorestart: restart_policy = "unless-stopped" else: restart_policy = "no" env = { "INTEGRATIONS_HOST_PORT": str(port), "DET_DB_PASSWORD": db_password, "IMAGE_REPO_PREFIX": image_repo_prefix, "DET_VERSION": version, "DET_RESTART_POLICY": restart_policy, } # Some cli flags for det deploy local will cause us to write a temporary master.yaml. master_conf = {} make_temp_conf = False if master_config_path is not None: with master_config_path.open() as f: master_conf = yaml.safe_load(f) else: # These defaults come from master/packaging/master.yaml (except for host_path). master_conf = { "db": { "user": "******", "host": "determined-db", "port": 5432, "name": "determined", }, "checkpoint_storage": { "type": "shared_fs", "host_path": appdirs.user_data_dir("determined"), "save_experiment_best": 0, "save_trial_best": 1, "save_trial_latest": 1, }, } make_temp_conf = True if storage_host_path is not None: master_conf["checkpoint_storage"] = { "type": "shared_fs", "host_path": str(storage_host_path.resolve()), } make_temp_conf = True if auto_work_dir is not None: work_dir = str(auto_work_dir.resolve()) master_conf.setdefault("task_container_defaults", {})["work_dir"] = work_dir master_conf["task_container_defaults"].setdefault("bind_mounts", []).append( {"host_path": work_dir, "container_path": work_dir} ) make_temp_conf = True # Ensure checkpoint storage directory exists. final_storage_host_path = master_conf.get("checkpoint_storage", {}).get("host_path") if final_storage_host_path is not None: final_storage_host_path = Path(final_storage_host_path) if not final_storage_host_path.exists(): final_storage_host_path.mkdir(parents=True) if make_temp_conf: fd, temp_path = tempfile.mkstemp(prefix="det-deploy-local-master-config-") with open(fd, "w") as f: yaml.dump(master_conf, f) master_config_path = Path(temp_path) # This is always true by now, but mypy needs help. assert master_config_path is not None env["DET_MASTER_CONFIG"] = str(master_config_path.resolve()) master_down(master_name, delete_db) docker_compose(command, master_name, env) _wait_for_master("localhost", port, cluster_name)
def test_noop_single_warm_start() -> None: experiment_id1 = exp.run_basic_test( conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), 1 ) trials = exp.experiment_trials(experiment_id1) assert len(trials) == 1 first_trial = trials[0].trial first_trial_id = first_trial.id first_workloads = trials[0].workloads assert len(first_workloads) == 90 checkpoints = exp.workloads_with_checkpoint(first_workloads) assert len(checkpoints) == 30 first_checkpoint_uuid = checkpoints[0].uuid last_checkpoint_uuid = checkpoints[-1].uuid last_validation = exp.workloads_with_validation(first_workloads)[-1] assert last_validation.metrics["validation_error"] == pytest.approx(0.9 ** 30) config_base = conf.load_config(conf.fixtures_path("no_op/single.yaml")) # Test source_trial_id. config_obj = copy.deepcopy(config_base) # Add a source trial ID to warm start from. config_obj["searcher"]["source_trial_id"] = first_trial_id experiment_id2 = exp.run_basic_test_with_temp_config(config_obj, conf.fixtures_path("no_op"), 1) trials = exp.experiment_trials(experiment_id2) assert len(trials) == 1 second_trial = trials[0] assert len(second_trial.workloads) == 90 # Second trial should have a warm start checkpoint id. assert second_trial.trial.warmStartCheckpointUuid == last_checkpoint_uuid val_workloads = exp.workloads_with_validation(second_trial.workloads) assert val_workloads[-1].metrics["validation_error"] == pytest.approx(0.9 ** 60) # Now test source_checkpoint_uuid. config_obj = copy.deepcopy(config_base) # Add a source trial ID to warm start from. config_obj["searcher"]["source_checkpoint_uuid"] = checkpoints[0].uuid with tempfile.NamedTemporaryFile() as tf: with open(tf.name, "w") as f: yaml.dump(config_obj, f) experiment_id3 = exp.run_basic_test(tf.name, conf.fixtures_path("no_op"), 1) trials = exp.experiment_trials(experiment_id3) assert len(trials) == 1 third_trial = trials[0] assert len(third_trial.workloads) == 90 assert third_trial.trial.warmStartCheckpointUuid == first_checkpoint_uuid validations = exp.workloads_with_validation(third_trial.workloads) assert validations[1].metrics["validation_error"] == pytest.approx(0.9 ** 3)
def test_noop_single_warm_start() -> None: experiment_id1 = exp.run_basic_test( conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), 1 ) trials = exp.experiment_trials(experiment_id1) assert len(trials) == 1 first_trial = trials[0] first_trial_id = first_trial["id"] assert len(first_trial["steps"]) == 30 first_step = first_trial["steps"][0] first_checkpoint_id = first_step["checkpoint"]["id"] last_step = first_trial["steps"][29] last_checkpoint_id = last_step["checkpoint"]["id"] assert last_step["validation"]["metrics"]["validation_metrics"][ "validation_error" ] == pytest.approx(0.9 ** 30) config_base = conf.load_config(conf.fixtures_path("no_op/single.yaml")) # Test source_trial_id. config_obj = copy.deepcopy(config_base) # Add a source trial ID to warm start from. config_obj["searcher"]["source_trial_id"] = first_trial_id experiment_id2 = exp.run_basic_test_with_temp_config(config_obj, conf.fixtures_path("no_op"), 1) trials = exp.experiment_trials(experiment_id2) assert len(trials) == 1 second_trial = trials[0] assert len(second_trial["steps"]) == 30 # Second trial should have a warm start checkpoint id. assert second_trial["warm_start_checkpoint_id"] == last_checkpoint_id assert second_trial["steps"][29]["validation"]["metrics"]["validation_metrics"][ "validation_error" ] == pytest.approx(0.9 ** 60) # Now test source_checkpoint_uuid. config_obj = copy.deepcopy(config_base) # Add a source trial ID to warm start from. config_obj["searcher"]["source_checkpoint_uuid"] = first_step["checkpoint"]["uuid"] with tempfile.NamedTemporaryFile() as tf: with open(tf.name, "w") as f: yaml.dump(config_obj, f) experiment_id3 = exp.run_basic_test(tf.name, conf.fixtures_path("no_op"), 1) trials = exp.experiment_trials(experiment_id3) assert len(trials) == 1 third_trial = trials[0] assert len(third_trial["steps"]) == 30 assert third_trial["warm_start_checkpoint_id"] == first_checkpoint_id assert third_trial["steps"][1]["validation"]["metrics"]["validation_metrics"][ "validation_error" ] == pytest.approx(0.9 ** 3)