def test_non_root_experiment(auth: Authentication, tmp_path: pathlib.Path) -> None: user = create_linked_user(65534, "nobody", 65534, "nogroup") with logged_in_user(user): with open(conf.fixtures_path("no_op/model_def.py")) as f: model_def_content = f.read() with open(conf.fixtures_path("no_op/single-one-short-step.yaml")) as f: config = yaml.safe_load(f) # Use a user-owned path to ensure shared_fs uses the container_path and not host_path. with non_tmp_shared_fs_path() as host_path: config["checkpoint_storage"] = { "type": "shared_fs", "host_path": host_path, } # Call `det --version` in a startup hook to ensure that det is on the PATH. with FileTree( tmp_path, { "startup-hook.sh": "det --version || exit 77", "const.yaml": yaml.dump(config), # type: ignore "model_def.py": model_def_content, }, ) as tree: exp.run_basic_test(str(tree.joinpath("const.yaml")), str(tree), None)
def test_experiment_creation_and_listing(auth: Authentication) -> None: # Create 2 users. creds1 = create_test_user(ADMIN_CREDENTIALS, True) creds2 = create_test_user(ADMIN_CREDENTIALS, True) # Create an experiment as first user. with logged_in_user(creds1): experiment_id1 = exp.run_basic_test( conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), 1) # Create another experiment, this time as second user. with logged_in_user(creds2): experiment_id2 = exp.run_basic_test( conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), 1) with logged_in_user(creds1): # Now it should be the other way around. output = extract_id_and_owner_from_exp_list(det_run(["e", "list"])) assert (experiment_id1, creds1.username) in output assert (experiment_id2, creds2.username) not in output # Now use the -a flag to list all experiments. The output should include both experiments. output = extract_id_and_owner_from_exp_list( det_run(["e", "list", "-a"])) assert (experiment_id1, creds1.username) in output assert (experiment_id2, creds2.username) in output # Clean up. delete_experiments(experiment_id1, experiment_id2)
def test_tensorboard_creation_and_listing(auth: Authentication) -> None: creds1 = create_test_user(ADMIN_CREDENTIALS, True) creds2 = create_test_user(ADMIN_CREDENTIALS, True) with logged_in_user(creds1): # Create an experiment. experiment_id1 = exp.run_basic_test( conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), 1) with logged_in_user(creds1): tensorboard_id1 = start_tensorboard(experiment_id1) with logged_in_user(creds2): experiment_id2 = exp.run_basic_test( conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), 1) with logged_in_user(creds2): tensorboard_id2 = start_tensorboard(experiment_id2) with logged_in_user(creds1): output = extract_columns(det_run(["tensorboard", "list"]), [0, 1]) assert (tensorboard_id1, creds1.username) in output assert (tensorboard_id2, creds2.username) not in output output = extract_columns(det_run(["tensorboard", "list", "-a"]), [0, 1]) assert (tensorboard_id1, creds1.username) in output assert (tensorboard_id2, creds2.username) in output kill_tensorboards(tensorboard_id1, tensorboard_id2) delete_experiments(experiment_id1, experiment_id2)
def test_large_model_def_experiment() -> None: with tempfile.TemporaryDirectory() as td: shutil.copy(conf.fixtures_path("no_op/model_def.py"), td) # Write a 94MB file into the directory. Use random data because it is not compressible. with open(os.path.join(td, "junk.txt"), "wb") as f: f.write(os.urandom(94 * 1024 * 1024)) exp.run_basic_test(conf.fixtures_path("no_op/single-one-short-step.yaml"), td, 1)
def test_core_api_tutorials(stage: str, ntrials: int, expect_workloads: bool, expect_checkpoints: bool) -> None: exp.run_basic_test( conf.tutorials_path(f"core_api/{stage}.yaml"), conf.tutorials_path("core_api"), ntrials, expect_workloads=expect_workloads, expect_checkpoints=expect_checkpoints, )
def test_priortity_scheduler_noop_experiment( managed_cluster_priority_scheduler: ManagedCluster, ) -> None: managed_cluster_priority_scheduler.ensure_agent_ok() # uses the default priority set in cluster config exp.run_basic_test(conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), 1) # uses explicit priority exp.run_basic_test(conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), 1, priority=50)
def test_start_tensorboard_for_multi_experiment( tmp_path: Path, secrets: Dict[str, str]) -> None: """ Start 3 random experiments configured with the s3 and shared_fs backends, start a TensorBoard instance pointed to the experiments and some select trials, and kill the TensorBoard instance. """ with FileTree( tmp_path, { "shared_fs_config.yaml": shared_fs_config(1), "s3_config.yaml": s3_config(1, secrets), "multi_trial_config.yaml": shared_fs_config(3), }, ) as tree: shared_conf_path = tree.joinpath("shared_fs_config.yaml") shared_fs_exp_id = exp.run_basic_test(str(shared_conf_path), conf.fixtures_path("no_op"), num_trials) s3_conf_path = tree.joinpath("s3_config.yaml") s3_exp_id = exp.run_basic_test(str(s3_conf_path), conf.fixtures_path("no_op"), num_trials) multi_trial_config = tree.joinpath("multi_trial_config.yaml") multi_trial_exp_id = exp.run_basic_test(str(multi_trial_config), conf.fixtures_path("no_op"), 3) trial_ids = [ str(t["id"]) for t in exp.experiment_trials(multi_trial_exp_id) ] command = [ "tensorboard", "start", str(shared_fs_exp_id), str(s3_exp_id), "-t", *trial_ids, "--no-browser", ] with cmd.interactive_command(*command) as tensorboard: for line in tensorboard.stdout: if SERVICE_READY in line: break if AWAITING_METRICS in line: raise AssertionError("Tensorboard did not find metrics") else: raise AssertionError(f"Did not find {SERVICE_READY} in output")
def test_custom_etc() -> None: master_host = "localhost" master_port = "8080" conf.MASTER_IP = master_host conf.MASTER_PORT = master_port etc_path = str(Path(__file__).parent.joinpath("etc/master.yaml").resolve()) cluster_up(["--master-config-path", etc_path]) exp.run_basic_test( conf.fixtures_path("no_op/single-default-ckpt.yaml"), conf.fixtures_path("no_op"), 1, ) assert os.path.exists("/tmp/ckpt-test/") cluster_down([])
def test_mask_rcnn_64_slots() -> None: experiment_id = exp.run_basic_test( conf.experimental_path("FasterRCNN_tp/64-gpus.yaml"), conf.experimental_path("FasterRCNN_tp/"), 1, max_wait_secs=5 * 60 * 60, ) validation_metric_name = "mAP(bbox)/IoU=0.5:0.95" validation_metric = exp.get_validation_metric_from_last_step( experiment_id, 0, validation_metric_name) durations = exp.get_experiment_durations(experiment_id, 0) wait_for_agents_time = (durations.experiment_duration - durations.training_duration - durations.validation_duration - durations.checkpoint_duration) print(validation_metric_name, validation_metric) print(durations) print(f"wait for agents duration: {wait_for_agents_time}") assert validation_metric > 0.375 assert durations.training_duration < datetime.timedelta(hours=2, minutes=45) assert durations.validation_duration < datetime.timedelta(hours=1, minutes=15)
def test_experiment_delete() -> None: user = create_test_user(ADMIN_CREDENTIALS, False) with logged_in_user(user): experiment_id = exp.run_basic_test( conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), 1) # "det experiment delete" call should fail, because the user is not an admin. child = det_spawn( ["experiment", "delete", str(experiment_id), "--yes"]) child.wait() assert child.exitstatus > 0 with logged_in_user(ADMIN_CREDENTIALS): child = det_spawn( ["experiment", "delete", str(experiment_id), "--yes"]) child.wait() assert child.exitstatus == 0 # "det experiment describe" call should fail, because the # experiment is no longer in the database. child = det_spawn(["experiment", "describe", str(experiment_id)]) child.wait() assert child.exitstatus > 0
def test_experiment_delete() -> None: subprocess.check_call( ["det", "-m", conf.make_master_url(), "user", "whoami"]) experiment_id = exp.run_basic_test(conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), 1) subprocess.check_call( [ "det", "-m", conf.make_master_url(), "experiment", "delete", str(experiment_id), "--yes" ], env={ **os.environ, "DET_ADMIN": "1" }, ) # "det experiment describe" call should fail, because the # experiment is no longer in the database. with pytest.raises(subprocess.CalledProcessError): subprocess.check_call([ "det", "-m", conf.make_master_url(), "experiment", "describe", str(experiment_id) ])
def test_model_registry() -> None: exp_id = exp.run_basic_test( conf.fixtures_path("mnist_pytorch/const-pytorch11.yaml"), conf.official_examples_path("trial/mnist_pytorch"), None, ) d = Determined(conf.make_master_url()) mnist = d.create_model("mnist", "simple computer vision model") assert mnist.metadata == {} mnist.add_metadata({"testing": "metadata"}) assert mnist.metadata == {"testing": "metadata"} mnist.add_metadata({"some_key": "some_value"}) assert mnist.metadata == {"testing": "metadata", "some_key": "some_value"} mnist.add_metadata({"testing": "override"}) assert mnist.metadata == {"testing": "override", "some_key": "some_value"} mnist.remove_metadata(["some_key"]) assert mnist.metadata == {"testing": "override"} checkpoint = d.get_experiment(exp_id).top_checkpoint() model_version = mnist.register_version(checkpoint) assert model_version == 1 assert mnist.get_version().uuid == checkpoint.uuid d.create_model("transformer", "all you need is attention") d.create_model("object-detection", "a bounding box model") models = d.get_models(sort_by=ModelSortBy.NAME) assert [m.name for m in models] == ["mnist", "object-detection", "transformer"]
def test_end_to_end_adaptive() -> None: exp_id = exp.run_basic_test( conf.fixtures_path("mnist_pytorch/adaptive_short.yaml"), conf.official_examples_path("trial/mnist_pytorch"), None, ) # Check that validation accuracy look sane (more than 93% on MNIST). trials = exp.experiment_trials(exp_id) best = None for trial in trials: assert len(trial["steps"]) last_step = trial["steps"][-1] accuracy = last_step["validation"]["metrics"]["validation_metrics"]["accuracy"] if not best or accuracy > best: best = accuracy assert best is not None assert best > 0.93 # Check that ExperimentReference returns a sorted order of top checkpoints # without gaps. The top 2 checkpoints should be the first 2 of the top k # checkpoints if sorting is stable. d = Determined(conf.make_master_url()) exp_ref = d.get_experiment(exp_id) top_2 = exp_ref.top_n_checkpoints(2) top_k = exp_ref.top_n_checkpoints(len(trials)) top_2_uuids = [c.uuid for c in top_2] top_k_uuids = [c.uuid for c in top_k] assert top_2_uuids == top_k_uuids[:2] # Check that metrics are truly in sorted order. metrics = [c.validation["metrics"]["validation_metrics"]["validation_loss"] for c in top_k] assert metrics == sorted(metrics) # Check that changing smaller is better reverses the checkpoint ordering. top_k_reversed = exp_ref.top_n_checkpoints( len(trials), sort_by="validation_loss", smaller_is_better=False ) top_k_reversed_uuids = [c.uuid for c in top_k_reversed] assert top_k_uuids == top_k_reversed_uuids[::-1] checkpoint = top_k[0] checkpoint.add_metadata({"testing": "metadata"}) assert checkpoint.metadata == {"testing": "metadata"} checkpoint.add_metadata({"some_key": "some_value"}) assert checkpoint.metadata == {"testing": "metadata", "some_key": "some_value"} checkpoint.add_metadata({"testing": "override"}) assert checkpoint.metadata == {"testing": "override", "some_key": "some_value"} checkpoint.remove_metadata(["some_key"]) assert checkpoint.metadata == {"testing": "override"}
def test_noop_load() -> None: """ Load a checkpoint """ experiment_id = exp.run_basic_test( conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), 1 ) trials = exp.experiment_trials(experiment_id) checkpoint = Determined(conf.make_master_url()).get_trial(trials[0].trial.id).top_checkpoint() assert checkpoint.task_id == trials[0].trial.taskId
def test_custom_port() -> None: name = "port_test" master_host = "localhost" master_port = "12321" conf.MASTER_IP = master_host conf.MASTER_PORT = master_port arguments = [ "--cluster-name", name, "--master-port", f"{master_port}", ] cluster_up(arguments) exp.run_basic_test( conf.fixtures_path("no_op/single-one-short-step.yaml"), conf.fixtures_path("no_op"), 1, ) cluster_down(["--cluster-name", name])
def test_trial_logs() -> None: experiment_id = exp.run_basic_test( conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), 1 ) trial_id = exp.experiment_trials(experiment_id)[0]["id"] subprocess.check_call(["det", "-m", conf.make_master_url(), "trial", "logs", str(trial_id)]) subprocess.check_call( ["det", "-m", conf.make_master_url(), "trial", "logs", "--head", "10", str(trial_id)], ) subprocess.check_call( ["det", "-m", conf.make_master_url(), "trial", "logs", "--tail", "10", str(trial_id)], )
def test_non_root_experiment(auth: Authentication, tmp_path: pathlib.Path) -> None: user = create_linked_user(65534, "nobody", 65534, "nogroup") with logged_in_user(user): with open(conf.fixtures_path("no_op/single-one-short-step.yaml")) as f: config_content = f.read() with open(conf.fixtures_path("no_op/model_def.py")) as f: model_def_content = f.read() # Call `det --version` in a startup hook to ensure that det is on the PATH. with FileTree( tmp_path, { "startup-hook.sh": "det --version || exit 77", "const.yaml": config_content, "model_def.py": model_def_content, }, ) as tree: exp.run_basic_test(str(tree.joinpath("const.yaml")), str(tree), None)
def test_metric_gathering() -> None: """ Confirm that metrics are gathered from the trial the way that we expect. """ experiment_id = exp.run_basic_test( conf.fixtures_path("metric_maker/const.yaml"), conf.fixtures_path("metric_maker"), 1) trials = exp.experiment_trials(experiment_id) assert len(trials) == 1 # Read the structure of the metrics directly from the config file config = conf.load_config(conf.fixtures_path("metric_maker/const.yaml")) base_value = config["hyperparameters"]["starting_base_value"] gain_per_batch = config["hyperparameters"]["gain_per_batch"] training_structure = config["hyperparameters"]["training_structure"]["val"] validation_structure = config["hyperparameters"]["validation_structure"][ "val"] scheduling_unit = 100 # Check training metrics. full_trial_metrics = exp.trial_metrics(trials[0].trial.id) batches_trained = 0 for step in full_trial_metrics["steps"]: metrics = step["metrics"] actual = metrics["batch_metrics"] assert len(actual) == scheduling_unit first_base_value = base_value + batches_trained batch_values = first_base_value + gain_per_batch * np.arange( scheduling_unit) expected = [ structure_to_metrics(value, training_structure) for value in batch_values ] assert structure_equal(expected, actual) batches_trained = step["total_batches"] # Check validation metrics. validation_workloads = exp.workloads_with_validation(trials[0].workloads) for validation in validation_workloads: actual = validation.metrics batches_trained = validation.totalBatches value = base_value + batches_trained expected = structure_to_metrics(value, validation_structure) assert structure_equal(expected, actual)
def test_metric_gathering() -> None: """ Confirm that metrics are gathered from the trial the way that we expect. """ experiment_id = exp.run_basic_test( conf.fixtures_path("metric_maker/const.yaml"), conf.fixtures_path("metric_maker"), 1) trials = exp.experiment_trials(experiment_id) assert len(trials) == 1 # Read the structure of the metrics directly from the config file config = conf.load_config(conf.fixtures_path("metric_maker/const.yaml")) base_value = config["hyperparameters"]["starting_base_value"] gain_per_batch = config["hyperparameters"]["gain_per_batch"] training_structure = config["hyperparameters"]["training_structure"]["val"] validation_structure = config["hyperparameters"]["validation_structure"][ "val"] scheduling_unit = 100 # Check training metrics. full_trial_metrics = exp.trial_metrics(trials[0]["id"]) for step in full_trial_metrics["steps"]: metrics = step["metrics"] assert metrics["num_inputs"] == scheduling_unit actual = metrics["batch_metrics"] assert len(actual) == scheduling_unit first_base_value = base_value + (step["id"] - 1) * scheduling_unit batch_values = first_base_value + gain_per_batch * np.arange( scheduling_unit) expected = [ structure_to_metrics(value, training_structure) for value in batch_values ] assert structure_equal(expected, actual) # Check validation metrics. for step in trials[0]["steps"]: validation = step["validation"] metrics = validation["metrics"] actual = metrics["validation_metrics"] value = base_value + step["id"] * scheduling_unit expected = structure_to_metrics(value, validation_structure) assert structure_equal(expected, actual)
def _test_rng_restore(fixture: str, metrics: list) -> None: """ This test confirms that an experiment can be restarted from a checkpoint with the same RNG state. It requires a test fixture that will emit random numbers from all of the RNGs used in the relevant framework as metrics. The experiment must have a const.yaml, run for at least 3 steps, checkpoint every step, and keep the first checkpoint (either by having metrics get worse over time, or by configuring the experiment to keep all checkpoints). """ experiment = exp.run_basic_test( conf.fixtures_path(fixture + "/const.yaml"), conf.fixtures_path(fixture), 1, ) first_trial = exp.experiment_trials(experiment)[0] assert len(first_trial["steps"]) >= 3 first_step = first_trial["steps"][0] first_checkpoint_id = first_step["checkpoint"]["id"] config_base = conf.load_config(conf.fixtures_path(fixture + "/const.yaml")) config_obj = copy.deepcopy(config_base) config_obj["searcher"]["source_checkpoint_uuid"] = first_step["checkpoint"]["uuid"] experiment2 = exp.run_basic_test_with_temp_config(config_obj, conf.fixtures_path(fixture), 1) second_trial = exp.experiment_trials(experiment2)[0] assert len(second_trial["steps"]) >= 3 assert second_trial["warm_start_checkpoint_id"] == first_checkpoint_id for step in range(0, 2): for metric in metrics: first_metric = first_trial["steps"][step + 1]["validation"]["metrics"][ "validation_metrics" ][metric] second_metric = second_trial["steps"][step]["validation"]["metrics"][ "validation_metrics" ][metric] assert ( first_metric == second_metric ), f"failures on iteration: {step} with metric: {metric}"
def test_nan_metrics() -> None: """ Confirm that NaN and Infinity metrics are gathered from the trial. """ exp_id = exp.run_basic_test(conf.fixtures_path("metric_maker/nans.yaml"), conf.fixtures_path("metric_maker"), 1) trials = exp.experiment_trials(exp_id) config = conf.load_config(conf.fixtures_path("metric_maker/nans.yaml")) base_value = config["hyperparameters"]["starting_base_value"] gain_per_batch = config["hyperparameters"]["gain_per_batch"] # Infinity and NaN cannot be processed in the YAML->JSON deserializer # Add them to expected values here training_structure = config["hyperparameters"]["training_structure"]["val"] training_structure["inf"] = "Infinity" training_structure["nan"] = "NaN" training_structure["nanarray"] = ["NaN", "NaN"] validation_structure = config["hyperparameters"]["validation_structure"][ "val"] validation_structure["neg_inf"] = "-Infinity" # Check training metrics. full_trial_metrics = exp.trial_metrics(trials[0].trial.id) batches_trained = 0 for step in full_trial_metrics["steps"]: metrics = step["metrics"] actual = metrics["batch_metrics"] first_base_value = base_value + batches_trained batch_values = first_base_value + gain_per_batch * np.arange(5) expected = [ structure_to_metrics(value, training_structure) for value in batch_values ] assert structure_equal(expected, actual) batches_trained = step["total_batches"] # Check validation metrics. validation_workloads = exp.workloads_with_validation(trials[0].workloads) for validation in validation_workloads: actual = validation.metrics batches_trained = validation.totalBatches expected = structure_to_metrics(base_value, validation_structure) assert structure_equal(expected, actual)
def test_start_tensorboard_for_shared_fs_experiment(tmp_path: Path) -> None: """ Start a random experiment configured with the shared_fs backend, start a TensorBoard instance pointed to the experiment, and kill the TensorBoard instance. """ with FileTree(tmp_path, {"config.yaml": shared_fs_config(1)}) as tree: config_path = tree.joinpath("config.yaml") experiment_id = exp.run_basic_test(str(config_path), conf.fixtures_path("no_op"), num_trials) command = ["tensorboard", "start", str(experiment_id), "--no-browser"] with cmd.interactive_command(*command) as tensorboard: for line in tensorboard.stdout: if SERVICE_READY in line: break else: raise AssertionError(f"Did not find {SERVICE_READY} in output")
def test_task_logs(task_type: str, task_config: Dict[str, Any], log_regex: Any) -> None: # TODO: refactor tests to not use cli singleton auth. master_url = conf.make_master_url() certs.cli_cert = certs.default_load(conf.make_master_url()) authentication.cli_auth = authentication.Authentication( conf.make_master_url(), try_reauth=True) rps = bindings.get_GetResourcePools( session.Session(master_url, "determined", authentication.cli_auth, certs.cli_cert)) assert rps.resourcePools and len( rps.resourcePools) > 0, "missing resource pool" if (rps.resourcePools[0].type == bindings.v1ResourcePoolType.RESOURCE_POOL_TYPE_K8S and task_type == command.TaskTypeCommand): # TODO(DET-6712): Investigate intermittent slowness with K8s command logs. return body = {} if task_type == command.TaskTypeTensorBoard: exp_id = exp.run_basic_test( conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), 1, ) body.update({"experiment_ids": [exp_id]}) resp = command.launch_command( master_url, f"api/v1/{command.RemoteTaskNewAPIs[task_type]}", task_config, "", default_body=body, ) task_id = resp[command.RemoteTaskName[task_type]]["id"] try: check_logs(master_url, task_id, log_regex, api.task_logs, api.task_log_fields) finally: command._kill(master_url, task_type, task_id)
def test_support_bundle() -> None: exp_id = exp.run_basic_test( config_file=conf.fixtures_path("no_op/single-one-short-step.yaml"), model_def_file=conf.fixtures_path("no_op"), expected_trials=1, ) trial_id = exp.experiment_first_trial(exp_id) output_dir = f"e2etest_trial_{trial_id}" os.mkdir(output_dir) command = ["det", "trial", "support-bundle", str(trial_id), "-o", output_dir] completed_process = subprocess.run( command, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE ) assert completed_process.returncode == 0, "\nstdout:\n{} \nstderr:\n{}".format( completed_process.stdout, completed_process.stderr )
def test_trial_logs() -> None: # TODO: refactor tests to not use cli singleton auth. master_url = conf.make_master_url() certs.cli_cert = certs.default_load(conf.make_master_url()) authentication.cli_auth = authentication.Authentication( conf.make_master_url(), try_reauth=True) experiment_id = exp.run_basic_test(conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), 1) trial = exp.experiment_trials(experiment_id)[0].trial trial_id = trial.id task_id = trial.taskId assert task_id != "" log_regex = re.compile("^.*New trial runner.*$") # Trial-specific APIs should work just fine. check_logs(master_url, trial_id, log_regex, api.trial_logs, api.trial_log_fields) # And so should new task log APIs. check_logs(master_url, task_id, log_regex, api.task_logs, api.task_log_fields)
def test_start_tensorboard_with_custom_image(tmp_path: Path) -> None: """ Start a random experiment, start a TensorBoard instance pointed to the experiment with custom image, verify the image has been set, and kill the TensorBoard instance. """ experiment_id = exp.run_basic_test( conf.fixtures_path("no_op/single-one-short-step.yaml"), conf.fixtures_path("no_op"), 1, ) command = [ "det", "-m", conf.make_master_url(), "tensorboard", "start", str(experiment_id), "--no-browser", "--detach", "--config", "environment.image=alpine", ] res = subprocess.run(command, universal_newlines=True, stdout=subprocess.PIPE, check=True) t_id = res.stdout.strip("\n") command = [ "det", "-m", conf.make_master_url(), "tensorboard", "config", t_id ] res = subprocess.run(command, universal_newlines=True, stdout=subprocess.PIPE, check=True) config = yaml.safe_load(res.stdout) assert (config["environment"]["image"]["cpu"] == "alpine" and config["environment"]["image"]["cuda"] == "alpine" and config["environment"]["image"]["rocm"] == "alpine"), config
def test_gpu_restore() -> None: experiment = exp.run_basic_test( conf.fixtures_path("pytorch-rng-saver/const.yaml"), conf.fixtures_path("pytorch-rng-saver"), 1, ) first_trial = exp.experiment_trials(experiment)[0] assert len(first_trial["steps"]) == 3 first_step = first_trial["steps"][0] second_checkpoint_id = first_step["checkpoint"]["id"] config_base = conf.load_config(conf.fixtures_path("pytorch-rng-saver/const.yaml")) config_obj = copy.deepcopy(config_base) config_obj["searcher"]["source_checkpoint_uuid"] = first_step["checkpoint"]["uuid"] experiment2 = exp.run_basic_test_with_temp_config( config_obj, conf.fixtures_path("pytorch-rng-saver"), 1 ) second_trial = exp.experiment_trials(experiment2)[0] assert len(second_trial["steps"]) == 3 assert second_trial["warm_start_checkpoint_id"] == second_checkpoint_id for step in range(0, 2): for metric in ["np_rand", "rand_rand", "torch_rand", "gpu_rand"]: first_metric = first_trial["steps"][step + 1]["validation"]["metrics"][ "validation_metrics" ][metric] second_metric = second_trial["steps"][step]["validation"]["metrics"][ "validation_metrics" ][metric] assert ( first_metric == second_metric ), f"failures on iteration: {step} with metric: {metric}"
def test_experiment_delete() -> None: user = create_test_user(ADMIN_CREDENTIALS) non_owner_user = create_test_user(ADMIN_CREDENTIALS) with logged_in_user(user): experiment_id = exp.run_basic_test( conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), 1) with logged_in_user(non_owner_user): # "det experiment delete" call should fail, because the user is not an admin and # doesn't own the experiment. child = det_spawn( ["experiment", "delete", str(experiment_id), "--yes"]) child.read() child.wait() assert child.exitstatus > 0 with logged_in_user(user): child = det_spawn( ["experiment", "delete", str(experiment_id), "--yes"]) child.read() child.wait() assert child.exitstatus == 0 experiment_delete_deadline = time.time() + 5 * 60 while 1: child = det_spawn(["experiment", "describe", str(experiment_id)]) child.read() child.wait() # "det experiment describe" call should fail, because the # experiment is no longer in the database. if child.exitstatus > 0: return elif time.time() > experiment_delete_deadline: pytest.fail("experiment didn't delete after timeout")
def test_startup_hook() -> None: exp.run_basic_test( conf.fixtures_path("no_op/startup-hook.yaml"), conf.fixtures_path("no_op"), 1, )
def test_noop_long_train_step() -> None: exp.run_basic_test( conf.fixtures_path("no_op/single-long-train-step.yaml"), conf.fixtures_path("no_op"), 1, )