def test_create_test_mode() -> None: # test-mode should succeed with a valid experiment. command = [ "det", "-m", conf.make_master_url(), "experiment", "create", "--test-mode", conf.fixtures_path("mnist_pytorch/adaptive_short.yaml"), conf.official_examples_path("mnist_pytorch"), ] output = subprocess.check_output(command, universal_newlines=True) assert "Model definition test succeeded" in output # test-mode should fail when an error is introduced into the trial # implementation. command = [ "det", "-m", conf.make_master_url(), "experiment", "create", "--test-mode", conf.fixtures_path("trial_error/const.yaml"), conf.fixtures_path("trial_error"), ] with pytest.raises(subprocess.CalledProcessError): subprocess.check_call(command)
def test_experiment_delete() -> None: subprocess.check_call( ["det", "-m", conf.make_master_url(), "user", "whoami"]) experiment_id = exp.run_basic_test(conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), 1) subprocess.check_call( [ "det", "-m", conf.make_master_url(), "experiment", "delete", str(experiment_id), "--yes" ], env={ **os.environ, "DET_ADMIN": "1" }, ) # "det experiment describe" call should fail, because the # experiment is no longer in the database. with pytest.raises(subprocess.CalledProcessError): subprocess.check_call([ "det", "-m", conf.make_master_url(), "experiment", "describe", str(experiment_id) ])
def test_labels() -> None: experiment_id = exp.create_experiment( conf.fixtures_path("no_op/single-one-short-step.yaml"), conf.fixtures_path("no_op"), None) label = "__det_test_dummy_label__" # Add a label and check that it shows up. subprocess.check_call([ "det", "-m", conf.make_master_url(), "e", "label", "add", str(experiment_id), label ]) output = subprocess.check_output([ "det", "-m", conf.make_master_url(), "e", "describe", str(experiment_id) ]).decode() assert label in output # Remove the label and check that it doesn't show up. subprocess.check_call([ "det", "-m", conf.make_master_url(), "e", "label", "remove", str(experiment_id), label ]) output = subprocess.check_output([ "det", "-m", conf.make_master_url(), "e", "describe", str(experiment_id) ]).decode() assert label not in output
def run_list_cli_tests(experiment_id: int) -> None: """ Runs list-related CLI commands on a finished experiment. Will raise an exception if the CLI command encounters a traceback failure. """ subprocess.check_call([ "det", "-m", conf.make_master_url(), "experiment", "list-trials", str(experiment_id) ]) subprocess.check_call([ "det", "-m", conf.make_master_url(), "experiment", "list-checkpoints", str(experiment_id) ]) subprocess.check_call([ "det", "-m", conf.make_master_url(), "experiment", "list-checkpoints", "--best", str(1), str(experiment_id), ])
def change_experiment_state(experiment_id: int, new_state: str) -> None: auth.initialize_session(conf.make_master_url(), try_reauth=True) r = api.patch( conf.make_master_url(), "experiments/{}".format(experiment_id), headers={"Content-Type": "application/merge-patch+json"}, body={"state": new_state}, ) assert r.status_code == requests.codes.no_content, r.text
def get_num_running_commands() -> int: auth.initialize_session(conf.make_master_url(), try_reauth=True) r = api.get(conf.make_master_url(), "commands") assert r.status_code == requests.codes.ok, r.text return len([ command for _id, command in r.json().items() if command["state"] == "RUNNING" ])
def cluster_slots() -> Dict[str, Any]: """ cluster_slots returns a dict of slots that each agent has. :return: Dict[AgentID, List[Slot]] """ auth.initialize_session(conf.make_master_url(), try_reauth=True) r = api.get(conf.make_master_url(), "agents") assert r.status_code == requests.codes.ok, r.text json = r.json() # type: Dict[str, Any] return {agent["id"]: agent["slots"].values() for agent in json.values()}
def test_experiment_archive_unarchive() -> None: experiment_id = exp.create_experiment( conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), ["--paused"]) describe_args = [ "det", "-m", conf.make_master_url(), "experiment", "describe", "--json", str(experiment_id), ] # Check that the experiment is initially unarchived. infos = json.loads(subprocess.check_output(describe_args)) assert len(infos) == 1 assert not infos[0]["archived"] # Check that archiving a non-terminal experiment fails, then terminate it. with pytest.raises(subprocess.CalledProcessError): subprocess.check_call([ "det", "-m", conf.make_master_url(), "experiment", "archive", str(experiment_id) ]) subprocess.check_call([ "det", "-m", conf.make_master_url(), "experiment", "cancel", str(experiment_id) ]) # Check that we can archive and unarchive the experiment and see the expected effects. subprocess.check_call([ "det", "-m", conf.make_master_url(), "experiment", "archive", str(experiment_id) ]) infos = json.loads(subprocess.check_output(describe_args)) assert len(infos) == 1 assert infos[0]["archived"] subprocess.check_call([ "det", "-m", conf.make_master_url(), "experiment", "unarchive", str(experiment_id) ]) infos = json.loads(subprocess.check_output(describe_args)) assert len(infos) == 1 assert not infos[0]["archived"]
def test_configs(tmp_path: Path) -> None: with FileTree( tmp_path, { "config.yaml": """ resources: slots: 1 environment: environment_variables: - TEST=TEST """ }, ) as tree: config_path = tree.joinpath("config.yaml") _run_and_verify_exit_code_zero([ "det", "-m", conf.make_master_url(), "cmd", "run", "--config-file", str(config_path), "python", "-c", """ import os test = os.environ["TEST"] if test != "TEST": print("{} != {}".format(test, "TEST")) sys.exit(1) """, ])
def activate_experiment(experiment_id: int) -> None: command = [ "det", "-m", conf.make_master_url(), "experiment", "activate", str(experiment_id) ] subprocess.check_call(command)
def maybe_create_experiment( config_file: str, model_def_file: str, create_args: Optional[List[str]] = None ) -> subprocess.CompletedProcess: command = [ "det", "-m", conf.make_master_url(), "experiment", "create", config_file, model_def_file, ] if create_args is not None: command += create_args env = os.environ.copy() env["DET_DEBUG"] = "true" return subprocess.run( command, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env, )
def test_basic_workflows(tmp_path: Path) -> None: with FileTree(tmp_path, {"hello.py": "print('hello world')"}) as tree: _run_and_verify_exit_code_zero([ "det", "-m", conf.make_master_url(), "cmd", "run", "--context", str(tree), "python", "hello.py", ]) with FileTree(tmp_path, {"hello.py": "print('hello world')"}) as tree: link = tree.joinpath("hello-link.py") link.symlink_to(tree.joinpath("hello.py")) _run_and_verify_exit_code_zero([ "det", "-m", conf.make_master_url(), "cmd", "run", "--context", str(tree), "python", "hello-link.py", ]) _run_and_verify_exit_code_zero([ "det", "-m", conf.make_master_url(), "cmd", "run", "python", "-c", "print('hello world')" ]) with pytest.raises(subprocess.CalledProcessError): _run_and_return_real_exit_status([ "det", "-m", conf.make_master_url(), "cmd", "run", "--context", "non-existent-path-here", "python", "hello.py", ])
def experiment_has_active_workload(experiment_id: int) -> bool: r = api.get(conf.make_master_url(), "tasks").json() for task in r.values(): if "Experiment {}".format(experiment_id) in task["name"] and len( task["containers"]) > 0: return True return False
def get_command_config(command_type: str, id: str) -> str: assert command_type in ["command", "notebook", "shell"] command = ["det", "-m", conf.make_master_url(), command_type, "config", id] completed_process = subprocess.run(command, universal_newlines=True, stdout=subprocess.PIPE) assert completed_process.returncode == 0 return str(completed_process.stdout)
def test_exit_code_reporting() -> None: """ Confirm that failed commands are not reported as successful, and confirm that our test infrastructure is valid. """ with pytest.raises(AssertionError): _run_and_verify_exit_code_zero( ["det", "-m", conf.make_master_url(), "cmd", "run", "false"])
def test_large_uploads(tmp_path: Path) -> None: with pytest.raises(subprocess.CalledProcessError): with FileTree(tmp_path, {"hello.py": "print('hello world')"}) as tree: large = tree.joinpath("large-file.bin") large.touch() f = large.open(mode="w") f.seek(1024 * 1024 * 120) f.write("\0") f.close() _run_and_return_real_exit_status([ "det", "-m", conf.make_master_url(), "cmd", "run", "--context", str(tree), "python", "hello.py", ]) with FileTree(tmp_path, { "hello.py": "print('hello world')", ".detignore": "*.bin" }) as tree: large = tree.joinpath("large-file.bin") large.touch() f = large.open(mode="w") f.seek(1024 * 1024 * 120) f.write("\0") f.close() _run_and_verify_exit_code_zero([ "det", "-m", conf.make_master_url(), "cmd", "run", "--context", str(tree), "python", "hello.py", ])
def test_iris() -> None: config = conf.load_config( conf.official_examples_path("iris_tf_keras/const.yaml")) config = conf.set_max_steps(config, 2) exp_id = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("iris_tf_keras"), 1) exp_ref = Determined(conf.make_master_url()).get_experiment(exp_id) model = exp_ref.top_checkpoint().load() model.summary()
def test_pytorch_load() -> None: config = conf.load_config( conf.fixtures_path("mnist_pytorch/const-pytorch11.yaml")) experiment_id = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("mnist_pytorch"), 1) nn = (Determined(conf.make_master_url()).get_experiment( experiment_id).top_checkpoint().load(map_location=torch.device("cpu"))) assert isinstance(nn, torch.nn.Module)
def maybe_set_template(template_name: str, template_file: str) -> subprocess.CompletedProcess: command = [ "det", "-m", conf.make_master_url(), "template", "set", template_name, os.path.join(os.path.dirname(__file__), template_file), ] return subprocess.run(command, universal_newlines=True, stdout=subprocess.PIPE)
def test_mnist_estimator_load() -> None: config = conf.load_config( conf.fixtures_path("mnist_estimator/single.yaml")) config = conf.set_tf1_image(config) experiment_id = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("mnist_estimator"), 1) trials = exp.experiment_trials(experiment_id) model = Determined(conf.make_master_url()).get_trial( trials[0].id).top_checkpoint().load() assert isinstance(model, AutoTrackable)
def test_pytorch_cifar10_const() -> None: config = conf.load_config( conf.official_examples_path("cifar10_cnn_pytorch/const.yaml")) config = conf.set_max_steps(config, 2) experiment_id = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("cifar10_cnn_pytorch"), 1) trials = exp.experiment_trials(experiment_id) nn = (Determined(conf.make_master_url()).get_trial( trials[0].id).select_checkpoint(latest=True).load( map_location=torch.device("cpu"))) assert isinstance(nn, torch.nn.Module)
def run_describe_cli_tests(experiment_id: int) -> None: """ Runs `det experiment describe` CLI command on a finished experiment. Will raise an exception if `det experiment describe` encounters a traceback failure. """ # "det experiment describe" without metrics. with tempfile.TemporaryDirectory() as tmpdir: subprocess.check_call([ "det", "-m", conf.make_master_url(), "experiment", "describe", str(experiment_id), "--outdir", tmpdir, ]) assert os.path.exists(os.path.join(tmpdir, "experiments.csv")) assert os.path.exists(os.path.join(tmpdir, "steps.csv")) assert os.path.exists(os.path.join(tmpdir, "trials.csv")) # "det experiment describe" with metrics. with tempfile.TemporaryDirectory() as tmpdir: subprocess.check_call([ "det", "-m", conf.make_master_url(), "experiment", "describe", str(experiment_id), "--metrics", "--outdir", tmpdir, ]) assert os.path.exists(os.path.join(tmpdir, "experiments.csv")) assert os.path.exists(os.path.join(tmpdir, "steps.csv")) assert os.path.exists(os.path.join(tmpdir, "trials.csv"))
def test_absolute_bind_mount(tmp_path: Path) -> None: _run_and_verify_exit_code_zero([ "det", "-m", conf.make_master_url(), "cmd", "run", "--volume", "/bin:/foo-bar", "ls", "/foo-bar", ]) with FileTree( tmp_path, { "config.yaml": """ bind_mounts: - host_path: /bin container_path: /foo-bar """ }, ) as tree: config_path = tree.joinpath("config.yaml") _run_and_verify_exit_code_zero([ "det", "-m", conf.make_master_url(), "cmd", "run", "--volume", "/bin:/foo-bar2", "--config-file", str(config_path), "ls", "/foo-bar", "/foo-bar2", ])
def test_end_to_end_adaptive() -> None: exp_id = exp.run_basic_test( conf.fixtures_path("mnist_pytorch/adaptive_short.yaml"), conf.official_examples_path("mnist_pytorch"), None, ) # Check that validation accuracy look sane (more than 93% on MNIST). trials = exp.experiment_trials(exp_id) best = None for trial in trials: assert len(trial.steps) last_step = trial.steps[-1] accuracy = last_step.validation.metrics["validation_metrics"][ "accuracy"] if not best or accuracy > best: best = accuracy assert best is not None assert best > 0.93 # Check that ExperimentReference returns a sorted order of top checkpoints # without gaps. The top 2 checkpoints should be the first 2 of the top k # checkpoints if sorting is stable. exp_ref = Determined(conf.make_master_url()).get_experiment(exp_id) top_2 = exp_ref.top_n_checkpoints(2) top_k = exp_ref.top_n_checkpoints(len(trials)) top_2_uuids = [c.uuid for c in top_2] top_k_uuids = [c.uuid for c in top_k] assert top_2_uuids == top_k_uuids[:2] # Check that metrics are truly in sorted order. metrics = [ c.validation.metrics["validation_metrics"]["validation_loss"] for c in top_k ] assert metrics == sorted(metrics) # Check that changing smaller is better reverses the checkpoint ordering. top_k_reversed = exp_ref.top_n_checkpoints(len(trials), sort_by="validation_loss", smaller_is_better=False) top_k_reversed_uuids = [c.uuid for c in top_k_reversed] assert top_k_uuids == top_k_reversed_uuids[::-1]
def get_command_config(command_type: str, id: str) -> str: assert command_type in ["command", "notebook", "shell"] command = ["det", "-m", conf.make_master_url(), command_type, "config", id] env = os.environ.copy() env["DET_DEBUG"] = "true" completed_process = subprocess.run( command, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env, ) assert completed_process.returncode == 0, "\nstdout:\n{} \nstderr:\n{}".format( completed_process.stdout, completed_process.stderr) return str(completed_process.stdout)
def maybe_create_native_experiment(context_dir: str, command: List[str]) -> Optional[int]: target_env = os.environ.copy() target_env["DET_MASTER"] = conf.make_master_url() with subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, cwd=context_dir, env=target_env) as p: for line in p.stdout: m = re.search(r"Created experiment (\d+)\n", line.decode()) if m is not None: return int(m.group(1)) return None
def maybe_create_experiment(implementation: NativeImplementation) -> typing.Optional[int]: logging.debug(implementation) target_env = os.environ.copy() target_env["DET_MASTER"] = conf.make_master_url() with subprocess.Popen( implementation.command + ["--config", json.dumps(implementation.configuration)], stdin=subprocess.PIPE, stdout=subprocess.PIPE, cwd=implementation.cwd, env=target_env, ) as p: for line in p.stdout: m = re.search(r"Created experiment (\d+)\n", line.decode()) if m is not None: return int(m.group(1)) return None
def test_image_pull_after_remove() -> None: """ Remove pulled image and verify that it will be pulled again with auth. """ client = docker.from_env() try: client.images.remove("alpine:3.10") except docker.errors.ImageNotFound: pass _run_and_verify_exit_code_zero([ "det", "-m", conf.make_master_url(), "cmd", "run", "--config", "environment.image=alpine:3.10", "sleep 3; echo hello world", ])
def maybe_create_experiment( config_file: str, model_def_file: str, create_args: Optional[List[str]] = None ) -> subprocess.CompletedProcess: command = [ "det", "-m", conf.make_master_url(), "experiment", "create", config_file, model_def_file, ] if create_args is not None: command += create_args return subprocess.run(command, universal_newlines=True, stdout=subprocess.PIPE)
def query() -> api.GraphQLQuery: auth.initialize_session(conf.make_master_url(), try_reauth=True) return api.GraphQLQuery(conf.make_master_url())