def test_large_uploads(tmp_path: Path) -> None: with pytest.raises(subprocess.CalledProcessError): with FileTree(tmp_path, {"hello.py": "print('hello world')"}) as tree: large = tree.joinpath("large-file.bin") large.touch() f = large.open(mode="w") f.seek(1024 * 1024 * 120) f.write("\0") f.close() _run_and_return_real_exit_status([ "det", "-m", conf.make_master_url(), "cmd", "run", "--context", str(tree), "python", "hello.py", ]) with FileTree(tmp_path, { "hello.py": "print('hello world')", ".detignore": "*.bin" }) as tree: large = tree.joinpath("large-file.bin") large.touch() f = large.open(mode="w") f.seek(1024 * 1024 * 120) f.write("\0") f.close() _run_and_verify_exit_code_zero([ "det", "-m", conf.make_master_url(), "cmd", "run", "--context", str(tree), "python", "hello.py", ])
def run_describe_cli_tests(experiment_id: int) -> None: """ Runs `det experiment describe` CLI command on a finished experiment. Will raise an exception if `det experiment describe` encounters a traceback failure. """ # "det experiment describe" without metrics. with tempfile.TemporaryDirectory() as tmpdir: subprocess.check_call( [ "det", "-m", conf.make_master_url(), "experiment", "describe", str(experiment_id), "--outdir", tmpdir, ] ) assert os.path.exists(os.path.join(tmpdir, "experiments.csv")) assert os.path.exists(os.path.join(tmpdir, "steps.csv")) assert os.path.exists(os.path.join(tmpdir, "trials.csv")) # "det experiment describe" with metrics. with tempfile.TemporaryDirectory() as tmpdir: subprocess.check_call( [ "det", "-m", conf.make_master_url(), "experiment", "describe", str(experiment_id), "--metrics", "--outdir", tmpdir, ] ) assert os.path.exists(os.path.join(tmpdir, "experiments.csv")) assert os.path.exists(os.path.join(tmpdir, "steps.csv")) assert os.path.exists(os.path.join(tmpdir, "trials.csv"))
def test_streaming_metrics_api() -> None: auth.initialize_session(conf.make_master_url(), try_reauth=True) pool = mp.pool.ThreadPool(processes=7) experiment_id = exp.create_experiment( conf.fixtures_path("mnist_pytorch/adaptive_short.yaml"), conf.tutorials_path("mnist_pytorch"), ) # To fully test the streaming APIs, the requests need to start running immediately after the # experiment, and then stay open until the experiment is complete. To accomplish this with all # of the API calls on a single experiment, we spawn them all in threads. metric_names_thread = pool.apply_async(request_metric_names, (experiment_id, )) train_metric_batches_thread = pool.apply_async( request_train_metric_batches, (experiment_id, )) valid_metric_batches_thread = pool.apply_async( request_valid_metric_batches, (experiment_id, )) train_trials_snapshot_thread = pool.apply_async( request_train_trials_snapshot, (experiment_id, )) valid_trials_snapshot_thread = pool.apply_async( request_valid_trials_snapshot, (experiment_id, )) train_trials_sample_thread = pool.apply_async(request_train_trials_sample, (experiment_id, )) valid_trials_sample_thread = pool.apply_async(request_valid_trials_sample, (experiment_id, )) metric_names_results = metric_names_thread.get() train_metric_batches_results = train_metric_batches_thread.get() valid_metric_batches_results = valid_metric_batches_thread.get() train_trials_snapshot_results = train_trials_snapshot_thread.get() valid_trials_snapshot_results = valid_trials_snapshot_thread.get() train_trials_sample_results = train_trials_sample_thread.get() valid_trials_sample_results = valid_trials_sample_thread.get() if metric_names_results is not None: pytest.fail("metric-names: %s. Results: %s" % metric_names_results) if train_metric_batches_results is not None: pytest.fail("metric-batches (training): %s. Results: %s" % train_metric_batches_results) if valid_metric_batches_results is not None: pytest.fail("metric-batches (validation): %s. Results: %s" % valid_metric_batches_results) if train_trials_snapshot_results is not None: pytest.fail("trials-snapshot (training): %s. Results: %s" % train_trials_snapshot_results) if valid_trials_snapshot_results is not None: pytest.fail("trials-snapshot (validation): %s. Results: %s" % valid_trials_snapshot_results) if train_trials_sample_results is not None: pytest.fail("trials-sample (training): %s. Results: %s" % train_trials_sample_results) if valid_trials_sample_results is not None: pytest.fail("trials-sample (validation): %s. Results: %s" % valid_trials_sample_results)
def test_iris() -> None: config = conf.load_config( conf.official_examples_path("iris_tf_keras/const.yaml")) config = conf.set_max_steps(config, 2) exp_id = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("iris_tf_keras"), 1) exp_ref = Determined(conf.make_master_url()).get_experiment(exp_id) model = exp_ref.top_checkpoint().load() model.summary()
def test_mnist_estimator_load() -> None: config = conf.load_config(conf.fixtures_path("mnist_estimator/single.yaml")) config = conf.set_tf1_image(config) experiment_id = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("trial/mnist_estimator"), 1 ) trials = exp.experiment_trials(experiment_id) model = Determined(conf.make_master_url()).get_trial(trials[0]["id"]).top_checkpoint().load() assert isinstance(model, AutoTrackable)
def fetch_master_log() -> bool: command = ["det", "-m", conf.make_master_url(), "master", "logs"] try: output = subprocess.check_output(command) except Exception: traceback.print_exc() return False with open(log_path, "wb") as log: log.write(output) return True
def test_pytorch_load() -> None: config = conf.load_config( conf.fixtures_path("mnist_pytorch/const-pytorch11.yaml")) experiment_id = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("mnist_pytorch"), 1) nn = (Determined(conf.make_master_url()).get_experiment( experiment_id).top_checkpoint().load(map_location=torch.device("cpu"))) assert isinstance(nn, torch.nn.Module)
def test_noop_load() -> None: """ Load a checkpoint """ experiment_id = exp.run_basic_test( conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), 1 ) trials = exp.experiment_trials(experiment_id) checkpoint = Determined(conf.make_master_url()).get_trial(trials[0].trial.id).top_checkpoint() assert checkpoint.task_id == trials[0].trial.taskId
def test_experiment_archive_unarchive() -> None: experiment_id = exp.create_experiment( conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), ["--paused"] ) describe_args = [ "det", "-m", conf.make_master_url(), "experiment", "describe", "--json", str(experiment_id), ] # Check that the experiment is initially unarchived. infos = json.loads(subprocess.check_output(describe_args)) assert len(infos) == 1 assert not infos[0]["archived"] # Check that archiving a non-terminal experiment fails, then terminate it. with pytest.raises(subprocess.CalledProcessError): subprocess.check_call( ["det", "-m", conf.make_master_url(), "experiment", "archive", str(experiment_id)] ) subprocess.check_call( ["det", "-m", conf.make_master_url(), "experiment", "cancel", str(experiment_id)] ) # Check that we can archive and unarchive the experiment and see the expected effects. subprocess.check_call( ["det", "-m", conf.make_master_url(), "experiment", "archive", str(experiment_id)] ) infos = json.loads(subprocess.check_output(describe_args)) assert len(infos) == 1 assert infos[0]["archived"] subprocess.check_call( ["det", "-m", conf.make_master_url(), "experiment", "unarchive", str(experiment_id)] ) infos = json.loads(subprocess.check_output(describe_args)) assert len(infos) == 1 assert not infos[0]["archived"]
def wait_for_gc_to_finish(experiment_id: int) -> None: certs.cli_cert = certs.default_load(conf.make_master_url()) authentication.cli_auth = authentication.Authentication( conf.make_master_url(), try_reauth=True) saw_gc = False # Don't wait longer than 5 minutes (as 600 half-seconds to improve our sampling resolution). for _ in range(600): r = api.get(conf.make_master_url(), "tasks").json() names = [task["name"] for task in r.values()] gc_name = f"Checkpoint GC (Experiment {experiment_id})" if gc_name in names: saw_gc = True elif saw_gc: # We previously saw checkpoint gc but now we don't, so it must have finished. return time.sleep(0.5) # It's possible that it ran really fast and we missed it, so just log this. print("Did not observe checkpoint gc start or finish!", file=sys.stderr)
def test_k8_mount(using_k8s: bool, sidecar: bool) -> None: if not using_k8s: pytest.skip("only need to run test on kubernetes") mount_path = "/ci/" with pytest.raises(subprocess.CalledProcessError): _run_and_verify_failure( [ "det", "-m", conf.make_master_url(), "cmd", "run", f"sleep 3; touch {mount_path}" ], "No such file or directory", ) config = { "environment": { "pod_spec": { "spec": { "containers": [{ "name": "determined-container", "volumeMounts": [{ "name": "temp1", "mountPath": mount_path }], }], "volumes": [{ "name": "temp1", "emptyDir": {} }], } } } } if sidecar: sidecar_container = { "name": "sidecar", "image": conf.TF1_CPU_IMAGE, "command": ["/bin/bash"], "args": ["-c", "exit 0"], } # We insert this as the first container, to make sure Determined can handle the case # where the `determined-container` is not the first one. config["environment"]["pod_spec"]["spec"]["containers"] = [ sidecar_container, config["environment"]["pod_spec"]["spec"]["containers"] [0], # type: ignore ] _run_cmd_with_config_expecting_success(cmd=f"sleep 3; touch {mount_path}", config=config)
def test_absolute_bind_mount(tmp_path: Path) -> None: _run_and_verify_exit_code_zero( [ "det", "-m", conf.make_master_url(), "cmd", "run", "--volume", "/bin:/foo-bar", "ls", "/foo-bar", ] ) with FileTree( tmp_path, { "config.yaml": """ bind_mounts: - host_path: /bin container_path: /foo-bar """ }, ) as tree: config_path = tree.joinpath("config.yaml") _run_and_verify_exit_code_zero( [ "det", "-m", conf.make_master_url(), "cmd", "run", "--volume", "/bin:/foo-bar2", "--config-file", str(config_path), "ls", "/foo-bar", "/foo-bar2", ] )
def test_task_logs(task_type: str, task_config: Dict[str, Any], log_regex: Any) -> None: # TODO: refactor tests to not use cli singleton auth. master_url = conf.make_master_url() certs.cli_cert = certs.default_load(conf.make_master_url()) authentication.cli_auth = authentication.Authentication( conf.make_master_url(), try_reauth=True) rps = bindings.get_GetResourcePools( session.Session(master_url, "determined", authentication.cli_auth, certs.cli_cert)) assert rps.resourcePools and len( rps.resourcePools) > 0, "missing resource pool" if (rps.resourcePools[0].type == bindings.v1ResourcePoolType.RESOURCE_POOL_TYPE_K8S and task_type == command.TaskTypeCommand): # TODO(DET-6712): Investigate intermittent slowness with K8s command logs. return body = {} if task_type == command.TaskTypeTensorBoard: exp_id = exp.run_basic_test( conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), 1, ) body.update({"experiment_ids": [exp_id]}) resp = command.launch_command( master_url, f"api/v1/{command.RemoteTaskNewAPIs[task_type]}", task_config, "", default_body=body, ) task_id = resp[command.RemoteTaskName[task_type]]["id"] try: check_logs(master_url, task_id, log_regex, api.task_logs, api.task_log_fields) finally: command._kill(master_url, task_type, task_id)
def test_hp_importance_api() -> None: certs.cli_cert = certs.default_load(conf.make_master_url()) authentication.cli_auth = authentication.Authentication( conf.make_master_url(), try_reauth=True) pool = mp.pool.ThreadPool(processes=1) experiment_id = exp.create_experiment( conf.fixtures_path("mnist_pytorch/random.yaml"), conf.tutorials_path("mnist_pytorch"), ) hp_importance_thread = pool.apply_async(request_hp_importance, (experiment_id, )) hp_importance_results = hp_importance_thread.get() if hp_importance_results is not None: pytest.fail("hyperparameter-importance: %s. Results: %s" % hp_importance_results)
def test_environment_variables_command() -> None: _run_and_verify_exit_code_zero([ "det", "-m", conf.make_master_url(), "cmd", "run", "--config", "environment.environment_variables='THISISTRUE=true','WONTCAUSEPANIC'", 'if [ "$THISISTRUE" != "true" ]; then exit 1; fi', ])
def test_change_displayname(clean_auth: None) -> None: u_patch = create_test_user(ADMIN_CREDENTIALS, False) original_name = u_patch.username master_url = conf.make_master_url() certs.cli_cert = certs.default_load(master_url) authentication.cli_auth = authentication.Authentication( conf.make_master_url(), requested_user=original_name, password="", try_reauth=True) sess = session.Session(master_url, original_name, authentication.cli_auth, certs.cli_cert) # Get API bindings object for the created test user all_users = bindings.get_GetUsers(sess).users assert all_users is not None current_user = list( filter(lambda u: u.username == original_name, all_users))[0] assert current_user is not None and current_user.id # Rename user using display name patch_user = bindings.v1PatchUser(displayName="renamed") bindings.patch_PatchUser(sess, body=patch_user, userId=current_user.id) modded_user = bindings.get_GetUser(sess, userId=current_user.id).user assert modded_user is not None assert modded_user.displayName == "renamed" # Avoid display name of 'admin' patch_user.displayName = "Admin" with pytest.raises(errors.APIException): bindings.patch_PatchUser(sess, body=patch_user, userId=current_user.id) # Clear display name (UI will show username) patch_user.displayName = "" bindings.patch_PatchUser(sess, body=patch_user, userId=current_user.id) modded_user = bindings.get_GetUser(sess, userId=current_user.id).user assert modded_user is not None assert modded_user.displayName == ""
def request_valid_trials_sample(experiment_id): # type: ignore response = api.get( conf.make_master_url(), "api/v1/experiments/{}/metrics-stream/trials-sample".format(experiment_id), params={ "metric_name": "accuracy", "metric_type": "METRIC_TYPE_VALIDATION", "period_seconds": 1, }, ) results = [message["result"] for message in map(json.loads, response.text.splitlines())] return check_trials_sample_result(results)
def get_command_config(command_type: str, id: str) -> str: assert command_type in ["command", "notebook", "shell"] command = ["det", "-m", conf.make_master_url(), command_type, "config", id] env = os.environ.copy() env["DET_DEBUG"] = "true" completed_process = subprocess.run( command, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env, ) assert completed_process.returncode == 0, "\nstdout:\n{} \nstderr:\n{}".format( completed_process.stdout, completed_process.stderr ) return str(completed_process.stdout)
def test_trial_logs() -> None: # TODO: refactor tests to not use cli singleton auth. master_url = conf.make_master_url() certs.cli_cert = certs.default_load(conf.make_master_url()) authentication.cli_auth = authentication.Authentication( conf.make_master_url(), try_reauth=True) experiment_id = exp.run_basic_test(conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), 1) trial = exp.experiment_trials(experiment_id)[0].trial trial_id = trial.id task_id = trial.taskId assert task_id != "" log_regex = re.compile("^.*New trial runner.*$") # Trial-specific APIs should work just fine. check_logs(master_url, trial_id, log_regex, api.trial_logs, api.trial_log_fields) # And so should new task log APIs. check_logs(master_url, task_id, log_regex, api.task_logs, api.task_log_fields)
def test_pytorch_cifar10_parallel() -> None: config = conf.load_config( conf.official_examples_path("trial/cifar10_cnn_pytorch/const.yaml")) config = conf.set_max_steps(config, 2) config = conf.set_slots_per_trial(config, 8) experiment_id = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("trial/cifar10_cnn_pytorch"), 1) trials = exp.experiment_trials(experiment_id) nn = (Determined(conf.make_master_url()).get_trial( trials[0]["id"]).select_checkpoint(latest=True).load()) assert isinstance(nn, torch.nn.Module)
def test_pytorch_load(collect_trial_profiles: Callable[[int], None]) -> None: config = conf.load_config( conf.fixtures_path("mnist_pytorch/const-pytorch11.yaml")) config = conf.set_profiling_enabled(config) experiment_id = exp.run_basic_test_with_temp_config( config, conf.tutorials_path("mnist_pytorch"), 1) (Determined(conf.make_master_url()).get_experiment( experiment_id).top_checkpoint().load(map_location="cpu")) trial_id = exp.experiment_trials(experiment_id)[0].trial.id collect_trial_profiles(trial_id)
def test_pytorch_cifar10_const() -> None: config = conf.load_config( conf.official_examples_path("cifar10_cnn_pytorch/const.yaml")) config = conf.set_max_steps(config, 2) experiment_id = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("cifar10_cnn_pytorch"), 1) trials = exp.experiment_trials(experiment_id) nn = (Determined(conf.make_master_url()).get_trial( trials[0].id).select_checkpoint(latest=True).load( map_location=torch.device("cpu"))) assert isinstance(nn, torch.nn.Module)
def _fetch_slots() -> List[Dict[str, Any]]: command = [ "det", "-m", conf.make_master_url(), "slot", "list", "--json", ] output = subprocess.check_output(command).decode() slots = cast(List[Dict[str, str]], json.loads(output)) return slots
def test_pytorch_gan_parallel() -> None: config = conf.load_config( conf.gan_examples_path("gan_mnist_pytorch/const.yaml")) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_slots_per_trial(config, 8) experiment_id = exp.run_basic_test_with_temp_config( config, conf.gan_examples_path("gan_mnist_pytorch"), 1) trials = exp.experiment_trials(experiment_id) (Determined(conf.make_master_url()).get_trial( trials[0]["id"]).select_checkpoint(latest=True).load( map_location="cpu"))
def test_start_tensorboard_with_custom_image(tmp_path: Path) -> None: """ Start a random experiment, start a TensorBoard instance pointed to the experiment with custom image, verify the image has been set, and kill the TensorBoard instance. """ experiment_id = exp.run_basic_test( conf.fixtures_path("no_op/single-one-short-step.yaml"), conf.fixtures_path("no_op"), 1, ) command = [ "det", "-m", conf.make_master_url(), "tensorboard", "start", str(experiment_id), "--no-browser", "--detach", "--config", "environment.image=alpine", ] res = subprocess.run(command, universal_newlines=True, stdout=subprocess.PIPE, check=True) t_id = res.stdout.strip("\n") command = [ "det", "-m", conf.make_master_url(), "tensorboard", "config", t_id ] res = subprocess.run(command, universal_newlines=True, stdout=subprocess.PIPE, check=True) config = yaml.safe_load(res.stdout) assert (config["environment"]["image"]["cpu"] == "alpine" and config["environment"]["image"]["cuda"] == "alpine" and config["environment"]["image"]["rocm"] == "alpine"), config
def request_profiling_pytorch_timing_metrics(trial_id: int, metric_name: str, accumulated: bool = False ) -> None: def validate_timing_batch(batch: Dict[str, Any], batch_idx: int) -> int: values = batch["values"] batches = batch["batches"] num_values = len(values) num_batch_indexes = len(batches) num_timestamps = len(batch["timestamps"]) if num_values != num_batch_indexes or num_batch_indexes != num_timestamps: pytest.fail( f"mismatched slices: not ({num_values} == {num_batch_indexes} == {num_timestamps})" ) if not any(values): pytest.fail(f"received bad batch, something went wrong: {batch}") if batches[0] != batch_idx: pytest.fail( f"batch did not start at correct batch, {batches[0]} != {batch_idx}: {batch}" ) # Check batches are monotonic with no gaps. if not all(x + 1 == y for x, y in zip(batches, batches[1:])): pytest.fail(f"skips in batches sampled: {batch}") # 10 is just a threshold at which it would be really strange for a batch to be monotonic. if accumulated and len(values) > 10 and all( x < y for x, y in zip(values, values[1:])): pytest.fail( f"per batch accumulated metric was monotonic, which is really fishy: {batch}" ) return int(batches[-1]) + 1 with api.get( conf.make_master_url(), "api/v1/trials/{}/profiler/metrics?{}".format( trial_id, to_query_params(PROFILER_METRIC_TYPE_TIMING, metric_name), ), stream=True, ) as r: batch_idx = 0 have_batch = False for line in r.iter_lines(): batch = json.loads(line)["result"]["batch"] batch_idx = validate_timing_batch(batch, batch_idx) have_batch = True if not have_batch: pytest.fail("no batch metrics at all")
def request_profiling_metric_labels(trial_id: int, timing_enabled: bool, gpu_enabled: bool) -> None: def validate_labels(labels: Sequence[Dict[str, Any]]) -> None: # Check some labels against the expected labels. Return the missing labels. expected = { "cpu_util_simple": PROFILER_METRIC_TYPE_SYSTEM, "dataloader_next": PROFILER_METRIC_TYPE_TIMING, "disk_iops": PROFILER_METRIC_TYPE_SYSTEM, "disk_throughput_read": PROFILER_METRIC_TYPE_SYSTEM, "disk_throughput_write": PROFILER_METRIC_TYPE_SYSTEM, "free_memory": PROFILER_METRIC_TYPE_SYSTEM, "from_device": PROFILER_METRIC_TYPE_TIMING, "net_throughput_recv": PROFILER_METRIC_TYPE_SYSTEM, "net_throughput_sent": PROFILER_METRIC_TYPE_SYSTEM, "reduce_metrics": PROFILER_METRIC_TYPE_TIMING, "step_lr_schedulers": PROFILER_METRIC_TYPE_TIMING, "to_device": PROFILER_METRIC_TYPE_TIMING, "train_batch": PROFILER_METRIC_TYPE_TIMING, } if gpu_enabled: expected.update({ "gpu_free_memory": PROFILER_METRIC_TYPE_SYSTEM, "gpu_util": PROFILER_METRIC_TYPE_SYSTEM, }) if not timing_enabled: expected = { k: v for k, v in expected.items() if v != PROFILER_METRIC_TYPE_TIMING } for label in labels: metric_name = label["name"] metric_type = label["metricType"] if expected.get(metric_name, None) == metric_type: del expected[metric_name] if len(expected) > 0: pytest.fail( f"expected completed experiment to have all labels but some are missing: {expected}" ) with api.get( conf.make_master_url(), "api/v1/trials/{}/profiler/available_series".format(trial_id), stream=True, ) as r: for line in r.iter_lines(): labels = simplejson.loads(line)["result"]["labels"] validate_labels(labels) # Just check 1 iter. return
def maybe_create_native_experiment(context_dir: str, command: List[str]) -> Optional[int]: target_env = os.environ.copy() target_env["DET_MASTER"] = conf.make_master_url() with subprocess.Popen( command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, cwd=context_dir, env=target_env ) as p: for line in p.stdout: m = re.search(r"Created experiment (\d+)\n", line.decode()) if m is not None: return int(m.group(1)) return None
def using_k8s(request: SubRequest) -> bool: command = [ "det", "-m", config.make_master_url(), "master", "config", ] output = subprocess.check_output(command, universal_newlines=True, stderr=subprocess.PIPE) rp = json.loads(output)["resource_manager"]["type"] return bool(rp == "kubernetes")
def _run_cmd_with_config_expecting_success( cmd: str, config: Dict[str, Any], context_path: Optional[str] = None ) -> None: with tempfile.NamedTemporaryFile() as tf: with open(tf.name, "w") as f: yaml.dump(config, f) command = ["det", "-m", conf.make_master_url(), "cmd", "run", "--config-file", tf.name] if context_path: command += ["-c", context_path] command.append(cmd) _run_and_verify_exit_code_zero(command)