Example #1
0
def test_large_uploads(tmp_path: Path) -> None:
    with pytest.raises(subprocess.CalledProcessError):
        with FileTree(tmp_path, {"hello.py": "print('hello world')"}) as tree:
            large = tree.joinpath("large-file.bin")
            large.touch()
            f = large.open(mode="w")
            f.seek(1024 * 1024 * 120)
            f.write("\0")
            f.close()

            _run_and_return_real_exit_status([
                "det",
                "-m",
                conf.make_master_url(),
                "cmd",
                "run",
                "--context",
                str(tree),
                "python",
                "hello.py",
            ])

    with FileTree(tmp_path, {
            "hello.py": "print('hello world')",
            ".detignore": "*.bin"
    }) as tree:
        large = tree.joinpath("large-file.bin")
        large.touch()
        f = large.open(mode="w")
        f.seek(1024 * 1024 * 120)
        f.write("\0")
        f.close()

        _run_and_verify_exit_code_zero([
            "det",
            "-m",
            conf.make_master_url(),
            "cmd",
            "run",
            "--context",
            str(tree),
            "python",
            "hello.py",
        ])
Example #2
0
def run_describe_cli_tests(experiment_id: int) -> None:
    """
    Runs `det experiment describe` CLI command on a finished
    experiment. Will raise an exception if `det experiment describe`
    encounters a traceback failure.
    """
    # "det experiment describe" without metrics.
    with tempfile.TemporaryDirectory() as tmpdir:
        subprocess.check_call(
            [
                "det",
                "-m",
                conf.make_master_url(),
                "experiment",
                "describe",
                str(experiment_id),
                "--outdir",
                tmpdir,
            ]
        )

        assert os.path.exists(os.path.join(tmpdir, "experiments.csv"))
        assert os.path.exists(os.path.join(tmpdir, "steps.csv"))
        assert os.path.exists(os.path.join(tmpdir, "trials.csv"))

    # "det experiment describe" with metrics.
    with tempfile.TemporaryDirectory() as tmpdir:
        subprocess.check_call(
            [
                "det",
                "-m",
                conf.make_master_url(),
                "experiment",
                "describe",
                str(experiment_id),
                "--metrics",
                "--outdir",
                tmpdir,
            ]
        )

        assert os.path.exists(os.path.join(tmpdir, "experiments.csv"))
        assert os.path.exists(os.path.join(tmpdir, "steps.csv"))
        assert os.path.exists(os.path.join(tmpdir, "trials.csv"))
Example #3
0
def test_streaming_metrics_api() -> None:
    auth.initialize_session(conf.make_master_url(), try_reauth=True)

    pool = mp.pool.ThreadPool(processes=7)

    experiment_id = exp.create_experiment(
        conf.fixtures_path("mnist_pytorch/adaptive_short.yaml"),
        conf.tutorials_path("mnist_pytorch"),
    )
    # To fully test the streaming APIs, the requests need to start running immediately after the
    # experiment, and then stay open until the experiment is complete. To accomplish this with all
    # of the API calls on a single experiment, we spawn them all in threads.

    metric_names_thread = pool.apply_async(request_metric_names,
                                           (experiment_id, ))
    train_metric_batches_thread = pool.apply_async(
        request_train_metric_batches, (experiment_id, ))
    valid_metric_batches_thread = pool.apply_async(
        request_valid_metric_batches, (experiment_id, ))
    train_trials_snapshot_thread = pool.apply_async(
        request_train_trials_snapshot, (experiment_id, ))
    valid_trials_snapshot_thread = pool.apply_async(
        request_valid_trials_snapshot, (experiment_id, ))
    train_trials_sample_thread = pool.apply_async(request_train_trials_sample,
                                                  (experiment_id, ))
    valid_trials_sample_thread = pool.apply_async(request_valid_trials_sample,
                                                  (experiment_id, ))

    metric_names_results = metric_names_thread.get()
    train_metric_batches_results = train_metric_batches_thread.get()
    valid_metric_batches_results = valid_metric_batches_thread.get()
    train_trials_snapshot_results = train_trials_snapshot_thread.get()
    valid_trials_snapshot_results = valid_trials_snapshot_thread.get()
    train_trials_sample_results = train_trials_sample_thread.get()
    valid_trials_sample_results = valid_trials_sample_thread.get()

    if metric_names_results is not None:
        pytest.fail("metric-names: %s. Results: %s" % metric_names_results)
    if train_metric_batches_results is not None:
        pytest.fail("metric-batches (training): %s. Results: %s" %
                    train_metric_batches_results)
    if valid_metric_batches_results is not None:
        pytest.fail("metric-batches (validation): %s. Results: %s" %
                    valid_metric_batches_results)
    if train_trials_snapshot_results is not None:
        pytest.fail("trials-snapshot (training): %s. Results: %s" %
                    train_trials_snapshot_results)
    if valid_trials_snapshot_results is not None:
        pytest.fail("trials-snapshot (validation): %s. Results: %s" %
                    valid_trials_snapshot_results)
    if train_trials_sample_results is not None:
        pytest.fail("trials-sample (training): %s. Results: %s" %
                    train_trials_sample_results)
    if valid_trials_sample_results is not None:
        pytest.fail("trials-sample (validation): %s. Results: %s" %
                    valid_trials_sample_results)
Example #4
0
def test_iris() -> None:
    config = conf.load_config(
        conf.official_examples_path("iris_tf_keras/const.yaml"))
    config = conf.set_max_steps(config, 2)

    exp_id = exp.run_basic_test_with_temp_config(
        config, conf.official_examples_path("iris_tf_keras"), 1)
    exp_ref = Determined(conf.make_master_url()).get_experiment(exp_id)
    model = exp_ref.top_checkpoint().load()
    model.summary()
Example #5
0
def test_mnist_estimator_load() -> None:
    config = conf.load_config(conf.fixtures_path("mnist_estimator/single.yaml"))
    config = conf.set_tf1_image(config)
    experiment_id = exp.run_basic_test_with_temp_config(
        config, conf.official_examples_path("trial/mnist_estimator"), 1
    )

    trials = exp.experiment_trials(experiment_id)
    model = Determined(conf.make_master_url()).get_trial(trials[0]["id"]).top_checkpoint().load()
    assert isinstance(model, AutoTrackable)
def fetch_master_log() -> bool:
    command = ["det", "-m", conf.make_master_url(), "master", "logs"]
    try:
        output = subprocess.check_output(command)
    except Exception:
        traceback.print_exc()
        return False
    with open(log_path, "wb") as log:
        log.write(output)
    return True
Example #7
0
def test_pytorch_load() -> None:
    config = conf.load_config(
        conf.fixtures_path("mnist_pytorch/const-pytorch11.yaml"))

    experiment_id = exp.run_basic_test_with_temp_config(
        config, conf.official_examples_path("mnist_pytorch"), 1)

    nn = (Determined(conf.make_master_url()).get_experiment(
        experiment_id).top_checkpoint().load(map_location=torch.device("cpu")))
    assert isinstance(nn, torch.nn.Module)
Example #8
0
def test_noop_load() -> None:
    """
    Load a checkpoint
    """
    experiment_id = exp.run_basic_test(
        conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), 1
    )
    trials = exp.experiment_trials(experiment_id)
    checkpoint = Determined(conf.make_master_url()).get_trial(trials[0].trial.id).top_checkpoint()
    assert checkpoint.task_id == trials[0].trial.taskId
Example #9
0
def test_experiment_archive_unarchive() -> None:
    experiment_id = exp.create_experiment(
        conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), ["--paused"]
    )

    describe_args = [
        "det",
        "-m",
        conf.make_master_url(),
        "experiment",
        "describe",
        "--json",
        str(experiment_id),
    ]

    # Check that the experiment is initially unarchived.
    infos = json.loads(subprocess.check_output(describe_args))
    assert len(infos) == 1
    assert not infos[0]["archived"]

    # Check that archiving a non-terminal experiment fails, then terminate it.
    with pytest.raises(subprocess.CalledProcessError):
        subprocess.check_call(
            ["det", "-m", conf.make_master_url(), "experiment", "archive", str(experiment_id)]
        )
    subprocess.check_call(
        ["det", "-m", conf.make_master_url(), "experiment", "cancel", str(experiment_id)]
    )

    # Check that we can archive and unarchive the experiment and see the expected effects.
    subprocess.check_call(
        ["det", "-m", conf.make_master_url(), "experiment", "archive", str(experiment_id)]
    )
    infos = json.loads(subprocess.check_output(describe_args))
    assert len(infos) == 1
    assert infos[0]["archived"]

    subprocess.check_call(
        ["det", "-m", conf.make_master_url(), "experiment", "unarchive", str(experiment_id)]
    )
    infos = json.loads(subprocess.check_output(describe_args))
    assert len(infos) == 1
    assert not infos[0]["archived"]
def wait_for_gc_to_finish(experiment_id: int) -> None:
    certs.cli_cert = certs.default_load(conf.make_master_url())
    authentication.cli_auth = authentication.Authentication(
        conf.make_master_url(), try_reauth=True)
    saw_gc = False
    # Don't wait longer than 5 minutes (as 600 half-seconds to improve our sampling resolution).
    for _ in range(600):
        r = api.get(conf.make_master_url(), "tasks").json()
        names = [task["name"] for task in r.values()]
        gc_name = f"Checkpoint GC (Experiment {experiment_id})"
        if gc_name in names:
            saw_gc = True
        elif saw_gc:
            # We previously saw checkpoint gc but now we don't, so it must have finished.
            return
        time.sleep(0.5)

    # It's possible that it ran really fast and we missed it, so just log this.
    print("Did not observe checkpoint gc start or finish!", file=sys.stderr)
Example #11
0
def test_k8_mount(using_k8s: bool, sidecar: bool) -> None:
    if not using_k8s:
        pytest.skip("only need to run test on kubernetes")

    mount_path = "/ci/"

    with pytest.raises(subprocess.CalledProcessError):
        _run_and_verify_failure(
            [
                "det", "-m",
                conf.make_master_url(), "cmd", "run",
                f"sleep 3; touch {mount_path}"
            ],
            "No such file or directory",
        )

    config = {
        "environment": {
            "pod_spec": {
                "spec": {
                    "containers": [{
                        "name":
                        "determined-container",
                        "volumeMounts": [{
                            "name": "temp1",
                            "mountPath": mount_path
                        }],
                    }],
                    "volumes": [{
                        "name": "temp1",
                        "emptyDir": {}
                    }],
                }
            }
        }
    }

    if sidecar:
        sidecar_container = {
            "name": "sidecar",
            "image": conf.TF1_CPU_IMAGE,
            "command": ["/bin/bash"],
            "args": ["-c", "exit 0"],
        }

        # We insert this as the first container, to make sure Determined can handle the case
        # where the `determined-container` is not the first one.
        config["environment"]["pod_spec"]["spec"]["containers"] = [
            sidecar_container,
            config["environment"]["pod_spec"]["spec"]["containers"]
            [0],  # type: ignore
        ]

    _run_cmd_with_config_expecting_success(cmd=f"sleep 3; touch {mount_path}",
                                           config=config)
Example #12
0
def test_absolute_bind_mount(tmp_path: Path) -> None:
    _run_and_verify_exit_code_zero(
        [
            "det",
            "-m",
            conf.make_master_url(),
            "cmd",
            "run",
            "--volume",
            "/bin:/foo-bar",
            "ls",
            "/foo-bar",
        ]
    )

    with FileTree(
        tmp_path,
        {
            "config.yaml": """
bind_mounts:
- host_path: /bin
  container_path: /foo-bar
"""
        },
    ) as tree:
        config_path = tree.joinpath("config.yaml")
        _run_and_verify_exit_code_zero(
            [
                "det",
                "-m",
                conf.make_master_url(),
                "cmd",
                "run",
                "--volume",
                "/bin:/foo-bar2",
                "--config-file",
                str(config_path),
                "ls",
                "/foo-bar",
                "/foo-bar2",
            ]
        )
Example #13
0
def test_task_logs(task_type: str, task_config: Dict[str, Any],
                   log_regex: Any) -> None:
    # TODO: refactor tests to not use cli singleton auth.
    master_url = conf.make_master_url()
    certs.cli_cert = certs.default_load(conf.make_master_url())
    authentication.cli_auth = authentication.Authentication(
        conf.make_master_url(), try_reauth=True)

    rps = bindings.get_GetResourcePools(
        session.Session(master_url, "determined", authentication.cli_auth,
                        certs.cli_cert))
    assert rps.resourcePools and len(
        rps.resourcePools) > 0, "missing resource pool"

    if (rps.resourcePools[0].type
            == bindings.v1ResourcePoolType.RESOURCE_POOL_TYPE_K8S
            and task_type == command.TaskTypeCommand):
        # TODO(DET-6712): Investigate intermittent slowness with K8s command logs.
        return

    body = {}
    if task_type == command.TaskTypeTensorBoard:
        exp_id = exp.run_basic_test(
            conf.fixtures_path("no_op/single.yaml"),
            conf.fixtures_path("no_op"),
            1,
        )
        body.update({"experiment_ids": [exp_id]})

    resp = command.launch_command(
        master_url,
        f"api/v1/{command.RemoteTaskNewAPIs[task_type]}",
        task_config,
        "",
        default_body=body,
    )
    task_id = resp[command.RemoteTaskName[task_type]]["id"]
    try:
        check_logs(master_url, task_id, log_regex, api.task_logs,
                   api.task_log_fields)
    finally:
        command._kill(master_url, task_type, task_id)
def test_hp_importance_api() -> None:
    certs.cli_cert = certs.default_load(conf.make_master_url())
    authentication.cli_auth = authentication.Authentication(
        conf.make_master_url(), try_reauth=True)

    pool = mp.pool.ThreadPool(processes=1)

    experiment_id = exp.create_experiment(
        conf.fixtures_path("mnist_pytorch/random.yaml"),
        conf.tutorials_path("mnist_pytorch"),
    )

    hp_importance_thread = pool.apply_async(request_hp_importance,
                                            (experiment_id, ))

    hp_importance_results = hp_importance_thread.get()

    if hp_importance_results is not None:
        pytest.fail("hyperparameter-importance: %s. Results: %s" %
                    hp_importance_results)
Example #15
0
def test_environment_variables_command() -> None:
    _run_and_verify_exit_code_zero([
        "det",
        "-m",
        conf.make_master_url(),
        "cmd",
        "run",
        "--config",
        "environment.environment_variables='THISISTRUE=true','WONTCAUSEPANIC'",
        'if [ "$THISISTRUE" != "true" ]; then exit 1; fi',
    ])
Example #16
0
def test_change_displayname(clean_auth: None) -> None:
    u_patch = create_test_user(ADMIN_CREDENTIALS, False)
    original_name = u_patch.username

    master_url = conf.make_master_url()
    certs.cli_cert = certs.default_load(master_url)
    authentication.cli_auth = authentication.Authentication(
        conf.make_master_url(),
        requested_user=original_name,
        password="",
        try_reauth=True)
    sess = session.Session(master_url, original_name, authentication.cli_auth,
                           certs.cli_cert)

    # Get API bindings object for the created test user
    all_users = bindings.get_GetUsers(sess).users
    assert all_users is not None
    current_user = list(
        filter(lambda u: u.username == original_name, all_users))[0]
    assert current_user is not None and current_user.id

    # Rename user using display name
    patch_user = bindings.v1PatchUser(displayName="renamed")
    bindings.patch_PatchUser(sess, body=patch_user, userId=current_user.id)

    modded_user = bindings.get_GetUser(sess, userId=current_user.id).user
    assert modded_user is not None
    assert modded_user.displayName == "renamed"

    # Avoid display name of 'admin'
    patch_user.displayName = "Admin"
    with pytest.raises(errors.APIException):
        bindings.patch_PatchUser(sess, body=patch_user, userId=current_user.id)

    # Clear display name (UI will show username)
    patch_user.displayName = ""
    bindings.patch_PatchUser(sess, body=patch_user, userId=current_user.id)

    modded_user = bindings.get_GetUser(sess, userId=current_user.id).user
    assert modded_user is not None
    assert modded_user.displayName == ""
Example #17
0
def request_valid_trials_sample(experiment_id):  # type: ignore
    response = api.get(
        conf.make_master_url(),
        "api/v1/experiments/{}/metrics-stream/trials-sample".format(experiment_id),
        params={
            "metric_name": "accuracy",
            "metric_type": "METRIC_TYPE_VALIDATION",
            "period_seconds": 1,
        },
    )
    results = [message["result"] for message in map(json.loads, response.text.splitlines())]
    return check_trials_sample_result(results)
Example #18
0
def get_command_config(command_type: str, id: str) -> str:
    assert command_type in ["command", "notebook", "shell"]
    command = ["det", "-m", conf.make_master_url(), command_type, "config", id]
    env = os.environ.copy()
    env["DET_DEBUG"] = "true"
    completed_process = subprocess.run(
        command, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env,
    )
    assert completed_process.returncode == 0, "\nstdout:\n{} \nstderr:\n{}".format(
        completed_process.stdout, completed_process.stderr
    )
    return str(completed_process.stdout)
Example #19
0
def test_trial_logs() -> None:
    # TODO: refactor tests to not use cli singleton auth.
    master_url = conf.make_master_url()
    certs.cli_cert = certs.default_load(conf.make_master_url())
    authentication.cli_auth = authentication.Authentication(
        conf.make_master_url(), try_reauth=True)

    experiment_id = exp.run_basic_test(conf.fixtures_path("no_op/single.yaml"),
                                       conf.fixtures_path("no_op"), 1)
    trial = exp.experiment_trials(experiment_id)[0].trial
    trial_id = trial.id
    task_id = trial.taskId
    assert task_id != ""

    log_regex = re.compile("^.*New trial runner.*$")
    # Trial-specific APIs should work just fine.
    check_logs(master_url, trial_id, log_regex, api.trial_logs,
               api.trial_log_fields)
    # And so should new task log APIs.
    check_logs(master_url, task_id, log_regex, api.task_logs,
               api.task_log_fields)
Example #20
0
def test_pytorch_cifar10_parallel() -> None:
    config = conf.load_config(
        conf.official_examples_path("trial/cifar10_cnn_pytorch/const.yaml"))
    config = conf.set_max_steps(config, 2)
    config = conf.set_slots_per_trial(config, 8)

    experiment_id = exp.run_basic_test_with_temp_config(
        config, conf.official_examples_path("trial/cifar10_cnn_pytorch"), 1)
    trials = exp.experiment_trials(experiment_id)
    nn = (Determined(conf.make_master_url()).get_trial(
        trials[0]["id"]).select_checkpoint(latest=True).load())
    assert isinstance(nn, torch.nn.Module)
def test_pytorch_load(collect_trial_profiles: Callable[[int], None]) -> None:
    config = conf.load_config(
        conf.fixtures_path("mnist_pytorch/const-pytorch11.yaml"))
    config = conf.set_profiling_enabled(config)

    experiment_id = exp.run_basic_test_with_temp_config(
        config, conf.tutorials_path("mnist_pytorch"), 1)

    (Determined(conf.make_master_url()).get_experiment(
        experiment_id).top_checkpoint().load(map_location="cpu"))
    trial_id = exp.experiment_trials(experiment_id)[0].trial.id
    collect_trial_profiles(trial_id)
Example #22
0
def test_pytorch_cifar10_const() -> None:
    config = conf.load_config(
        conf.official_examples_path("cifar10_cnn_pytorch/const.yaml"))
    config = conf.set_max_steps(config, 2)

    experiment_id = exp.run_basic_test_with_temp_config(
        config, conf.official_examples_path("cifar10_cnn_pytorch"), 1)
    trials = exp.experiment_trials(experiment_id)
    nn = (Determined(conf.make_master_url()).get_trial(
        trials[0].id).select_checkpoint(latest=True).load(
            map_location=torch.device("cpu")))
    assert isinstance(nn, torch.nn.Module)
def _fetch_slots() -> List[Dict[str, Any]]:
    command = [
        "det",
        "-m",
        conf.make_master_url(),
        "slot",
        "list",
        "--json",
    ]
    output = subprocess.check_output(command).decode()
    slots = cast(List[Dict[str, str]], json.loads(output))
    return slots
Example #24
0
def test_pytorch_gan_parallel() -> None:
    config = conf.load_config(
        conf.gan_examples_path("gan_mnist_pytorch/const.yaml"))
    config = conf.set_max_length(config, {"batches": 200})
    config = conf.set_slots_per_trial(config, 8)

    experiment_id = exp.run_basic_test_with_temp_config(
        config, conf.gan_examples_path("gan_mnist_pytorch"), 1)
    trials = exp.experiment_trials(experiment_id)
    (Determined(conf.make_master_url()).get_trial(
        trials[0]["id"]).select_checkpoint(latest=True).load(
            map_location="cpu"))
def test_start_tensorboard_with_custom_image(tmp_path: Path) -> None:
    """
    Start a random experiment, start a TensorBoard instance pointed
    to the experiment with custom image, verify the image has been set,
    and kill the TensorBoard instance.
    """
    experiment_id = exp.run_basic_test(
        conf.fixtures_path("no_op/single-one-short-step.yaml"),
        conf.fixtures_path("no_op"),
        1,
    )
    command = [
        "det",
        "-m",
        conf.make_master_url(),
        "tensorboard",
        "start",
        str(experiment_id),
        "--no-browser",
        "--detach",
        "--config",
        "environment.image=alpine",
    ]
    res = subprocess.run(command,
                         universal_newlines=True,
                         stdout=subprocess.PIPE,
                         check=True)
    t_id = res.stdout.strip("\n")
    command = [
        "det", "-m",
        conf.make_master_url(), "tensorboard", "config", t_id
    ]
    res = subprocess.run(command,
                         universal_newlines=True,
                         stdout=subprocess.PIPE,
                         check=True)
    config = yaml.safe_load(res.stdout)
    assert (config["environment"]["image"]["cpu"] == "alpine"
            and config["environment"]["image"]["cuda"] == "alpine"
            and config["environment"]["image"]["rocm"] == "alpine"), config
def request_profiling_pytorch_timing_metrics(trial_id: int,
                                             metric_name: str,
                                             accumulated: bool = False
                                             ) -> None:
    def validate_timing_batch(batch: Dict[str, Any], batch_idx: int) -> int:
        values = batch["values"]
        batches = batch["batches"]
        num_values = len(values)
        num_batch_indexes = len(batches)
        num_timestamps = len(batch["timestamps"])
        if num_values != num_batch_indexes or num_batch_indexes != num_timestamps:
            pytest.fail(
                f"mismatched slices: not ({num_values} == {num_batch_indexes} == {num_timestamps})"
            )

        if not any(values):
            pytest.fail(f"received bad batch, something went wrong: {batch}")

        if batches[0] != batch_idx:
            pytest.fail(
                f"batch did not start at correct batch, {batches[0]} != {batch_idx}: {batch}"
            )

        # Check batches are monotonic with no gaps.
        if not all(x + 1 == y for x, y in zip(batches, batches[1:])):
            pytest.fail(f"skips in batches sampled: {batch}")

        # 10 is just a threshold at which it would be really strange for a batch to be monotonic.
        if accumulated and len(values) > 10 and all(
                x < y for x, y in zip(values, values[1:])):
            pytest.fail(
                f"per batch accumulated metric was monotonic, which is really fishy: {batch}"
            )

        return int(batches[-1]) + 1

    with api.get(
            conf.make_master_url(),
            "api/v1/trials/{}/profiler/metrics?{}".format(
                trial_id,
                to_query_params(PROFILER_METRIC_TYPE_TIMING, metric_name),
            ),
            stream=True,
    ) as r:
        batch_idx = 0
        have_batch = False
        for line in r.iter_lines():
            batch = json.loads(line)["result"]["batch"]
            batch_idx = validate_timing_batch(batch, batch_idx)
            have_batch = True
        if not have_batch:
            pytest.fail("no batch metrics at all")
Example #27
0
def request_profiling_metric_labels(trial_id: int, timing_enabled: bool,
                                    gpu_enabled: bool) -> None:
    def validate_labels(labels: Sequence[Dict[str, Any]]) -> None:
        # Check some labels against the expected labels. Return the missing labels.
        expected = {
            "cpu_util_simple": PROFILER_METRIC_TYPE_SYSTEM,
            "dataloader_next": PROFILER_METRIC_TYPE_TIMING,
            "disk_iops": PROFILER_METRIC_TYPE_SYSTEM,
            "disk_throughput_read": PROFILER_METRIC_TYPE_SYSTEM,
            "disk_throughput_write": PROFILER_METRIC_TYPE_SYSTEM,
            "free_memory": PROFILER_METRIC_TYPE_SYSTEM,
            "from_device": PROFILER_METRIC_TYPE_TIMING,
            "net_throughput_recv": PROFILER_METRIC_TYPE_SYSTEM,
            "net_throughput_sent": PROFILER_METRIC_TYPE_SYSTEM,
            "reduce_metrics": PROFILER_METRIC_TYPE_TIMING,
            "step_lr_schedulers": PROFILER_METRIC_TYPE_TIMING,
            "to_device": PROFILER_METRIC_TYPE_TIMING,
            "train_batch": PROFILER_METRIC_TYPE_TIMING,
        }

        if gpu_enabled:
            expected.update({
                "gpu_free_memory": PROFILER_METRIC_TYPE_SYSTEM,
                "gpu_util": PROFILER_METRIC_TYPE_SYSTEM,
            })
        if not timing_enabled:
            expected = {
                k: v
                for k, v in expected.items()
                if v != PROFILER_METRIC_TYPE_TIMING
            }
        for label in labels:
            metric_name = label["name"]
            metric_type = label["metricType"]
            if expected.get(metric_name, None) == metric_type:
                del expected[metric_name]

        if len(expected) > 0:
            pytest.fail(
                f"expected completed experiment to have all labels but some are missing: {expected}"
            )

    with api.get(
            conf.make_master_url(),
            "api/v1/trials/{}/profiler/available_series".format(trial_id),
            stream=True,
    ) as r:
        for line in r.iter_lines():
            labels = simplejson.loads(line)["result"]["labels"]
            validate_labels(labels)
            # Just check 1 iter.
            return
Example #28
0
def maybe_create_native_experiment(context_dir: str, command: List[str]) -> Optional[int]:
    target_env = os.environ.copy()
    target_env["DET_MASTER"] = conf.make_master_url()

    with subprocess.Popen(
        command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, cwd=context_dir, env=target_env
    ) as p:
        for line in p.stdout:
            m = re.search(r"Created experiment (\d+)\n", line.decode())
            if m is not None:
                return int(m.group(1))

    return None
Example #29
0
def using_k8s(request: SubRequest) -> bool:
    command = [
        "det",
        "-m",
        config.make_master_url(),
        "master",
        "config",
    ]

    output = subprocess.check_output(command, universal_newlines=True, stderr=subprocess.PIPE)

    rp = json.loads(output)["resource_manager"]["type"]
    return bool(rp == "kubernetes")
Example #30
0
def _run_cmd_with_config_expecting_success(
    cmd: str, config: Dict[str, Any], context_path: Optional[str] = None
) -> None:
    with tempfile.NamedTemporaryFile() as tf:
        with open(tf.name, "w") as f:
            yaml.dump(config, f)

        command = ["det", "-m", conf.make_master_url(), "cmd", "run", "--config-file", tf.name]
        if context_path:
            command += ["-c", context_path]
        command.append(cmd)

        _run_and_verify_exit_code_zero(command)