Exemple #1
0
def test_non_root_experiment(auth: Authentication,
                             tmp_path: pathlib.Path) -> None:
    user = create_linked_user(65534, "nobody", 65534, "nogroup")

    with logged_in_user(user):
        with open(conf.fixtures_path("no_op/model_def.py")) as f:
            model_def_content = f.read()

        with open(conf.fixtures_path("no_op/single-one-short-step.yaml")) as f:
            config = yaml.safe_load(f)

        # Use a user-owned path to ensure shared_fs uses the container_path and not host_path.
        with non_tmp_shared_fs_path() as host_path:
            config["checkpoint_storage"] = {
                "type": "shared_fs",
                "host_path": host_path,
            }

            # Call `det --version` in a startup hook to ensure that det is on the PATH.
            with FileTree(
                    tmp_path,
                {
                    "startup-hook.sh": "det --version || exit 77",
                    "const.yaml": yaml.dump(config),  # type: ignore
                    "model_def.py": model_def_content,
                },
            ) as tree:
                exp.run_basic_test(str(tree.joinpath("const.yaml")), str(tree),
                                   None)
Exemple #2
0
def test_experiment_creation_and_listing(auth: Authentication) -> None:
    # Create 2 users.
    creds1 = create_test_user(ADMIN_CREDENTIALS, True)

    creds2 = create_test_user(ADMIN_CREDENTIALS, True)

    # Create an experiment as first user.
    with logged_in_user(creds1):
        experiment_id1 = exp.run_basic_test(
            conf.fixtures_path("no_op/single.yaml"),
            conf.fixtures_path("no_op"), 1)

    # Create another experiment, this time as second user.
    with logged_in_user(creds2):
        experiment_id2 = exp.run_basic_test(
            conf.fixtures_path("no_op/single.yaml"),
            conf.fixtures_path("no_op"), 1)

    with logged_in_user(creds1):
        # Now it should be the other way around.
        output = extract_id_and_owner_from_exp_list(det_run(["e", "list"]))
        assert (experiment_id1, creds1.username) in output
        assert (experiment_id2, creds2.username) not in output

        # Now use the -a flag to list all experiments.  The output should include both experiments.
        output = extract_id_and_owner_from_exp_list(
            det_run(["e", "list", "-a"]))
        assert (experiment_id1, creds1.username) in output
        assert (experiment_id2, creds2.username) in output

    # Clean up.
    delete_experiments(experiment_id1, experiment_id2)
Exemple #3
0
def test_tensorboard_creation_and_listing(auth: Authentication) -> None:
    creds1 = create_test_user(ADMIN_CREDENTIALS, True)
    creds2 = create_test_user(ADMIN_CREDENTIALS, True)

    with logged_in_user(creds1):
        # Create an experiment.
        experiment_id1 = exp.run_basic_test(
            conf.fixtures_path("no_op/single.yaml"),
            conf.fixtures_path("no_op"), 1)

    with logged_in_user(creds1):
        tensorboard_id1 = start_tensorboard(experiment_id1)

    with logged_in_user(creds2):
        experiment_id2 = exp.run_basic_test(
            conf.fixtures_path("no_op/single.yaml"),
            conf.fixtures_path("no_op"), 1)

    with logged_in_user(creds2):
        tensorboard_id2 = start_tensorboard(experiment_id2)

    with logged_in_user(creds1):
        output = extract_columns(det_run(["tensorboard", "list"]), [0, 1])
        assert (tensorboard_id1, creds1.username) in output
        assert (tensorboard_id2, creds2.username) not in output

        output = extract_columns(det_run(["tensorboard", "list", "-a"]),
                                 [0, 1])
        assert (tensorboard_id1, creds1.username) in output
        assert (tensorboard_id2, creds2.username) in output

    kill_tensorboards(tensorboard_id1, tensorboard_id2)
    delete_experiments(experiment_id1, experiment_id2)
Exemple #4
0
def test_large_model_def_experiment() -> None:
    with tempfile.TemporaryDirectory() as td:
        shutil.copy(conf.fixtures_path("no_op/model_def.py"), td)
        # Write a 94MB file into the directory.  Use random data because it is not compressible.
        with open(os.path.join(td, "junk.txt"), "wb") as f:
            f.write(os.urandom(94 * 1024 * 1024))

        exp.run_basic_test(conf.fixtures_path("no_op/single-one-short-step.yaml"), td, 1)
def test_core_api_tutorials(stage: str, ntrials: int, expect_workloads: bool,
                            expect_checkpoints: bool) -> None:
    exp.run_basic_test(
        conf.tutorials_path(f"core_api/{stage}.yaml"),
        conf.tutorials_path("core_api"),
        ntrials,
        expect_workloads=expect_workloads,
        expect_checkpoints=expect_checkpoints,
    )
def test_priortity_scheduler_noop_experiment(
    managed_cluster_priority_scheduler: ManagedCluster, ) -> None:
    managed_cluster_priority_scheduler.ensure_agent_ok()
    # uses the default priority set in cluster config
    exp.run_basic_test(conf.fixtures_path("no_op/single.yaml"),
                       conf.fixtures_path("no_op"), 1)
    # uses explicit priority
    exp.run_basic_test(conf.fixtures_path("no_op/single.yaml"),
                       conf.fixtures_path("no_op"),
                       1,
                       priority=50)
def test_start_tensorboard_for_multi_experiment(
        tmp_path: Path, secrets: Dict[str, str]) -> None:
    """
    Start 3 random experiments configured with the s3 and shared_fs backends,
    start a TensorBoard instance pointed to the experiments and some select
    trials, and kill the TensorBoard instance.
    """

    with FileTree(
            tmp_path,
        {
            "shared_fs_config.yaml": shared_fs_config(1),
            "s3_config.yaml": s3_config(1, secrets),
            "multi_trial_config.yaml": shared_fs_config(3),
        },
    ) as tree:
        shared_conf_path = tree.joinpath("shared_fs_config.yaml")
        shared_fs_exp_id = exp.run_basic_test(str(shared_conf_path),
                                              conf.fixtures_path("no_op"),
                                              num_trials)

        s3_conf_path = tree.joinpath("s3_config.yaml")
        s3_exp_id = exp.run_basic_test(str(s3_conf_path),
                                       conf.fixtures_path("no_op"), num_trials)

        multi_trial_config = tree.joinpath("multi_trial_config.yaml")
        multi_trial_exp_id = exp.run_basic_test(str(multi_trial_config),
                                                conf.fixtures_path("no_op"), 3)

        trial_ids = [
            str(t["id"]) for t in exp.experiment_trials(multi_trial_exp_id)
        ]

    command = [
        "tensorboard",
        "start",
        str(shared_fs_exp_id),
        str(s3_exp_id),
        "-t",
        *trial_ids,
        "--no-browser",
    ]

    with cmd.interactive_command(*command) as tensorboard:
        for line in tensorboard.stdout:
            if SERVICE_READY in line:
                break
            if AWAITING_METRICS in line:
                raise AssertionError("Tensorboard did not find metrics")
        else:
            raise AssertionError(f"Did not find {SERVICE_READY} in output")
def test_custom_etc() -> None:
    master_host = "localhost"
    master_port = "8080"
    conf.MASTER_IP = master_host
    conf.MASTER_PORT = master_port
    etc_path = str(Path(__file__).parent.joinpath("etc/master.yaml").resolve())
    cluster_up(["--master-config-path", etc_path])
    exp.run_basic_test(
        conf.fixtures_path("no_op/single-default-ckpt.yaml"),
        conf.fixtures_path("no_op"),
        1,
    )
    assert os.path.exists("/tmp/ckpt-test/")
    cluster_down([])
Exemple #9
0
def test_mask_rcnn_64_slots() -> None:
    experiment_id = exp.run_basic_test(
        conf.experimental_path("FasterRCNN_tp/64-gpus.yaml"),
        conf.experimental_path("FasterRCNN_tp/"),
        1,
        max_wait_secs=5 * 60 * 60,
    )

    validation_metric_name = "mAP(bbox)/IoU=0.5:0.95"
    validation_metric = exp.get_validation_metric_from_last_step(
        experiment_id, 0, validation_metric_name)
    durations = exp.get_experiment_durations(experiment_id, 0)
    wait_for_agents_time = (durations.experiment_duration -
                            durations.training_duration -
                            durations.validation_duration -
                            durations.checkpoint_duration)

    print(validation_metric_name, validation_metric)
    print(durations)
    print(f"wait for agents duration: {wait_for_agents_time}")

    assert validation_metric > 0.375
    assert durations.training_duration < datetime.timedelta(hours=2,
                                                            minutes=45)
    assert durations.validation_duration < datetime.timedelta(hours=1,
                                                              minutes=15)
Exemple #10
0
def test_experiment_delete() -> None:
    user = create_test_user(ADMIN_CREDENTIALS, False)

    with logged_in_user(user):
        experiment_id = exp.run_basic_test(
            conf.fixtures_path("no_op/single.yaml"),
            conf.fixtures_path("no_op"), 1)

        # "det experiment delete" call should fail, because the user is not an admin.
        child = det_spawn(
            ["experiment", "delete",
             str(experiment_id), "--yes"])
        child.wait()
        assert child.exitstatus > 0

    with logged_in_user(ADMIN_CREDENTIALS):
        child = det_spawn(
            ["experiment", "delete",
             str(experiment_id), "--yes"])
        child.wait()
        assert child.exitstatus == 0

        # "det experiment describe" call should fail, because the
        # experiment is no longer in the database.
        child = det_spawn(["experiment", "describe", str(experiment_id)])
        child.wait()
        assert child.exitstatus > 0
def test_experiment_delete() -> None:
    subprocess.check_call(
        ["det", "-m", conf.make_master_url(), "user", "whoami"])

    experiment_id = exp.run_basic_test(conf.fixtures_path("no_op/single.yaml"),
                                       conf.fixtures_path("no_op"), 1)

    subprocess.check_call(
        [
            "det", "-m",
            conf.make_master_url(), "experiment", "delete",
            str(experiment_id), "--yes"
        ],
        env={
            **os.environ, "DET_ADMIN": "1"
        },
    )

    # "det experiment describe" call should fail, because the
    # experiment is no longer in the database.
    with pytest.raises(subprocess.CalledProcessError):
        subprocess.check_call([
            "det", "-m",
            conf.make_master_url(), "experiment", "describe",
            str(experiment_id)
        ])
def test_model_registry() -> None:
    exp_id = exp.run_basic_test(
        conf.fixtures_path("mnist_pytorch/const-pytorch11.yaml"),
        conf.official_examples_path("trial/mnist_pytorch"),
        None,
    )

    d = Determined(conf.make_master_url())

    mnist = d.create_model("mnist", "simple computer vision model")
    assert mnist.metadata == {}

    mnist.add_metadata({"testing": "metadata"})
    assert mnist.metadata == {"testing": "metadata"}

    mnist.add_metadata({"some_key": "some_value"})
    assert mnist.metadata == {"testing": "metadata", "some_key": "some_value"}

    mnist.add_metadata({"testing": "override"})
    assert mnist.metadata == {"testing": "override", "some_key": "some_value"}

    mnist.remove_metadata(["some_key"])
    assert mnist.metadata == {"testing": "override"}

    checkpoint = d.get_experiment(exp_id).top_checkpoint()
    model_version = mnist.register_version(checkpoint)
    assert model_version == 1
    assert mnist.get_version().uuid == checkpoint.uuid

    d.create_model("transformer", "all you need is attention")
    d.create_model("object-detection", "a bounding box model")

    models = d.get_models(sort_by=ModelSortBy.NAME)
    assert [m.name
            for m in models] == ["mnist", "object-detection", "transformer"]
Exemple #13
0
def test_end_to_end_adaptive() -> None:
    exp_id = exp.run_basic_test(
        conf.fixtures_path("mnist_pytorch/adaptive_short.yaml"),
        conf.official_examples_path("trial/mnist_pytorch"),
        None,
    )

    # Check that validation accuracy look sane (more than 93% on MNIST).
    trials = exp.experiment_trials(exp_id)
    best = None
    for trial in trials:
        assert len(trial["steps"])
        last_step = trial["steps"][-1]
        accuracy = last_step["validation"]["metrics"]["validation_metrics"]["accuracy"]
        if not best or accuracy > best:
            best = accuracy

    assert best is not None
    assert best > 0.93

    # Check that ExperimentReference returns a sorted order of top checkpoints
    # without gaps. The top 2 checkpoints should be the first 2 of the top k
    # checkpoints if sorting is stable.
    d = Determined(conf.make_master_url())
    exp_ref = d.get_experiment(exp_id)

    top_2 = exp_ref.top_n_checkpoints(2)
    top_k = exp_ref.top_n_checkpoints(len(trials))

    top_2_uuids = [c.uuid for c in top_2]
    top_k_uuids = [c.uuid for c in top_k]

    assert top_2_uuids == top_k_uuids[:2]

    # Check that metrics are truly in sorted order.
    metrics = [c.validation["metrics"]["validation_metrics"]["validation_loss"] for c in top_k]

    assert metrics == sorted(metrics)

    # Check that changing smaller is better reverses the checkpoint ordering.
    top_k_reversed = exp_ref.top_n_checkpoints(
        len(trials), sort_by="validation_loss", smaller_is_better=False
    )
    top_k_reversed_uuids = [c.uuid for c in top_k_reversed]

    assert top_k_uuids == top_k_reversed_uuids[::-1]

    checkpoint = top_k[0]
    checkpoint.add_metadata({"testing": "metadata"})
    assert checkpoint.metadata == {"testing": "metadata"}

    checkpoint.add_metadata({"some_key": "some_value"})
    assert checkpoint.metadata == {"testing": "metadata", "some_key": "some_value"}

    checkpoint.add_metadata({"testing": "override"})
    assert checkpoint.metadata == {"testing": "override", "some_key": "some_value"}

    checkpoint.remove_metadata(["some_key"])
    assert checkpoint.metadata == {"testing": "override"}
Exemple #14
0
def test_noop_load() -> None:
    """
    Load a checkpoint
    """
    experiment_id = exp.run_basic_test(
        conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), 1
    )
    trials = exp.experiment_trials(experiment_id)
    checkpoint = Determined(conf.make_master_url()).get_trial(trials[0].trial.id).top_checkpoint()
    assert checkpoint.task_id == trials[0].trial.taskId
def test_custom_port() -> None:
    name = "port_test"
    master_host = "localhost"
    master_port = "12321"
    conf.MASTER_IP = master_host
    conf.MASTER_PORT = master_port
    arguments = [
        "--cluster-name",
        name,
        "--master-port",
        f"{master_port}",
    ]
    cluster_up(arguments)
    exp.run_basic_test(
        conf.fixtures_path("no_op/single-one-short-step.yaml"),
        conf.fixtures_path("no_op"),
        1,
    )
    cluster_down(["--cluster-name", name])
Exemple #16
0
def test_trial_logs() -> None:
    experiment_id = exp.run_basic_test(
        conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), 1
    )
    trial_id = exp.experiment_trials(experiment_id)[0]["id"]
    subprocess.check_call(["det", "-m", conf.make_master_url(), "trial", "logs", str(trial_id)])
    subprocess.check_call(
        ["det", "-m", conf.make_master_url(), "trial", "logs", "--head", "10", str(trial_id)],
    )
    subprocess.check_call(
        ["det", "-m", conf.make_master_url(), "trial", "logs", "--tail", "10", str(trial_id)],
    )
Exemple #17
0
def test_non_root_experiment(auth: Authentication,
                             tmp_path: pathlib.Path) -> None:
    user = create_linked_user(65534, "nobody", 65534, "nogroup")

    with logged_in_user(user):
        with open(conf.fixtures_path("no_op/single-one-short-step.yaml")) as f:
            config_content = f.read()

        with open(conf.fixtures_path("no_op/model_def.py")) as f:
            model_def_content = f.read()

        # Call `det --version` in a startup hook to ensure that det is on the PATH.
        with FileTree(
                tmp_path,
            {
                "startup-hook.sh": "det --version || exit 77",
                "const.yaml": config_content,
                "model_def.py": model_def_content,
            },
        ) as tree:
            exp.run_basic_test(str(tree.joinpath("const.yaml")), str(tree),
                               None)
Exemple #18
0
def test_metric_gathering() -> None:
    """
    Confirm that metrics are gathered from the trial the way that we expect.
    """
    experiment_id = exp.run_basic_test(
        conf.fixtures_path("metric_maker/const.yaml"),
        conf.fixtures_path("metric_maker"), 1)

    trials = exp.experiment_trials(experiment_id)
    assert len(trials) == 1

    # Read the structure of the metrics directly from the config file
    config = conf.load_config(conf.fixtures_path("metric_maker/const.yaml"))

    base_value = config["hyperparameters"]["starting_base_value"]
    gain_per_batch = config["hyperparameters"]["gain_per_batch"]
    training_structure = config["hyperparameters"]["training_structure"]["val"]
    validation_structure = config["hyperparameters"]["validation_structure"][
        "val"]

    scheduling_unit = 100

    # Check training metrics.
    full_trial_metrics = exp.trial_metrics(trials[0].trial.id)
    batches_trained = 0
    for step in full_trial_metrics["steps"]:
        metrics = step["metrics"]

        actual = metrics["batch_metrics"]
        assert len(actual) == scheduling_unit

        first_base_value = base_value + batches_trained
        batch_values = first_base_value + gain_per_batch * np.arange(
            scheduling_unit)
        expected = [
            structure_to_metrics(value, training_structure)
            for value in batch_values
        ]
        assert structure_equal(expected, actual)
        batches_trained = step["total_batches"]

    # Check validation metrics.
    validation_workloads = exp.workloads_with_validation(trials[0].workloads)
    for validation in validation_workloads:
        actual = validation.metrics
        batches_trained = validation.totalBatches

        value = base_value + batches_trained
        expected = structure_to_metrics(value, validation_structure)
        assert structure_equal(expected, actual)
def test_metric_gathering() -> None:
    """
    Confirm that metrics are gathered from the trial the way that we expect.
    """
    experiment_id = exp.run_basic_test(
        conf.fixtures_path("metric_maker/const.yaml"),
        conf.fixtures_path("metric_maker"), 1)

    trials = exp.experiment_trials(experiment_id)
    assert len(trials) == 1

    # Read the structure of the metrics directly from the config file
    config = conf.load_config(conf.fixtures_path("metric_maker/const.yaml"))

    base_value = config["hyperparameters"]["starting_base_value"]
    gain_per_batch = config["hyperparameters"]["gain_per_batch"]
    training_structure = config["hyperparameters"]["training_structure"]["val"]
    validation_structure = config["hyperparameters"]["validation_structure"][
        "val"]

    scheduling_unit = 100

    # Check training metrics.
    full_trial_metrics = exp.trial_metrics(trials[0]["id"])
    for step in full_trial_metrics["steps"]:
        metrics = step["metrics"]
        assert metrics["num_inputs"] == scheduling_unit

        actual = metrics["batch_metrics"]
        assert len(actual) == scheduling_unit

        first_base_value = base_value + (step["id"] - 1) * scheduling_unit
        batch_values = first_base_value + gain_per_batch * np.arange(
            scheduling_unit)
        expected = [
            structure_to_metrics(value, training_structure)
            for value in batch_values
        ]
        assert structure_equal(expected, actual)

    # Check validation metrics.
    for step in trials[0]["steps"]:
        validation = step["validation"]
        metrics = validation["metrics"]
        actual = metrics["validation_metrics"]

        value = base_value + step["id"] * scheduling_unit
        expected = structure_to_metrics(value, validation_structure)
        assert structure_equal(expected, actual)
Exemple #20
0
def _test_rng_restore(fixture: str, metrics: list) -> None:
    """
    This test confirms that an experiment can be restarted from a checkpoint
    with the same RNG state. It requires a test fixture that will emit
    random numbers from all of the RNGs used in the relevant framework as
    metrics. The experiment must have a const.yaml, run for at least 3 steps,
    checkpoint every step, and keep the first checkpoint (either by having
    metrics get worse over time, or by configuring the experiment to keep all
    checkpoints).
    """
    experiment = exp.run_basic_test(
        conf.fixtures_path(fixture + "/const.yaml"),
        conf.fixtures_path(fixture),
        1,
    )

    first_trial = exp.experiment_trials(experiment)[0]

    assert len(first_trial["steps"]) >= 3

    first_step = first_trial["steps"][0]
    first_checkpoint_id = first_step["checkpoint"]["id"]

    config_base = conf.load_config(conf.fixtures_path(fixture + "/const.yaml"))
    config_obj = copy.deepcopy(config_base)
    config_obj["searcher"]["source_checkpoint_uuid"] = first_step["checkpoint"]["uuid"]

    experiment2 = exp.run_basic_test_with_temp_config(config_obj, conf.fixtures_path(fixture), 1)

    second_trial = exp.experiment_trials(experiment2)[0]

    assert len(second_trial["steps"]) >= 3
    assert second_trial["warm_start_checkpoint_id"] == first_checkpoint_id

    for step in range(0, 2):
        for metric in metrics:
            first_metric = first_trial["steps"][step + 1]["validation"]["metrics"][
                "validation_metrics"
            ][metric]
            second_metric = second_trial["steps"][step]["validation"]["metrics"][
                "validation_metrics"
            ][metric]
            assert (
                first_metric == second_metric
            ), f"failures on iteration: {step} with metric: {metric}"
Exemple #21
0
def test_nan_metrics() -> None:
    """
    Confirm that NaN and Infinity metrics are gathered from the trial.
    """
    exp_id = exp.run_basic_test(conf.fixtures_path("metric_maker/nans.yaml"),
                                conf.fixtures_path("metric_maker"), 1)
    trials = exp.experiment_trials(exp_id)
    config = conf.load_config(conf.fixtures_path("metric_maker/nans.yaml"))
    base_value = config["hyperparameters"]["starting_base_value"]
    gain_per_batch = config["hyperparameters"]["gain_per_batch"]

    # Infinity and NaN cannot be processed in the YAML->JSON deserializer
    # Add them to expected values here
    training_structure = config["hyperparameters"]["training_structure"]["val"]
    training_structure["inf"] = "Infinity"
    training_structure["nan"] = "NaN"
    training_structure["nanarray"] = ["NaN", "NaN"]
    validation_structure = config["hyperparameters"]["validation_structure"][
        "val"]
    validation_structure["neg_inf"] = "-Infinity"

    # Check training metrics.
    full_trial_metrics = exp.trial_metrics(trials[0].trial.id)
    batches_trained = 0
    for step in full_trial_metrics["steps"]:
        metrics = step["metrics"]
        actual = metrics["batch_metrics"]
        first_base_value = base_value + batches_trained
        batch_values = first_base_value + gain_per_batch * np.arange(5)
        expected = [
            structure_to_metrics(value, training_structure)
            for value in batch_values
        ]
        assert structure_equal(expected, actual)
        batches_trained = step["total_batches"]

    # Check validation metrics.
    validation_workloads = exp.workloads_with_validation(trials[0].workloads)
    for validation in validation_workloads:
        actual = validation.metrics
        batches_trained = validation.totalBatches
        expected = structure_to_metrics(base_value, validation_structure)
        assert structure_equal(expected, actual)
Exemple #22
0
def test_start_tensorboard_for_shared_fs_experiment(tmp_path: Path) -> None:
    """
    Start a random experiment configured with the shared_fs backend, start a
    TensorBoard instance pointed to the experiment, and kill the TensorBoard
    instance.
    """
    with FileTree(tmp_path, {"config.yaml": shared_fs_config(1)}) as tree:
        config_path = tree.joinpath("config.yaml")
        experiment_id = exp.run_basic_test(str(config_path),
                                           conf.fixtures_path("no_op"),
                                           num_trials)

    command = ["tensorboard", "start", str(experiment_id), "--no-browser"]
    with cmd.interactive_command(*command) as tensorboard:
        for line in tensorboard.stdout:
            if SERVICE_READY in line:
                break
        else:
            raise AssertionError(f"Did not find {SERVICE_READY} in output")
Exemple #23
0
def test_task_logs(task_type: str, task_config: Dict[str, Any],
                   log_regex: Any) -> None:
    # TODO: refactor tests to not use cli singleton auth.
    master_url = conf.make_master_url()
    certs.cli_cert = certs.default_load(conf.make_master_url())
    authentication.cli_auth = authentication.Authentication(
        conf.make_master_url(), try_reauth=True)

    rps = bindings.get_GetResourcePools(
        session.Session(master_url, "determined", authentication.cli_auth,
                        certs.cli_cert))
    assert rps.resourcePools and len(
        rps.resourcePools) > 0, "missing resource pool"

    if (rps.resourcePools[0].type
            == bindings.v1ResourcePoolType.RESOURCE_POOL_TYPE_K8S
            and task_type == command.TaskTypeCommand):
        # TODO(DET-6712): Investigate intermittent slowness with K8s command logs.
        return

    body = {}
    if task_type == command.TaskTypeTensorBoard:
        exp_id = exp.run_basic_test(
            conf.fixtures_path("no_op/single.yaml"),
            conf.fixtures_path("no_op"),
            1,
        )
        body.update({"experiment_ids": [exp_id]})

    resp = command.launch_command(
        master_url,
        f"api/v1/{command.RemoteTaskNewAPIs[task_type]}",
        task_config,
        "",
        default_body=body,
    )
    task_id = resp[command.RemoteTaskName[task_type]]["id"]
    try:
        check_logs(master_url, task_id, log_regex, api.task_logs,
                   api.task_log_fields)
    finally:
        command._kill(master_url, task_type, task_id)
Exemple #24
0
def test_support_bundle() -> None:
    exp_id = exp.run_basic_test(
        config_file=conf.fixtures_path("no_op/single-one-short-step.yaml"),
        model_def_file=conf.fixtures_path("no_op"),
        expected_trials=1,
    )

    trial_id = exp.experiment_first_trial(exp_id)
    output_dir = f"e2etest_trial_{trial_id}"
    os.mkdir(output_dir)

    command = ["det", "trial", "support-bundle", str(trial_id), "-o", output_dir]

    completed_process = subprocess.run(
        command, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE
    )

    assert completed_process.returncode == 0, "\nstdout:\n{} \nstderr:\n{}".format(
        completed_process.stdout, completed_process.stderr
    )
Exemple #25
0
def test_trial_logs() -> None:
    # TODO: refactor tests to not use cli singleton auth.
    master_url = conf.make_master_url()
    certs.cli_cert = certs.default_load(conf.make_master_url())
    authentication.cli_auth = authentication.Authentication(
        conf.make_master_url(), try_reauth=True)

    experiment_id = exp.run_basic_test(conf.fixtures_path("no_op/single.yaml"),
                                       conf.fixtures_path("no_op"), 1)
    trial = exp.experiment_trials(experiment_id)[0].trial
    trial_id = trial.id
    task_id = trial.taskId
    assert task_id != ""

    log_regex = re.compile("^.*New trial runner.*$")
    # Trial-specific APIs should work just fine.
    check_logs(master_url, trial_id, log_regex, api.trial_logs,
               api.trial_log_fields)
    # And so should new task log APIs.
    check_logs(master_url, task_id, log_regex, api.task_logs,
               api.task_log_fields)
def test_start_tensorboard_with_custom_image(tmp_path: Path) -> None:
    """
    Start a random experiment, start a TensorBoard instance pointed
    to the experiment with custom image, verify the image has been set,
    and kill the TensorBoard instance.
    """
    experiment_id = exp.run_basic_test(
        conf.fixtures_path("no_op/single-one-short-step.yaml"),
        conf.fixtures_path("no_op"),
        1,
    )
    command = [
        "det",
        "-m",
        conf.make_master_url(),
        "tensorboard",
        "start",
        str(experiment_id),
        "--no-browser",
        "--detach",
        "--config",
        "environment.image=alpine",
    ]
    res = subprocess.run(command,
                         universal_newlines=True,
                         stdout=subprocess.PIPE,
                         check=True)
    t_id = res.stdout.strip("\n")
    command = [
        "det", "-m",
        conf.make_master_url(), "tensorboard", "config", t_id
    ]
    res = subprocess.run(command,
                         universal_newlines=True,
                         stdout=subprocess.PIPE,
                         check=True)
    config = yaml.safe_load(res.stdout)
    assert (config["environment"]["image"]["cpu"] == "alpine"
            and config["environment"]["image"]["cuda"] == "alpine"
            and config["environment"]["image"]["rocm"] == "alpine"), config
Exemple #27
0
def test_gpu_restore() -> None:
    experiment = exp.run_basic_test(
        conf.fixtures_path("pytorch-rng-saver/const.yaml"),
        conf.fixtures_path("pytorch-rng-saver"),
        1,
    )

    first_trial = exp.experiment_trials(experiment)[0]

    assert len(first_trial["steps"]) == 3

    first_step = first_trial["steps"][0]
    second_checkpoint_id = first_step["checkpoint"]["id"]

    config_base = conf.load_config(conf.fixtures_path("pytorch-rng-saver/const.yaml"))
    config_obj = copy.deepcopy(config_base)
    config_obj["searcher"]["source_checkpoint_uuid"] = first_step["checkpoint"]["uuid"]

    experiment2 = exp.run_basic_test_with_temp_config(
        config_obj, conf.fixtures_path("pytorch-rng-saver"), 1
    )

    second_trial = exp.experiment_trials(experiment2)[0]

    assert len(second_trial["steps"]) == 3
    assert second_trial["warm_start_checkpoint_id"] == second_checkpoint_id

    for step in range(0, 2):
        for metric in ["np_rand", "rand_rand", "torch_rand", "gpu_rand"]:
            first_metric = first_trial["steps"][step + 1]["validation"]["metrics"][
                "validation_metrics"
            ][metric]
            second_metric = second_trial["steps"][step]["validation"]["metrics"][
                "validation_metrics"
            ][metric]
            assert (
                first_metric == second_metric
            ), f"failures on iteration: {step} with metric: {metric}"
def test_experiment_delete() -> None:
    user = create_test_user(ADMIN_CREDENTIALS)
    non_owner_user = create_test_user(ADMIN_CREDENTIALS)

    with logged_in_user(user):
        experiment_id = exp.run_basic_test(
            conf.fixtures_path("no_op/single.yaml"),
            conf.fixtures_path("no_op"), 1)

    with logged_in_user(non_owner_user):
        # "det experiment delete" call should fail, because the user is not an admin and
        # doesn't own the experiment.
        child = det_spawn(
            ["experiment", "delete",
             str(experiment_id), "--yes"])
        child.read()
        child.wait()
        assert child.exitstatus > 0

    with logged_in_user(user):
        child = det_spawn(
            ["experiment", "delete",
             str(experiment_id), "--yes"])
        child.read()
        child.wait()
        assert child.exitstatus == 0

        experiment_delete_deadline = time.time() + 5 * 60
        while 1:
            child = det_spawn(["experiment", "describe", str(experiment_id)])
            child.read()
            child.wait()
            # "det experiment describe" call should fail, because the
            # experiment is no longer in the database.
            if child.exitstatus > 0:
                return
            elif time.time() > experiment_delete_deadline:
                pytest.fail("experiment didn't delete after timeout")
Exemple #29
0
def test_startup_hook() -> None:
    exp.run_basic_test(
        conf.fixtures_path("no_op/startup-hook.yaml"),
        conf.fixtures_path("no_op"),
        1,
    )
Exemple #30
0
def test_noop_long_train_step() -> None:
    exp.run_basic_test(
        conf.fixtures_path("no_op/single-long-train-step.yaml"),
        conf.fixtures_path("no_op"),
        1,
    )