def test_disable_agent_experiment_resume() -> None:
    """
    Start an experiment with max_restarts=0 and ensure that being killed due to an explicit agent
    disable/enable (without draining) does not count toward the number of restarts.
    """
    slots = _fetch_slots()
    assert len(slots) == 1
    agent_id = slots[0]["agent_id"]

    exp_id = exp.create_experiment(
        conf.fixtures_path("no_op/single-medium-train-step.yaml"),
        conf.fixtures_path("no_op"),
        ["--config", "max_restarts=0"],
    )
    exp.wait_for_experiment_workload_progress(exp_id)

    with _disable_agent(agent_id):
        # Wait for the allocation to go away.
        for _ in range(20):
            slots = _fetch_slots()
            print(slots)
            if not any(s["allocation_id"] != "FREE" for s in slots):
                break
            time.sleep(1)
        else:
            pytest.fail("Experiment stayed scheduled after agent was disabled")
    exp.wait_for_experiment_state(exp_id, determinedexperimentv1State.STATE_COMPLETED)
def test_master_restart_kill_works(
        managed_cluster_restarts: ManagedCluster) -> None:
    _sanity_check(managed_cluster_restarts)

    try:
        exp_id = exp.create_experiment(
            conf.fixtures_path("no_op/single-many-long-steps.yaml"),
            conf.fixtures_path("no_op"),
            [
                "--config", "searcher.max_length.batches=10000", "--config",
                "max_restarts=0"
            ],
        )

        exp.wait_for_experiment_workload_progress(exp_id)

        managed_cluster_restarts.kill_master()
        time.sleep(0)
        managed_cluster_restarts.restart_master()

        command = [
            "det", "-m",
            conf.make_master_url(), "e", "kill",
            str(exp_id)
        ]
        subprocess.check_call(command)

        exp.wait_for_experiment_state(exp_id,
                                      EXP_STATE.STATE_CANCELED,
                                      max_wait_secs=10)

        managed_cluster_restarts.ensure_agent_ok()
    except Exception:
        managed_cluster_restarts.restart_master()
        managed_cluster_restarts.restart_agent()
def test_master_restart_reattach_recover_experiment(
        managed_cluster_restarts: ManagedCluster, downtime: int) -> None:
    _sanity_check(managed_cluster_restarts)

    try:
        exp_id = exp.create_experiment(
            conf.fixtures_path("no_op/single-medium-train-step.yaml"),
            conf.fixtures_path("no_op"),
            None,
        )

        # TODO(ilia): don't wait for progress.
        exp.wait_for_experiment_workload_progress(exp_id)

        if downtime >= 0:
            managed_cluster_restarts.kill_master()
            time.sleep(downtime)
            managed_cluster_restarts.restart_master()

        exp.wait_for_experiment_state(exp_id,
                                      EXP_STATE.STATE_COMPLETED,
                                      max_wait_secs=downtime + 60)
        trials = exp.experiment_trials(exp_id)

        assert len(trials) == 1
        train_wls = exp.workloads_with_training(trials[0].workloads)
        assert len(train_wls) == 5
    except Exception:
        managed_cluster_restarts.restart_master()
        managed_cluster_restarts.restart_agent()
        raise
Beispiel #4
0
def test_streaming_observability_metrics_apis(
    framework_base_experiment: str, framework_timings_enabled: bool
) -> None:
    # TODO: refactor tests to not use cli singleton auth.
    certs.cli_cert = certs.default_load(conf.make_master_url())
    authentication.cli_auth = authentication.Authentication(conf.make_master_url(), try_reauth=True)

    config_path = conf.tutorials_path(f"../{framework_base_experiment}/const.yaml")
    model_def_path = conf.tutorials_path(f"../{framework_base_experiment}")

    config_obj = conf.load_config(config_path)
    config_obj = conf.set_profiling_enabled(config_obj)
    with tempfile.NamedTemporaryFile() as tf:
        with open(tf.name, "w") as f:
            yaml.dump(config_obj, f)
        experiment_id = exp.create_experiment(
            tf.name,
            model_def_path,
        )

    exp.wait_for_experiment_state(experiment_id, "COMPLETED")
    trials = exp.experiment_trials(experiment_id)
    trial_id = trials[0]["id"]

    gpu_enabled = conf.GPU_ENABLED

    request_profiling_metric_labels(trial_id, framework_timings_enabled, gpu_enabled)
    if gpu_enabled:
        request_profiling_system_metrics(trial_id, "gpu_util")
    if framework_timings_enabled:
        request_profiling_pytorch_timing_metrics(trial_id, "train_batch")
Beispiel #5
0
def test_agent_reconnect_keep_experiment(
        managed_cluster_restarts: ManagedCluster) -> None:
    managed_cluster_restarts.ensure_agent_ok()

    try:
        exp_id = exp.create_experiment(
            conf.fixtures_path("no_op/single-medium-train-step.yaml"),
            conf.fixtures_path("no_op"),
            None,
        )
        exp.wait_for_experiment_workload_progress(exp_id)

        managed_cluster_restarts.kill_proxy()
        time.sleep(1)
        managed_cluster_restarts.restart_proxy()

        exp.wait_for_experiment_state(exp_id, EXP_STATE.STATE_COMPLETED)
        trials = exp.experiment_trials(exp_id)

        assert len(trials) == 1
        train_wls = exp.workloads_with_training(trials[0].workloads)
        assert len(train_wls) == 5
    except Exception:
        managed_cluster_restarts.restart_proxy(wait_for_reconnect=False)
        managed_cluster_restarts.restart_agent()
        raise
Beispiel #6
0
def test_agent_restart_recover_experiment(
        managed_cluster_restarts: ManagedCluster, downtime: int) -> None:
    if not managed_cluster_restarts.reattach:
        pytest.skip()

    managed_cluster_restarts.ensure_agent_ok()
    try:
        exp_id = exp.create_experiment(
            conf.fixtures_path("no_op/single-medium-train-step.yaml"),
            conf.fixtures_path("no_op"),
            None,
        )
        exp.wait_for_experiment_workload_progress(exp_id)

        if downtime >= 0:
            managed_cluster_restarts.kill_agent()
            time.sleep(downtime)
            managed_cluster_restarts.restart_agent(wait_for_amnesia=False)

        exp.wait_for_experiment_state(exp_id, EXP_STATE.STATE_COMPLETED)
        trials = exp.experiment_trials(exp_id)

        assert len(trials) == 1
        train_wls = exp.workloads_with_training(trials[0].workloads)
        assert len(train_wls) == 5
    except Exception:
        managed_cluster_restarts.restart_agent()
        raise
def test_noop_pause_of_experiment_without_trials() -> None:
    """
    Walk through starting, pausing, and resuming a single no-op experiment
    which will never schedule a trial.
    """
    config_obj = conf.load_config(
        conf.fixtures_path("no_op/single-one-short-step.yaml"))
    impossibly_large = 100
    config_obj["max_restarts"] = 0
    config_obj["resources"] = {"slots_per_trial": impossibly_large}
    with tempfile.NamedTemporaryFile() as tf:
        with open(tf.name, "w") as f:
            yaml.dump(config_obj, f)
        experiment_id = exp.create_experiment(tf.name,
                                              conf.fixtures_path("no_op"),
                                              None)
    exp.pause_experiment(experiment_id)
    exp.wait_for_experiment_state(experiment_id, "PAUSED")

    exp.activate_experiment(experiment_id)
    exp.wait_for_experiment_state(experiment_id, "ACTIVE")

    for _ in range(5):
        assert exp.experiment_state(experiment_id) == "ACTIVE"
        time.sleep(1)

    exp.cancel_single(experiment_id)
Beispiel #8
0
def test_noop_nan_validations() -> None:
    """
    Ensure that NaN validation metric values don't prevent an experiment from completing.
    """
    experiment_id = exp.create_experiment(
        conf.fixtures_path("no_op/single-nan-validations.yaml"),
        conf.fixtures_path("no_op"),
        None,
    )
    exp.wait_for_experiment_state(
        experiment_id, bindings.determinedexperimentv1State.STATE_COMPLETED
    )
Beispiel #9
0
def test_agent_restart_exp_container_failure(
        managed_cluster_restarts: ManagedCluster) -> None:
    managed_cluster_restarts.ensure_agent_ok()
    try:
        exp_id = exp.create_experiment(
            conf.fixtures_path("no_op/single-medium-train-step.yaml"),
            conf.fixtures_path("no_op"),
            None,
        )
        exp.wait_for_experiment_workload_progress(exp_id)
        container_ids = list(_local_container_ids_for_experiment(exp_id))
        if len(container_ids) != 1:
            pytest.fail(
                f"unexpected number of local containers for the experiment: {len(container_ids)}"
            )
        # Get task id / allocation id
        tasks_data = _task_list_json(managed_cluster_restarts.master_url)
        assert len(tasks_data) == 1
        exp_task_before = list(tasks_data.values())[0]

        managed_cluster_restarts.kill_agent()
        subprocess.run(["docker", "kill", container_ids[0]],
                       check=True,
                       stdout=subprocess.PIPE)
    except Exception:
        managed_cluster_restarts.restart_agent()
        raise
    else:
        managed_cluster_restarts.restart_agent()
        # As soon as the agent is back, the original allocation should be considered dead,
        # but the new one should be allocated.
        state = exp.experiment_state(exp_id)
        assert state == EXP_STATE.STATE_ACTIVE
        tasks_data = _task_list_json(managed_cluster_restarts.master_url)
        assert len(tasks_data) == 1
        exp_task_after = list(tasks_data.values())[0]

        assert exp_task_before["task_id"] == exp_task_after["task_id"]
        assert exp_task_before["allocation_id"] != exp_task_after[
            "allocation_id"]

        exp.wait_for_experiment_state(exp_id, EXP_STATE.STATE_COMPLETED)
Beispiel #10
0
def test_noop_pause() -> None:
    """
    Walk through starting, pausing, and resuming a single no-op experiment.
    """
    experiment_id = exp.create_experiment(
        conf.fixtures_path("no_op/single-medium-train-step.yaml"),
        conf.fixtures_path("no_op"),
        None,
    )
    exp.wait_for_experiment_state(experiment_id, "ACTIVE")

    # Wait for the only trial to get scheduled.
    workload_active = False
    for _ in range(conf.MAX_TASK_SCHEDULED_SECS):
        workload_active = exp.experiment_has_active_workload(experiment_id)
        if workload_active:
            break
        else:
            time.sleep(1)
    check.true(
        workload_active,
        f"The only trial cannot be scheduled within {conf.MAX_TASK_SCHEDULED_SECS} seconds.",
    )

    # Wait for the only trial to show progress, indicating the image is built and running.
    num_steps = 0
    for _ in range(conf.MAX_TRIAL_BUILD_SECS):
        trials = exp.experiment_trials(experiment_id)
        if len(trials) > 0:
            only_trial = trials[0]
            num_steps = len(only_trial["steps"])
            if num_steps > 1:
                break
        time.sleep(1)
    check.true(
        num_steps > 1,
        f"The only trial cannot start training within {conf.MAX_TRIAL_BUILD_SECS} seconds.",
    )

    # Pause the experiment. Note that Determined does not currently differentiate
    # between a "stopping paused" and a "paused" state, so we follow this check
    # up by ensuring the experiment cleared all scheduled workloads.
    exp.pause_experiment(experiment_id)
    exp.wait_for_experiment_state(experiment_id, "PAUSED")

    # Wait at most 20 seconds for the experiment to clear all workloads (each
    # train step should take 5 seconds).
    for _ in range(20):
        workload_active = exp.experiment_has_active_workload(experiment_id)
        if not workload_active:
            break
        else:
            time.sleep(1)
    check.true(
        not workload_active, "The experiment cannot be paused within 20 seconds.",
    )

    # Resume the experiment and wait for completion.
    exp.activate_experiment(experiment_id)
    exp.wait_for_experiment_state(experiment_id, "COMPLETED")
def test_experimental_experiment_api_determined_disabled() -> None:
    context_path = pathlib.Path(conf.fixtures_path("no_op"))
    model_def_path = pathlib.Path(
        conf.fixtures_path("no_op/single-medium-train-step.yaml"))

    model_context = context.Context.from_local(context_path)

    with model_def_path.open("r") as fin:
        dai_experiment_config = util.safe_load_yaml_with_exceptions(fin)

    determined_master = conf.make_master_url()
    requested_user, password = create_test_user(ADMIN_CREDENTIALS,
                                                add_password=True)
    a_username, _ = ADMIN_CREDENTIALS

    try:
        det_spawn(["-u", a_username, "user", "deactivate", "determined"])

        certs.cli_cert = certs.default_load(master_url=determined_master, )
        determined_api.authentication.cli_auth = determined_api.authentication.Authentication(
            determined_master,
            requested_user=requested_user,
            password=password,
            try_reauth=True,
            cert=certs.cli_cert,
        )
        exp_id = determined_api.experiment.create_experiment_and_follow_logs(
            master_url=determined_master,
            config=dai_experiment_config,
            model_context=model_context,
            template=None,
            additional_body_fields={},
            activate=True,
            follow_first_trial_logs=False,
        )

        exp.wait_for_experiment_state(exp_id, EXP_STATE.STATE_COMPLETED)
    finally:
        det_spawn(["-u", a_username, "user", "activate", "determined"])
def test_drain_agent_sched() -> None:
    """
    Start an experiment, drain it. Start a second one and make sure it schedules
    on the second agent *before* the first one has finished.
    """
    slots = _wait_for_slots(2)
    assert len(slots) == 2

    exp_id1 = exp.create_experiment(
        conf.fixtures_path("no_op/single-medium-train-step.yaml"),
        conf.fixtures_path("no_op"),
        None,
    )
    exp.wait_for_experiment_workload_progress(exp_id1)

    slots = _fetch_slots()
    used_slots = [s for s in slots if s["allocation_id"] != "FREE"]
    assert len(used_slots) == 1
    agent_id1 = used_slots[0]["agent_id"]

    with _disable_agent(agent_id1, drain=True):
        exp_id2 = exp.create_experiment(
            conf.fixtures_path("no_op/single-medium-train-step.yaml"),
            conf.fixtures_path("no_op"),
            None,
        )
        exp.wait_for_experiment_state(exp_id2,
                                      determinedexperimentv1State.STATE_ACTIVE)

        # Wait for a state when *BOTH* experiments are scheduled.
        for _ in range(20):
            slots = _fetch_slots()
            assert len(slots) == 2
            used_slots = [s for s in slots if s["allocation_id"] != "FREE"]
            if len(used_slots) == 2:
                # All good.
                break
        else:
            pytest.fail(
                "Second experiment didn't schedule on the second agent "
                "while the first agent was draining")

        exp.wait_for_experiment_state(
            exp_id1, determinedexperimentv1State.STATE_COMPLETED)
        exp.wait_for_experiment_state(
            exp_id2, determinedexperimentv1State.STATE_COMPLETED)

        trials1 = exp.experiment_trials(exp_id1)
        trials2 = exp.experiment_trials(exp_id2)
        assert len(trials1) == len(trials2) == 1
        assert len(trials1[0].workloads) == len(trials2[0].workloads) == 7
Beispiel #13
0
def test_noop_pause() -> None:
    """
    Walk through starting, pausing, and resuming a single no-op experiment.
    """
    experiment_id = exp.create_experiment(
        conf.fixtures_path("no_op/single-medium-train-step.yaml"),
        conf.fixtures_path("no_op"),
        None,
    )
    exp.wait_for_experiment_state(experiment_id, bindings.determinedexperimentv1State.STATE_ACTIVE)

    # Wait for the only trial to get scheduled.
    exp.wait_for_experiment_active_workload(experiment_id)

    # Wait for the only trial to show progress, indicating the image is built and running.
    exp.wait_for_experiment_workload_progress(experiment_id)

    # Pause the experiment. Note that Determined does not currently differentiate
    # between a "stopping paused" and a "paused" state, so we follow this check
    # up by ensuring the experiment cleared all scheduled workloads.
    exp.pause_experiment(experiment_id)
    exp.wait_for_experiment_state(experiment_id, bindings.determinedexperimentv1State.STATE_PAUSED)

    # Wait at most 20 seconds for the experiment to clear all workloads (each
    # train step should take 5 seconds).
    for _ in range(20):
        workload_active = exp.experiment_has_active_workload(experiment_id)
        if not workload_active:
            break
        else:
            time.sleep(1)
    check.true(
        not workload_active,
        "The experiment cannot be paused within 20 seconds.",
    )

    # Resume the experiment and wait for completion.
    exp.activate_experiment(experiment_id)
    exp.wait_for_experiment_state(
        experiment_id, bindings.determinedexperimentv1State.STATE_COMPLETED
    )
def test_allocation_resources_incremental_release() -> None:
    """
    Start an two container experiment and ensure one container exits before the other. Ensure
    resources are released and schedule-able without the other needing to be released.
    """
    cleanup_exp_ids = []

    try:
        slots = _wait_for_slots(2)
        assert len(slots) == 2

        with tempfile.TemporaryDirectory() as context_dir, open(
                os.path.join(context_dir, "const.yaml"), "w") as config_file:
            # Launch an experiment that has one resource (docker container) that exits immediately.
            config_obj = conf.load_config(
                conf.fixtures_path("no_op/single.yaml"))
            config_obj["resources"] = {
                **config_obj.get("resources", {}),
                **{
                    "slots": 2
                }
            }
            config_obj["hyperparameters"] = {
                **config_obj.get("hyperparameters", {}),
                **{
                    "non_chief_exit_immediately": True
                },
            }
            yaml.dump(config_obj, config_file)

            shutil.copy(conf.fixtures_path("no_op/model_def.py"),
                        os.path.join(context_dir, "model_def.py"))

            exp_id = exp.create_experiment(config_file.name, context_dir, None)
            cleanup_exp_ids.append(exp_id)

        # Wait for the experiment to start and run some.
        exp.wait_for_experiment_state(
            exp_id,
            determinedexperimentv1State.STATE_ACTIVE,
        )
        exp.wait_for_experiment_active_workload(exp_id)

        # And wait for exactly one of the resources to free, while one is still in use.
        confirmations = 0
        for _ in range(RANK_ONE_WAIT_TIME):
            free_agents = list_free_agents()
            if len(free_agents) == 1:
                confirmations += 1

            if confirmations == 2:
                # Just for the race where one container has exited and the other hasn't quite yet,
                # but is going to, make sure we see it at least twice.
                break

            # Still waiting on partial exit
            time.sleep(1)
        else:
            pytest.fail(
                "exactly one agent did not free after {} seconds".format(
                    RANK_ONE_WAIT_TIME))

        # Ensure we can schedule on the free slot, not only that the API says its available.
        exp_id_2 = exp.create_experiment(
            conf.fixtures_path("no_op/single.yaml"),
            conf.fixtures_path("no_op"),
            None,
        )
        cleanup_exp_ids.append(exp_id_2)

        exp.wait_for_experiment_workload_progress(exp_id_2)
        exp.wait_for_experiment_state(
            exp_id_2, determinedexperimentv1State.STATE_COMPLETED)
        cleanup_exp_ids = cleanup_exp_ids[:-1]

        # And check the hung experiment still is holding on to its hung slot.
        free_agents = list_free_agents()
        if len(free_agents) != 1:
            pytest.fail(
                f"should still have exactly one agent scheduled: {free_agents}"
            )

    finally:
        for exp_id in cleanup_exp_ids:
            bindings.post_KillExperiment(determined_test_session(), id=exp_id)
            exp.wait_for_experiment_state(
                exp_id, determinedexperimentv1State.STATE_CANCELED)
Beispiel #15
0
def test_stress_agents_reconnect(steps: int, num_agents: int, should_disconnect: bool) -> None:
    random.seed(42)
    master_host = "localhost"
    master_port = "8080"
    conf.MASTER_IP = master_host
    conf.MASTER_PORT = master_port
    master_up([])

    # Start all agents.
    agents_are_up = [True] * num_agents
    for i in range(num_agents):
        agent_up(["--agent-name", f"agent-{i}"], fluent_offset=i)
    time.sleep(3)

    for _ in range(steps):
        for agent_id, agent_is_up in enumerate(agents_are_up):
            if random.choice([True, False]):  # Flip agents status randomly.
                continue

            if should_disconnect:
                # Can't just randomly deploy up/down due to just getting a Docker name conflict.
                if agent_is_up:
                    agent_down(["--agent-name", f"agent-{agent_id}"])
                else:
                    agent_up(["--agent-name", f"agent-{agent_id}"], fluent_offset=agent_id)
                agents_are_up[agent_id] = not agents_are_up[agent_id]
            else:
                if random.choice([True, False]):
                    agent_disable([f"agent-{agent_id}"])
                    agents_are_up[agent_id] = False
                else:
                    agent_enable([f"agent-{agent_id}"])
                    agents_are_up[agent_id] = True
        time.sleep(10)

        # Validate that our master kept track of the agent reconnect spam.
        agent_list = json.loads(
            subprocess.check_output(
                [
                    "det",
                    "agent",
                    "list",
                    "--json",
                ]
            ).decode()
        )
        assert sum(agents_are_up) <= len(agent_list)
        for agent in agent_list:
            agent_id = int(agent["id"].replace("agent-", ""))
            assert agents_are_up[agent_id] == agent["enabled"]

        # Can we still schedule something?
        if any(agents_are_up):
            experiment_id = exp.create_experiment(
                conf.fixtures_path("no_op/single-one-short-step.yaml"),
                conf.fixtures_path("no_op"),
                None,
            )
            exp.wait_for_experiment_state(
                experiment_id, bindings.determinedexperimentv1State.STATE_COMPLETED
            )

    for agent_id in range(num_agents):
        agent_down(["--agent-name", f"agent-{agent_id}"])
    master_down([])
def run_gc_checkpoints_test(checkpoint_storage: Dict[str, str]) -> None:
    fixtures = [
        (
            conf.fixtures_path("no_op/gc_checkpoints_decreasing.yaml"),
            {
                (bindings.determinedexperimentv1State.STATE_COMPLETED.value):
                {800, 900, 1000},
                (bindings.determinedexperimentv1State.STATE_DELETED.value): {
                    100,
                    200,
                    300,
                    400,
                    500,
                    600,
                    700,
                },
            },
        ),
        (
            conf.fixtures_path("no_op/gc_checkpoints_increasing.yaml"),
            {
                (bindings.determinedexperimentv1State.STATE_COMPLETED.value): {
                    100,
                    200,
                    300,
                    900,
                    1000,
                },
                (bindings.determinedexperimentv1State.STATE_DELETED.value): {
                    400,
                    500,
                    600,
                    700,
                    800,
                },
            },
        ),
    ]

    all_checkpoints: List[Tuple[Any, List[bindings.v1CheckpointWorkload]]] = []
    for base_conf_path, result in fixtures:
        config = conf.load_config(str(base_conf_path))
        config["checkpoint_storage"].update(checkpoint_storage)

        with tempfile.NamedTemporaryFile() as tf:
            with open(tf.name, "w") as f:
                yaml.dump(config, f)

            experiment_id = exp.create_experiment(tf.name,
                                                  conf.fixtures_path("no_op"))

        exp.wait_for_experiment_state(
            experiment_id,
            bindings.determinedexperimentv1State.STATE_COMPLETED)

        # In some configurations, checkpoint GC will run on an auxillary machine, which may have to
        # be spun up still.  So we'll wait for it to run.
        wait_for_gc_to_finish(experiment_id)

        # Checkpoints are not marked as deleted until gc_checkpoint task starts.
        retries = 5
        for retry in range(retries):
            trials = exp.experiment_trials(experiment_id)
            assert len(trials) == 1

            cpoints = exp.workloads_with_checkpoint(trials[0].workloads)
            sorted_checkpoints = sorted(
                cpoints,
                key=lambda ckp: int(ckp.totalBatches),
            )
            assert len(sorted_checkpoints) == 10
            by_state = {}  # type: Dict[str, Set[int]]
            for ckpt in sorted_checkpoints:
                by_state.setdefault(ckpt.state.value,
                                    set()).add(ckpt.totalBatches)

            if by_state == result:
                all_checkpoints.append((config, sorted_checkpoints))
                break

            if retry + 1 == retries:
                assert by_state == result

            time.sleep(1)

    # Check that the actual checkpoint storage (for shared_fs) reflects the
    # deletions. We want to wait for the GC containers to exit, so check
    # repeatedly with a timeout.
    max_checks = 30
    for i in range(max_checks):
        time.sleep(1)
        try:
            storage_states = []
            for config, checkpoints in all_checkpoints:
                checkpoint_config = config["checkpoint_storage"]

                storage_manager = storage.build(checkpoint_config,
                                                container_path=None)
                storage_state = {}  # type: Dict[str, Any]
                for checkpoint in checkpoints:
                    assert checkpoint.uuid is not None
                    storage_id = checkpoint.uuid
                    storage_state[storage_id] = {}
                    if checkpoint.state == bindings.determinedcheckpointv1State.STATE_COMPLETED:
                        storage_state[storage_id]["found"] = False
                        try:
                            with storage_manager.restore_path(storage_id):
                                storage_state[storage_id]["found"] = True
                        except errors.CheckpointNotFound:
                            pass
                    elif checkpoint.state == bindings.determinedcheckpointv1State.STATE_DELETED:
                        storage_state[storage_id] = {
                            "deleted": False,
                            "checkpoint": checkpoint
                        }
                        try:
                            with storage_manager.restore_path(storage_id):
                                pass
                        except errors.CheckpointNotFound:
                            storage_state[storage_id]["deleted"] = True
                        storage_states.append(storage_state)

            for storage_state in storage_states:
                for state in storage_state.values():
                    if state.get("deleted", None) is False:
                        json_states = json.dumps(storage_states)
                        raise AssertionError(
                            f"Some checkpoints were not deleted: JSON:{json_states}"
                        )
                    if state.get("found", None) is False:
                        json_states = json.dumps(storage_states)
                        raise AssertionError(
                            f"Some checkpoints were not found: JSON:{json_states}"
                        )
        except AssertionError:
            if i == max_checks - 1:
                raise
        else:
            break
Beispiel #17
0
def test_workspace_org() -> None:
    master_url = conf.make_master_url()
    authentication.cli_auth = authentication.Authentication(master_url,
                                                            try_reauth=True)
    sess = session.Session(master_url, None, None, None)

    test_experiments: List[bindings.v1Experiment] = []
    test_projects: List[bindings.v1Project] = []
    test_workspaces: List[bindings.v1Workspace] = []

    try:
        # Uncategorized workspace / project should exist already.
        r = bindings.get_GetWorkspaces(sess, name="Uncategorized")
        assert len(r.workspaces) == 1
        default_workspace = r.workspaces[0]
        assert default_workspace.immutable
        r2 = bindings.get_GetWorkspaceProjects(sess, id=default_workspace.id)
        assert len(r2.projects) == 1
        default_project = r2.projects[0]
        assert default_project.name == "Uncategorized"
        assert default_project.immutable

        # Add a test workspace.
        r3 = bindings.post_PostWorkspace(
            sess, body=bindings.v1PostWorkspaceRequest(name="_TestOnly"))
        made_workspace = r3.workspace
        test_workspaces.append(made_workspace)
        get_workspace = bindings.get_GetWorkspace(
            sess, id=made_workspace.id).workspace
        assert get_workspace.name == made_workspace.name
        assert not made_workspace.immutable and not get_workspace.immutable

        # Patch the workspace
        w_patch = bindings.v1PatchWorkspace.from_json(made_workspace.to_json())
        w_patch.name = "_TestPatched"
        bindings.patch_PatchWorkspace(sess, body=w_patch, id=made_workspace.id)
        get_workspace = bindings.get_GetWorkspace(
            sess, id=made_workspace.id).workspace
        assert get_workspace.name == "_TestPatched"

        # Archive the workspace
        assert not made_workspace.archived
        bindings.post_ArchiveWorkspace(sess, id=made_workspace.id)
        get_workspace_2 = bindings.get_GetWorkspace(
            sess, id=made_workspace.id).workspace
        assert get_workspace_2.archived
        with pytest.raises(errors.APIException):
            # Cannot patch archived workspace
            bindings.patch_PatchWorkspace(sess,
                                          body=w_patch,
                                          id=made_workspace.id)
        with pytest.raises(errors.APIException):
            # Cannot create project inside archived workspace
            bindings.post_PostProject(
                sess,
                body=bindings.v1PostProjectRequest(
                    name="Nope2", workspaceId=made_workspace.id),
                workspaceId=made_workspace.id,
            )
        bindings.post_UnarchiveWorkspace(sess, id=made_workspace.id)
        get_workspace_3 = bindings.get_GetWorkspace(
            sess, id=made_workspace.id).workspace
        assert not get_workspace_3.archived

        # Refuse to patch, archive, unarchive, or delete the default workspace
        with pytest.raises(errors.APIException):
            bindings.patch_PatchWorkspace(sess,
                                          body=w_patch,
                                          id=default_workspace.id)
        with pytest.raises(errors.APIException):
            bindings.post_ArchiveWorkspace(sess, id=default_workspace.id)
        with pytest.raises(errors.APIException):
            bindings.post_UnarchiveWorkspace(sess, id=default_workspace.id)
        with pytest.raises(errors.APIException):
            bindings.delete_DeleteWorkspace(sess, id=default_workspace.id)

        # Sort test and default workspaces.
        workspace2 = bindings.post_PostWorkspace(
            sess,
            body=bindings.v1PostWorkspaceRequest(name="_TestWS")).workspace
        test_workspaces.append(workspace2)
        list_test_1 = bindings.get_GetWorkspaces(sess).workspaces
        assert ["Uncategorized", "_TestPatched",
                "_TestWS"] == [w.name for w in list_test_1]
        list_test_2 = bindings.get_GetWorkspaces(
            sess, orderBy=bindings.v1OrderBy.ORDER_BY_DESC).workspaces
        assert ["_TestWS", "_TestPatched",
                "Uncategorized"] == [w.name for w in list_test_2]
        list_test_3 = bindings.get_GetWorkspaces(
            sess, sortBy=bindings.v1GetWorkspacesRequestSortBy.SORT_BY_NAME
        ).workspaces
        assert ["_TestPatched", "_TestWS",
                "Uncategorized"] == [w.name for w in list_test_3]

        # Test pinned workspaces.
        pinned = bindings.get_GetWorkspaces(
            sess,
            pinned=True,
        ).workspaces
        assert len(pinned) == 2
        bindings.post_UnpinWorkspace(sess, id=made_workspace.id)
        pinned = bindings.get_GetWorkspaces(
            sess,
            pinned=True,
        ).workspaces
        assert len(pinned) == 1
        bindings.post_PinWorkspace(sess, id=made_workspace.id)
        pinned = bindings.get_GetWorkspaces(
            sess,
            pinned=True,
        ).workspaces
        assert len(pinned) == 2

        # Add a test project to a workspace.
        r4 = bindings.post_PostProject(
            sess,
            body=bindings.v1PostProjectRequest(name="_TestOnly",
                                               workspaceId=made_workspace.id),
            workspaceId=made_workspace.id,
        )
        made_project = r4.project
        test_projects.append(made_project)
        get_project = bindings.get_GetProject(sess, id=made_project.id).project
        assert get_project.name == made_project.name
        assert not made_project.immutable and not get_project.immutable

        # Project cannot be created in the default workspace.
        with pytest.raises(errors.APIException):
            bindings.post_PostProject(
                sess,
                body=bindings.v1PostProjectRequest(
                    name="Nope", workspaceId=default_workspace.id),
                workspaceId=default_workspace.id,
            )

        # Patch the project
        p_patch = bindings.v1PatchProject.from_json(made_project.to_json())
        p_patch.name = "_TestPatchedProject"
        bindings.patch_PatchProject(sess, body=p_patch, id=made_project.id)
        get_project = bindings.get_GetProject(sess, id=made_project.id).project
        assert get_project.name == "_TestPatchedProject"

        # Archive the project
        assert not made_project.archived
        bindings.post_ArchiveProject(sess, id=made_project.id)
        get_project_2 = bindings.get_GetProject(sess,
                                                id=made_project.id).project
        assert get_project_2.archived

        # Cannot patch or move an archived project
        with pytest.raises(errors.APIException):
            bindings.patch_PatchProject(sess, body=p_patch, id=made_project.id)
        with pytest.raises(errors.APIException):
            bindings.post_MoveProject(
                sess,
                projectId=made_project.id,
                body=bindings.v1MoveProjectRequest(
                    destinationWorkspaceId=workspace2.id,
                    projectId=made_project.id,
                ),
            )

        # Unarchive the project
        bindings.post_UnarchiveProject(sess, id=made_project.id)
        get_project_3 = bindings.get_GetProject(sess,
                                                id=made_project.id).project
        assert not get_project_3.archived

        # Can't archive, un-archive, or move while parent workspace is archived
        bindings.post_ArchiveWorkspace(sess, id=made_workspace.id)
        get_project_4 = bindings.get_GetProject(sess,
                                                id=made_project.id).project
        assert get_project_4.archived
        with pytest.raises(errors.APIException):
            bindings.post_ArchiveProject(sess, id=made_project.id)
        with pytest.raises(errors.APIException):
            bindings.post_UnarchiveProject(sess, id=made_project.id)
        with pytest.raises(errors.APIException):
            bindings.post_MoveProject(
                sess,
                projectId=made_project.id,
                body=bindings.v1MoveProjectRequest(
                    destinationWorkspaceId=workspace2.id,
                    projectId=made_project.id,
                ),
            )
        bindings.post_UnarchiveWorkspace(sess, id=made_workspace.id)

        # Refuse to patch, archive, unarchive, or delete the default project
        with pytest.raises(errors.APIException):
            bindings.patch_PatchProject(sess,
                                        body=p_patch,
                                        id=default_project.id)
        with pytest.raises(errors.APIException):
            bindings.post_ArchiveProject(sess, id=default_project.id)
        with pytest.raises(errors.APIException):
            bindings.post_UnarchiveProject(sess, id=default_project.id)
        with pytest.raises(errors.APIException):
            bindings.delete_DeleteProject(sess, id=default_project.id)

        # Sort workspaces' projects.
        p1 = bindings.post_PostProject(
            sess,
            body=bindings.v1PostProjectRequest(name="_TestPRJ",
                                               workspaceId=made_workspace.id),
            workspaceId=made_workspace.id,
        ).project
        p2 = bindings.post_PostProject(
            sess,
            body=bindings.v1PostProjectRequest(name="_TestEarly",
                                               workspaceId=made_workspace.id),
            workspaceId=made_workspace.id,
        ).project
        test_projects += [p1, p2]
        list_test_4 = bindings.get_GetWorkspaceProjects(
            sess, id=made_workspace.id).projects
        assert ["_TestPatchedProject", "_TestPRJ",
                "_TestEarly"] == [p.name for p in list_test_4]
        list_test_5 = bindings.get_GetWorkspaceProjects(
            sess,
            id=made_workspace.id,
            orderBy=bindings.v1OrderBy.ORDER_BY_DESC).projects
        assert ["_TestEarly", "_TestPRJ",
                "_TestPatchedProject"] == [p.name for p in list_test_5]
        list_test_6 = bindings.get_GetWorkspaceProjects(
            sess,
            id=made_workspace.id,
            sortBy=bindings.v1GetWorkspaceProjectsRequestSortBy.SORT_BY_NAME,
        ).projects
        assert ["_TestEarly", "_TestPatchedProject",
                "_TestPRJ"] == [p.name for p in list_test_6]

        # Move a project to another workspace
        bindings.post_MoveProject(
            sess,
            projectId=made_project.id,
            body=bindings.v1MoveProjectRequest(
                destinationWorkspaceId=workspace2.id,
                projectId=made_project.id,
            ),
        )
        get_project = bindings.get_GetProject(sess, id=made_project.id).project
        assert get_project.workspaceId == workspace2.id

        # Default project cannot be moved.
        with pytest.raises(errors.APIException):
            bindings.post_MoveProject(
                sess,
                projectId=default_project.id,
                body=bindings.v1MoveProjectRequest(
                    destinationWorkspaceId=workspace2.id,
                    projectId=default_project.id,
                ),
            )

        # Project cannot be moved into the default workspace.
        with pytest.raises(errors.APIException):
            bindings.post_MoveProject(
                sess,
                projectId=made_project.id,
                body=bindings.v1MoveProjectRequest(
                    destinationWorkspaceId=default_workspace.id,
                    projectId=made_project.id,
                ),
            )

        # Project cannot be moved into an archived workspace.
        bindings.post_ArchiveWorkspace(sess, id=made_workspace.id)
        with pytest.raises(errors.APIException):
            bindings.post_MoveProject(
                sess,
                projectId=made_project.id,
                body=bindings.v1MoveProjectRequest(
                    destinationWorkspaceId=made_workspace.id,
                    projectId=made_project.id,
                ),
            )
        bindings.post_UnarchiveWorkspace(sess, id=made_workspace.id)

        # Add a test note to a project.
        note = bindings.v1Note(name="Hello", contents="Hello World")
        note2 = bindings.v1Note(name="Hello 2", contents="Hello World")
        bindings.post_AddProjectNote(
            sess,
            body=note,
            projectId=made_project.id,
        )
        r5 = bindings.post_AddProjectNote(
            sess,
            body=note2,
            projectId=made_project.id,
        )
        returned_notes = r5.notes
        assert len(returned_notes) == 2

        # Put notes
        r6 = bindings.put_PutProjectNotes(
            sess,
            body=bindings.v1PutProjectNotesRequest(notes=[note],
                                                   projectId=made_project.id),
            projectId=made_project.id,
        )
        returned_notes = r6.notes
        assert len(returned_notes) == 1

        # Create an experiment in the default project.
        test_exp_id = run_basic_test(conf.fixtures_path("no_op/single.yaml"),
                                     conf.fixtures_path("no_op"), 1)
        test_exp = bindings.get_GetExperiment(
            sess, experimentId=test_exp_id).experiment
        test_experiments.append(test_exp)
        wait_for_experiment_state(
            test_exp_id, bindings.determinedexperimentv1State.STATE_COMPLETED)
        assert test_exp.projectId == default_project.id

        # Move the test experiment into a user-made project
        dproj_exp = bindings.get_GetProjectExperiments(
            sess, id=default_project.id).experiments
        exp_count = len(
            bindings.get_GetProjectExperiments(sess,
                                               id=made_project.id).experiments)
        assert exp_count == 0
        mbody = bindings.v1MoveExperimentRequest(
            destinationProjectId=made_project.id, experimentId=test_exp_id)
        bindings.post_MoveExperiment(sess,
                                     experimentId=test_exp_id,
                                     body=mbody)
        modified_exp = bindings.get_GetExperiment(
            sess, experimentId=test_exp_id).experiment
        assert modified_exp.projectId == made_project.id

        # Confirm the test experiment is in the new project, no longer in old project.
        exp_count = len(
            bindings.get_GetProjectExperiments(sess,
                                               id=made_project.id).experiments)
        assert exp_count == 1
        dproj_exp2 = bindings.get_GetProjectExperiments(
            sess, id=default_project.id).experiments
        assert len(dproj_exp2) == len(dproj_exp) - 1

        # Cannot move an experiment out of an archived project
        bindings.post_ArchiveProject(sess, id=made_project.id)
        mbody2 = bindings.v1MoveExperimentRequest(
            destinationProjectId=default_project.id, experimentId=test_exp_id)
        with pytest.raises(errors.APIException):
            bindings.post_MoveExperiment(sess,
                                         experimentId=test_exp_id,
                                         body=mbody2)
        bindings.post_UnarchiveProject(sess, id=made_project.id)

        # Moving an experiment into default project
        bindings.post_MoveExperiment(sess,
                                     experimentId=test_exp_id,
                                     body=mbody2)

        # Cannot move an experiment into an archived project
        bindings.post_ArchiveProject(sess, id=made_project.id)
        with pytest.raises(errors.APIException):
            bindings.post_MoveExperiment(sess,
                                         experimentId=test_exp_id,
                                         body=mbody)

    finally:
        # Clean out experiments, projects, workspaces.
        # In dependency order:
        for e in test_experiments:
            bindings.delete_DeleteExperiment(sess, experimentId=e.id)
        for p in test_projects:
            bindings.delete_DeleteProject(sess, id=p.id)
        for w in test_workspaces:
            bindings.delete_DeleteWorkspace(sess, id=w.id)
Beispiel #18
0
def test_pytorch_native_api() -> None:
    exp_id = exp.create_native_experiment(conf.fixtures_path("pytorch_no_op"),
                                          [sys.executable, "model_def.py"])
    exp.wait_for_experiment_state(exp_id, "COMPLETED")
def test_drain_agent() -> None:
    """
    Start an experiment, `disable --drain` the agent once the trial is running,
    make sure the experiment still finishes, but the new ones won't schedule.
    """

    slots = _fetch_slots()
    assert len(slots) == 1
    agent_id = slots[0]["agent_id"]

    experiment_id = exp.create_experiment(
        conf.fixtures_path("no_op/single-medium-train-step.yaml"),
        conf.fixtures_path("no_op"),
        None,
    )
    exp.wait_for_experiment_state(experiment_id, determinedexperimentv1State.STATE_ACTIVE)
    exp.wait_for_experiment_active_workload(experiment_id)
    exp.wait_for_experiment_workload_progress(experiment_id)

    # Disable and quickly enable it back.
    with _disable_agent(agent_id, drain=True):
        pass

    # Try to launch another experiment. It shouldn't get scheduled because the
    # slot is still busy with the first experiment.
    experiment_id_no_start = exp.create_experiment(
        conf.fixtures_path("no_op/single-medium-train-step.yaml"),
        conf.fixtures_path("no_op"),
        None,
    )
    time.sleep(5)
    exp.wait_for_experiment_state(experiment_id_no_start, determinedexperimentv1State.STATE_ACTIVE)

    with _disable_agent(agent_id, drain=True):
        # Check for 15 seconds it doesn't get scheduled into the same slot.
        for _ in range(15):
            trials = exp.experiment_trials(experiment_id_no_start)
            assert len(trials) == 0

        # Ensure the first one has finished with the correct number of workloads.
        exp.wait_for_experiment_state(experiment_id, determinedexperimentv1State.STATE_COMPLETED)
        trials = exp.experiment_trials(experiment_id)
        assert len(trials) == 1
        assert len(trials[0].workloads) == 7

        # Ensure the slot is empty.
        slots = _fetch_slots()
        assert len(slots) == 1
        assert slots[0]["enabled"] is False
        assert slots[0]["draining"] is True
        assert slots[0]["allocation_id"] == "FREE"

        # Check agent state.
        command = ["det", "-m", conf.make_master_url(), "agent", "list", "--json"]
        output = subprocess.check_output(command).decode()
        agent_data = cast(List[Dict[str, Any]], json.loads(output))[0]
        assert agent_data["id"] == agent_id
        assert agent_data["enabled"] is False
        assert agent_data["draining"] is True

        exp.cancel_single(experiment_id_no_start)
def run_gc_checkpoints_test(checkpoint_storage: Dict[str, str]) -> None:
    fixtures = [
        (
            conf.fixtures_path("no_op/gc_checkpoints_decreasing.yaml"),
            {
                "COMPLETED": {8, 9, 10},
                "DELETED": {1, 2, 3, 4, 5, 6, 7}
            },
        ),
        (
            conf.fixtures_path("no_op/gc_checkpoints_increasing.yaml"),
            {
                "COMPLETED": {1, 2, 3, 9, 10},
                "DELETED": {4, 5, 6, 7, 8}
            },
        ),
    ]

    all_checkpoints = []
    for base_conf_path, result in fixtures:
        config = conf.load_config(str(base_conf_path))
        config["checkpoint_storage"].update(checkpoint_storage)

        with tempfile.NamedTemporaryFile() as tf:
            with open(tf.name, "w") as f:
                yaml.dump(config, f)

            experiment_id = exp.create_experiment(tf.name,
                                                  conf.fixtures_path("no_op"))

        exp.wait_for_experiment_state(experiment_id, "COMPLETED")

        # Checkpoints are not marked as deleted until gc_checkpoint task starts.
        retries = 5
        for retry in range(retries):
            trials = exp.experiment_trials(experiment_id)
            assert len(trials) == 1

            checkpoints = sorted(
                (step["checkpoint"] for step in trials[0]["steps"]),
                key=operator.itemgetter("step_id"),
            )
            assert len(checkpoints) == 10
            by_state = {}  # type: Dict[str, Set[int]]
            for checkpoint in checkpoints:
                by_state.setdefault(checkpoint["state"],
                                    set()).add(checkpoint["step_id"])

            if by_state == result:
                all_checkpoints.append((config, checkpoints))
                break

            if retry + 1 == retries:
                assert by_state == result

            time.sleep(1)

    # Check that the actual checkpoint storage (for shared_fs) reflects the
    # deletions. We want to wait for the GC containers to exit, so check
    # repeatedly with a timeout.
    max_checks = 30
    for i in range(max_checks):
        time.sleep(1)
        try:
            for config, checkpoints in all_checkpoints:
                checkpoint_config = config["checkpoint_storage"]

                if checkpoint_config["type"] == "shared_fs":
                    deleted_exception = check.CheckFailedError
                elif checkpoint_config["type"] == "s3":
                    deleted_exception = botocore.exceptions.ClientError
                else:
                    raise NotImplementedError(
                        f'unsupported storage type {checkpoint_config["type"]}'
                    )

                storage_manager = storage.build(checkpoint_config,
                                                container_path=None)
                for checkpoint in checkpoints:
                    metadata = storage.StorageMetadata.from_json(checkpoint)
                    if checkpoint["state"] == "COMPLETED":
                        with storage_manager.restore_path(metadata):
                            pass
                    elif checkpoint["state"] == "DELETED":
                        try:
                            with storage_manager.restore_path(metadata):
                                raise AssertionError("checkpoint not deleted")
                        except deleted_exception:
                            pass
        except AssertionError:
            if i == max_checks - 1:
                raise
        else:
            break
Beispiel #21
0
def run_gc_checkpoints_test(checkpoint_storage: Dict[str, str]) -> None:
    fixtures = [
        (
            conf.fixtures_path("no_op/gc_checkpoints_decreasing.yaml"),
            {
                "COMPLETED": {8, 9, 10},
                "DELETED": {1, 2, 3, 4, 5, 6, 7}
            },
        ),
        (
            conf.fixtures_path("no_op/gc_checkpoints_increasing.yaml"),
            {
                "COMPLETED": {1, 2, 3, 9, 10},
                "DELETED": {4, 5, 6, 7, 8}
            },
        ),
    ]

    all_checkpoints = []
    for base_conf_path, result in fixtures:
        config = conf.load_config(str(base_conf_path))
        config["checkpoint_storage"].update(checkpoint_storage)

        with tempfile.NamedTemporaryFile() as tf:
            with open(tf.name, "w") as f:
                yaml.dump(config, f)

            experiment_id = exp.create_experiment(tf.name,
                                                  conf.fixtures_path("no_op"))

        exp.wait_for_experiment_state(experiment_id, "COMPLETED")

        # Checkpoints are not marked as deleted until gc_checkpoint task starts.
        retries = 5
        for retry in range(retries):
            trials = exp.experiment_trials(experiment_id)
            assert len(trials) == 1

            checkpoints = sorted(
                (step.checkpoint for step in trials[0].steps),
                key=operator.itemgetter("step_id"),
            )
            assert len(checkpoints) == 10
            by_state = {}  # type: Dict[str, Set[int]]
            for checkpoint in checkpoints:
                by_state.setdefault(checkpoint.state,
                                    set()).add(checkpoint.step_id)

            if by_state == result:
                all_checkpoints.append((config, checkpoints))
                break

            if retry + 1 == retries:
                assert by_state == result

            time.sleep(1)

    # Check that the actual checkpoint storage (for shared_fs) reflects the
    # deletions. We want to wait for the GC containers to exit, so check
    # repeatedly with a timeout.
    max_checks = 30
    for check in range(max_checks):
        time.sleep(1)
        try:
            for config, checkpoints in all_checkpoints:
                checkpoint_config = config["checkpoint_storage"]

                if checkpoint_config["type"] == "shared_fs" and (
                        "storage_path" not in checkpoint_config):
                    if "tensorboard_path" in checkpoint_config:
                        checkpoint_config[
                            "storage_path"] = checkpoint_config.get(
                                "tensorboard_path", None)
                    else:
                        checkpoint_config[
                            "storage_path"] = checkpoint_config.get(
                                "checkpoint_path", None)

                    root = os.path.join(checkpoint_config["host_path"],
                                        checkpoint_config["storage_path"])

                    for checkpoint in checkpoints:
                        dirname = os.path.join(root, checkpoint.uuid)
                        if checkpoint.state == "COMPLETED":
                            assert os.path.isdir(dirname)
                        elif checkpoint.state == "DELETED":
                            assert not os.path.exists(dirname)
        except AssertionError:
            if check == max_checks - 1:
                raise
        else:
            break