def test_master_restart_reattach_recover_experiment(
        managed_cluster_restarts: ManagedCluster, downtime: int) -> None:
    _sanity_check(managed_cluster_restarts)

    try:
        exp_id = exp.create_experiment(
            conf.fixtures_path("no_op/single-medium-train-step.yaml"),
            conf.fixtures_path("no_op"),
            None,
        )

        # TODO(ilia): don't wait for progress.
        exp.wait_for_experiment_workload_progress(exp_id)

        if downtime >= 0:
            managed_cluster_restarts.kill_master()
            time.sleep(downtime)
            managed_cluster_restarts.restart_master()

        exp.wait_for_experiment_state(exp_id,
                                      EXP_STATE.STATE_COMPLETED,
                                      max_wait_secs=downtime + 60)
        trials = exp.experiment_trials(exp_id)

        assert len(trials) == 1
        train_wls = exp.workloads_with_training(trials[0].workloads)
        assert len(train_wls) == 5
    except Exception:
        managed_cluster_restarts.restart_master()
        managed_cluster_restarts.restart_agent()
        raise
def test_disable_agent_experiment_resume() -> None:
    """
    Start an experiment with max_restarts=0 and ensure that being killed due to an explicit agent
    disable/enable (without draining) does not count toward the number of restarts.
    """
    slots = _fetch_slots()
    assert len(slots) == 1
    agent_id = slots[0]["agent_id"]

    exp_id = exp.create_experiment(
        conf.fixtures_path("no_op/single-medium-train-step.yaml"),
        conf.fixtures_path("no_op"),
        ["--config", "max_restarts=0"],
    )
    exp.wait_for_experiment_workload_progress(exp_id)

    with _disable_agent(agent_id):
        # Wait for the allocation to go away.
        for _ in range(20):
            slots = _fetch_slots()
            print(slots)
            if not any(s["allocation_id"] != "FREE" for s in slots):
                break
            time.sleep(1)
        else:
            pytest.fail("Experiment stayed scheduled after agent was disabled")
    exp.wait_for_experiment_state(exp_id, determinedexperimentv1State.STATE_COMPLETED)
def test_master_restart_kill_works(
        managed_cluster_restarts: ManagedCluster) -> None:
    _sanity_check(managed_cluster_restarts)

    try:
        exp_id = exp.create_experiment(
            conf.fixtures_path("no_op/single-many-long-steps.yaml"),
            conf.fixtures_path("no_op"),
            [
                "--config", "searcher.max_length.batches=10000", "--config",
                "max_restarts=0"
            ],
        )

        exp.wait_for_experiment_workload_progress(exp_id)

        managed_cluster_restarts.kill_master()
        time.sleep(0)
        managed_cluster_restarts.restart_master()

        command = [
            "det", "-m",
            conf.make_master_url(), "e", "kill",
            str(exp_id)
        ]
        subprocess.check_call(command)

        exp.wait_for_experiment_state(exp_id,
                                      EXP_STATE.STATE_CANCELED,
                                      max_wait_secs=10)

        managed_cluster_restarts.ensure_agent_ok()
    except Exception:
        managed_cluster_restarts.restart_master()
        managed_cluster_restarts.restart_agent()
Example #4
0
def test_agent_reconnect_keep_experiment(
        managed_cluster_restarts: ManagedCluster) -> None:
    managed_cluster_restarts.ensure_agent_ok()

    try:
        exp_id = exp.create_experiment(
            conf.fixtures_path("no_op/single-medium-train-step.yaml"),
            conf.fixtures_path("no_op"),
            None,
        )
        exp.wait_for_experiment_workload_progress(exp_id)

        managed_cluster_restarts.kill_proxy()
        time.sleep(1)
        managed_cluster_restarts.restart_proxy()

        exp.wait_for_experiment_state(exp_id, EXP_STATE.STATE_COMPLETED)
        trials = exp.experiment_trials(exp_id)

        assert len(trials) == 1
        train_wls = exp.workloads_with_training(trials[0].workloads)
        assert len(train_wls) == 5
    except Exception:
        managed_cluster_restarts.restart_proxy(wait_for_reconnect=False)
        managed_cluster_restarts.restart_agent()
        raise
Example #5
0
def test_agent_restart_recover_experiment(
        managed_cluster_restarts: ManagedCluster, downtime: int) -> None:
    if not managed_cluster_restarts.reattach:
        pytest.skip()

    managed_cluster_restarts.ensure_agent_ok()
    try:
        exp_id = exp.create_experiment(
            conf.fixtures_path("no_op/single-medium-train-step.yaml"),
            conf.fixtures_path("no_op"),
            None,
        )
        exp.wait_for_experiment_workload_progress(exp_id)

        if downtime >= 0:
            managed_cluster_restarts.kill_agent()
            time.sleep(downtime)
            managed_cluster_restarts.restart_agent(wait_for_amnesia=False)

        exp.wait_for_experiment_state(exp_id, EXP_STATE.STATE_COMPLETED)
        trials = exp.experiment_trials(exp_id)

        assert len(trials) == 1
        train_wls = exp.workloads_with_training(trials[0].workloads)
        assert len(train_wls) == 5
    except Exception:
        managed_cluster_restarts.restart_agent()
        raise
def test_drain_agent_sched() -> None:
    """
    Start an experiment, drain it. Start a second one and make sure it schedules
    on the second agent *before* the first one has finished.
    """
    slots = _wait_for_slots(2)
    assert len(slots) == 2

    exp_id1 = exp.create_experiment(
        conf.fixtures_path("no_op/single-medium-train-step.yaml"),
        conf.fixtures_path("no_op"),
        None,
    )
    exp.wait_for_experiment_workload_progress(exp_id1)

    slots = _fetch_slots()
    used_slots = [s for s in slots if s["allocation_id"] != "FREE"]
    assert len(used_slots) == 1
    agent_id1 = used_slots[0]["agent_id"]

    with _disable_agent(agent_id1, drain=True):
        exp_id2 = exp.create_experiment(
            conf.fixtures_path("no_op/single-medium-train-step.yaml"),
            conf.fixtures_path("no_op"),
            None,
        )
        exp.wait_for_experiment_state(exp_id2,
                                      determinedexperimentv1State.STATE_ACTIVE)

        # Wait for a state when *BOTH* experiments are scheduled.
        for _ in range(20):
            slots = _fetch_slots()
            assert len(slots) == 2
            used_slots = [s for s in slots if s["allocation_id"] != "FREE"]
            if len(used_slots) == 2:
                # All good.
                break
        else:
            pytest.fail(
                "Second experiment didn't schedule on the second agent "
                "while the first agent was draining")

        exp.wait_for_experiment_state(
            exp_id1, determinedexperimentv1State.STATE_COMPLETED)
        exp.wait_for_experiment_state(
            exp_id2, determinedexperimentv1State.STATE_COMPLETED)

        trials1 = exp.experiment_trials(exp_id1)
        trials2 = exp.experiment_trials(exp_id2)
        assert len(trials1) == len(trials2) == 1
        assert len(trials1[0].workloads) == len(trials2[0].workloads) == 7
Example #7
0
def test_agent_restart_exp_container_failure(
        managed_cluster_restarts: ManagedCluster) -> None:
    managed_cluster_restarts.ensure_agent_ok()
    try:
        exp_id = exp.create_experiment(
            conf.fixtures_path("no_op/single-medium-train-step.yaml"),
            conf.fixtures_path("no_op"),
            None,
        )
        exp.wait_for_experiment_workload_progress(exp_id)
        container_ids = list(_local_container_ids_for_experiment(exp_id))
        if len(container_ids) != 1:
            pytest.fail(
                f"unexpected number of local containers for the experiment: {len(container_ids)}"
            )
        # Get task id / allocation id
        tasks_data = _task_list_json(managed_cluster_restarts.master_url)
        assert len(tasks_data) == 1
        exp_task_before = list(tasks_data.values())[0]

        managed_cluster_restarts.kill_agent()
        subprocess.run(["docker", "kill", container_ids[0]],
                       check=True,
                       stdout=subprocess.PIPE)
    except Exception:
        managed_cluster_restarts.restart_agent()
        raise
    else:
        managed_cluster_restarts.restart_agent()
        # As soon as the agent is back, the original allocation should be considered dead,
        # but the new one should be allocated.
        state = exp.experiment_state(exp_id)
        assert state == EXP_STATE.STATE_ACTIVE
        tasks_data = _task_list_json(managed_cluster_restarts.master_url)
        assert len(tasks_data) == 1
        exp_task_after = list(tasks_data.values())[0]

        assert exp_task_before["task_id"] == exp_task_after["task_id"]
        assert exp_task_before["allocation_id"] != exp_task_after[
            "allocation_id"]

        exp.wait_for_experiment_state(exp_id, EXP_STATE.STATE_COMPLETED)
Example #8
0
def test_noop_pause() -> None:
    """
    Walk through starting, pausing, and resuming a single no-op experiment.
    """
    experiment_id = exp.create_experiment(
        conf.fixtures_path("no_op/single-medium-train-step.yaml"),
        conf.fixtures_path("no_op"),
        None,
    )
    exp.wait_for_experiment_state(experiment_id, bindings.determinedexperimentv1State.STATE_ACTIVE)

    # Wait for the only trial to get scheduled.
    exp.wait_for_experiment_active_workload(experiment_id)

    # Wait for the only trial to show progress, indicating the image is built and running.
    exp.wait_for_experiment_workload_progress(experiment_id)

    # Pause the experiment. Note that Determined does not currently differentiate
    # between a "stopping paused" and a "paused" state, so we follow this check
    # up by ensuring the experiment cleared all scheduled workloads.
    exp.pause_experiment(experiment_id)
    exp.wait_for_experiment_state(experiment_id, bindings.determinedexperimentv1State.STATE_PAUSED)

    # Wait at most 20 seconds for the experiment to clear all workloads (each
    # train step should take 5 seconds).
    for _ in range(20):
        workload_active = exp.experiment_has_active_workload(experiment_id)
        if not workload_active:
            break
        else:
            time.sleep(1)
    check.true(
        not workload_active,
        "The experiment cannot be paused within 20 seconds.",
    )

    # Resume the experiment and wait for completion.
    exp.activate_experiment(experiment_id)
    exp.wait_for_experiment_state(
        experiment_id, bindings.determinedexperimentv1State.STATE_COMPLETED
    )
def test_drain_agent() -> None:
    """
    Start an experiment, `disable --drain` the agent once the trial is running,
    make sure the experiment still finishes, but the new ones won't schedule.
    """

    slots = _fetch_slots()
    assert len(slots) == 1
    agent_id = slots[0]["agent_id"]

    experiment_id = exp.create_experiment(
        conf.fixtures_path("no_op/single-medium-train-step.yaml"),
        conf.fixtures_path("no_op"),
        None,
    )
    exp.wait_for_experiment_state(experiment_id, determinedexperimentv1State.STATE_ACTIVE)
    exp.wait_for_experiment_active_workload(experiment_id)
    exp.wait_for_experiment_workload_progress(experiment_id)

    # Disable and quickly enable it back.
    with _disable_agent(agent_id, drain=True):
        pass

    # Try to launch another experiment. It shouldn't get scheduled because the
    # slot is still busy with the first experiment.
    experiment_id_no_start = exp.create_experiment(
        conf.fixtures_path("no_op/single-medium-train-step.yaml"),
        conf.fixtures_path("no_op"),
        None,
    )
    time.sleep(5)
    exp.wait_for_experiment_state(experiment_id_no_start, determinedexperimentv1State.STATE_ACTIVE)

    with _disable_agent(agent_id, drain=True):
        # Check for 15 seconds it doesn't get scheduled into the same slot.
        for _ in range(15):
            trials = exp.experiment_trials(experiment_id_no_start)
            assert len(trials) == 0

        # Ensure the first one has finished with the correct number of workloads.
        exp.wait_for_experiment_state(experiment_id, determinedexperimentv1State.STATE_COMPLETED)
        trials = exp.experiment_trials(experiment_id)
        assert len(trials) == 1
        assert len(trials[0].workloads) == 7

        # Ensure the slot is empty.
        slots = _fetch_slots()
        assert len(slots) == 1
        assert slots[0]["enabled"] is False
        assert slots[0]["draining"] is True
        assert slots[0]["allocation_id"] == "FREE"

        # Check agent state.
        command = ["det", "-m", conf.make_master_url(), "agent", "list", "--json"]
        output = subprocess.check_output(command).decode()
        agent_data = cast(List[Dict[str, Any]], json.loads(output))[0]
        assert agent_data["id"] == agent_id
        assert agent_data["enabled"] is False
        assert agent_data["draining"] is True

        exp.cancel_single(experiment_id_no_start)
def test_allocation_resources_incremental_release() -> None:
    """
    Start an two container experiment and ensure one container exits before the other. Ensure
    resources are released and schedule-able without the other needing to be released.
    """
    cleanup_exp_ids = []

    try:
        slots = _wait_for_slots(2)
        assert len(slots) == 2

        with tempfile.TemporaryDirectory() as context_dir, open(
                os.path.join(context_dir, "const.yaml"), "w") as config_file:
            # Launch an experiment that has one resource (docker container) that exits immediately.
            config_obj = conf.load_config(
                conf.fixtures_path("no_op/single.yaml"))
            config_obj["resources"] = {
                **config_obj.get("resources", {}),
                **{
                    "slots": 2
                }
            }
            config_obj["hyperparameters"] = {
                **config_obj.get("hyperparameters", {}),
                **{
                    "non_chief_exit_immediately": True
                },
            }
            yaml.dump(config_obj, config_file)

            shutil.copy(conf.fixtures_path("no_op/model_def.py"),
                        os.path.join(context_dir, "model_def.py"))

            exp_id = exp.create_experiment(config_file.name, context_dir, None)
            cleanup_exp_ids.append(exp_id)

        # Wait for the experiment to start and run some.
        exp.wait_for_experiment_state(
            exp_id,
            determinedexperimentv1State.STATE_ACTIVE,
        )
        exp.wait_for_experiment_active_workload(exp_id)

        # And wait for exactly one of the resources to free, while one is still in use.
        confirmations = 0
        for _ in range(RANK_ONE_WAIT_TIME):
            free_agents = list_free_agents()
            if len(free_agents) == 1:
                confirmations += 1

            if confirmations == 2:
                # Just for the race where one container has exited and the other hasn't quite yet,
                # but is going to, make sure we see it at least twice.
                break

            # Still waiting on partial exit
            time.sleep(1)
        else:
            pytest.fail(
                "exactly one agent did not free after {} seconds".format(
                    RANK_ONE_WAIT_TIME))

        # Ensure we can schedule on the free slot, not only that the API says its available.
        exp_id_2 = exp.create_experiment(
            conf.fixtures_path("no_op/single.yaml"),
            conf.fixtures_path("no_op"),
            None,
        )
        cleanup_exp_ids.append(exp_id_2)

        exp.wait_for_experiment_workload_progress(exp_id_2)
        exp.wait_for_experiment_state(
            exp_id_2, determinedexperimentv1State.STATE_COMPLETED)
        cleanup_exp_ids = cleanup_exp_ids[:-1]

        # And check the hung experiment still is holding on to its hung slot.
        free_agents = list_free_agents()
        if len(free_agents) != 1:
            pytest.fail(
                f"should still have exactly one agent scheduled: {free_agents}"
            )

    finally:
        for exp_id in cleanup_exp_ids:
            bindings.post_KillExperiment(determined_test_session(), id=exp_id)
            exp.wait_for_experiment_state(
                exp_id, determinedexperimentv1State.STATE_CANCELED)