Exemple #1
0
def verify_completed_experiment_metadata(
    experiment_id: int, num_expected_trials: Optional[int], has_zeroth_step: bool = False
) -> None:
    # If `expected_trials` is None, the expected number of trials is
    # non-deterministic.
    if num_expected_trials is not None:
        assert num_trials(experiment_id) == num_expected_trials
        assert num_completed_trials(experiment_id) == num_expected_trials

    # Check that every trial and step is COMPLETED.
    trials = experiment_trials(experiment_id)
    assert len(trials) > 0

    for trial in trials:
        if trial["state"] != "COMPLETED":
            report_failed_trial(trial["id"], trial["state"])
            pytest.fail(f"Trial {trial['id']} was not COMPLETED but {trial['state']}")

        assert len(trial["steps"]) > 0

        # Check that steps appear in increasing order of step ID.
        # Step IDs should start at 0 or 1 and have no gaps.
        step_ids = [s["id"] for s in trial["steps"]]
        assert step_ids == sorted(step_ids)
        if has_zeroth_step:
            assert step_ids == list(range(0, len(step_ids)))
        else:
            assert step_ids == list(range(1, len(step_ids) + 1))

        for step in trial["steps"]:
            assert step["state"] == "COMPLETED"

            if step["validation"]:
                validation = step["validation"]
                assert validation["state"] == "COMPLETED"

            if step["checkpoint"]:
                checkpoint = step["checkpoint"]
                assert checkpoint["state"] in {"COMPLETED", "DELETED"}

    # The last step of every trial should have a checkpoint.
    for trial in trials:
        last_step = trial["steps"][-1]
        assert last_step["checkpoint"]

    # When the experiment completes, all slots should now be free. This
    # requires terminating the experiment's last container, which might
    # take some time.
    max_secs_to_free_slots = 30
    for _ in range(max_secs_to_free_slots):
        if cluster.num_free_slots() == cluster.num_slots():
            break
        time.sleep(1)
    else:
        raise AssertionError("Slots failed to free after experiment {}".format(experiment_id))

    # Run a series of CLI tests on the finished experiment, to sanity check
    # that basic CLI commands don't raise errors.
    run_describe_cli_tests(experiment_id)
    run_list_cli_tests(experiment_id)
Exemple #2
0
def verify_completed_experiment_metadata(
        experiment_id: int, num_expected_trials: Optional[int]) -> None:
    # If `expected_trials` is None, the expected number of trials is
    # non-deterministic.
    if num_expected_trials is not None:
        assert num_trials(experiment_id) == num_expected_trials
        assert num_completed_trials(experiment_id) == num_expected_trials

    # Check that every trial and step is COMPLETED.
    trials = experiment_trials(experiment_id)
    assert len(trials) > 0

    for trial in trials:
        assert trial.state == "COMPLETED"
        assert len(trial.steps) > 0

        # Check that steps appear in increasing order of step ID.
        # Step IDs should start at 1 and have no gaps.
        step_ids = [s.id for s in trial.steps]
        assert step_ids == sorted(step_ids)
        assert step_ids == list(range(1, len(step_ids) + 1))

        for step in trial.steps:
            assert step.state == "COMPLETED"

            if step.validation:
                assert step.validation.state == "COMPLETED"

            if step.checkpoint:
                assert step.checkpoint.state in {"COMPLETED", "DELETED"}

    # The last step of every trial should have a checkpoint.
    for trial in trials:
        assert trial.steps[-1].checkpoint

    # When the experiment completes, all slots should now be free. This
    # requires terminating the experiment's last container, which might
    # take some time.
    max_secs_to_free_slots = 30
    for _ in range(max_secs_to_free_slots):
        if cluster.num_free_slots() == cluster.num_slots():
            break
        time.sleep(1)
    else:
        raise AssertionError(
            "Slots failed to free after experiment {}".format(experiment_id))

    # Run a series of CLI tests on the finished experiment, to sanity check
    # that basic CLI commands don't raise errors.
    run_describe_cli_tests(experiment_id)
    run_list_cli_tests(experiment_id)