Ejemplo n.º 1
0
def test_noop_pause_of_experiment_without_trials() -> None:
    """
    Walk through starting, pausing, and resuming a single no-op experiment
    which will never schedule a trial.
    """
    config_obj = conf.load_config(conf.fixtures_path("no_op/single-one-short-step.yaml"))
    impossibly_large = 100
    config_obj["max_restarts"] = 0
    config_obj["resources"] = {"slots_per_trial": impossibly_large}
    with tempfile.NamedTemporaryFile() as tf:
        with open(tf.name, "w") as f:
            yaml.dump(config_obj, f)
        experiment_id = exp.create_experiment(tf.name, conf.fixtures_path("no_op"), None)
    exp.pause_experiment(experiment_id)
    exp.wait_for_experiment_state(experiment_id, bindings.determinedexperimentv1State.STATE_PAUSED)

    exp.activate_experiment(experiment_id)
    exp.wait_for_experiment_state(experiment_id, bindings.determinedexperimentv1State.STATE_ACTIVE)

    for _ in range(5):
        assert (
            exp.experiment_state(experiment_id) == bindings.determinedexperimentv1State.STATE_ACTIVE
        )
        time.sleep(1)

    exp.cancel_single(experiment_id)
Ejemplo n.º 2
0
def run_failure_test_with_temp_config(
    config: Dict[Any, Any],
    model_def_path: str,
    error_str: Optional[str] = None,
) -> None:
    with tempfile.NamedTemporaryFile() as tf:
        with open(tf.name, "w") as f:
            yaml.dump(config, f)
        run_failure_test(tf.name, model_def_path, error_str=error_str)
Ejemplo n.º 3
0
def test_noop_experiment_config_override() -> None:
    config_obj = conf.load_config(conf.fixtures_path("no_op/single-one-short-step.yaml"))
    with tempfile.NamedTemporaryFile() as tf:
        with open(tf.name, "w") as f:
            yaml.dump(config_obj, f)
        experiment_id = exp.create_experiment(
            tf.name,
            conf.fixtures_path("no_op"),
            ["--config", "reproducibility.experiment_seed=8200"],
        )
        exp_config = exp.experiment_config_json(experiment_id)
        assert exp_config["reproducibility"]["experiment_seed"] == 8200
        exp.cancel_single(experiment_id)
Ejemplo n.º 4
0
def test_non_root_experiment(clean_auth: None, tmp_path: pathlib.Path) -> None:
    user = create_linked_user(65534, "nobody", 65534, "nogroup")

    with logged_in_user(user):
        with open(conf.fixtures_path("no_op/model_def.py")) as f:
            model_def_content = f.read()

        with open(conf.fixtures_path("no_op/single-one-short-step.yaml")) as f:
            config = yaml.safe_load(f)

        # Use a user-owned path to ensure shared_fs uses the container_path and not host_path.
        with non_tmp_shared_fs_path() as host_path:
            config["checkpoint_storage"] = {
                "type": "shared_fs",
                "host_path": host_path,
            }

            # Call `det --version` in a startup hook to ensure that det is on the PATH.
            with FileTree(
                    tmp_path,
                {
                    "startup-hook.sh": "det --version || exit 77",
                    "const.yaml": yaml.dump(config),  # type: ignore
                    "model_def.py": model_def_content,
                },
            ) as tree:
                exp.run_basic_test(str(tree.joinpath("const.yaml")), str(tree),
                                   None)
Ejemplo n.º 5
0
def run_basic_test_with_temp_config(
    config: Dict[Any, Any],
    model_def_path: str,
    expected_trials: Optional[int],
    create_args: Optional[List[str]] = None,
    max_wait_secs: int = conf.DEFAULT_MAX_WAIT_SECS,
) -> int:
    with tempfile.NamedTemporaryFile() as tf:
        with open(tf.name, "w") as f:
            yaml.dump(config, f)
        experiment_id = run_basic_test(
            tf.name,
            model_def_path,
            expected_trials,
            create_args,
            max_wait_secs=max_wait_secs,
        )
    return experiment_id
Ejemplo n.º 6
0
def master_up(
    port: int,
    master_config_path: Optional[Path],
    storage_host_path: Path,
    master_name: str,
    image_repo_prefix: Optional[str],
    version: Optional[str],
    db_password: str,
    delete_db: bool,
    autorestart: bool,
    cluster_name: str,
    auto_work_dir: Optional[Path],
) -> None:
    command = ["up", "-d"]
    if image_repo_prefix is None:
        image_repo_prefix = "determinedai"
    if version is None:
        version = determined.__version__
    if autorestart:
        restart_policy = "unless-stopped"
    else:
        restart_policy = "no"

    env = {
        "INTEGRATIONS_HOST_PORT": str(port),
        "DET_DB_PASSWORD": db_password,
        "IMAGE_REPO_PREFIX": image_repo_prefix,
        "DET_VERSION": version,
        "DET_RESTART_POLICY": restart_policy,
    }

    # Some cli flags for det deploy local will cause us to write a temporary master.yaml.
    master_conf = {}
    make_temp_conf = False

    if master_config_path is not None:
        with master_config_path.open() as f:
            master_conf = yaml.safe_load(f)
    else:
        # These defaults come from master/packaging/master.yaml (except for host_path).
        master_conf = {
            "db": {
                "user": "******",
                "host": "determined-db",
                "port": 5432,
                "name": "determined",
            },
            "checkpoint_storage": {
                "type": "shared_fs",
                "host_path": appdirs.user_data_dir("determined"),
                "save_experiment_best": 0,
                "save_trial_best": 1,
                "save_trial_latest": 1,
            },
        }
        make_temp_conf = True

    if storage_host_path is not None:
        master_conf["checkpoint_storage"] = {
            "type": "shared_fs",
            "host_path": str(storage_host_path.resolve()),
        }
        make_temp_conf = True

    if auto_work_dir is not None:
        work_dir = str(auto_work_dir.resolve())
        master_conf.setdefault("task_container_defaults", {})["work_dir"] = work_dir
        master_conf["task_container_defaults"].setdefault("bind_mounts", []).append(
            {"host_path": work_dir, "container_path": work_dir}
        )
        make_temp_conf = True

    # Ensure checkpoint storage directory exists.
    final_storage_host_path = master_conf.get("checkpoint_storage", {}).get("host_path")
    if final_storage_host_path is not None:
        final_storage_host_path = Path(final_storage_host_path)
        if not final_storage_host_path.exists():
            final_storage_host_path.mkdir(parents=True)

    if make_temp_conf:
        fd, temp_path = tempfile.mkstemp(prefix="det-deploy-local-master-config-")
        with open(fd, "w") as f:
            yaml.dump(master_conf, f)
        master_config_path = Path(temp_path)

    # This is always true by now, but mypy needs help.
    assert master_config_path is not None

    env["DET_MASTER_CONFIG"] = str(master_config_path.resolve())

    master_down(master_name, delete_db)
    docker_compose(command, master_name, env)
    _wait_for_master("localhost", port, cluster_name)
Ejemplo n.º 7
0
def test_noop_single_warm_start() -> None:
    experiment_id1 = exp.run_basic_test(
        conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), 1
    )

    trials = exp.experiment_trials(experiment_id1)
    assert len(trials) == 1

    first_trial = trials[0].trial
    first_trial_id = first_trial.id

    first_workloads = trials[0].workloads
    assert len(first_workloads) == 90
    checkpoints = exp.workloads_with_checkpoint(first_workloads)
    assert len(checkpoints) == 30
    first_checkpoint_uuid = checkpoints[0].uuid
    last_checkpoint_uuid = checkpoints[-1].uuid
    last_validation = exp.workloads_with_validation(first_workloads)[-1]
    assert last_validation.metrics["validation_error"] == pytest.approx(0.9 ** 30)

    config_base = conf.load_config(conf.fixtures_path("no_op/single.yaml"))

    # Test source_trial_id.
    config_obj = copy.deepcopy(config_base)
    # Add a source trial ID to warm start from.
    config_obj["searcher"]["source_trial_id"] = first_trial_id

    experiment_id2 = exp.run_basic_test_with_temp_config(config_obj, conf.fixtures_path("no_op"), 1)

    trials = exp.experiment_trials(experiment_id2)
    assert len(trials) == 1

    second_trial = trials[0]
    assert len(second_trial.workloads) == 90

    # Second trial should have a warm start checkpoint id.
    assert second_trial.trial.warmStartCheckpointUuid == last_checkpoint_uuid

    val_workloads = exp.workloads_with_validation(second_trial.workloads)
    assert val_workloads[-1].metrics["validation_error"] == pytest.approx(0.9 ** 60)

    # Now test source_checkpoint_uuid.
    config_obj = copy.deepcopy(config_base)
    # Add a source trial ID to warm start from.
    config_obj["searcher"]["source_checkpoint_uuid"] = checkpoints[0].uuid

    with tempfile.NamedTemporaryFile() as tf:
        with open(tf.name, "w") as f:
            yaml.dump(config_obj, f)

        experiment_id3 = exp.run_basic_test(tf.name, conf.fixtures_path("no_op"), 1)

    trials = exp.experiment_trials(experiment_id3)
    assert len(trials) == 1

    third_trial = trials[0]
    assert len(third_trial.workloads) == 90

    assert third_trial.trial.warmStartCheckpointUuid == first_checkpoint_uuid
    validations = exp.workloads_with_validation(third_trial.workloads)
    assert validations[1].metrics["validation_error"] == pytest.approx(0.9 ** 3)
Ejemplo n.º 8
0
def test_noop_single_warm_start() -> None:
    experiment_id1 = exp.run_basic_test(
        conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), 1
    )

    trials = exp.experiment_trials(experiment_id1)
    assert len(trials) == 1

    first_trial = trials[0]
    first_trial_id = first_trial["id"]

    assert len(first_trial["steps"]) == 30
    first_step = first_trial["steps"][0]
    first_checkpoint_id = first_step["checkpoint"]["id"]
    last_step = first_trial["steps"][29]
    last_checkpoint_id = last_step["checkpoint"]["id"]
    assert last_step["validation"]["metrics"]["validation_metrics"][
        "validation_error"
    ] == pytest.approx(0.9 ** 30)

    config_base = conf.load_config(conf.fixtures_path("no_op/single.yaml"))

    # Test source_trial_id.
    config_obj = copy.deepcopy(config_base)
    # Add a source trial ID to warm start from.
    config_obj["searcher"]["source_trial_id"] = first_trial_id

    experiment_id2 = exp.run_basic_test_with_temp_config(config_obj, conf.fixtures_path("no_op"), 1)

    trials = exp.experiment_trials(experiment_id2)
    assert len(trials) == 1

    second_trial = trials[0]
    assert len(second_trial["steps"]) == 30

    # Second trial should have a warm start checkpoint id.
    assert second_trial["warm_start_checkpoint_id"] == last_checkpoint_id

    assert second_trial["steps"][29]["validation"]["metrics"]["validation_metrics"][
        "validation_error"
    ] == pytest.approx(0.9 ** 60)

    # Now test source_checkpoint_uuid.
    config_obj = copy.deepcopy(config_base)
    # Add a source trial ID to warm start from.
    config_obj["searcher"]["source_checkpoint_uuid"] = first_step["checkpoint"]["uuid"]

    with tempfile.NamedTemporaryFile() as tf:
        with open(tf.name, "w") as f:
            yaml.dump(config_obj, f)

        experiment_id3 = exp.run_basic_test(tf.name, conf.fixtures_path("no_op"), 1)

    trials = exp.experiment_trials(experiment_id3)
    assert len(trials) == 1

    third_trial = trials[0]
    assert len(third_trial["steps"]) == 30

    assert third_trial["warm_start_checkpoint_id"] == first_checkpoint_id

    assert third_trial["steps"][1]["validation"]["metrics"]["validation_metrics"][
        "validation_error"
    ] == pytest.approx(0.9 ** 3)