Ejemplo n.º 1
0
def test_tf_keras_const_warm_start(
        tf2: bool, collect_trial_profiles: Callable[[int], None]) -> None:
    config = conf.load_config(
        conf.cv_examples_path("cifar10_tf_keras/const.yaml"))
    config = conf.set_max_length(config, {"batches": 200})
    config = conf.set_min_validation_period(config, {"batches": 1000})
    config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config)
    config = conf.set_profiling_enabled(config)

    experiment_id1 = exp.run_basic_test_with_temp_config(
        config, conf.cv_examples_path("cifar10_tf_keras"), 1)
    trials = exp.experiment_trials(experiment_id1)
    assert len(trials) == 1

    first_trial = trials[0]
    first_trial_id = first_trial.trial.id

    assert len(first_trial.workloads) == 4
    checkpoints = exp.workloads_with_checkpoint(first_trial.workloads)
    first_checkpoint_uuid = checkpoints[0].uuid

    # Add a source trial ID to warm start from.
    config["searcher"]["source_trial_id"] = first_trial_id

    experiment_id2 = exp.run_basic_test_with_temp_config(
        config, conf.cv_examples_path("cifar10_tf_keras"), 1)

    # The new  trials should have a warm start checkpoint ID.
    trials = exp.experiment_trials(experiment_id2)
    assert len(trials) == 1
    for t in trials:
        assert t.trial.warmStartCheckpointUuid != ""
        assert t.trial.warmStartCheckpointUuid == first_checkpoint_uuid
    trial_id = trials[0].trial.id
    collect_trial_profiles(trial_id)
def test_mnist_estimator_warm_start(tf2: bool) -> None:
    config = conf.load_config(
        conf.fixtures_path("mnist_estimator/single.yaml"))
    config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config)
    experiment_id1 = exp.run_basic_test_with_temp_config(
        config, conf.cv_examples_path("mnist_estimator"), 1)

    trials = exp.experiment_trials(experiment_id1)
    assert len(trials) == 1

    first_trial = trials[0]
    first_trial_id = first_trial.trial.id

    assert len(first_trial.workloads) == 3
    checkpoint_workloads = exp.workloads_with_checkpoint(first_trial.workloads)
    first_checkpoint_uuid = checkpoint_workloads[0].uuid

    config_obj = conf.load_config(
        conf.fixtures_path("mnist_estimator/single.yaml"))

    config_obj["searcher"]["source_trial_id"] = first_trial_id
    config_obj = conf.set_tf2_image(config_obj) if tf2 else conf.set_tf1_image(
        config_obj)

    experiment_id2 = exp.run_basic_test_with_temp_config(
        config_obj, conf.cv_examples_path("mnist_estimator"), 1)

    trials = exp.experiment_trials(experiment_id2)
    assert len(trials) == 1
    assert trials[0].trial.warmStartCheckpointUuid == first_checkpoint_uuid
Ejemplo n.º 3
0
def test_tf_keras_const_warm_start(tf2: bool) -> None:
    config = conf.load_config(
        conf.cv_examples_path("cifar10_tf_keras/const.yaml"))
    config = conf.set_max_length(config, {"batches": 200})
    config = conf.set_min_validation_period(config, {"batches": 1000})
    config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config)

    experiment_id1 = exp.run_basic_test_with_temp_config(
        config, conf.cv_examples_path("cifar10_tf_keras"), 1)
    trials = exp.experiment_trials(experiment_id1)
    assert len(trials) == 1

    first_trial = trials[0]
    first_trial_id = first_trial["id"]

    assert len(first_trial["steps"]) == 2
    first_checkpoint_id = first_trial["steps"][1]["checkpoint"]["id"]

    # Add a source trial ID to warm start from.
    config["searcher"]["source_trial_id"] = first_trial_id

    experiment_id2 = exp.run_basic_test_with_temp_config(
        config, conf.cv_examples_path("cifar10_tf_keras"), 1)

    # The new  trials should have a warm start checkpoint ID.
    trials = exp.experiment_trials(experiment_id2)
    assert len(trials) == 1
    for trial in trials:
        assert trial["warm_start_checkpoint_id"] == first_checkpoint_id
Ejemplo n.º 4
0
def test_mnist_estimator_warm_start(tf2: bool) -> None:
    config = conf.load_config(
        conf.fixtures_path("mnist_estimator/single.yaml"))
    config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config)
    experiment_id1 = exp.run_basic_test_with_temp_config(
        config, conf.official_examples_path("mnist_estimator"), 1)

    trials = exp.experiment_trials(experiment_id1)
    assert len(trials) == 1

    first_trial = trials[0]
    first_trial_id = first_trial["id"]

    assert len(first_trial["steps"]) == 1
    first_checkpoint_id = first_trial["steps"][0]["checkpoint"]["id"]

    config_obj = conf.load_config(
        conf.fixtures_path("mnist_estimator/single.yaml"))

    config_obj["searcher"]["source_trial_id"] = first_trial_id
    config_obj = conf.set_tf2_image(config_obj) if tf2 else conf.set_tf1_image(
        config_obj)

    experiment_id2 = exp.run_basic_test_with_temp_config(
        config_obj, conf.official_examples_path("mnist_estimator"), 1)

    trials = exp.experiment_trials(experiment_id2)
    assert len(trials) == 1
    assert trials[0]["warm_start_checkpoint_id"] == first_checkpoint_id
Ejemplo n.º 5
0
def _test_rng_restore(fixture: str, metrics: list, tf2: Union[None, bool] = None) -> None:
    """
    This test confirms that an experiment can be restarted from a checkpoint
    with the same RNG state. It requires a test fixture that will emit
    random numbers from all of the RNGs used in the relevant framework as
    metrics. The experiment must have a const.yaml, run for at least 3 steps,
    checkpoint every step, and keep the first checkpoint (either by having
    metrics get worse over time, or by configuring the experiment to keep all
    checkpoints).
    """
    config_base = conf.load_config(conf.fixtures_path(fixture + "/const.yaml"))
    config = copy.deepcopy(config_base)
    if tf2 is not None:
        config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config)

    experiment = exp.run_basic_test_with_temp_config(
        config,
        conf.fixtures_path(fixture),
        1,
    )

    first_trial = exp.experiment_trials(experiment)[0]

    assert len(first_trial.workloads) >= 4

    first_checkpoint = exp.workloads_with_checkpoint(first_trial.workloads)[0]
    first_checkpoint_uuid = first_checkpoint.uuid

    config = copy.deepcopy(config_base)
    if tf2 is not None:
        config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config)
    config["searcher"]["source_checkpoint_uuid"] = first_checkpoint.uuid

    experiment2 = exp.run_basic_test_with_temp_config(config, conf.fixtures_path(fixture), 1)

    second_trial = exp.experiment_trials(experiment2)[0]

    assert len(second_trial.workloads) >= 4
    assert second_trial.trial.warmStartCheckpointUuid == first_checkpoint_uuid
    first_trial_validations = exp.workloads_with_validation(first_trial.workloads)
    second_trial_validations = exp.workloads_with_validation(second_trial.workloads)

    for wl in range(0, 2):
        for metric in metrics:
            first_trial_val = first_trial_validations[wl + 1]
            first_metric = first_trial_val.metrics[metric]
            second_trial_val = second_trial_validations[wl]
            second_metric = second_trial_val.metrics[metric]
            assert (
                first_metric == second_metric
            ), f"failures on iteration: {wl} with metric: {metric}"
def test_drain_agent_sched() -> None:
    """
    Start an experiment, drain it. Start a second one and make sure it schedules
    on the second agent *before* the first one has finished.
    """
    slots = _wait_for_slots(2)
    assert len(slots) == 2

    exp_id1 = exp.create_experiment(
        conf.fixtures_path("no_op/single-medium-train-step.yaml"),
        conf.fixtures_path("no_op"),
        None,
    )
    exp.wait_for_experiment_workload_progress(exp_id1)

    slots = _fetch_slots()
    used_slots = [s for s in slots if s["allocation_id"] != "FREE"]
    assert len(used_slots) == 1
    agent_id1 = used_slots[0]["agent_id"]

    with _disable_agent(agent_id1, drain=True):
        exp_id2 = exp.create_experiment(
            conf.fixtures_path("no_op/single-medium-train-step.yaml"),
            conf.fixtures_path("no_op"),
            None,
        )
        exp.wait_for_experiment_state(exp_id2,
                                      determinedexperimentv1State.STATE_ACTIVE)

        # Wait for a state when *BOTH* experiments are scheduled.
        for _ in range(20):
            slots = _fetch_slots()
            assert len(slots) == 2
            used_slots = [s for s in slots if s["allocation_id"] != "FREE"]
            if len(used_slots) == 2:
                # All good.
                break
        else:
            pytest.fail(
                "Second experiment didn't schedule on the second agent "
                "while the first agent was draining")

        exp.wait_for_experiment_state(
            exp_id1, determinedexperimentv1State.STATE_COMPLETED)
        exp.wait_for_experiment_state(
            exp_id2, determinedexperimentv1State.STATE_COMPLETED)

        trials1 = exp.experiment_trials(exp_id1)
        trials2 = exp.experiment_trials(exp_id2)
        assert len(trials1) == len(trials2) == 1
        assert len(trials1[0].workloads) == len(trials2[0].workloads) == 7
Ejemplo n.º 7
0
def test_epoch_sync(num_workers: int, global_batch_size: int,
                    dataset_len: int) -> None:
    """
    Test that epoch_idx is synchronized across all workers regardless of whether the
    number of batches is evenly divisible by the number of workers.
    """
    config = conf.load_config(conf.fixtures_path("pytorch_no_op/const.yaml"))
    config = conf.set_slots_per_trial(config, num_workers)
    max_len_batches = 10
    config = conf.set_max_length(config, {"batches": max_len_batches})
    config = conf.set_hparam(config, "dataset_len", dataset_len)
    config = conf.set_global_batch_size(config, global_batch_size)

    e_id = exp.run_basic_test_with_temp_config(
        config, conf.fixtures_path("pytorch_no_op"), 1)
    t_id = exp.experiment_trials(e_id)[0].trial.id

    batches_per_epoch = (dataset_len + global_batch_size -
                         1) // global_batch_size  # ceil

    for batch_idx in range(max_len_batches):
        epoch_idx = batch_idx // batches_per_epoch
        for rank in range(config["resources"]["slots_per_trial"]):
            assert exp.check_if_string_present_in_trial_logs(
                t_id,
                f"rank {rank} finished batch {batch_idx} in epoch {epoch_idx}")
Ejemplo n.º 8
0
def test_mnist_estimator_const(tf2: bool) -> None:
    config = conf.load_config(
        conf.fixtures_path("mnist_estimator/single.yaml"))
    config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config)
    experiment_id = exp.run_basic_test_with_temp_config(
        config, conf.official_examples_path("mnist_estimator"), 1)

    trials = exp.experiment_trials(experiment_id)
    assert len(trials) == 1

    # Check validation metrics.
    steps = trials[0]["steps"]
    assert len(steps) == 1

    step = steps[0]
    assert "validation" in step

    v_metrics = step["validation"]["metrics"]["validation_metrics"]

    # GPU training is non-deterministic, but on CPU we can validate that we
    # reach a consistent result.
    if not cluster.running_on_gpu():
        assert v_metrics["accuracy"] == 0.9125999808311462

    # Check training metrics.
    full_trial_metrics = exp.trial_metrics(trials[0]["id"])
    for step in full_trial_metrics["steps"]:
        metrics = step["metrics"]

        batch_metrics = metrics["batch_metrics"]
        assert len(batch_metrics) == 100

        for batch_metric in batch_metrics:
            assert batch_metric["loss"] > 0
Ejemplo n.º 9
0
def test_mnist_tp_accuracy() -> None:
    config = conf.load_config(conf.official_examples_path("mnist_tp/const.yaml"))
    experiment_id = exp.run_basic_test_with_temp_config(
        config, conf.official_examples_path("mnist_tp"), 1
    )

    trials = exp.experiment_trials(experiment_id)
    trial_metrics = exp.trial_metrics(trials[0]["id"])

    validation_errors = []
    # TODO (DET-3082): The validation metric names were modified by our trial reporting
    # from accuracy to val_accuracy.  We should probably remove the added prefix so
    # the metric name is as specified.
    validation_errors = [
        step["validation"]["metrics"]["validation_metrics"]["val_accuracy"]
        for step in trial_metrics["steps"]
        if step.get("validation")
    ]

    target_accuracy = 0.95
    assert max(validation_errors) > target_accuracy, (
        "mnist_tp did not reach minimum target accuracy {} in {} steps."
        " full validation error history: {}".format(
            target_accuracy, len(trial_metrics["steps"]), validation_errors
        )
    )
Ejemplo n.º 10
0
def test_noop_pause() -> None:
    """
    Walk through starting, pausing, and resuming a single no-op experiment.
    """
    experiment_id = exp.create_experiment(
        conf.fixtures_path("no_op/single-medium-train-step.yaml"),
        conf.fixtures_path("no_op"),
        None,
    )
    exp.wait_for_experiment_state(experiment_id, "ACTIVE")

    # Wait for the only trial to get scheduled.
    workload_active = False
    for _ in range(conf.MAX_TASK_SCHEDULED_SECS):
        workload_active = exp.experiment_has_active_workload(experiment_id)
        if workload_active:
            break
        else:
            time.sleep(1)
    check.true(
        workload_active,
        f"The only trial cannot be scheduled within {conf.MAX_TASK_SCHEDULED_SECS} seconds.",
    )

    # Wait for the only trial to show progress, indicating the image is built and running.
    num_steps = 0
    for _ in range(conf.MAX_TRIAL_BUILD_SECS):
        trials = exp.experiment_trials(experiment_id)
        if len(trials) > 0:
            only_trial = trials[0]
            num_steps = len(only_trial["steps"])
            if num_steps > 1:
                break
        time.sleep(1)
    check.true(
        num_steps > 1,
        f"The only trial cannot start training within {conf.MAX_TRIAL_BUILD_SECS} seconds.",
    )

    # Pause the experiment. Note that Determined does not currently differentiate
    # between a "stopping paused" and a "paused" state, so we follow this check
    # up by ensuring the experiment cleared all scheduled workloads.
    exp.pause_experiment(experiment_id)
    exp.wait_for_experiment_state(experiment_id, "PAUSED")

    # Wait at most 20 seconds for the experiment to clear all workloads (each
    # train step should take 5 seconds).
    for _ in range(20):
        workload_active = exp.experiment_has_active_workload(experiment_id)
        if not workload_active:
            break
        else:
            time.sleep(1)
    check.true(
        not workload_active, "The experiment cannot be paused within 20 seconds.",
    )

    # Resume the experiment and wait for completion.
    exp.activate_experiment(experiment_id)
    exp.wait_for_experiment_state(experiment_id, "COMPLETED")
Ejemplo n.º 11
0
def test_master_restart_reattach_recover_experiment(
        managed_cluster_restarts: ManagedCluster, downtime: int) -> None:
    _sanity_check(managed_cluster_restarts)

    try:
        exp_id = exp.create_experiment(
            conf.fixtures_path("no_op/single-medium-train-step.yaml"),
            conf.fixtures_path("no_op"),
            None,
        )

        # TODO(ilia): don't wait for progress.
        exp.wait_for_experiment_workload_progress(exp_id)

        if downtime >= 0:
            managed_cluster_restarts.kill_master()
            time.sleep(downtime)
            managed_cluster_restarts.restart_master()

        exp.wait_for_experiment_state(exp_id,
                                      EXP_STATE.STATE_COMPLETED,
                                      max_wait_secs=downtime + 60)
        trials = exp.experiment_trials(exp_id)

        assert len(trials) == 1
        train_wls = exp.workloads_with_training(trials[0].workloads)
        assert len(train_wls) == 5
    except Exception:
        managed_cluster_restarts.restart_master()
        managed_cluster_restarts.restart_agent()
        raise
Ejemplo n.º 12
0
def test_agent_reconnect_keep_experiment(
        managed_cluster_restarts: ManagedCluster) -> None:
    managed_cluster_restarts.ensure_agent_ok()

    try:
        exp_id = exp.create_experiment(
            conf.fixtures_path("no_op/single-medium-train-step.yaml"),
            conf.fixtures_path("no_op"),
            None,
        )
        exp.wait_for_experiment_workload_progress(exp_id)

        managed_cluster_restarts.kill_proxy()
        time.sleep(1)
        managed_cluster_restarts.restart_proxy()

        exp.wait_for_experiment_state(exp_id, EXP_STATE.STATE_COMPLETED)
        trials = exp.experiment_trials(exp_id)

        assert len(trials) == 1
        train_wls = exp.workloads_with_training(trials[0].workloads)
        assert len(train_wls) == 5
    except Exception:
        managed_cluster_restarts.restart_proxy(wait_for_reconnect=False)
        managed_cluster_restarts.restart_agent()
        raise
Ejemplo n.º 13
0
def test_agent_restart_recover_experiment(
        managed_cluster_restarts: ManagedCluster, downtime: int) -> None:
    if not managed_cluster_restarts.reattach:
        pytest.skip()

    managed_cluster_restarts.ensure_agent_ok()
    try:
        exp_id = exp.create_experiment(
            conf.fixtures_path("no_op/single-medium-train-step.yaml"),
            conf.fixtures_path("no_op"),
            None,
        )
        exp.wait_for_experiment_workload_progress(exp_id)

        if downtime >= 0:
            managed_cluster_restarts.kill_agent()
            time.sleep(downtime)
            managed_cluster_restarts.restart_agent(wait_for_amnesia=False)

        exp.wait_for_experiment_state(exp_id, EXP_STATE.STATE_COMPLETED)
        trials = exp.experiment_trials(exp_id)

        assert len(trials) == 1
        train_wls = exp.workloads_with_training(trials[0].workloads)
        assert len(train_wls) == 5
    except Exception:
        managed_cluster_restarts.restart_agent()
        raise
Ejemplo n.º 14
0
def test_pytorch_parallel() -> None:
    config = conf.load_config(conf.tutorials_path("mnist_pytorch/const.yaml"))
    config = conf.set_slots_per_trial(config, 8)
    config = conf.set_max_length(config, {"batches": 200})
    config = conf.set_tensor_auto_tuning(config, True)
    config = conf.set_perform_initial_validation(config, True)

    exp_id = exp.run_basic_test_with_temp_config(
        config, conf.tutorials_path("mnist_pytorch"), 1)
    exp.assert_performed_initial_validation(exp_id)

    # Check on record/batch counts we emitted in logs.
    validation_size = 10000
    global_batch_size = config["hyperparameters"]["global_batch_size"]
    num_workers = config.get("resources", {}).get("slots_per_trial", 1)
    global_batch_size = config["hyperparameters"]["global_batch_size"]
    scheduling_unit = config.get("scheduling_unit", 100)
    per_slot_batch_size = global_batch_size // num_workers
    exp_val_batches = (validation_size +
                       (per_slot_batch_size - 1)) // per_slot_batch_size
    patterns = [
        # Expect two copies of matching training reports.
        f"trained: {scheduling_unit * global_batch_size} records.*in {scheduling_unit} batches",
        f"trained: {scheduling_unit * global_batch_size} records.*in {scheduling_unit} batches",
        f"validated: {validation_size} records.*in {exp_val_batches} batches",
    ]
    trial_id = exp.experiment_trials(exp_id)[0].trial.id
    exp.assert_patterns_in_trial_logs(trial_id, patterns)
def test_cifar10_byol_pytorch_accuracy() -> None:
    config = conf.load_config(
        conf.cv_examples_path("byol_pytorch/const-cifar10.yaml"))
    # Limit convergence time, since was running over 30 minute limit.
    config["searcher"]["max_length"]["epochs"] = 20
    config["hyperparameters"]["classifier"]["train_epochs"] = 1
    config = conf.set_random_seed(config, 1591280374)
    experiment_id = exp.run_basic_test_with_temp_config(
        config, conf.cv_examples_path("byol_pytorch"), 1)

    trials = exp.experiment_trials(experiment_id)
    trial_metrics = exp.trial_metrics(trials[0].trial.id)

    validation_accuracies = [
        step["validation"]["metrics"]["validation_metrics"]["test_accuracy"]
        for step in trial_metrics["steps"] if step.get("validation")
    ]

    # Accuracy reachable within limited convergence time -- goes higher given full training.
    target_accuracy = 0.40
    assert max(validation_accuracies) > target_accuracy, (
        "cifar10_byol_pytorch did not reach minimum target accuracy {} in {} steps."
        " full validation accuracy history: {}".format(
            target_accuracy, len(trial_metrics["steps"]),
            validation_accuracies))
Ejemplo n.º 16
0
def test_launch_layer_cifar(
        collect_trial_profiles: Callable[[int], None]) -> None:
    config = conf.load_config(
        conf.cv_examples_path("cifar10_pytorch/const.yaml"))
    config = conf.set_max_length(config, {"batches": 200})
    config = conf.set_slots_per_trial(config, 1)
    config = conf.set_profiling_enabled(config)
    config = conf.set_entrypoint(
        config,
        "python3 -m determined.launch.horovod --autohorovod --trial model_def:CIFARTrial"
    )

    experiment_id = exp.run_basic_test_with_temp_config(
        config, conf.cv_examples_path("cifar10_pytorch"), 1)
    trials = exp.experiment_trials(experiment_id)
    (Determined(conf.make_master_url()).get_trial(
        trials[0].trial.id).select_checkpoint(latest=True).load(
            map_location="cpu"))

    collect_trial_profiles(trials[0].trial.id)

    assert exp.check_if_string_present_in_trial_logs(
        trials[0].trial.id,
        "allocation stopped after resources exited successfully with a zero exit code",
    )
Ejemplo n.º 17
0
def run_dataset_experiment(
    searcher_max_steps: int,
    batches_per_step: int,
    secrets: Dict[str, str],
    tf2: bool,
    slots_per_trial: int = 1,
    source_trial_id: Optional[str] = None,
) -> List[Dict[str, Any]]:
    config = conf.load_config(
        conf.fixtures_path("estimator_dataset/const.yaml"))
    config.setdefault("searcher", {})
    config["searcher"]["max_steps"] = searcher_max_steps
    config["batches_per_step"] = batches_per_step
    config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config)

    if source_trial_id is not None:
        config["searcher"]["source_trial_id"] = source_trial_id

    config.setdefault("resources", {})
    config["resources"]["slots_per_trial"] = slots_per_trial

    if cluster.num_agents() > 1:
        config["checkpoint_storage"] = exp.s3_checkpoint_config(secrets)

    experiment_id = exp.run_basic_test_with_temp_config(
        config, conf.fixtures_path("estimator_dataset"), 1)
    return exp.experiment_trials(experiment_id)
Ejemplo n.º 18
0
def test_streaming_observability_metrics_apis(
    framework_base_experiment: str, framework_timings_enabled: bool
) -> None:
    # TODO: refactor tests to not use cli singleton auth.
    certs.cli_cert = certs.default_load(conf.make_master_url())
    authentication.cli_auth = authentication.Authentication(conf.make_master_url(), try_reauth=True)

    config_path = conf.tutorials_path(f"../{framework_base_experiment}/const.yaml")
    model_def_path = conf.tutorials_path(f"../{framework_base_experiment}")

    config_obj = conf.load_config(config_path)
    config_obj = conf.set_profiling_enabled(config_obj)
    with tempfile.NamedTemporaryFile() as tf:
        with open(tf.name, "w") as f:
            yaml.dump(config_obj, f)
        experiment_id = exp.create_experiment(
            tf.name,
            model_def_path,
        )

    exp.wait_for_experiment_state(experiment_id, "COMPLETED")
    trials = exp.experiment_trials(experiment_id)
    trial_id = trials[0]["id"]

    gpu_enabled = conf.GPU_ENABLED

    request_profiling_metric_labels(trial_id, framework_timings_enabled, gpu_enabled)
    if gpu_enabled:
        request_profiling_system_metrics(trial_id, "gpu_util")
    if framework_timings_enabled:
        request_profiling_pytorch_timing_metrics(trial_id, "train_batch")
Ejemplo n.º 19
0
def test_end_to_end_adaptive() -> None:
    exp_id = exp.run_basic_test(
        conf.fixtures_path("mnist_pytorch/adaptive_short.yaml"),
        conf.official_examples_path("trial/mnist_pytorch"),
        None,
    )

    # Check that validation accuracy look sane (more than 93% on MNIST).
    trials = exp.experiment_trials(exp_id)
    best = None
    for trial in trials:
        assert len(trial["steps"])
        last_step = trial["steps"][-1]
        accuracy = last_step["validation"]["metrics"]["validation_metrics"]["accuracy"]
        if not best or accuracy > best:
            best = accuracy

    assert best is not None
    assert best > 0.93

    # Check that ExperimentReference returns a sorted order of top checkpoints
    # without gaps. The top 2 checkpoints should be the first 2 of the top k
    # checkpoints if sorting is stable.
    d = Determined(conf.make_master_url())
    exp_ref = d.get_experiment(exp_id)

    top_2 = exp_ref.top_n_checkpoints(2)
    top_k = exp_ref.top_n_checkpoints(len(trials))

    top_2_uuids = [c.uuid for c in top_2]
    top_k_uuids = [c.uuid for c in top_k]

    assert top_2_uuids == top_k_uuids[:2]

    # Check that metrics are truly in sorted order.
    metrics = [c.validation["metrics"]["validation_metrics"]["validation_loss"] for c in top_k]

    assert metrics == sorted(metrics)

    # Check that changing smaller is better reverses the checkpoint ordering.
    top_k_reversed = exp_ref.top_n_checkpoints(
        len(trials), sort_by="validation_loss", smaller_is_better=False
    )
    top_k_reversed_uuids = [c.uuid for c in top_k_reversed]

    assert top_k_uuids == top_k_reversed_uuids[::-1]

    checkpoint = top_k[0]
    checkpoint.add_metadata({"testing": "metadata"})
    assert checkpoint.metadata == {"testing": "metadata"}

    checkpoint.add_metadata({"some_key": "some_value"})
    assert checkpoint.metadata == {"testing": "metadata", "some_key": "some_value"}

    checkpoint.add_metadata({"testing": "override"})
    assert checkpoint.metadata == {"testing": "override", "some_key": "some_value"}

    checkpoint.remove_metadata(["some_key"])
    assert checkpoint.metadata == {"testing": "override"}
Ejemplo n.º 20
0
def test_tensorpack_const() -> None:
    config = conf.load_config(
        conf.official_examples_path("trial/mnist_tp/const.yaml"))

    experiment_id = exp.run_basic_test_with_temp_config(
        config, conf.official_examples_path("trial/mnist_tp"), 1)
    trials = exp.experiment_trials(experiment_id)
    assert len(trials) == 1
Ejemplo n.º 21
0
def _test_rng_restore(fixture: str, metrics: list) -> None:
    """
    This test confirms that an experiment can be restarted from a checkpoint
    with the same RNG state. It requires a test fixture that will emit
    random numbers from all of the RNGs used in the relevant framework as
    metrics. The experiment must have a const.yaml, run for at least 3 steps,
    checkpoint every step, and keep the first checkpoint (either by having
    metrics get worse over time, or by configuring the experiment to keep all
    checkpoints).
    """
    experiment = exp.run_basic_test(
        conf.fixtures_path(fixture + "/const.yaml"),
        conf.fixtures_path(fixture),
        1,
    )

    first_trial = exp.experiment_trials(experiment)[0]

    assert len(first_trial["steps"]) >= 3

    first_step = first_trial["steps"][0]
    first_checkpoint_id = first_step["checkpoint"]["id"]

    config_base = conf.load_config(conf.fixtures_path(fixture + "/const.yaml"))
    config_obj = copy.deepcopy(config_base)
    config_obj["searcher"]["source_checkpoint_uuid"] = first_step["checkpoint"]["uuid"]

    experiment2 = exp.run_basic_test_with_temp_config(config_obj, conf.fixtures_path(fixture), 1)

    second_trial = exp.experiment_trials(experiment2)[0]

    assert len(second_trial["steps"]) >= 3
    assert second_trial["warm_start_checkpoint_id"] == first_checkpoint_id

    for step in range(0, 2):
        for metric in metrics:
            first_metric = first_trial["steps"][step + 1]["validation"]["metrics"][
                "validation_metrics"
            ][metric]
            second_metric = second_trial["steps"][step]["validation"]["metrics"][
                "validation_metrics"
            ][metric]
            assert (
                first_metric == second_metric
            ), f"failures on iteration: {step} with metric: {metric}"
Ejemplo n.º 22
0
def test_tf_keras_mnist_data_layer_s3(
    tf2: bool,
    storage_type: str,
    secrets: Dict[str, str],
    collect_trial_profiles: Callable[[int], None],
) -> None:
    exp_id = run_tf_keras_mnist_data_layer_test(tf2, storage_type)
    trial_id = exp.experiment_trials(exp_id)[0].trial.id
    collect_trial_profiles(trial_id)
Ejemplo n.º 23
0
def test_mnist_estimator_load() -> None:
    config = conf.load_config(conf.fixtures_path("mnist_estimator/single.yaml"))
    config = conf.set_tf1_image(config)
    experiment_id = exp.run_basic_test_with_temp_config(
        config, conf.official_examples_path("trial/mnist_estimator"), 1
    )

    trials = exp.experiment_trials(experiment_id)
    model = Determined(conf.make_master_url()).get_trial(trials[0]["id"]).top_checkpoint().load()
    assert isinstance(model, AutoTrackable)
Ejemplo n.º 24
0
def test_noop_load() -> None:
    """
    Load a checkpoint
    """
    experiment_id = exp.run_basic_test(
        conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), 1
    )
    trials = exp.experiment_trials(experiment_id)
    checkpoint = Determined(conf.make_master_url()).get_trial(trials[0].trial.id).top_checkpoint()
    assert checkpoint.task_id == trials[0].trial.taskId
Ejemplo n.º 25
0
def test_tf_keras_mnist_parallel() -> None:
    config = conf.load_config(
        conf.tutorials_path("fashion_mnist_tf_keras/const.yaml"))
    config = conf.set_slots_per_trial(config, 8)
    config = conf.set_max_length(config, {"batches": 200})

    experiment_id = exp.run_basic_test_with_temp_config(
        config, conf.tutorials_path("fashion_mnist_tf_keras"), 1)
    trials = exp.experiment_trials(experiment_id)
    assert len(trials) == 1
Ejemplo n.º 26
0
def test_log_null_bytes() -> None:
    config_obj = conf.load_config(conf.fixtures_path("no_op/single.yaml"))
    config_obj["hyperparameters"]["write_null"] = True
    config_obj["max_restarts"] = 0
    config_obj["searcher"]["max_length"] = {"batches": 1}
    experiment_id = exp.run_basic_test_with_temp_config(config_obj, conf.fixtures_path("no_op"), 1)

    trials = exp.experiment_trials(experiment_id)
    assert len(trials) == 1
    logs = exp.trial_logs(trials[0]["id"])
    assert len(logs) > 0
Ejemplo n.º 27
0
def test_tf_keras_tf2_disabled() -> None:
    """Keras on tf2 with tf2 and eager execution disabled."""
    config = conf.load_config(
        conf.fixtures_path("keras_tf2_disabled_no_op/const.yaml"))
    config = conf.set_max_length(config, {"batches": 1})
    config = conf.set_tf2_image(config)
    experiment_id = exp.run_basic_test_with_temp_config(
        config, conf.fixtures_path("keras_tf2_disabled_no_op"), 1)
    trials = exp.experiment_trials(experiment_id)
    assert len(trials) == 1
    exp.export_and_load_model(experiment_id)
Ejemplo n.º 28
0
def test_tf_keras_single_gpu(tf2: bool) -> None:
    config = conf.load_config(conf.official_examples_path("cifar10_cnn_tf_keras/const.yaml"))
    config = conf.set_slots_per_trial(config, 1)
    config = conf.set_max_steps(config, 2)
    config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config)

    experiment_id = exp.run_basic_test_with_temp_config(
        config, conf.official_examples_path("cifar10_cnn_tf_keras"), 1
    )
    trials = exp.experiment_trials(experiment_id)
    assert len(trials) == 1
Ejemplo n.º 29
0
def test_tensorpack_native_parallel() -> None:
    config = conf.load_config(
        conf.official_examples_path("trial/mnist_tp/const.yaml"))
    config = conf.set_slots_per_trial(config, 8)
    config = conf.set_native_parallel(config, True)
    config = conf.set_max_length(config, {"batches": 32})

    experiment_id = exp.run_basic_test_with_temp_config(
        config, conf.official_examples_path("trial/mnist_tp"), 1)
    trials = exp.experiment_trials(experiment_id)
    assert len(trials) == 1
Ejemplo n.º 30
0
def test_tf_keras_mnist_parallel() -> None:
    config = conf.load_config(conf.official_examples_path("fashion_mnist_tf_keras/const.yaml"))
    config = conf.set_slots_per_trial(config, 8)
    config = conf.set_native_parallel(config, False)
    config = conf.set_max_steps(config, 2)

    experiment_id = exp.run_basic_test_with_temp_config(
        config, conf.official_examples_path("fashion_mnist_tf_keras"), 1
    )
    trials = exp.experiment_trials(experiment_id)
    assert len(trials) == 1