Ejemplo n.º 1
0
def _test_rng_restore(fixture: str, metrics: list, tf2: Union[None, bool] = None) -> None:
    """
    This test confirms that an experiment can be restarted from a checkpoint
    with the same RNG state. It requires a test fixture that will emit
    random numbers from all of the RNGs used in the relevant framework as
    metrics. The experiment must have a const.yaml, run for at least 3 steps,
    checkpoint every step, and keep the first checkpoint (either by having
    metrics get worse over time, or by configuring the experiment to keep all
    checkpoints).
    """
    config_base = conf.load_config(conf.fixtures_path(fixture + "/const.yaml"))
    config = copy.deepcopy(config_base)
    if tf2 is not None:
        config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config)

    experiment = exp.run_basic_test_with_temp_config(
        config,
        conf.fixtures_path(fixture),
        1,
    )

    first_trial = exp.experiment_trials(experiment)[0]

    assert len(first_trial.workloads) >= 4

    first_checkpoint = exp.workloads_with_checkpoint(first_trial.workloads)[0]
    first_checkpoint_uuid = first_checkpoint.uuid

    config = copy.deepcopy(config_base)
    if tf2 is not None:
        config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config)
    config["searcher"]["source_checkpoint_uuid"] = first_checkpoint.uuid

    experiment2 = exp.run_basic_test_with_temp_config(config, conf.fixtures_path(fixture), 1)

    second_trial = exp.experiment_trials(experiment2)[0]

    assert len(second_trial.workloads) >= 4
    assert second_trial.trial.warmStartCheckpointUuid == first_checkpoint_uuid
    first_trial_validations = exp.workloads_with_validation(first_trial.workloads)
    second_trial_validations = exp.workloads_with_validation(second_trial.workloads)

    for wl in range(0, 2):
        for metric in metrics:
            first_trial_val = first_trial_validations[wl + 1]
            first_metric = first_trial_val.metrics[metric]
            second_trial_val = second_trial_validations[wl]
            second_metric = second_trial_val.metrics[metric]
            assert (
                first_metric == second_metric
            ), f"failures on iteration: {wl} with metric: {metric}"
def test_pytorch_gradient_aggregation() -> None:
    config = conf.load_config(
        conf.fixtures_path("pytorch_identity/distributed.yaml"))

    exp_id = exp.run_basic_test_with_temp_config(
        config, conf.fixtures_path("pytorch_identity"), 1)
    trials = exp.experiment_trials(exp_id)
    assert len(trials) == 1
    workloads = exp.workloads_with_validation(trials[0].workloads)
    actual_weights = []
    for wl in workloads:
        if wl.metrics:
            actual_weights.append(wl.metrics["weight"])

    # independently compute expected metrics
    batch_size = 4
    epoch_size = 64
    num_epochs = 3
    batches = [(v[:], v[:])
               for v in ([x * 0.1 + 1.0 for x in range(y, y + batch_size)]
                         for y in (z % epoch_size
                                   for z in range(0, epoch_size *
                                                  num_epochs, batch_size)))]

    lr = 0.001

    def compute_expected_weight(data: List[float], label: List[float],
                                w: float) -> float:
        n = len(data)
        expected_step = 2.0 * lr * sum(
            (d * (l - d * w) for d, l in zip(data, label))) / n
        return w + expected_step

    expected_weights = []
    weight = 0.0
    data: List[float] = []
    label: List[float] = []
    for i, batch in enumerate(batches):
        if i % 2 == 0:
            # for even-numbered batches the optimizer step is a no-op:
            # the weights don't change
            data, label = batch
        else:
            additional_data, additional_label = batch
            data += additional_data
            label += additional_label
            weight = compute_expected_weight(data, label, weight)
        expected_weights.append(weight)

    assert actual_weights == pytest.approx(
        expected_weights), f"{actual_weights} != {expected_weights}"
Ejemplo n.º 3
0
def test_metric_gathering() -> None:
    """
    Confirm that metrics are gathered from the trial the way that we expect.
    """
    experiment_id = exp.run_basic_test(
        conf.fixtures_path("metric_maker/const.yaml"),
        conf.fixtures_path("metric_maker"), 1)

    trials = exp.experiment_trials(experiment_id)
    assert len(trials) == 1

    # Read the structure of the metrics directly from the config file
    config = conf.load_config(conf.fixtures_path("metric_maker/const.yaml"))

    base_value = config["hyperparameters"]["starting_base_value"]
    gain_per_batch = config["hyperparameters"]["gain_per_batch"]
    training_structure = config["hyperparameters"]["training_structure"]["val"]
    validation_structure = config["hyperparameters"]["validation_structure"][
        "val"]

    scheduling_unit = 100

    # Check training metrics.
    full_trial_metrics = exp.trial_metrics(trials[0].trial.id)
    batches_trained = 0
    for step in full_trial_metrics["steps"]:
        metrics = step["metrics"]

        actual = metrics["batch_metrics"]
        assert len(actual) == scheduling_unit

        first_base_value = base_value + batches_trained
        batch_values = first_base_value + gain_per_batch * np.arange(
            scheduling_unit)
        expected = [
            structure_to_metrics(value, training_structure)
            for value in batch_values
        ]
        assert structure_equal(expected, actual)
        batches_trained = step["total_batches"]

    # Check validation metrics.
    validation_workloads = exp.workloads_with_validation(trials[0].workloads)
    for validation in validation_workloads:
        actual = validation.metrics
        batches_trained = validation.totalBatches

        value = base_value + batches_trained
        expected = structure_to_metrics(value, validation_structure)
        assert structure_equal(expected, actual)
def test_custom_reducer_distributed(secrets: Dict[str, str],
                                    tf2: bool) -> None:
    config = conf.load_config(
        conf.fixtures_path("estimator_dataset/distributed.yaml"))
    # Run with multiple steps to verify we are resetting reducers right.
    config = conf.set_max_length(config, {"batches": 2})
    config = conf.set_slots_per_trial(config, 8)
    config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config)

    experiment_id = exp.run_basic_test_with_temp_config(
        config, conf.fixtures_path("estimator_dataset"), 1)

    trial = exp.experiment_trials(experiment_id)[0]
    last_validation = exp.workloads_with_validation(trial.workloads)[-1]
    metrics = last_validation.metrics
    label_sum = 2 * sum(range(16))
    assert metrics["label_sum_fn"] == label_sum
    assert metrics["label_sum_cls"] == label_sum
Ejemplo n.º 5
0
def test_nan_metrics() -> None:
    """
    Confirm that NaN and Infinity metrics are gathered from the trial.
    """
    exp_id = exp.run_basic_test(conf.fixtures_path("metric_maker/nans.yaml"),
                                conf.fixtures_path("metric_maker"), 1)
    trials = exp.experiment_trials(exp_id)
    config = conf.load_config(conf.fixtures_path("metric_maker/nans.yaml"))
    base_value = config["hyperparameters"]["starting_base_value"]
    gain_per_batch = config["hyperparameters"]["gain_per_batch"]

    # Infinity and NaN cannot be processed in the YAML->JSON deserializer
    # Add them to expected values here
    training_structure = config["hyperparameters"]["training_structure"]["val"]
    training_structure["inf"] = "Infinity"
    training_structure["nan"] = "NaN"
    training_structure["nanarray"] = ["NaN", "NaN"]
    validation_structure = config["hyperparameters"]["validation_structure"][
        "val"]
    validation_structure["neg_inf"] = "-Infinity"

    # Check training metrics.
    full_trial_metrics = exp.trial_metrics(trials[0].trial.id)
    batches_trained = 0
    for step in full_trial_metrics["steps"]:
        metrics = step["metrics"]
        actual = metrics["batch_metrics"]
        first_base_value = base_value + batches_trained
        batch_values = first_base_value + gain_per_batch * np.arange(5)
        expected = [
            structure_to_metrics(value, training_structure)
            for value in batch_values
        ]
        assert structure_equal(expected, actual)
        batches_trained = step["total_batches"]

    # Check validation metrics.
    validation_workloads = exp.workloads_with_validation(trials[0].workloads)
    for validation in validation_workloads:
        actual = validation.metrics
        batches_trained = validation.totalBatches
        expected = structure_to_metrics(base_value, validation_structure)
        assert structure_equal(expected, actual)
Ejemplo n.º 6
0
def test_noop_single_warm_start() -> None:
    experiment_id1 = exp.run_basic_test(
        conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), 1
    )

    trials = exp.experiment_trials(experiment_id1)
    assert len(trials) == 1

    first_trial = trials[0].trial
    first_trial_id = first_trial.id

    first_workloads = trials[0].workloads
    assert len(first_workloads) == 90
    checkpoints = exp.workloads_with_checkpoint(first_workloads)
    assert len(checkpoints) == 30
    first_checkpoint_uuid = checkpoints[0].uuid
    last_checkpoint_uuid = checkpoints[-1].uuid
    last_validation = exp.workloads_with_validation(first_workloads)[-1]
    assert last_validation.metrics["validation_error"] == pytest.approx(0.9 ** 30)

    config_base = conf.load_config(conf.fixtures_path("no_op/single.yaml"))

    # Test source_trial_id.
    config_obj = copy.deepcopy(config_base)
    # Add a source trial ID to warm start from.
    config_obj["searcher"]["source_trial_id"] = first_trial_id

    experiment_id2 = exp.run_basic_test_with_temp_config(config_obj, conf.fixtures_path("no_op"), 1)

    trials = exp.experiment_trials(experiment_id2)
    assert len(trials) == 1

    second_trial = trials[0]
    assert len(second_trial.workloads) == 90

    # Second trial should have a warm start checkpoint id.
    assert second_trial.trial.warmStartCheckpointUuid == last_checkpoint_uuid

    val_workloads = exp.workloads_with_validation(second_trial.workloads)
    assert val_workloads[-1].metrics["validation_error"] == pytest.approx(0.9 ** 60)

    # Now test source_checkpoint_uuid.
    config_obj = copy.deepcopy(config_base)
    # Add a source trial ID to warm start from.
    config_obj["searcher"]["source_checkpoint_uuid"] = checkpoints[0].uuid

    with tempfile.NamedTemporaryFile() as tf:
        with open(tf.name, "w") as f:
            yaml.dump(config_obj, f)

        experiment_id3 = exp.run_basic_test(tf.name, conf.fixtures_path("no_op"), 1)

    trials = exp.experiment_trials(experiment_id3)
    assert len(trials) == 1

    third_trial = trials[0]
    assert len(third_trial.workloads) == 90

    assert third_trial.trial.warmStartCheckpointUuid == first_checkpoint_uuid
    validations = exp.workloads_with_validation(third_trial.workloads)
    assert validations[1].metrics["validation_error"] == pytest.approx(0.9 ** 3)
Ejemplo n.º 7
0
def test_end_to_end_adaptive() -> None:
    exp_id = exp.run_basic_test(
        conf.fixtures_path("mnist_pytorch/adaptive_short.yaml"),
        conf.tutorials_path("mnist_pytorch"),
        None,
    )

    # Check that validation accuracy look sane (more than 93% on MNIST).
    trials = exp.experiment_trials(exp_id)
    best = None
    for trial in trials:
        assert len(trial.workloads) > 0
        last_validation = exp.workloads_with_validation(trial.workloads)[-1]
        accuracy = last_validation.metrics["accuracy"]
        if not best or accuracy > best:
            best = accuracy

    assert best is not None
    assert best > 0.93

    # Check that ExperimentReference returns a sorted order of top checkpoints
    # without gaps. The top 2 checkpoints should be the first 2 of the top k
    # checkpoints if sorting is stable.
    d = Determined(conf.make_master_url())
    exp_ref = d.get_experiment(exp_id)

    top_2 = exp_ref.top_n_checkpoints(2)
    top_k = exp_ref.top_n_checkpoints(
        len(trials), sort_by="validation_loss", smaller_is_better=True
    )

    top_2_uuids = [c.uuid for c in top_2]
    top_k_uuids = [c.uuid for c in top_k]

    assert top_2_uuids == top_k_uuids[:2]

    # Check that metrics are truly in sorted order.
    assert all(c.training is not None for c in top_k)
    metrics = [
        c.training.validation_metrics["avgMetrics"]["validation_loss"]
        for c in top_k
        if c.training is not None
    ]

    assert metrics == sorted(metrics)

    # Check that changing smaller is better reverses the checkpoint ordering.
    top_k_reversed = exp_ref.top_n_checkpoints(
        len(trials), sort_by="validation_loss", smaller_is_better=False
    )
    top_k_reversed_uuids = [c.uuid for c in top_k_reversed]

    assert top_k_uuids == top_k_reversed_uuids[::-1]

    checkpoint = top_k[0]
    checkpoint.add_metadata({"testing": "metadata"})
    db_check = d.get_checkpoint(checkpoint.uuid)
    # Make sure the checkpoint metadata is correct and correctly saved to the db.
    # Beginning with 0.18 the system contributes a few items to the dict
    assert checkpoint.metadata.get("testing") == "metadata"
    assert checkpoint.metadata.keys() == {"format", "framework", "steps_completed", "testing"}
    assert checkpoint.metadata == db_check.metadata

    checkpoint.add_metadata({"some_key": "some_value"})
    db_check = d.get_checkpoint(checkpoint.uuid)
    assert checkpoint.metadata.items() > {"testing": "metadata", "some_key": "some_value"}.items()
    assert checkpoint.metadata.keys() == {
        "format",
        "framework",
        "steps_completed",
        "testing",
        "some_key",
    }
    assert checkpoint.metadata == db_check.metadata

    checkpoint.add_metadata({"testing": "override"})
    db_check = d.get_checkpoint(checkpoint.uuid)
    assert checkpoint.metadata.items() > {"testing": "override", "some_key": "some_value"}.items()
    assert checkpoint.metadata == db_check.metadata

    checkpoint.remove_metadata(["some_key"])
    db_check = d.get_checkpoint(checkpoint.uuid)
    assert "some_key" not in checkpoint.metadata
    assert checkpoint.metadata["testing"] == "override"
    assert checkpoint.metadata == db_check.metadata