def _test_rng_restore(fixture: str, metrics: list, tf2: Union[None, bool] = None) -> None: """ This test confirms that an experiment can be restarted from a checkpoint with the same RNG state. It requires a test fixture that will emit random numbers from all of the RNGs used in the relevant framework as metrics. The experiment must have a const.yaml, run for at least 3 steps, checkpoint every step, and keep the first checkpoint (either by having metrics get worse over time, or by configuring the experiment to keep all checkpoints). """ config_base = conf.load_config(conf.fixtures_path(fixture + "/const.yaml")) config = copy.deepcopy(config_base) if tf2 is not None: config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) experiment = exp.run_basic_test_with_temp_config( config, conf.fixtures_path(fixture), 1, ) first_trial = exp.experiment_trials(experiment)[0] assert len(first_trial.workloads) >= 4 first_checkpoint = exp.workloads_with_checkpoint(first_trial.workloads)[0] first_checkpoint_uuid = first_checkpoint.uuid config = copy.deepcopy(config_base) if tf2 is not None: config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) config["searcher"]["source_checkpoint_uuid"] = first_checkpoint.uuid experiment2 = exp.run_basic_test_with_temp_config(config, conf.fixtures_path(fixture), 1) second_trial = exp.experiment_trials(experiment2)[0] assert len(second_trial.workloads) >= 4 assert second_trial.trial.warmStartCheckpointUuid == first_checkpoint_uuid first_trial_validations = exp.workloads_with_validation(first_trial.workloads) second_trial_validations = exp.workloads_with_validation(second_trial.workloads) for wl in range(0, 2): for metric in metrics: first_trial_val = first_trial_validations[wl + 1] first_metric = first_trial_val.metrics[metric] second_trial_val = second_trial_validations[wl] second_metric = second_trial_val.metrics[metric] assert ( first_metric == second_metric ), f"failures on iteration: {wl} with metric: {metric}"
def test_pytorch_gradient_aggregation() -> None: config = conf.load_config( conf.fixtures_path("pytorch_identity/distributed.yaml")) exp_id = exp.run_basic_test_with_temp_config( config, conf.fixtures_path("pytorch_identity"), 1) trials = exp.experiment_trials(exp_id) assert len(trials) == 1 workloads = exp.workloads_with_validation(trials[0].workloads) actual_weights = [] for wl in workloads: if wl.metrics: actual_weights.append(wl.metrics["weight"]) # independently compute expected metrics batch_size = 4 epoch_size = 64 num_epochs = 3 batches = [(v[:], v[:]) for v in ([x * 0.1 + 1.0 for x in range(y, y + batch_size)] for y in (z % epoch_size for z in range(0, epoch_size * num_epochs, batch_size)))] lr = 0.001 def compute_expected_weight(data: List[float], label: List[float], w: float) -> float: n = len(data) expected_step = 2.0 * lr * sum( (d * (l - d * w) for d, l in zip(data, label))) / n return w + expected_step expected_weights = [] weight = 0.0 data: List[float] = [] label: List[float] = [] for i, batch in enumerate(batches): if i % 2 == 0: # for even-numbered batches the optimizer step is a no-op: # the weights don't change data, label = batch else: additional_data, additional_label = batch data += additional_data label += additional_label weight = compute_expected_weight(data, label, weight) expected_weights.append(weight) assert actual_weights == pytest.approx( expected_weights), f"{actual_weights} != {expected_weights}"
def test_metric_gathering() -> None: """ Confirm that metrics are gathered from the trial the way that we expect. """ experiment_id = exp.run_basic_test( conf.fixtures_path("metric_maker/const.yaml"), conf.fixtures_path("metric_maker"), 1) trials = exp.experiment_trials(experiment_id) assert len(trials) == 1 # Read the structure of the metrics directly from the config file config = conf.load_config(conf.fixtures_path("metric_maker/const.yaml")) base_value = config["hyperparameters"]["starting_base_value"] gain_per_batch = config["hyperparameters"]["gain_per_batch"] training_structure = config["hyperparameters"]["training_structure"]["val"] validation_structure = config["hyperparameters"]["validation_structure"][ "val"] scheduling_unit = 100 # Check training metrics. full_trial_metrics = exp.trial_metrics(trials[0].trial.id) batches_trained = 0 for step in full_trial_metrics["steps"]: metrics = step["metrics"] actual = metrics["batch_metrics"] assert len(actual) == scheduling_unit first_base_value = base_value + batches_trained batch_values = first_base_value + gain_per_batch * np.arange( scheduling_unit) expected = [ structure_to_metrics(value, training_structure) for value in batch_values ] assert structure_equal(expected, actual) batches_trained = step["total_batches"] # Check validation metrics. validation_workloads = exp.workloads_with_validation(trials[0].workloads) for validation in validation_workloads: actual = validation.metrics batches_trained = validation.totalBatches value = base_value + batches_trained expected = structure_to_metrics(value, validation_structure) assert structure_equal(expected, actual)
def test_custom_reducer_distributed(secrets: Dict[str, str], tf2: bool) -> None: config = conf.load_config( conf.fixtures_path("estimator_dataset/distributed.yaml")) # Run with multiple steps to verify we are resetting reducers right. config = conf.set_max_length(config, {"batches": 2}) config = conf.set_slots_per_trial(config, 8) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) experiment_id = exp.run_basic_test_with_temp_config( config, conf.fixtures_path("estimator_dataset"), 1) trial = exp.experiment_trials(experiment_id)[0] last_validation = exp.workloads_with_validation(trial.workloads)[-1] metrics = last_validation.metrics label_sum = 2 * sum(range(16)) assert metrics["label_sum_fn"] == label_sum assert metrics["label_sum_cls"] == label_sum
def test_nan_metrics() -> None: """ Confirm that NaN and Infinity metrics are gathered from the trial. """ exp_id = exp.run_basic_test(conf.fixtures_path("metric_maker/nans.yaml"), conf.fixtures_path("metric_maker"), 1) trials = exp.experiment_trials(exp_id) config = conf.load_config(conf.fixtures_path("metric_maker/nans.yaml")) base_value = config["hyperparameters"]["starting_base_value"] gain_per_batch = config["hyperparameters"]["gain_per_batch"] # Infinity and NaN cannot be processed in the YAML->JSON deserializer # Add them to expected values here training_structure = config["hyperparameters"]["training_structure"]["val"] training_structure["inf"] = "Infinity" training_structure["nan"] = "NaN" training_structure["nanarray"] = ["NaN", "NaN"] validation_structure = config["hyperparameters"]["validation_structure"][ "val"] validation_structure["neg_inf"] = "-Infinity" # Check training metrics. full_trial_metrics = exp.trial_metrics(trials[0].trial.id) batches_trained = 0 for step in full_trial_metrics["steps"]: metrics = step["metrics"] actual = metrics["batch_metrics"] first_base_value = base_value + batches_trained batch_values = first_base_value + gain_per_batch * np.arange(5) expected = [ structure_to_metrics(value, training_structure) for value in batch_values ] assert structure_equal(expected, actual) batches_trained = step["total_batches"] # Check validation metrics. validation_workloads = exp.workloads_with_validation(trials[0].workloads) for validation in validation_workloads: actual = validation.metrics batches_trained = validation.totalBatches expected = structure_to_metrics(base_value, validation_structure) assert structure_equal(expected, actual)
def test_noop_single_warm_start() -> None: experiment_id1 = exp.run_basic_test( conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), 1 ) trials = exp.experiment_trials(experiment_id1) assert len(trials) == 1 first_trial = trials[0].trial first_trial_id = first_trial.id first_workloads = trials[0].workloads assert len(first_workloads) == 90 checkpoints = exp.workloads_with_checkpoint(first_workloads) assert len(checkpoints) == 30 first_checkpoint_uuid = checkpoints[0].uuid last_checkpoint_uuid = checkpoints[-1].uuid last_validation = exp.workloads_with_validation(first_workloads)[-1] assert last_validation.metrics["validation_error"] == pytest.approx(0.9 ** 30) config_base = conf.load_config(conf.fixtures_path("no_op/single.yaml")) # Test source_trial_id. config_obj = copy.deepcopy(config_base) # Add a source trial ID to warm start from. config_obj["searcher"]["source_trial_id"] = first_trial_id experiment_id2 = exp.run_basic_test_with_temp_config(config_obj, conf.fixtures_path("no_op"), 1) trials = exp.experiment_trials(experiment_id2) assert len(trials) == 1 second_trial = trials[0] assert len(second_trial.workloads) == 90 # Second trial should have a warm start checkpoint id. assert second_trial.trial.warmStartCheckpointUuid == last_checkpoint_uuid val_workloads = exp.workloads_with_validation(second_trial.workloads) assert val_workloads[-1].metrics["validation_error"] == pytest.approx(0.9 ** 60) # Now test source_checkpoint_uuid. config_obj = copy.deepcopy(config_base) # Add a source trial ID to warm start from. config_obj["searcher"]["source_checkpoint_uuid"] = checkpoints[0].uuid with tempfile.NamedTemporaryFile() as tf: with open(tf.name, "w") as f: yaml.dump(config_obj, f) experiment_id3 = exp.run_basic_test(tf.name, conf.fixtures_path("no_op"), 1) trials = exp.experiment_trials(experiment_id3) assert len(trials) == 1 third_trial = trials[0] assert len(third_trial.workloads) == 90 assert third_trial.trial.warmStartCheckpointUuid == first_checkpoint_uuid validations = exp.workloads_with_validation(third_trial.workloads) assert validations[1].metrics["validation_error"] == pytest.approx(0.9 ** 3)
def test_end_to_end_adaptive() -> None: exp_id = exp.run_basic_test( conf.fixtures_path("mnist_pytorch/adaptive_short.yaml"), conf.tutorials_path("mnist_pytorch"), None, ) # Check that validation accuracy look sane (more than 93% on MNIST). trials = exp.experiment_trials(exp_id) best = None for trial in trials: assert len(trial.workloads) > 0 last_validation = exp.workloads_with_validation(trial.workloads)[-1] accuracy = last_validation.metrics["accuracy"] if not best or accuracy > best: best = accuracy assert best is not None assert best > 0.93 # Check that ExperimentReference returns a sorted order of top checkpoints # without gaps. The top 2 checkpoints should be the first 2 of the top k # checkpoints if sorting is stable. d = Determined(conf.make_master_url()) exp_ref = d.get_experiment(exp_id) top_2 = exp_ref.top_n_checkpoints(2) top_k = exp_ref.top_n_checkpoints( len(trials), sort_by="validation_loss", smaller_is_better=True ) top_2_uuids = [c.uuid for c in top_2] top_k_uuids = [c.uuid for c in top_k] assert top_2_uuids == top_k_uuids[:2] # Check that metrics are truly in sorted order. assert all(c.training is not None for c in top_k) metrics = [ c.training.validation_metrics["avgMetrics"]["validation_loss"] for c in top_k if c.training is not None ] assert metrics == sorted(metrics) # Check that changing smaller is better reverses the checkpoint ordering. top_k_reversed = exp_ref.top_n_checkpoints( len(trials), sort_by="validation_loss", smaller_is_better=False ) top_k_reversed_uuids = [c.uuid for c in top_k_reversed] assert top_k_uuids == top_k_reversed_uuids[::-1] checkpoint = top_k[0] checkpoint.add_metadata({"testing": "metadata"}) db_check = d.get_checkpoint(checkpoint.uuid) # Make sure the checkpoint metadata is correct and correctly saved to the db. # Beginning with 0.18 the system contributes a few items to the dict assert checkpoint.metadata.get("testing") == "metadata" assert checkpoint.metadata.keys() == {"format", "framework", "steps_completed", "testing"} assert checkpoint.metadata == db_check.metadata checkpoint.add_metadata({"some_key": "some_value"}) db_check = d.get_checkpoint(checkpoint.uuid) assert checkpoint.metadata.items() > {"testing": "metadata", "some_key": "some_value"}.items() assert checkpoint.metadata.keys() == { "format", "framework", "steps_completed", "testing", "some_key", } assert checkpoint.metadata == db_check.metadata checkpoint.add_metadata({"testing": "override"}) db_check = d.get_checkpoint(checkpoint.uuid) assert checkpoint.metadata.items() > {"testing": "override", "some_key": "some_value"}.items() assert checkpoint.metadata == db_check.metadata checkpoint.remove_metadata(["some_key"]) db_check = d.get_checkpoint(checkpoint.uuid) assert "some_key" not in checkpoint.metadata assert checkpoint.metadata["testing"] == "override" assert checkpoint.metadata == db_check.metadata