def test_mnist_estimator_warm_start(tf2: bool) -> None: config = conf.load_config( conf.fixtures_path("mnist_estimator/single.yaml")) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) experiment_id1 = exp.run_basic_test_with_temp_config( config, conf.cv_examples_path("mnist_estimator"), 1) trials = exp.experiment_trials(experiment_id1) assert len(trials) == 1 first_trial = trials[0] first_trial_id = first_trial.trial.id assert len(first_trial.workloads) == 3 checkpoint_workloads = exp.workloads_with_checkpoint(first_trial.workloads) first_checkpoint_uuid = checkpoint_workloads[0].uuid config_obj = conf.load_config( conf.fixtures_path("mnist_estimator/single.yaml")) config_obj["searcher"]["source_trial_id"] = first_trial_id config_obj = conf.set_tf2_image(config_obj) if tf2 else conf.set_tf1_image( config_obj) experiment_id2 = exp.run_basic_test_with_temp_config( config_obj, conf.cv_examples_path("mnist_estimator"), 1) trials = exp.experiment_trials(experiment_id2) assert len(trials) == 1 assert trials[0].trial.warmStartCheckpointUuid == first_checkpoint_uuid
def test_mnist_estimator_warm_start(tf2: bool) -> None: config = conf.load_config( conf.fixtures_path("mnist_estimator/single.yaml")) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) experiment_id1 = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("mnist_estimator"), 1) trials = exp.experiment_trials(experiment_id1) assert len(trials) == 1 first_trial = trials[0] first_trial_id = first_trial["id"] assert len(first_trial["steps"]) == 1 first_checkpoint_id = first_trial["steps"][0]["checkpoint"]["id"] config_obj = conf.load_config( conf.fixtures_path("mnist_estimator/single.yaml")) config_obj["searcher"]["source_trial_id"] = first_trial_id config_obj = conf.set_tf2_image(config_obj) if tf2 else conf.set_tf1_image( config_obj) experiment_id2 = exp.run_basic_test_with_temp_config( config_obj, conf.official_examples_path("mnist_estimator"), 1) trials = exp.experiment_trials(experiment_id2) assert len(trials) == 1 assert trials[0]["warm_start_checkpoint_id"] == first_checkpoint_id
def _test_rng_restore(fixture: str, metrics: list, tf2: Union[None, bool] = None) -> None: """ This test confirms that an experiment can be restarted from a checkpoint with the same RNG state. It requires a test fixture that will emit random numbers from all of the RNGs used in the relevant framework as metrics. The experiment must have a const.yaml, run for at least 3 steps, checkpoint every step, and keep the first checkpoint (either by having metrics get worse over time, or by configuring the experiment to keep all checkpoints). """ config_base = conf.load_config(conf.fixtures_path(fixture + "/const.yaml")) config = copy.deepcopy(config_base) if tf2 is not None: config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) experiment = exp.run_basic_test_with_temp_config( config, conf.fixtures_path(fixture), 1, ) first_trial = exp.experiment_trials(experiment)[0] assert len(first_trial.workloads) >= 4 first_checkpoint = exp.workloads_with_checkpoint(first_trial.workloads)[0] first_checkpoint_uuid = first_checkpoint.uuid config = copy.deepcopy(config_base) if tf2 is not None: config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) config["searcher"]["source_checkpoint_uuid"] = first_checkpoint.uuid experiment2 = exp.run_basic_test_with_temp_config(config, conf.fixtures_path(fixture), 1) second_trial = exp.experiment_trials(experiment2)[0] assert len(second_trial.workloads) >= 4 assert second_trial.trial.warmStartCheckpointUuid == first_checkpoint_uuid first_trial_validations = exp.workloads_with_validation(first_trial.workloads) second_trial_validations = exp.workloads_with_validation(second_trial.workloads) for wl in range(0, 2): for metric in metrics: first_trial_val = first_trial_validations[wl + 1] first_metric = first_trial_val.metrics[metric] second_trial_val = second_trial_validations[wl] second_metric = second_trial_val.metrics[metric] assert ( first_metric == second_metric ), f"failures on iteration: {wl} with metric: {metric}"
def test_mnist_estimator_const(tf2: bool) -> None: config = conf.load_config( conf.fixtures_path("mnist_estimator/single.yaml")) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) experiment_id = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("mnist_estimator"), 1) trials = exp.experiment_trials(experiment_id) assert len(trials) == 1 # Check validation metrics. steps = trials[0]["steps"] assert len(steps) == 1 step = steps[0] assert "validation" in step v_metrics = step["validation"]["metrics"]["validation_metrics"] # GPU training is non-deterministic, but on CPU we can validate that we # reach a consistent result. if not cluster.running_on_gpu(): assert v_metrics["accuracy"] == 0.9125999808311462 # Check training metrics. full_trial_metrics = exp.trial_metrics(trials[0]["id"]) for step in full_trial_metrics["steps"]: metrics = step["metrics"] batch_metrics = metrics["batch_metrics"] assert len(batch_metrics) == 100 for batch_metric in batch_metrics: assert batch_metric["loss"] > 0
def test_tf_keras_const_warm_start( tf2: bool, collect_trial_profiles: Callable[[int], None]) -> None: config = conf.load_config( conf.cv_examples_path("cifar10_tf_keras/const.yaml")) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_min_validation_period(config, {"batches": 1000}) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) config = conf.set_profiling_enabled(config) experiment_id1 = exp.run_basic_test_with_temp_config( config, conf.cv_examples_path("cifar10_tf_keras"), 1) trials = exp.experiment_trials(experiment_id1) assert len(trials) == 1 first_trial = trials[0] first_trial_id = first_trial.trial.id assert len(first_trial.workloads) == 4 checkpoints = exp.workloads_with_checkpoint(first_trial.workloads) first_checkpoint_uuid = checkpoints[0].uuid # Add a source trial ID to warm start from. config["searcher"]["source_trial_id"] = first_trial_id experiment_id2 = exp.run_basic_test_with_temp_config( config, conf.cv_examples_path("cifar10_tf_keras"), 1) # The new trials should have a warm start checkpoint ID. trials = exp.experiment_trials(experiment_id2) assert len(trials) == 1 for t in trials: assert t.trial.warmStartCheckpointUuid != "" assert t.trial.warmStartCheckpointUuid == first_checkpoint_uuid trial_id = trials[0].trial.id collect_trial_profiles(trial_id)
def test_tf_keras_const_warm_start(tf2: bool) -> None: config = conf.load_config( conf.cv_examples_path("cifar10_tf_keras/const.yaml")) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_min_validation_period(config, {"batches": 1000}) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) experiment_id1 = exp.run_basic_test_with_temp_config( config, conf.cv_examples_path("cifar10_tf_keras"), 1) trials = exp.experiment_trials(experiment_id1) assert len(trials) == 1 first_trial = trials[0] first_trial_id = first_trial["id"] assert len(first_trial["steps"]) == 2 first_checkpoint_id = first_trial["steps"][1]["checkpoint"]["id"] # Add a source trial ID to warm start from. config["searcher"]["source_trial_id"] = first_trial_id experiment_id2 = exp.run_basic_test_with_temp_config( config, conf.cv_examples_path("cifar10_tf_keras"), 1) # The new trials should have a warm start checkpoint ID. trials = exp.experiment_trials(experiment_id2) assert len(trials) == 1 for trial in trials: assert trial["warm_start_checkpoint_id"] == first_checkpoint_id
def run_dataset_experiment( searcher_max_steps: int, batches_per_step: int, secrets: Dict[str, str], tf2: bool, slots_per_trial: int = 1, source_trial_id: Optional[str] = None, ) -> List[Dict[str, Any]]: config = conf.load_config( conf.fixtures_path("estimator_dataset/const.yaml")) config.setdefault("searcher", {}) config["searcher"]["max_steps"] = searcher_max_steps config["batches_per_step"] = batches_per_step config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) if source_trial_id is not None: config["searcher"]["source_trial_id"] = source_trial_id config.setdefault("resources", {}) config["resources"]["slots_per_trial"] = slots_per_trial if cluster.num_agents() > 1: config["checkpoint_storage"] = exp.s3_checkpoint_config(secrets) experiment_id = exp.run_basic_test_with_temp_config( config, conf.fixtures_path("estimator_dataset"), 1) return exp.experiment_trials(experiment_id)
def test_mnist_estimator_adaptive(tf2: bool) -> None: # Only test tf1 here, because a tf2 test would add no extra coverage. config = conf.load_config( conf.fixtures_path("mnist_estimator/adaptive.yaml")) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) exp.run_basic_test_with_temp_config( config, conf.official_examples_path("mnist_estimator"), None)
def test_tf_estimator_warm_start(implementation: NativeImplementation, tf2: bool) -> None: implementation = implementation._replace( configuration=( conf.set_tf2_image(implementation.configuration) if tf2 else conf.set_tf1_image(implementation.configuration) ) ) run_warm_start_test(implementation)
def test_mnist_estimator_load() -> None: config = conf.load_config(conf.fixtures_path("mnist_estimator/single.yaml")) config = conf.set_tf1_image(config) experiment_id = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("trial/mnist_estimator"), 1 ) trials = exp.experiment_trials(experiment_id) model = Determined(conf.make_master_url()).get_trial(trials[0]["id"]).top_checkpoint().load() assert isinstance(model, AutoTrackable)
def test_mnist_estimator_const_parallel(tf2: bool) -> None: config = conf.load_config( conf.fixtures_path("mnist_estimator/single-multi-slot.yaml")) config = conf.set_slots_per_trial(config, 8) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) config = conf.set_perform_initial_validation(config, True) exp_id = exp.run_basic_test_with_temp_config( config, conf.cv_examples_path("mnist_estimator"), 1) exp.assert_performed_initial_validation(exp_id)
def test_tf_keras_single_gpu(tf2: bool) -> None: config = conf.load_config(conf.official_examples_path("cifar10_cnn_tf_keras/const.yaml")) config = conf.set_slots_per_trial(config, 1) config = conf.set_max_steps(config, 2) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) experiment_id = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("cifar10_cnn_tf_keras"), 1 ) trials = exp.experiment_trials(experiment_id) assert len(trials) == 1
def run_mnist_estimator_data_layer_test(tf2: bool, storage_type: str) -> None: config = conf.load_config( conf.data_layer_examples_path("data_layer_mnist_estimator/const.yaml")) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) if storage_type == "lfs": config = conf.set_shared_fs_data_layer(config) else: config = conf.set_s3_data_layer(config) exp.run_basic_test_with_temp_config( config, conf.data_layer_examples_path("data_layer_mnist_estimator"), 1)
def test_tf_keras_parallel(aggregation_frequency: int, tf2: bool) -> None: config = conf.load_config( conf.cv_examples_path("cifar10_tf_keras/const.yaml")) config = conf.set_slots_per_trial(config, 8) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_aggregation_frequency(config, aggregation_frequency) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) experiment_id = exp.run_basic_test_with_temp_config( config, conf.cv_examples_path("cifar10_tf_keras"), 1) trials = exp.experiment_trials(experiment_id) assert len(trials) == 1
def run_mnist_estimator_data_layer_test(tf2: bool, storage_type: str) -> None: config = conf.load_config( conf.experimental_path("data_layer_mnist_estimator/const.yaml")) config = conf.set_max_steps(config, 2) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) if storage_type == "lfs": config = conf.set_shared_fs_data_layer(config) else: config = conf.set_s3_data_layer(config) exp.run_basic_test_with_temp_config( config, conf.experimental_path("data_layer_mnist_estimator"), 1)
def test_tf_keras_native_parallel(tf2: bool) -> None: config = conf.load_config( conf.official_examples_path("trial/cifar10_cnn_tf_keras/const.yaml")) config = conf.set_slots_per_trial(config, 8) config = conf.set_native_parallel(config, True) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) experiment_id = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("trial/cifar10_cnn_tf_keras"), 1) trials = exp.experiment_trials(experiment_id) assert len(trials) == 1
def test_mnist_estimator_data_layer_parallel(storage_type: str, secrets: Dict[str, str]) -> None: config = conf.load_config(conf.experimental_path("trial/data_layer_mnist_estimator/const.yaml")) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_slots_per_trial(config, 8) config = conf.set_tf1_image(config) if storage_type == "lfs": config = conf.set_shared_fs_data_layer(config) else: config = conf.set_s3_data_layer(config) exp.run_basic_test_with_temp_config( config, conf.experimental_path("trial/data_layer_mnist_estimator"), 1 )
def test_mnist_estimator_data_layer_parallel(storage_type: str) -> None: config = conf.load_config( conf.experimental_path("data_layer_mnist_estimator/const.yaml")) config = conf.set_max_steps(config, 2) config = conf.set_slots_per_trial(config, 8) config = conf.set_tf1_image(config) if storage_type == "lfs": config = conf.set_shared_fs_data_layer(config) else: config = conf.set_s3_data_layer(config) exp.run_basic_test_with_temp_config( config, conf.experimental_path("data_layer_mnist_estimator"), 1)
def run_tf_keras_mnist_data_layer_test(tf2: bool, storage_type: str) -> None: config = conf.load_config( conf.features_examples_path("data_layer_mnist_tf_keras/const.yaml")) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_min_validation_period(config, {"batches": 1000}) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) if storage_type == "lfs": config = conf.set_shared_fs_data_layer(config) else: config = conf.set_s3_data_layer(config) exp.run_basic_test_with_temp_config( config, conf.features_examples_path("data_layer_mnist_tf_keras"), 1)
def test_tf_keras_parallel(aggregation_frequency: int, tf2: bool) -> None: config = conf.load_config(conf.official_examples_path("cifar10_cnn_tf_keras/const.yaml")) config = conf.set_slots_per_trial(config, 8) config = conf.set_native_parallel(config, False) config = conf.set_max_steps(config, 2) config = conf.set_aggregation_frequency(config, aggregation_frequency) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) experiment_id = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("cifar10_cnn_tf_keras"), 1 ) trials = exp.experiment_trials(experiment_id) assert len(trials) == 1
def test_mnist_estimmator_const_parallel(native_parallel: bool, tf2: bool) -> None: if tf2 and native_parallel: pytest.skip("TF2 native parallel training is not currently supported.") config = conf.load_config( conf.fixtures_path("mnist_estimator/single-multi-slot.yaml")) config = conf.set_slots_per_trial(config, 8) config = conf.set_native_parallel(config, native_parallel) config = conf.set_max_steps(config, 2) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) exp.run_basic_test_with_temp_config( config, conf.official_examples_path("mnist_estimator"), 1)
def test_tf_keras_single_gpu(tf2: bool) -> None: config = conf.load_config( conf.cv_examples_path("cifar10_tf_keras/const.yaml")) config = conf.set_slots_per_trial(config, 1) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) experiment_id = exp.run_basic_test_with_temp_config( config, conf.cv_examples_path("cifar10_tf_keras"), 1) trials = exp.experiment_trials(experiment_id) assert len(trials) == 1 # Test exporting a checkpoint. exp.export_and_load_model(experiment_id)
def test_tf_keras_mnist_data_layer_parallel(tf2: bool, storage_type: str, secrets: Dict[str, str]) -> None: config = conf.load_config( conf.features_examples_path("data_layer_mnist_tf_keras/const.yaml")) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_slots_per_trial(config, 8) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) if storage_type == "lfs": config = conf.set_shared_fs_data_layer(config) else: config = conf.set_s3_data_layer(config) exp.run_basic_test_with_temp_config( config, conf.features_examples_path("data_layer_mnist_tf_keras"), 1)
def test_mnist_estimmator_const_parallel(native_parallel: bool, tf2: bool) -> None: if tf2 and native_parallel: pytest.skip("TF2 native parallel training is not currently supported.") config = conf.load_config(conf.fixtures_path("mnist_estimator/single-multi-slot.yaml")) config = conf.set_slots_per_trial(config, 8) config = conf.set_native_parallel(config, native_parallel) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) config = conf.set_perform_initial_validation(config, True) exp_id = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("trial/mnist_estimator"), 1, has_zeroth_step=True ) exp.assert_performed_initial_validation(exp_id)
def test_custom_reducer_distributed(secrets: Dict[str, str], tf2: bool) -> None: config = conf.load_config(conf.fixtures_path("estimator_dataset/distributed.yaml")) # Run with multiple steps to verify we are resetting reducers right. config = conf.set_max_length(config, {"batches": 2}) config = conf.set_slots_per_trial(config, 8) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) experiment_id = exp.run_basic_test_with_temp_config( config, conf.fixtures_path("estimator_dataset"), 1 ) trial = exp.experiment_trials(experiment_id)[0] last_validation = trial["steps"][len(trial["steps"]) - 1]["validation"] metrics = last_validation["metrics"]["validation_metrics"] label_sum = 2 * sum(range(16)) assert metrics["label_sum_fn"] == label_sum assert metrics["label_sum_cls"] == label_sum
def test_tf_keras_mnist_data_layer_parallel( tf2: bool, storage_type: str, secrets: Dict[str, str], collect_trial_profiles: Callable[[int], None], ) -> None: config = conf.load_config( conf.fixtures_path("data_layer_tf_keras/const.yaml")) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_slots_per_trial(config, 8) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) config = conf.set_profiling_enabled(config) if storage_type == "lfs": config = conf.set_shared_fs_data_layer(config) else: config = conf.set_s3_data_layer(config) exp_id = exp.run_basic_test_with_temp_config( config, conf.fixtures_path("data_layer_tf_keras"), 1) trial_id = exp.experiment_trials(exp_id)[0].trial.id collect_trial_profiles(trial_id)
def test_tf_keras_parallel( aggregation_frequency: int, tf2: bool, collect_trial_profiles: Callable[[int], None]) -> None: config = conf.load_config( conf.cv_examples_path("cifar10_tf_keras/const.yaml")) config = conf.set_slots_per_trial(config, 8) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_aggregation_frequency(config, aggregation_frequency) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) config = conf.set_profiling_enabled(config) experiment_id = exp.run_basic_test_with_temp_config( config, conf.cv_examples_path("cifar10_tf_keras"), 1) trials = exp.experiment_trials(experiment_id) assert len(trials) == 1 # Test exporting a checkpoint. export_and_load_model(experiment_id) collect_trial_profiles(trials[0].trial.id) # Check on record/batch counts we emitted in logs. validation_size = 10000 global_batch_size = config["hyperparameters"]["global_batch_size"] num_workers = config.get("resources", {}).get("slots_per_trial", 1) global_batch_size = config["hyperparameters"]["global_batch_size"] scheduling_unit = config.get("scheduling_unit", 100) per_slot_batch_size = global_batch_size // num_workers exp_val_batches = (validation_size + (per_slot_batch_size - 1)) // per_slot_batch_size patterns = [ # Expect two copies of matching training reports. f"trained: {scheduling_unit * global_batch_size} records.*in {scheduling_unit} batches", f"trained: {scheduling_unit * global_batch_size} records.*in {scheduling_unit} batches", f"validated: {validation_size} records.*in {exp_val_batches} batches", ] exp.assert_patterns_in_trial_logs(trials[0].trial.id, patterns)