def test_tf_keras_const_warm_start(tf2: bool) -> None: config = conf.load_config( conf.official_examples_path("cifar10_cnn_tf_keras/const.yaml")) config = conf.set_max_steps(config, 2) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) experiment_id1 = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("cifar10_cnn_tf_keras"), 1) trials = exp.experiment_trials(experiment_id1) assert len(trials) == 1 first_trial = trials[0] first_trial_id = first_trial["id"] assert len(first_trial["steps"]) == 2 first_checkpoint_id = first_trial["steps"][1]["checkpoint"]["id"] # Add a source trial ID to warm start from. config["searcher"]["source_trial_id"] = first_trial_id experiment_id2 = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("cifar10_cnn_tf_keras"), 1) # The new trials should have a warm start checkpoint ID. trials = exp.experiment_trials(experiment_id2) assert len(trials) == 1 for trial in trials: assert trial["warm_start_checkpoint_id"] == first_checkpoint_id
def test_mnist_estimator_warm_start(tf2: bool) -> None: config = conf.load_config( conf.fixtures_path("mnist_estimator/single.yaml")) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) experiment_id1 = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("mnist_estimator"), 1) trials = exp.experiment_trials(experiment_id1) assert len(trials) == 1 first_trial = trials[0] first_trial_id = first_trial.id assert len(first_trial.steps) == 1 first_checkpoint_id = first_trial.steps[0].checkpoint.id config_obj = conf.load_config( conf.fixtures_path("mnist_estimator/single.yaml")) config_obj["searcher"]["source_trial_id"] = first_trial_id config_obj = conf.set_tf2_image(config_obj) if tf2 else conf.set_tf1_image( config_obj) experiment_id2 = exp.run_basic_test_with_temp_config( config_obj, conf.official_examples_path("mnist_estimator"), 1) trials = exp.experiment_trials(experiment_id2) assert len(trials) == 1 assert trials[0].warm_start_checkpoint_id == first_checkpoint_id
def test_tensorpack_const() -> None: config = conf.load_config( conf.official_examples_path("mnist_tp/const.yaml")) experiment_id = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("mnist_tp"), 1) trials = exp.experiment_trials(experiment_id) assert len(trials) == 1
def test_pytorch_parallel() -> None: config = conf.load_config(conf.official_examples_path("mnist_pytorch/const.yaml")) config = conf.set_slots_per_trial(config, 8) config = conf.set_native_parallel(config, False) config = conf.set_max_steps(config, 2) config = conf.set_tensor_auto_tuning(config, True) exp.run_basic_test_with_temp_config(config, conf.official_examples_path("mnist_pytorch"), 1)
def test_pytorch_const_with_amp() -> None: config = conf.load_config( conf.official_examples_path("mnist_pytorch/const.yaml")) config = conf.set_max_steps(config, 2) config = conf.set_amp_level(config, "O1") exp.run_basic_test_with_temp_config( config, conf.official_examples_path("mnist_pytorch"), 1)
def test_tensorpack_const() -> None: config = conf.load_config( conf.official_examples_path("mnist_tp/const.yaml")) config["checkpoint_storage"] = exp.shared_fs_checkpoint_config() config.get("bind_mounts", []).append(exp.root_user_home_bind_mount()) experiment_id = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("mnist_tp"), 1) trials = exp.experiment_trials(experiment_id) assert len(trials) == 1
def test_iris() -> None: config = conf.load_config( conf.official_examples_path("iris_tf_keras/const.yaml")) config = conf.set_max_steps(config, 2) exp_id = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("iris_tf_keras"), 1) exp_ref = Determined(conf.make_master_url()).get_experiment(exp_id) model = exp_ref.top_checkpoint().load() model.summary()
def test_tensorpack_native_parallel() -> None: config = conf.load_config( conf.official_examples_path("mnist_tp/const.yaml")) config = conf.set_slots_per_trial(config, 8) config = conf.set_native_parallel(config, True) config = conf.set_max_steps(config, 2) experiment_id = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("mnist_tp"), 1) trials = exp.experiment_trials(experiment_id) assert len(trials) == 1
def test_tf_keras_mnist_parallel() -> None: config = conf.load_config( conf.official_examples_path("fashion_mnist_tf_keras/const.yaml")) config = conf.set_slots_per_trial(config, 8) config = conf.set_native_parallel(config, False) config = conf.set_max_steps(config, 2) experiment_id = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("fashion_mnist_tf_keras"), 1) trials = exp.experiment_trials(experiment_id) assert len(trials) == 1
def test_s3_no_creds(secrets: Dict[str, str]) -> None: pytest.skip("Temporarily skipping this until we find a more secure way of testing this.") config = conf.load_config(conf.official_examples_path("mnist_pytorch/const.yaml")) config["checkpoint_storage"] = exp.s3_checkpoint_config_no_creds() config.setdefault("environment", {}) config["environment"].setdefault("environment_variables", []) config["environment"]["environment_variables"] += [ f"AWS_ACCESS_KEY_ID={secrets['INTEGRATIONS_S3_ACCESS_KEY']}", f"AWS_SECRET_ACCESS_KEY={secrets['INTEGRATIONS_S3_SECRET_KEY']}", ] exp.run_basic_test_with_temp_config(config, conf.official_examples_path("mnist_pytorch"), 1)
def test_pytorch_cifar10_const() -> None: config = conf.load_config( conf.official_examples_path("cifar10_cnn_pytorch/const.yaml")) config = conf.set_max_steps(config, 2) experiment_id = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("cifar10_cnn_pytorch"), 1) trials = exp.experiment_trials(experiment_id) nn = Determined().get_trial( trials[0].id).select_checkpoint(latest=True).load() assert isinstance(nn, torch.nn.Module)
def test_tf_keras_single_gpu(tf2: bool) -> None: config = conf.load_config( conf.official_examples_path("cifar10_cnn_tf_keras/const.yaml")) config = conf.set_slots_per_trial(config, 1) config = conf.set_max_steps(config, 2) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) experiment_id = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("cifar10_cnn_tf_keras"), 1) trials = exp.experiment_trials(experiment_id) assert len(trials) == 1
def test_tf_keras_parallel(aggregation_frequency: int, tf2: bool) -> None: config = conf.load_config( conf.official_examples_path("cifar10_cnn_tf_keras/const.yaml")) config = conf.set_slots_per_trial(config, 8) config = conf.set_native_parallel(config, False) config = conf.set_max_steps(config, 2) config = conf.set_aggregation_frequency(config, aggregation_frequency) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) experiment_id = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("cifar10_cnn_tf_keras"), 1) trials = exp.experiment_trials(experiment_id) assert len(trials) == 1
def test_tf_keras_single_gpu(tf2: bool) -> None: config = conf.load_config( conf.official_examples_path("cifar10_cnn_tf_keras/const.yaml")) config["checkpoint_storage"] = exp.shared_fs_checkpoint_config() config.get("bind_mounts", []).append(exp.root_user_home_bind_mount()) config = conf.set_slots_per_trial(config, 1) config = conf.set_max_steps(config, 2) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) experiment_id = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("cifar10_cnn_tf_keras"), 1) trials = exp.experiment_trials(experiment_id) assert len(trials) == 1
def test_pytorch_const_parallel(aggregation_frequency: int, use_amp: bool) -> None: config = conf.load_config( conf.official_examples_path("mnist_pytorch/const.yaml")) config = conf.set_slots_per_trial(config, 8) config = conf.set_native_parallel(config, False) config = conf.set_max_steps(config, 2) config = conf.set_aggregation_frequency(config, aggregation_frequency) if use_amp: config = conf.set_amp_level(config, "O1") exp.run_basic_test_with_temp_config( config, conf.official_examples_path("mnist_pytorch"), 1)
def test_pytorch_cifar10_parallel() -> None: config = conf.load_config( conf.official_examples_path("cifar10_cnn_pytorch/const.yaml")) config = conf.set_max_steps(config, 2) config = conf.set_slots_per_trial(config, 8) experiment_id = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("cifar10_cnn_pytorch"), 1) trials = exp.experiment_trials(experiment_id) nn = (Determined(conf.make_master_url()).get_trial( trials[0].id).select_checkpoint(latest=True).load( map_location=torch.device("cpu"))) assert isinstance(nn, torch.nn.Module)
def test_tf_keras_mnist_parallel() -> None: config = conf.load_config( conf.official_examples_path("fashion_mnist_tf_keras/const.yaml")) config["checkpoint_storage"] = exp.shared_fs_checkpoint_config() config.get("bind_mounts", []).append(exp.root_user_home_bind_mount()) config = conf.set_slots_per_trial(config, 8) config = conf.set_native_parallel(config, False) config = conf.set_max_steps(config, 2) experiment_id = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("fashion_mnist_tf_keras"), 1) trials = exp.experiment_trials(experiment_id) assert len(trials) == 1
def test_tensorpack_parallel(aggregation_frequency: int) -> None: config = conf.load_config( conf.official_examples_path("mnist_tp/const.yaml")) config["checkpoint_storage"] = exp.shared_fs_checkpoint_config() config.get("bind_mounts", []).append(exp.root_user_home_bind_mount()) config = conf.set_slots_per_trial(config, 8) config = conf.set_native_parallel(config, False) config = conf.set_max_steps(config, 2) config = conf.set_aggregation_frequency(config, aggregation_frequency) experiment_id = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("mnist_tp"), 1) trials = exp.experiment_trials(experiment_id) assert len(trials) == 1
def test_pytorch_11_const(aggregation_frequency: int) -> None: config = conf.load_config( conf.fixtures_path("mnist_pytorch/const-pytorch11.yaml")) config = conf.set_aggregation_frequency(config, aggregation_frequency) exp.run_basic_test_with_temp_config( config, conf.official_examples_path("mnist_pytorch"), 1)
def test_create_test_mode() -> None: # test-mode should succeed with a valid experiment. command = [ "det", "-m", conf.make_master_url(), "experiment", "create", "--test-mode", conf.fixtures_path("mnist_pytorch/adaptive_short.yaml"), conf.official_examples_path("mnist_pytorch"), ] output = subprocess.check_output(command, universal_newlines=True) assert "Model definition test succeeded" in output # test-mode should fail when an error is introduced into the trial # implementation. command = [ "det", "-m", conf.make_master_url(), "experiment", "create", "--test-mode", conf.fixtures_path("trial_error/const.yaml"), conf.fixtures_path("trial_error"), ] with pytest.raises(subprocess.CalledProcessError): subprocess.check_call(command)
def test_mnist_estimator_const(tf2: bool) -> None: config = conf.load_config( conf.fixtures_path("mnist_estimator/single.yaml")) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) experiment_id = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("mnist_estimator"), 1) trials = exp.experiment_trials(experiment_id) assert len(trials) == 1 # Check validation metrics. steps = trials[0].steps assert len(steps) == 1 step = steps[0] assert "validation" in step v_metrics = step.validation.metrics["validation_metrics"] # GPU training is non-deterministic, but on CPU we can validate that we # reach a consistent result. if not cluster.running_on_gpu(): assert v_metrics["accuracy"] == 0.9125999808311462 # Check training metrics. full_trial_metrics = exp.trial_metrics(trials[0].id) for step in full_trial_metrics.steps: metrics = step.metrics batch_metrics = metrics["batch_metrics"] assert len(batch_metrics) == 100 for batch_metric in batch_metrics: assert batch_metric["loss"] > 0
def test_mnist_estimator_adaptive(tf2: bool) -> None: # Only test tf1 here, because a tf2 test would add no extra coverage. config = conf.load_config( conf.fixtures_path("mnist_estimator/adaptive.yaml")) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) exp.run_basic_test_with_temp_config( config, conf.official_examples_path("mnist_estimator"), None)
def test_pytorch_load() -> None: config = conf.load_config( conf.fixtures_path("mnist_pytorch/const-pytorch11.yaml")) experiment_id = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("mnist_pytorch"), 1) nn = Determined().get_experiment(experiment_id).top_checkpoint().load() assert isinstance(nn, torch.nn.Module)
def test_mnist_estimator_load() -> None: config = conf.load_config( conf.fixtures_path("mnist_estimator/single.yaml")) config = conf.set_tf1_image(config) experiment_id = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("mnist_estimator"), 1) trials = exp.experiment_trials(experiment_id) model = Determined().get_trial(trials[0].id).top_checkpoint().load() assert isinstance(model, AutoTrackable)
def test_pytorch_const_warm_start() -> None: """ Test that specifying an earlier trial checkpoint to warm-start from correctly populates the later trials' `warm_start_checkpoint_id` fields. """ config = conf.load_config( conf.official_examples_path("mnist_pytorch/const.yaml")) config = conf.set_max_steps(config, 2) experiment_id1 = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("mnist_pytorch"), 1) trials = exp.experiment_trials(experiment_id1) assert len(trials) == 1 first_trial = trials[0] first_trial_id = first_trial["id"] assert len(first_trial["steps"]) == 2 first_checkpoint_id = first_trial["steps"][-1]["checkpoint"]["id"] config_obj = conf.load_config( conf.official_examples_path("mnist_pytorch/const.yaml")) # Change the search method to random, and add a source trial ID to warm # start from. config_obj["searcher"]["source_trial_id"] = first_trial_id config_obj["searcher"]["name"] = "random" config_obj["searcher"]["max_steps"] = 1 config_obj["searcher"]["max_trials"] = 3 experiment_id2 = exp.run_basic_test_with_temp_config( config_obj, conf.official_examples_path("mnist_pytorch"), 3) trials = exp.experiment_trials(experiment_id2) assert len(trials) == 3 for trial in trials: assert trial["warm_start_checkpoint_id"] == first_checkpoint_id
def test_mnist_estimmator_const_parallel(native_parallel: bool, tf2: bool) -> None: if tf2 and native_parallel: pytest.skip("TF2 native parallel training is not currently supported.") config = conf.load_config( conf.fixtures_path("mnist_estimator/single-multi-slot.yaml")) config = conf.set_slots_per_trial(config, 8) config = conf.set_native_parallel(config, native_parallel) config = conf.set_max_steps(config, 2) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) exp.run_basic_test_with_temp_config( config, conf.official_examples_path("mnist_estimator"), 1)
def test_end_to_end_adaptive() -> None: exp_id = exp.run_basic_test( conf.fixtures_path("mnist_pytorch/adaptive_short.yaml"), conf.official_examples_path("mnist_pytorch"), None, ) # Check that validation accuracy look sane (more than 93% on MNIST). trials = exp.experiment_trials(exp_id) best = None for trial in trials: assert len(trial.steps) last_step = trial.steps[-1] accuracy = last_step.validation.metrics["validation_metrics"][ "accuracy"] if not best or accuracy > best: best = accuracy assert best is not None assert best > 0.93 # Check that ExperimentReference returns a sorted order of top checkpoints # without gaps. The top 2 checkpoints should be the first 2 of the top k # checkpoints if sorting is stable. exp_ref = Determined().get_experiment(exp_id) top_2 = exp_ref.top_n_checkpoints(2) top_k = exp_ref.top_n_checkpoints(len(trials)) top_2_uuids = [c.uuid for c in top_2] top_k_uuids = [c.uuid for c in top_k] assert top_2_uuids == top_k_uuids[:2] # Check that metrics are truly in sorted order. metrics = [ c.validation.metrics["validation_metrics"]["validation_loss"] for c in top_k ] assert metrics == sorted(metrics) # Check that changing smaller is better reverses the checkpoint ordering. top_k_reversed = exp_ref.top_n_checkpoints(len(trials), sort_by="validation_loss", smaller_is_better=False) top_k_reversed_uuids = [c.uuid for c in top_k_reversed] assert top_k_uuids == top_k_reversed_uuids[::-1]
def test_mnist_tf1_15() -> None: pytest.skip("Ignore until we have official support for tf1.15.") config = conf.load_config(conf.fixtures_path("mnist_tf/const.yaml")) # TODO(brian + yoni) don't hardcode TF1.15 image when we build a TF1.15 # golden image. config.setdefault("environment", {}) config["environment"]["image"] = ( "573932760021.dkr.ecr.us-west-2.amazonaws.com" "/determinedai/task-environment:" "c8750377f18ff0a738229adcf16a50685ef41631779616cdc86c0655fc554704") # This particular configuration takes a long time to build, so wait longer than normal. exp.run_basic_test_with_temp_config( config, conf.official_examples_path("mnist_tf"), None, max_wait_secs=3000)
def test_invalid_experiment() -> None: completed_process = exp.maybe_create_experiment( conf.fixtures_path("invalid_experiment/const.yaml"), conf.official_examples_path("mnist_tf")) assert completed_process.returncode != 0