def test_streaming_observability_metrics_apis( framework_base_experiment: str, framework_timings_enabled: bool ) -> None: # TODO: refactor tests to not use cli singleton auth. certs.cli_cert = certs.default_load(conf.make_master_url()) authentication.cli_auth = authentication.Authentication(conf.make_master_url(), try_reauth=True) config_path = conf.tutorials_path(f"../{framework_base_experiment}/const.yaml") model_def_path = conf.tutorials_path(f"../{framework_base_experiment}") config_obj = conf.load_config(config_path) config_obj = conf.set_profiling_enabled(config_obj) with tempfile.NamedTemporaryFile() as tf: with open(tf.name, "w") as f: yaml.dump(config_obj, f) experiment_id = exp.create_experiment( tf.name, model_def_path, ) exp.wait_for_experiment_state(experiment_id, "COMPLETED") trials = exp.experiment_trials(experiment_id) trial_id = trials[0]["id"] gpu_enabled = conf.GPU_ENABLED request_profiling_metric_labels(trial_id, framework_timings_enabled, gpu_enabled) if gpu_enabled: request_profiling_system_metrics(trial_id, "gpu_util") if framework_timings_enabled: request_profiling_pytorch_timing_metrics(trial_id, "train_batch")
def test_tf_keras_const_warm_start( tf2: bool, collect_trial_profiles: Callable[[int], None]) -> None: config = conf.load_config( conf.cv_examples_path("cifar10_tf_keras/const.yaml")) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_min_validation_period(config, {"batches": 1000}) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) config = conf.set_profiling_enabled(config) experiment_id1 = exp.run_basic_test_with_temp_config( config, conf.cv_examples_path("cifar10_tf_keras"), 1) trials = exp.experiment_trials(experiment_id1) assert len(trials) == 1 first_trial = trials[0] first_trial_id = first_trial.trial.id assert len(first_trial.workloads) == 4 checkpoints = exp.workloads_with_checkpoint(first_trial.workloads) first_checkpoint_uuid = checkpoints[0].uuid # Add a source trial ID to warm start from. config["searcher"]["source_trial_id"] = first_trial_id experiment_id2 = exp.run_basic_test_with_temp_config( config, conf.cv_examples_path("cifar10_tf_keras"), 1) # The new trials should have a warm start checkpoint ID. trials = exp.experiment_trials(experiment_id2) assert len(trials) == 1 for t in trials: assert t.trial.warmStartCheckpointUuid != "" assert t.trial.warmStartCheckpointUuid == first_checkpoint_uuid trial_id = trials[0].trial.id collect_trial_profiles(trial_id)
def test_launch_layer_cifar( collect_trial_profiles: Callable[[int], None]) -> None: config = conf.load_config( conf.cv_examples_path("cifar10_pytorch/const.yaml")) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_slots_per_trial(config, 1) config = conf.set_profiling_enabled(config) config = conf.set_entrypoint( config, "python3 -m determined.launch.horovod --autohorovod --trial model_def:CIFARTrial" ) experiment_id = exp.run_basic_test_with_temp_config( config, conf.cv_examples_path("cifar10_pytorch"), 1) trials = exp.experiment_trials(experiment_id) (Determined(conf.make_master_url()).get_trial( trials[0].trial.id).select_checkpoint(latest=True).load( map_location="cpu")) collect_trial_profiles(trials[0].trial.id) assert exp.check_if_string_present_in_trial_logs( trials[0].trial.id, "allocation stopped after resources exited successfully with a zero exit code", )
def test_pytorch_const_with_amp( api_style: str, collect_trial_profiles: Callable[[int], None]) -> None: config = conf.load_config( conf.fixtures_path("pytorch_amp/" + api_style + "_amp.yaml")) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_profiling_enabled(config) experiment_id = exp.run_basic_test_with_temp_config( config, conf.fixtures_path("pytorch_amp"), 1) trial_id = exp.experiment_trials(experiment_id)[0].trial.id collect_trial_profiles(trial_id)
def test_pytorch_load(collect_trial_profiles: Callable[[int], None]) -> None: config = conf.load_config( conf.fixtures_path("mnist_pytorch/const-pytorch11.yaml")) config = conf.set_profiling_enabled(config) experiment_id = exp.run_basic_test_with_temp_config( config, conf.tutorials_path("mnist_pytorch"), 1) (Determined(conf.make_master_url()).get_experiment( experiment_id).top_checkpoint().load(map_location="cpu")) trial_id = exp.experiment_trials(experiment_id)[0].trial.id collect_trial_profiles(trial_id)
def test_tf_keras_mnist_parallel( collect_trial_profiles: Callable[[int], None]) -> None: config = conf.load_config( conf.tutorials_path("fashion_mnist_tf_keras/const.yaml")) config = conf.set_slots_per_trial(config, 8) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_profiling_enabled(config) experiment_id = exp.run_basic_test_with_temp_config( config, conf.tutorials_path("fashion_mnist_tf_keras"), 1) trials = exp.experiment_trials(experiment_id) assert len(trials) == 1 collect_trial_profiles(trials[0].trial.id)
def run_tf_keras_dcgan_example( collect_trial_profiles: Callable[[int], None]) -> None: config = conf.load_config( conf.gan_examples_path("dcgan_tf_keras/const.yaml")) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_min_validation_period(config, {"batches": 200}) config = conf.set_slots_per_trial(config, 8) config = conf.set_tf2_image(config) config = conf.set_profiling_enabled(config) exp_id = exp.run_basic_test_with_temp_config( config, conf.gan_examples_path("dcgan_tf_keras"), 1) trial_id = exp.experiment_trials(exp_id)[0].trial.id collect_trial_profiles(trial_id)
def run_tf_keras_mnist_data_layer_test(tf2: bool, storage_type: str) -> int: config = conf.load_config( conf.fixtures_path("data_layer_tf_keras/const.yaml")) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_min_validation_period(config, {"batches": 1000}) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) config = conf.set_profiling_enabled(config) if storage_type == "lfs": config = conf.set_shared_fs_data_layer(config) else: config = conf.set_s3_data_layer(config) return exp.run_basic_test_with_temp_config( config, conf.fixtures_path("data_layer_tf_keras"), 1)
def test_tf_keras_tf2_disabled( collect_trial_profiles: Callable[[int], None]) -> None: """Keras on tf2 with tf2 and eager execution disabled.""" config = conf.load_config( conf.fixtures_path("keras_tf2_disabled_no_op/const.yaml")) config = conf.set_max_length(config, {"batches": 1}) config = conf.set_tf2_image(config) config = conf.set_profiling_enabled(config) experiment_id = exp.run_basic_test_with_temp_config( config, conf.fixtures_path("keras_tf2_disabled_no_op"), 1) trials = exp.experiment_trials(experiment_id) assert len(trials) == 1 export_and_load_model(experiment_id) collect_trial_profiles(trials[0].trial.id)
def test_pytorch_gan_parallel( collect_trial_profiles: Callable[[int], None]) -> None: config = conf.load_config( conf.gan_examples_path("gan_mnist_pytorch/const.yaml")) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_slots_per_trial(config, 8) config = conf.set_profiling_enabled(config) experiment_id = exp.run_basic_test_with_temp_config( config, conf.gan_examples_path("gan_mnist_pytorch"), 1) trials = exp.experiment_trials(experiment_id) (Determined(conf.make_master_url()).get_trial( trials[0].trial.id).select_checkpoint(latest=True).load( map_location="cpu")) collect_trial_profiles(trials[0].trial.id)
def test_tf_keras_single_gpu( tf2: bool, collect_trial_profiles: Callable[[int], None]) -> None: config = conf.load_config( conf.cv_examples_path("cifar10_tf_keras/const.yaml")) config = conf.set_slots_per_trial(config, 1) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) config = conf.set_profiling_enabled(config) experiment_id = exp.run_basic_test_with_temp_config( config, conf.cv_examples_path("cifar10_tf_keras"), 1) trials = exp.experiment_trials(experiment_id) assert len(trials) == 1 # Test exporting a checkpoint. export_and_load_model(experiment_id) collect_trial_profiles(trials[0].trial.id)
def test_tf_keras_mnist_data_layer_parallel( tf2: bool, storage_type: str, secrets: Dict[str, str], collect_trial_profiles: Callable[[int], None], ) -> None: config = conf.load_config( conf.fixtures_path("data_layer_tf_keras/const.yaml")) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_slots_per_trial(config, 8) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) config = conf.set_profiling_enabled(config) if storage_type == "lfs": config = conf.set_shared_fs_data_layer(config) else: config = conf.set_s3_data_layer(config) exp_id = exp.run_basic_test_with_temp_config( config, conf.fixtures_path("data_layer_tf_keras"), 1) trial_id = exp.experiment_trials(exp_id)[0].trial.id collect_trial_profiles(trial_id)
def test_tf_keras_parallel( aggregation_frequency: int, tf2: bool, collect_trial_profiles: Callable[[int], None]) -> None: config = conf.load_config( conf.cv_examples_path("cifar10_tf_keras/const.yaml")) config = conf.set_slots_per_trial(config, 8) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_aggregation_frequency(config, aggregation_frequency) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) config = conf.set_profiling_enabled(config) experiment_id = exp.run_basic_test_with_temp_config( config, conf.cv_examples_path("cifar10_tf_keras"), 1) trials = exp.experiment_trials(experiment_id) assert len(trials) == 1 # Test exporting a checkpoint. export_and_load_model(experiment_id) collect_trial_profiles(trials[0].trial.id) # Check on record/batch counts we emitted in logs. validation_size = 10000 global_batch_size = config["hyperparameters"]["global_batch_size"] num_workers = config.get("resources", {}).get("slots_per_trial", 1) global_batch_size = config["hyperparameters"]["global_batch_size"] scheduling_unit = config.get("scheduling_unit", 100) per_slot_batch_size = global_batch_size // num_workers exp_val_batches = (validation_size + (per_slot_batch_size - 1)) // per_slot_batch_size patterns = [ # Expect two copies of matching training reports. f"trained: {scheduling_unit * global_batch_size} records.*in {scheduling_unit} batches", f"trained: {scheduling_unit * global_batch_size} records.*in {scheduling_unit} batches", f"validated: {validation_size} records.*in {exp_val_batches} batches", ] exp.assert_patterns_in_trial_logs(trials[0].trial.id, patterns)
def test_pytorch_11_const( aggregation_frequency: int, using_k8s: bool, collect_trial_profiles: Callable[[int], None]) -> None: config = conf.load_config( conf.fixtures_path("mnist_pytorch/const-pytorch11.yaml")) config = conf.set_aggregation_frequency(config, aggregation_frequency) config = conf.set_profiling_enabled(config) if using_k8s: pod_spec = { "metadata": { "labels": { "ci": "testing" } }, "spec": { "containers": [{ "name": "determined-container", "volumeMounts": [{ "name": "temp1", "mountPath": "/random" }], }], "volumes": [{ "name": "temp1", "emptyDir": {} }], }, } config = conf.set_pod_spec(config, pod_spec) experiment_id = exp.run_basic_test_with_temp_config( config, conf.tutorials_path("mnist_pytorch"), 1) trial_id = exp.experiment_trials(experiment_id)[0].trial.id collect_trial_profiles(trial_id)