def test_tf_keras_const_warm_start( tf2: bool, collect_trial_profiles: Callable[[int], None]) -> None: config = conf.load_config( conf.cv_examples_path("cifar10_tf_keras/const.yaml")) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_min_validation_period(config, {"batches": 1000}) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) config = conf.set_profiling_enabled(config) experiment_id1 = exp.run_basic_test_with_temp_config( config, conf.cv_examples_path("cifar10_tf_keras"), 1) trials = exp.experiment_trials(experiment_id1) assert len(trials) == 1 first_trial = trials[0] first_trial_id = first_trial.trial.id assert len(first_trial.workloads) == 4 checkpoints = exp.workloads_with_checkpoint(first_trial.workloads) first_checkpoint_uuid = checkpoints[0].uuid # Add a source trial ID to warm start from. config["searcher"]["source_trial_id"] = first_trial_id experiment_id2 = exp.run_basic_test_with_temp_config( config, conf.cv_examples_path("cifar10_tf_keras"), 1) # The new trials should have a warm start checkpoint ID. trials = exp.experiment_trials(experiment_id2) assert len(trials) == 1 for t in trials: assert t.trial.warmStartCheckpointUuid != "" assert t.trial.warmStartCheckpointUuid == first_checkpoint_uuid trial_id = trials[0].trial.id collect_trial_profiles(trial_id)
def test_mnist_estimator_warm_start(tf2: bool) -> None: config = conf.load_config( conf.fixtures_path("mnist_estimator/single.yaml")) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) experiment_id1 = exp.run_basic_test_with_temp_config( config, conf.cv_examples_path("mnist_estimator"), 1) trials = exp.experiment_trials(experiment_id1) assert len(trials) == 1 first_trial = trials[0] first_trial_id = first_trial.trial.id assert len(first_trial.workloads) == 3 checkpoint_workloads = exp.workloads_with_checkpoint(first_trial.workloads) first_checkpoint_uuid = checkpoint_workloads[0].uuid config_obj = conf.load_config( conf.fixtures_path("mnist_estimator/single.yaml")) config_obj["searcher"]["source_trial_id"] = first_trial_id config_obj = conf.set_tf2_image(config_obj) if tf2 else conf.set_tf1_image( config_obj) experiment_id2 = exp.run_basic_test_with_temp_config( config_obj, conf.cv_examples_path("mnist_estimator"), 1) trials = exp.experiment_trials(experiment_id2) assert len(trials) == 1 assert trials[0].trial.warmStartCheckpointUuid == first_checkpoint_uuid
def test_tf_keras_const_warm_start(tf2: bool) -> None: config = conf.load_config( conf.cv_examples_path("cifar10_tf_keras/const.yaml")) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_min_validation_period(config, {"batches": 1000}) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) experiment_id1 = exp.run_basic_test_with_temp_config( config, conf.cv_examples_path("cifar10_tf_keras"), 1) trials = exp.experiment_trials(experiment_id1) assert len(trials) == 1 first_trial = trials[0] first_trial_id = first_trial["id"] assert len(first_trial["steps"]) == 2 first_checkpoint_id = first_trial["steps"][1]["checkpoint"]["id"] # Add a source trial ID to warm start from. config["searcher"]["source_trial_id"] = first_trial_id experiment_id2 = exp.run_basic_test_with_temp_config( config, conf.cv_examples_path("cifar10_tf_keras"), 1) # The new trials should have a warm start checkpoint ID. trials = exp.experiment_trials(experiment_id2) assert len(trials) == 1 for trial in trials: assert trial["warm_start_checkpoint_id"] == first_checkpoint_id
def test_mnist_estimator_warm_start(tf2: bool) -> None: config = conf.load_config( conf.fixtures_path("mnist_estimator/single.yaml")) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) experiment_id1 = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("mnist_estimator"), 1) trials = exp.experiment_trials(experiment_id1) assert len(trials) == 1 first_trial = trials[0] first_trial_id = first_trial["id"] assert len(first_trial["steps"]) == 1 first_checkpoint_id = first_trial["steps"][0]["checkpoint"]["id"] config_obj = conf.load_config( conf.fixtures_path("mnist_estimator/single.yaml")) config_obj["searcher"]["source_trial_id"] = first_trial_id config_obj = conf.set_tf2_image(config_obj) if tf2 else conf.set_tf1_image( config_obj) experiment_id2 = exp.run_basic_test_with_temp_config( config_obj, conf.official_examples_path("mnist_estimator"), 1) trials = exp.experiment_trials(experiment_id2) assert len(trials) == 1 assert trials[0]["warm_start_checkpoint_id"] == first_checkpoint_id
def _test_rng_restore(fixture: str, metrics: list, tf2: Union[None, bool] = None) -> None: """ This test confirms that an experiment can be restarted from a checkpoint with the same RNG state. It requires a test fixture that will emit random numbers from all of the RNGs used in the relevant framework as metrics. The experiment must have a const.yaml, run for at least 3 steps, checkpoint every step, and keep the first checkpoint (either by having metrics get worse over time, or by configuring the experiment to keep all checkpoints). """ config_base = conf.load_config(conf.fixtures_path(fixture + "/const.yaml")) config = copy.deepcopy(config_base) if tf2 is not None: config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) experiment = exp.run_basic_test_with_temp_config( config, conf.fixtures_path(fixture), 1, ) first_trial = exp.experiment_trials(experiment)[0] assert len(first_trial.workloads) >= 4 first_checkpoint = exp.workloads_with_checkpoint(first_trial.workloads)[0] first_checkpoint_uuid = first_checkpoint.uuid config = copy.deepcopy(config_base) if tf2 is not None: config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) config["searcher"]["source_checkpoint_uuid"] = first_checkpoint.uuid experiment2 = exp.run_basic_test_with_temp_config(config, conf.fixtures_path(fixture), 1) second_trial = exp.experiment_trials(experiment2)[0] assert len(second_trial.workloads) >= 4 assert second_trial.trial.warmStartCheckpointUuid == first_checkpoint_uuid first_trial_validations = exp.workloads_with_validation(first_trial.workloads) second_trial_validations = exp.workloads_with_validation(second_trial.workloads) for wl in range(0, 2): for metric in metrics: first_trial_val = first_trial_validations[wl + 1] first_metric = first_trial_val.metrics[metric] second_trial_val = second_trial_validations[wl] second_metric = second_trial_val.metrics[metric] assert ( first_metric == second_metric ), f"failures on iteration: {wl} with metric: {metric}"
def test_drain_agent_sched() -> None: """ Start an experiment, drain it. Start a second one and make sure it schedules on the second agent *before* the first one has finished. """ slots = _wait_for_slots(2) assert len(slots) == 2 exp_id1 = exp.create_experiment( conf.fixtures_path("no_op/single-medium-train-step.yaml"), conf.fixtures_path("no_op"), None, ) exp.wait_for_experiment_workload_progress(exp_id1) slots = _fetch_slots() used_slots = [s for s in slots if s["allocation_id"] != "FREE"] assert len(used_slots) == 1 agent_id1 = used_slots[0]["agent_id"] with _disable_agent(agent_id1, drain=True): exp_id2 = exp.create_experiment( conf.fixtures_path("no_op/single-medium-train-step.yaml"), conf.fixtures_path("no_op"), None, ) exp.wait_for_experiment_state(exp_id2, determinedexperimentv1State.STATE_ACTIVE) # Wait for a state when *BOTH* experiments are scheduled. for _ in range(20): slots = _fetch_slots() assert len(slots) == 2 used_slots = [s for s in slots if s["allocation_id"] != "FREE"] if len(used_slots) == 2: # All good. break else: pytest.fail( "Second experiment didn't schedule on the second agent " "while the first agent was draining") exp.wait_for_experiment_state( exp_id1, determinedexperimentv1State.STATE_COMPLETED) exp.wait_for_experiment_state( exp_id2, determinedexperimentv1State.STATE_COMPLETED) trials1 = exp.experiment_trials(exp_id1) trials2 = exp.experiment_trials(exp_id2) assert len(trials1) == len(trials2) == 1 assert len(trials1[0].workloads) == len(trials2[0].workloads) == 7
def test_epoch_sync(num_workers: int, global_batch_size: int, dataset_len: int) -> None: """ Test that epoch_idx is synchronized across all workers regardless of whether the number of batches is evenly divisible by the number of workers. """ config = conf.load_config(conf.fixtures_path("pytorch_no_op/const.yaml")) config = conf.set_slots_per_trial(config, num_workers) max_len_batches = 10 config = conf.set_max_length(config, {"batches": max_len_batches}) config = conf.set_hparam(config, "dataset_len", dataset_len) config = conf.set_global_batch_size(config, global_batch_size) e_id = exp.run_basic_test_with_temp_config( config, conf.fixtures_path("pytorch_no_op"), 1) t_id = exp.experiment_trials(e_id)[0].trial.id batches_per_epoch = (dataset_len + global_batch_size - 1) // global_batch_size # ceil for batch_idx in range(max_len_batches): epoch_idx = batch_idx // batches_per_epoch for rank in range(config["resources"]["slots_per_trial"]): assert exp.check_if_string_present_in_trial_logs( t_id, f"rank {rank} finished batch {batch_idx} in epoch {epoch_idx}")
def test_mnist_estimator_const(tf2: bool) -> None: config = conf.load_config( conf.fixtures_path("mnist_estimator/single.yaml")) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) experiment_id = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("mnist_estimator"), 1) trials = exp.experiment_trials(experiment_id) assert len(trials) == 1 # Check validation metrics. steps = trials[0]["steps"] assert len(steps) == 1 step = steps[0] assert "validation" in step v_metrics = step["validation"]["metrics"]["validation_metrics"] # GPU training is non-deterministic, but on CPU we can validate that we # reach a consistent result. if not cluster.running_on_gpu(): assert v_metrics["accuracy"] == 0.9125999808311462 # Check training metrics. full_trial_metrics = exp.trial_metrics(trials[0]["id"]) for step in full_trial_metrics["steps"]: metrics = step["metrics"] batch_metrics = metrics["batch_metrics"] assert len(batch_metrics) == 100 for batch_metric in batch_metrics: assert batch_metric["loss"] > 0
def test_mnist_tp_accuracy() -> None: config = conf.load_config(conf.official_examples_path("mnist_tp/const.yaml")) experiment_id = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("mnist_tp"), 1 ) trials = exp.experiment_trials(experiment_id) trial_metrics = exp.trial_metrics(trials[0]["id"]) validation_errors = [] # TODO (DET-3082): The validation metric names were modified by our trial reporting # from accuracy to val_accuracy. We should probably remove the added prefix so # the metric name is as specified. validation_errors = [ step["validation"]["metrics"]["validation_metrics"]["val_accuracy"] for step in trial_metrics["steps"] if step.get("validation") ] target_accuracy = 0.95 assert max(validation_errors) > target_accuracy, ( "mnist_tp did not reach minimum target accuracy {} in {} steps." " full validation error history: {}".format( target_accuracy, len(trial_metrics["steps"]), validation_errors ) )
def test_noop_pause() -> None: """ Walk through starting, pausing, and resuming a single no-op experiment. """ experiment_id = exp.create_experiment( conf.fixtures_path("no_op/single-medium-train-step.yaml"), conf.fixtures_path("no_op"), None, ) exp.wait_for_experiment_state(experiment_id, "ACTIVE") # Wait for the only trial to get scheduled. workload_active = False for _ in range(conf.MAX_TASK_SCHEDULED_SECS): workload_active = exp.experiment_has_active_workload(experiment_id) if workload_active: break else: time.sleep(1) check.true( workload_active, f"The only trial cannot be scheduled within {conf.MAX_TASK_SCHEDULED_SECS} seconds.", ) # Wait for the only trial to show progress, indicating the image is built and running. num_steps = 0 for _ in range(conf.MAX_TRIAL_BUILD_SECS): trials = exp.experiment_trials(experiment_id) if len(trials) > 0: only_trial = trials[0] num_steps = len(only_trial["steps"]) if num_steps > 1: break time.sleep(1) check.true( num_steps > 1, f"The only trial cannot start training within {conf.MAX_TRIAL_BUILD_SECS} seconds.", ) # Pause the experiment. Note that Determined does not currently differentiate # between a "stopping paused" and a "paused" state, so we follow this check # up by ensuring the experiment cleared all scheduled workloads. exp.pause_experiment(experiment_id) exp.wait_for_experiment_state(experiment_id, "PAUSED") # Wait at most 20 seconds for the experiment to clear all workloads (each # train step should take 5 seconds). for _ in range(20): workload_active = exp.experiment_has_active_workload(experiment_id) if not workload_active: break else: time.sleep(1) check.true( not workload_active, "The experiment cannot be paused within 20 seconds.", ) # Resume the experiment and wait for completion. exp.activate_experiment(experiment_id) exp.wait_for_experiment_state(experiment_id, "COMPLETED")
def test_master_restart_reattach_recover_experiment( managed_cluster_restarts: ManagedCluster, downtime: int) -> None: _sanity_check(managed_cluster_restarts) try: exp_id = exp.create_experiment( conf.fixtures_path("no_op/single-medium-train-step.yaml"), conf.fixtures_path("no_op"), None, ) # TODO(ilia): don't wait for progress. exp.wait_for_experiment_workload_progress(exp_id) if downtime >= 0: managed_cluster_restarts.kill_master() time.sleep(downtime) managed_cluster_restarts.restart_master() exp.wait_for_experiment_state(exp_id, EXP_STATE.STATE_COMPLETED, max_wait_secs=downtime + 60) trials = exp.experiment_trials(exp_id) assert len(trials) == 1 train_wls = exp.workloads_with_training(trials[0].workloads) assert len(train_wls) == 5 except Exception: managed_cluster_restarts.restart_master() managed_cluster_restarts.restart_agent() raise
def test_agent_reconnect_keep_experiment( managed_cluster_restarts: ManagedCluster) -> None: managed_cluster_restarts.ensure_agent_ok() try: exp_id = exp.create_experiment( conf.fixtures_path("no_op/single-medium-train-step.yaml"), conf.fixtures_path("no_op"), None, ) exp.wait_for_experiment_workload_progress(exp_id) managed_cluster_restarts.kill_proxy() time.sleep(1) managed_cluster_restarts.restart_proxy() exp.wait_for_experiment_state(exp_id, EXP_STATE.STATE_COMPLETED) trials = exp.experiment_trials(exp_id) assert len(trials) == 1 train_wls = exp.workloads_with_training(trials[0].workloads) assert len(train_wls) == 5 except Exception: managed_cluster_restarts.restart_proxy(wait_for_reconnect=False) managed_cluster_restarts.restart_agent() raise
def test_agent_restart_recover_experiment( managed_cluster_restarts: ManagedCluster, downtime: int) -> None: if not managed_cluster_restarts.reattach: pytest.skip() managed_cluster_restarts.ensure_agent_ok() try: exp_id = exp.create_experiment( conf.fixtures_path("no_op/single-medium-train-step.yaml"), conf.fixtures_path("no_op"), None, ) exp.wait_for_experiment_workload_progress(exp_id) if downtime >= 0: managed_cluster_restarts.kill_agent() time.sleep(downtime) managed_cluster_restarts.restart_agent(wait_for_amnesia=False) exp.wait_for_experiment_state(exp_id, EXP_STATE.STATE_COMPLETED) trials = exp.experiment_trials(exp_id) assert len(trials) == 1 train_wls = exp.workloads_with_training(trials[0].workloads) assert len(train_wls) == 5 except Exception: managed_cluster_restarts.restart_agent() raise
def test_pytorch_parallel() -> None: config = conf.load_config(conf.tutorials_path("mnist_pytorch/const.yaml")) config = conf.set_slots_per_trial(config, 8) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_tensor_auto_tuning(config, True) config = conf.set_perform_initial_validation(config, True) exp_id = exp.run_basic_test_with_temp_config( config, conf.tutorials_path("mnist_pytorch"), 1) exp.assert_performed_initial_validation(exp_id) # Check on record/batch counts we emitted in logs. validation_size = 10000 global_batch_size = config["hyperparameters"]["global_batch_size"] num_workers = config.get("resources", {}).get("slots_per_trial", 1) global_batch_size = config["hyperparameters"]["global_batch_size"] scheduling_unit = config.get("scheduling_unit", 100) per_slot_batch_size = global_batch_size // num_workers exp_val_batches = (validation_size + (per_slot_batch_size - 1)) // per_slot_batch_size patterns = [ # Expect two copies of matching training reports. f"trained: {scheduling_unit * global_batch_size} records.*in {scheduling_unit} batches", f"trained: {scheduling_unit * global_batch_size} records.*in {scheduling_unit} batches", f"validated: {validation_size} records.*in {exp_val_batches} batches", ] trial_id = exp.experiment_trials(exp_id)[0].trial.id exp.assert_patterns_in_trial_logs(trial_id, patterns)
def test_cifar10_byol_pytorch_accuracy() -> None: config = conf.load_config( conf.cv_examples_path("byol_pytorch/const-cifar10.yaml")) # Limit convergence time, since was running over 30 minute limit. config["searcher"]["max_length"]["epochs"] = 20 config["hyperparameters"]["classifier"]["train_epochs"] = 1 config = conf.set_random_seed(config, 1591280374) experiment_id = exp.run_basic_test_with_temp_config( config, conf.cv_examples_path("byol_pytorch"), 1) trials = exp.experiment_trials(experiment_id) trial_metrics = exp.trial_metrics(trials[0].trial.id) validation_accuracies = [ step["validation"]["metrics"]["validation_metrics"]["test_accuracy"] for step in trial_metrics["steps"] if step.get("validation") ] # Accuracy reachable within limited convergence time -- goes higher given full training. target_accuracy = 0.40 assert max(validation_accuracies) > target_accuracy, ( "cifar10_byol_pytorch did not reach minimum target accuracy {} in {} steps." " full validation accuracy history: {}".format( target_accuracy, len(trial_metrics["steps"]), validation_accuracies))
def test_launch_layer_cifar( collect_trial_profiles: Callable[[int], None]) -> None: config = conf.load_config( conf.cv_examples_path("cifar10_pytorch/const.yaml")) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_slots_per_trial(config, 1) config = conf.set_profiling_enabled(config) config = conf.set_entrypoint( config, "python3 -m determined.launch.horovod --autohorovod --trial model_def:CIFARTrial" ) experiment_id = exp.run_basic_test_with_temp_config( config, conf.cv_examples_path("cifar10_pytorch"), 1) trials = exp.experiment_trials(experiment_id) (Determined(conf.make_master_url()).get_trial( trials[0].trial.id).select_checkpoint(latest=True).load( map_location="cpu")) collect_trial_profiles(trials[0].trial.id) assert exp.check_if_string_present_in_trial_logs( trials[0].trial.id, "allocation stopped after resources exited successfully with a zero exit code", )
def run_dataset_experiment( searcher_max_steps: int, batches_per_step: int, secrets: Dict[str, str], tf2: bool, slots_per_trial: int = 1, source_trial_id: Optional[str] = None, ) -> List[Dict[str, Any]]: config = conf.load_config( conf.fixtures_path("estimator_dataset/const.yaml")) config.setdefault("searcher", {}) config["searcher"]["max_steps"] = searcher_max_steps config["batches_per_step"] = batches_per_step config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) if source_trial_id is not None: config["searcher"]["source_trial_id"] = source_trial_id config.setdefault("resources", {}) config["resources"]["slots_per_trial"] = slots_per_trial if cluster.num_agents() > 1: config["checkpoint_storage"] = exp.s3_checkpoint_config(secrets) experiment_id = exp.run_basic_test_with_temp_config( config, conf.fixtures_path("estimator_dataset"), 1) return exp.experiment_trials(experiment_id)
def test_streaming_observability_metrics_apis( framework_base_experiment: str, framework_timings_enabled: bool ) -> None: # TODO: refactor tests to not use cli singleton auth. certs.cli_cert = certs.default_load(conf.make_master_url()) authentication.cli_auth = authentication.Authentication(conf.make_master_url(), try_reauth=True) config_path = conf.tutorials_path(f"../{framework_base_experiment}/const.yaml") model_def_path = conf.tutorials_path(f"../{framework_base_experiment}") config_obj = conf.load_config(config_path) config_obj = conf.set_profiling_enabled(config_obj) with tempfile.NamedTemporaryFile() as tf: with open(tf.name, "w") as f: yaml.dump(config_obj, f) experiment_id = exp.create_experiment( tf.name, model_def_path, ) exp.wait_for_experiment_state(experiment_id, "COMPLETED") trials = exp.experiment_trials(experiment_id) trial_id = trials[0]["id"] gpu_enabled = conf.GPU_ENABLED request_profiling_metric_labels(trial_id, framework_timings_enabled, gpu_enabled) if gpu_enabled: request_profiling_system_metrics(trial_id, "gpu_util") if framework_timings_enabled: request_profiling_pytorch_timing_metrics(trial_id, "train_batch")
def test_end_to_end_adaptive() -> None: exp_id = exp.run_basic_test( conf.fixtures_path("mnist_pytorch/adaptive_short.yaml"), conf.official_examples_path("trial/mnist_pytorch"), None, ) # Check that validation accuracy look sane (more than 93% on MNIST). trials = exp.experiment_trials(exp_id) best = None for trial in trials: assert len(trial["steps"]) last_step = trial["steps"][-1] accuracy = last_step["validation"]["metrics"]["validation_metrics"]["accuracy"] if not best or accuracy > best: best = accuracy assert best is not None assert best > 0.93 # Check that ExperimentReference returns a sorted order of top checkpoints # without gaps. The top 2 checkpoints should be the first 2 of the top k # checkpoints if sorting is stable. d = Determined(conf.make_master_url()) exp_ref = d.get_experiment(exp_id) top_2 = exp_ref.top_n_checkpoints(2) top_k = exp_ref.top_n_checkpoints(len(trials)) top_2_uuids = [c.uuid for c in top_2] top_k_uuids = [c.uuid for c in top_k] assert top_2_uuids == top_k_uuids[:2] # Check that metrics are truly in sorted order. metrics = [c.validation["metrics"]["validation_metrics"]["validation_loss"] for c in top_k] assert metrics == sorted(metrics) # Check that changing smaller is better reverses the checkpoint ordering. top_k_reversed = exp_ref.top_n_checkpoints( len(trials), sort_by="validation_loss", smaller_is_better=False ) top_k_reversed_uuids = [c.uuid for c in top_k_reversed] assert top_k_uuids == top_k_reversed_uuids[::-1] checkpoint = top_k[0] checkpoint.add_metadata({"testing": "metadata"}) assert checkpoint.metadata == {"testing": "metadata"} checkpoint.add_metadata({"some_key": "some_value"}) assert checkpoint.metadata == {"testing": "metadata", "some_key": "some_value"} checkpoint.add_metadata({"testing": "override"}) assert checkpoint.metadata == {"testing": "override", "some_key": "some_value"} checkpoint.remove_metadata(["some_key"]) assert checkpoint.metadata == {"testing": "override"}
def test_tensorpack_const() -> None: config = conf.load_config( conf.official_examples_path("trial/mnist_tp/const.yaml")) experiment_id = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("trial/mnist_tp"), 1) trials = exp.experiment_trials(experiment_id) assert len(trials) == 1
def _test_rng_restore(fixture: str, metrics: list) -> None: """ This test confirms that an experiment can be restarted from a checkpoint with the same RNG state. It requires a test fixture that will emit random numbers from all of the RNGs used in the relevant framework as metrics. The experiment must have a const.yaml, run for at least 3 steps, checkpoint every step, and keep the first checkpoint (either by having metrics get worse over time, or by configuring the experiment to keep all checkpoints). """ experiment = exp.run_basic_test( conf.fixtures_path(fixture + "/const.yaml"), conf.fixtures_path(fixture), 1, ) first_trial = exp.experiment_trials(experiment)[0] assert len(first_trial["steps"]) >= 3 first_step = first_trial["steps"][0] first_checkpoint_id = first_step["checkpoint"]["id"] config_base = conf.load_config(conf.fixtures_path(fixture + "/const.yaml")) config_obj = copy.deepcopy(config_base) config_obj["searcher"]["source_checkpoint_uuid"] = first_step["checkpoint"]["uuid"] experiment2 = exp.run_basic_test_with_temp_config(config_obj, conf.fixtures_path(fixture), 1) second_trial = exp.experiment_trials(experiment2)[0] assert len(second_trial["steps"]) >= 3 assert second_trial["warm_start_checkpoint_id"] == first_checkpoint_id for step in range(0, 2): for metric in metrics: first_metric = first_trial["steps"][step + 1]["validation"]["metrics"][ "validation_metrics" ][metric] second_metric = second_trial["steps"][step]["validation"]["metrics"][ "validation_metrics" ][metric] assert ( first_metric == second_metric ), f"failures on iteration: {step} with metric: {metric}"
def test_tf_keras_mnist_data_layer_s3( tf2: bool, storage_type: str, secrets: Dict[str, str], collect_trial_profiles: Callable[[int], None], ) -> None: exp_id = run_tf_keras_mnist_data_layer_test(tf2, storage_type) trial_id = exp.experiment_trials(exp_id)[0].trial.id collect_trial_profiles(trial_id)
def test_mnist_estimator_load() -> None: config = conf.load_config(conf.fixtures_path("mnist_estimator/single.yaml")) config = conf.set_tf1_image(config) experiment_id = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("trial/mnist_estimator"), 1 ) trials = exp.experiment_trials(experiment_id) model = Determined(conf.make_master_url()).get_trial(trials[0]["id"]).top_checkpoint().load() assert isinstance(model, AutoTrackable)
def test_noop_load() -> None: """ Load a checkpoint """ experiment_id = exp.run_basic_test( conf.fixtures_path("no_op/single.yaml"), conf.fixtures_path("no_op"), 1 ) trials = exp.experiment_trials(experiment_id) checkpoint = Determined(conf.make_master_url()).get_trial(trials[0].trial.id).top_checkpoint() assert checkpoint.task_id == trials[0].trial.taskId
def test_tf_keras_mnist_parallel() -> None: config = conf.load_config( conf.tutorials_path("fashion_mnist_tf_keras/const.yaml")) config = conf.set_slots_per_trial(config, 8) config = conf.set_max_length(config, {"batches": 200}) experiment_id = exp.run_basic_test_with_temp_config( config, conf.tutorials_path("fashion_mnist_tf_keras"), 1) trials = exp.experiment_trials(experiment_id) assert len(trials) == 1
def test_log_null_bytes() -> None: config_obj = conf.load_config(conf.fixtures_path("no_op/single.yaml")) config_obj["hyperparameters"]["write_null"] = True config_obj["max_restarts"] = 0 config_obj["searcher"]["max_length"] = {"batches": 1} experiment_id = exp.run_basic_test_with_temp_config(config_obj, conf.fixtures_path("no_op"), 1) trials = exp.experiment_trials(experiment_id) assert len(trials) == 1 logs = exp.trial_logs(trials[0]["id"]) assert len(logs) > 0
def test_tf_keras_tf2_disabled() -> None: """Keras on tf2 with tf2 and eager execution disabled.""" config = conf.load_config( conf.fixtures_path("keras_tf2_disabled_no_op/const.yaml")) config = conf.set_max_length(config, {"batches": 1}) config = conf.set_tf2_image(config) experiment_id = exp.run_basic_test_with_temp_config( config, conf.fixtures_path("keras_tf2_disabled_no_op"), 1) trials = exp.experiment_trials(experiment_id) assert len(trials) == 1 exp.export_and_load_model(experiment_id)
def test_tf_keras_single_gpu(tf2: bool) -> None: config = conf.load_config(conf.official_examples_path("cifar10_cnn_tf_keras/const.yaml")) config = conf.set_slots_per_trial(config, 1) config = conf.set_max_steps(config, 2) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) experiment_id = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("cifar10_cnn_tf_keras"), 1 ) trials = exp.experiment_trials(experiment_id) assert len(trials) == 1
def test_tensorpack_native_parallel() -> None: config = conf.load_config( conf.official_examples_path("trial/mnist_tp/const.yaml")) config = conf.set_slots_per_trial(config, 8) config = conf.set_native_parallel(config, True) config = conf.set_max_length(config, {"batches": 32}) experiment_id = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("trial/mnist_tp"), 1) trials = exp.experiment_trials(experiment_id) assert len(trials) == 1
def test_tf_keras_mnist_parallel() -> None: config = conf.load_config(conf.official_examples_path("fashion_mnist_tf_keras/const.yaml")) config = conf.set_slots_per_trial(config, 8) config = conf.set_native_parallel(config, False) config = conf.set_max_steps(config, 2) experiment_id = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("fashion_mnist_tf_keras"), 1 ) trials = exp.experiment_trials(experiment_id) assert len(trials) == 1