def test_tf_keras_const_warm_start(tf2: bool) -> None: config = conf.load_config( conf.cv_examples_path("cifar10_tf_keras/const.yaml")) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_min_validation_period(config, {"batches": 1000}) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) experiment_id1 = exp.run_basic_test_with_temp_config( config, conf.cv_examples_path("cifar10_tf_keras"), 1) trials = exp.experiment_trials(experiment_id1) assert len(trials) == 1 first_trial = trials[0] first_trial_id = first_trial["id"] assert len(first_trial["steps"]) == 2 first_checkpoint_id = first_trial["steps"][1]["checkpoint"]["id"] # Add a source trial ID to warm start from. config["searcher"]["source_trial_id"] = first_trial_id experiment_id2 = exp.run_basic_test_with_temp_config( config, conf.cv_examples_path("cifar10_tf_keras"), 1) # The new trials should have a warm start checkpoint ID. trials = exp.experiment_trials(experiment_id2) assert len(trials) == 1 for trial in trials: assert trial["warm_start_checkpoint_id"] == first_checkpoint_id
def test_tf_keras_const_warm_start( tf2: bool, collect_trial_profiles: Callable[[int], None]) -> None: config = conf.load_config( conf.cv_examples_path("cifar10_tf_keras/const.yaml")) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_min_validation_period(config, {"batches": 1000}) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) config = conf.set_profiling_enabled(config) experiment_id1 = exp.run_basic_test_with_temp_config( config, conf.cv_examples_path("cifar10_tf_keras"), 1) trials = exp.experiment_trials(experiment_id1) assert len(trials) == 1 first_trial = trials[0] first_trial_id = first_trial.trial.id assert len(first_trial.workloads) == 4 checkpoints = exp.workloads_with_checkpoint(first_trial.workloads) first_checkpoint_uuid = checkpoints[0].uuid # Add a source trial ID to warm start from. config["searcher"]["source_trial_id"] = first_trial_id experiment_id2 = exp.run_basic_test_with_temp_config( config, conf.cv_examples_path("cifar10_tf_keras"), 1) # The new trials should have a warm start checkpoint ID. trials = exp.experiment_trials(experiment_id2) assert len(trials) == 1 for t in trials: assert t.trial.warmStartCheckpointUuid != "" assert t.trial.warmStartCheckpointUuid == first_checkpoint_uuid trial_id = trials[0].trial.id collect_trial_profiles(trial_id)
def test_deepspeed_pipeline_parallel() -> None: config = conf.load_config(conf.deepspeed_examples_path("pipeline_parallelism/distributed.yaml")) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_min_validation_period(config, {"batches": 100}) exp.run_basic_test_with_temp_config( config, conf.deepspeed_examples_path("pipeline_parallelism"), 1 )
def run_tf_keras_dcgan_example() -> None: config = conf.load_config( conf.gan_examples_path("dcgan_tf_keras/const.yaml")) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_min_validation_period(config, {"batches": 200}) config = conf.set_slots_per_trial(config, 8) config = conf.set_tf2_image(config) exp.run_basic_test_with_temp_config( config, conf.gan_examples_path("dcgan_tf_keras"), 1)
def run_tf_keras_mnist_data_layer_test(tf2: bool, storage_type: str) -> None: config = conf.load_config( conf.features_examples_path("data_layer_mnist_tf_keras/const.yaml")) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_min_validation_period(config, {"batches": 1000}) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) if storage_type == "lfs": config = conf.set_shared_fs_data_layer(config) else: config = conf.set_s3_data_layer(config) exp.run_basic_test_with_temp_config( config, conf.features_examples_path("data_layer_mnist_tf_keras"), 1)
def run_tf_keras_dcgan_example( collect_trial_profiles: Callable[[int], None]) -> None: config = conf.load_config( conf.gan_examples_path("dcgan_tf_keras/const.yaml")) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_min_validation_period(config, {"batches": 200}) config = conf.set_slots_per_trial(config, 8) config = conf.set_tf2_image(config) config = conf.set_profiling_enabled(config) exp_id = exp.run_basic_test_with_temp_config( config, conf.gan_examples_path("dcgan_tf_keras"), 1) trial_id = exp.experiment_trials(exp_id)[0].trial.id collect_trial_profiles(trial_id)
def test_gpt_neox_zero_3D_parallel() -> None: config = conf.load_config(conf.deepspeed_examples_path("gpt_neox/zero1_3d_parallel.yaml")) config = conf.set_max_length(config, {"batches": 100}) config = conf.set_min_validation_period(config, {"batches": 100}) exp.run_basic_test_with_temp_config(config, conf.deepspeed_examples_path("gpt_neox"), 1)
def test_deepspeed_zero() -> None: config = conf.load_config(conf.deepspeed_examples_path("cifar10_moe/zero_stages.yaml")) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_min_validation_period(config, {"batches": 100}) exp.run_basic_test_with_temp_config(config, conf.deepspeed_examples_path("cifar10_moe"), 1)