def test_nas_search() -> None: config = conf.load_config( conf.experimental_path("trial/rsws_nas/train_one_arch.yaml")) config = conf.set_max_length(config, {"batches": 200}) exp.run_basic_test_with_temp_config( config, conf.experimental_path("trial/rsws_nas"), 1)
def test_mnist_estimator_distributed() -> None: config = conf.load_config( conf.cv_examples_path("mnist_estimator/distributed.yaml")) config = conf.set_max_length(config, {"batches": 200}) exp.run_basic_test_with_temp_config( config, conf.cv_examples_path("mnist_estimator"), 1)
def test_resnet50() -> None: config = conf.load_config(conf.experimental_path("trial/resnet50_tf_keras/const.yaml")) config = conf.set_max_length(config, {"batches": 200}) exp.run_basic_test_with_temp_config( config, conf.experimental_path("trial/resnet50_tf_keras"), 1 )
def test_deformabledetr_coco_pytorch_const() -> None: config = conf.load_config( conf.cv_examples_path("deformabledetr_coco_pytorch/const_fake.yaml")) config = conf.set_max_length(config, {"batches": 200}) exp.run_basic_test_with_temp_config( config, conf.cv_examples_path("deformabledetr_coco_pytorch"), 1)
def test_fashion_mnist_tf_keras_distributed() -> None: config = conf.load_config( conf.tutorials_path("fashion_mnist_tf_keras/distributed.yaml")) config = conf.set_max_length(config, {"batches": 200}) exp.run_basic_test_with_temp_config( config, conf.tutorials_path("fashion_mnist_tf_keras"), 1)
def test_launch_layer_cifar( collect_trial_profiles: Callable[[int], None]) -> None: config = conf.load_config( conf.cv_examples_path("cifar10_pytorch/const.yaml")) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_slots_per_trial(config, 1) config = conf.set_profiling_enabled(config) config = conf.set_entrypoint( config, "python3 -m determined.launch.horovod --autohorovod --trial model_def:CIFARTrial" ) experiment_id = exp.run_basic_test_with_temp_config( config, conf.cv_examples_path("cifar10_pytorch"), 1) trials = exp.experiment_trials(experiment_id) (Determined(conf.make_master_url()).get_trial( trials[0].trial.id).select_checkpoint(latest=True).load( map_location="cpu")) collect_trial_profiles(trials[0].trial.id) assert exp.check_if_string_present_in_trial_logs( trials[0].trial.id, "allocation stopped after resources exited successfully with a zero exit code", )
def test_unets_tf_keras_distributed() -> None: config = conf.load_config( conf.cv_examples_path("unets_tf_keras/distributed.yaml")) config = conf.set_max_length(config, {"batches": 200}) exp.run_basic_test_with_temp_config( config, conf.cv_examples_path("unets_tf_keras"), 1)
def test_tf_keras_const_warm_start( tf2: bool, collect_trial_profiles: Callable[[int], None]) -> None: config = conf.load_config( conf.cv_examples_path("cifar10_tf_keras/const.yaml")) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_min_validation_period(config, {"batches": 1000}) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) config = conf.set_profiling_enabled(config) experiment_id1 = exp.run_basic_test_with_temp_config( config, conf.cv_examples_path("cifar10_tf_keras"), 1) trials = exp.experiment_trials(experiment_id1) assert len(trials) == 1 first_trial = trials[0] first_trial_id = first_trial.trial.id assert len(first_trial.workloads) == 4 checkpoints = exp.workloads_with_checkpoint(first_trial.workloads) first_checkpoint_uuid = checkpoints[0].uuid # Add a source trial ID to warm start from. config["searcher"]["source_trial_id"] = first_trial_id experiment_id2 = exp.run_basic_test_with_temp_config( config, conf.cv_examples_path("cifar10_tf_keras"), 1) # The new trials should have a warm start checkpoint ID. trials = exp.experiment_trials(experiment_id2) assert len(trials) == 1 for t in trials: assert t.trial.warmStartCheckpointUuid != "" assert t.trial.warmStartCheckpointUuid == first_checkpoint_uuid trial_id = trials[0].trial.id collect_trial_profiles(trial_id)
def test_word_language_transformer_const() -> None: config = conf.load_config(conf.nlp_examples_path("word_language_model/const.yaml")) config = conf.set_max_length(config, {"batches": 200}) config = config.copy() config["hyperparameters"]["model_cls"] = "Transformer" exp.run_basic_test_with_temp_config(config, conf.nlp_examples_path("word_language_model"), 1)
def test_pytorch_parallel() -> None: config = conf.load_config(conf.tutorials_path("mnist_pytorch/const.yaml")) config = conf.set_slots_per_trial(config, 8) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_tensor_auto_tuning(config, True) config = conf.set_perform_initial_validation(config, True) exp_id = exp.run_basic_test_with_temp_config( config, conf.tutorials_path("mnist_pytorch"), 1) exp.assert_performed_initial_validation(exp_id) # Check on record/batch counts we emitted in logs. validation_size = 10000 global_batch_size = config["hyperparameters"]["global_batch_size"] num_workers = config.get("resources", {}).get("slots_per_trial", 1) global_batch_size = config["hyperparameters"]["global_batch_size"] scheduling_unit = config.get("scheduling_unit", 100) per_slot_batch_size = global_batch_size // num_workers exp_val_batches = (validation_size + (per_slot_batch_size - 1)) // per_slot_batch_size patterns = [ # Expect two copies of matching training reports. f"trained: {scheduling_unit * global_batch_size} records.*in {scheduling_unit} batches", f"trained: {scheduling_unit * global_batch_size} records.*in {scheduling_unit} batches", f"validated: {validation_size} records.*in {exp_val_batches} batches", ] trial_id = exp.experiment_trials(exp_id)[0].trial.id exp.assert_patterns_in_trial_logs(trial_id, patterns)
def test_pl_mnist() -> None: exp_dir = "mnist_pl" config = conf.load_config(conf.cv_examples_path(exp_dir + "/const.yaml")) config = conf.set_max_length(config, {"batches": 200}) exp.run_basic_test_with_temp_config(config, conf.cv_examples_path(exp_dir), 1)
def test_epoch_sync(num_workers: int, global_batch_size: int, dataset_len: int) -> None: """ Test that epoch_idx is synchronized across all workers regardless of whether the number of batches is evenly divisible by the number of workers. """ config = conf.load_config(conf.fixtures_path("pytorch_no_op/const.yaml")) config = conf.set_slots_per_trial(config, num_workers) max_len_batches = 10 config = conf.set_max_length(config, {"batches": max_len_batches}) config = conf.set_hparam(config, "dataset_len", dataset_len) config = conf.set_global_batch_size(config, global_batch_size) e_id = exp.run_basic_test_with_temp_config( config, conf.fixtures_path("pytorch_no_op"), 1) t_id = exp.experiment_trials(e_id)[0].trial.id batches_per_epoch = (dataset_len + global_batch_size - 1) // global_batch_size # ceil for batch_idx in range(max_len_batches): epoch_idx = batch_idx // batches_per_epoch for rank in range(config["resources"]["slots_per_trial"]): assert exp.check_if_string_present_in_trial_logs( t_id, f"rank {rank} finished batch {batch_idx} in epoch {epoch_idx}")
def test_pix2pix_facades_const() -> None: config = conf.load_config( conf.gan_examples_path("pix2pix_tf_keras/const.yaml")) config = conf.set_max_length(config, {"batches": 200}) exp.run_basic_test_with_temp_config( config, conf.gan_examples_path("pix2pix_tf_keras"), 1)
def test_pytorch_const_with_amp(api_style: str) -> None: config = conf.load_config( conf.fixtures_path("pytorch_amp/" + api_style + "_amp.yaml")) config = conf.set_max_length(config, {"batches": 200}) exp.run_basic_test_with_temp_config(config, conf.fixtures_path("pytorch_amp"), 1)
def test_imagenet_nas() -> None: config = conf.load_config(conf.experimental_path("trial/imagenet_nas_arch_pytorch/const.yaml")) config = conf.set_max_length(config, {"batches": 200}) exp.run_basic_test_with_temp_config( config, conf.experimental_path("trial/imagenet_nas_arch_pytorch"), 1 )
def test_protein_pytorch_geometric() -> None: config = conf.load_config(conf.graphs_examples_path("proteins_pytorch_geometric/const.yaml")) config = conf.set_max_length(config, {"epochs": 50}) exp.run_basic_test_with_temp_config( config, conf.graphs_examples_path("proteins_pytorch_geometric"), 1 )
def test_tf_keras_const_warm_start(tf2: bool) -> None: config = conf.load_config( conf.cv_examples_path("cifar10_tf_keras/const.yaml")) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_min_validation_period(config, {"batches": 1000}) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) experiment_id1 = exp.run_basic_test_with_temp_config( config, conf.cv_examples_path("cifar10_tf_keras"), 1) trials = exp.experiment_trials(experiment_id1) assert len(trials) == 1 first_trial = trials[0] first_trial_id = first_trial["id"] assert len(first_trial["steps"]) == 2 first_checkpoint_id = first_trial["steps"][1]["checkpoint"]["id"] # Add a source trial ID to warm start from. config["searcher"]["source_trial_id"] = first_trial_id experiment_id2 = exp.run_basic_test_with_temp_config( config, conf.cv_examples_path("cifar10_tf_keras"), 1) # The new trials should have a warm start checkpoint ID. trials = exp.experiment_trials(experiment_id2) assert len(trials) == 1 for trial in trials: assert trial["warm_start_checkpoint_id"] == first_checkpoint_id
def test_text_classification_glue() -> None: example_path = conf.model_hub_examples_path("huggingface/text-classification") config = conf.load_config(os.path.join(example_path, "glue_config.yaml")) config = conf.set_max_length(config, {"batches": 200}) config = set_docker_image(config) exp.run_basic_test_with_temp_config(config, example_path, 1)
def test_gaea_pytorch_distributed() -> None: config = conf.load_config( conf.nas_examples_path("gaea_pytorch/eval/distributed_no_data_download.yaml") ) config = conf.set_max_length(config, {"batches": 200}) exp.run_basic_test_with_temp_config(config, conf.nas_examples_path("gaea_pytorch/eval"), 1)
def test_language_modeling_plm() -> None: example_path = conf.model_hub_examples_path("huggingface/language-modeling") config = conf.load_config(os.path.join(example_path, "plm_config.yaml")) config = conf.set_max_length(config, {"batches": 200}) config = set_docker_image(config) exp.run_basic_test_with_temp_config(config, example_path, 1)
def test_gan_mnist_pytorch_const() -> None: config = conf.load_config( conf.gan_examples_path("gan_mnist_pytorch/const.yaml")) config = conf.set_max_length(config, {"batches": 200}) exp.run_basic_test_with_temp_config( config, conf.gan_examples_path("gan_mnist_pytorch"), 1)
def test_bert_glue() -> None: config = conf.load_config(conf.experimental_path("trial/bert_glue_pytorch/const.yaml")) config = conf.set_max_length(config, {"batches": 200}) exp.run_basic_test_with_temp_config( config, conf.experimental_path("trial/bert_glue_pytorch/"), 1 )
def test_mmdetection_pytorch_const() -> None: config = conf.load_config( conf.cv_examples_path("mmdetection_pytorch/const_fake_data.yaml")) config = conf.set_max_length(config, {"batches": 200}) exp.run_basic_test_with_temp_config( config, conf.cv_examples_path("mmdetection_pytorch"), 1)
def test_mnist_tp_to_estimator() -> None: config = conf.load_config(conf.experimental_path("trial/mnist_tp_to_estimator/const.yaml")) config = conf.set_max_length(config, {"batches": 32}) exp.run_basic_test_with_temp_config( config, conf.experimental_path("trial/mnist_tp_to_estimator"), 1 )
def test_cifar10_pytorch_distributed() -> None: config = conf.load_config( conf.cv_examples_path("cifar10_pytorch/distributed.yaml")) config = conf.set_max_length(config, {"batches": 200}) exp.run_basic_test_with_temp_config( config, conf.cv_examples_path("cifar10_pytorch"), 1)
def test_mnist_pytorch_multi_output() -> None: config = conf.load_config(conf.experimental_path("trial/mnist_pytorch_multi_output/const.yaml")) config = conf.set_max_length(config, {"batches": 200}) exp.run_basic_test_with_temp_config( config, conf.experimental_path("trial/mnist_pytorch_multi_output"), 1 )
def test_maskrcnn_distributed_fake() -> None: example_path = conf.fixtures_path("mmdetection") config = conf.load_config(os.path.join(example_path, "distributed_fake_data.yaml")) config = conf.set_max_length(config, {"batches": 200}) config = set_docker_image(config) exp.run_basic_test_with_temp_config(config, example_path, 1)
def test_deepspeed_pipeline_parallel() -> None: config = conf.load_config(conf.deepspeed_examples_path("pipeline_parallelism/distributed.yaml")) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_min_validation_period(config, {"batches": 100}) exp.run_basic_test_with_temp_config( config, conf.deepspeed_examples_path("pipeline_parallelism"), 1 )
def test_gaea_pytorch_const() -> None: config = conf.load_config( conf.nas_examples_path("gaea_pytorch/eval/const.yaml")) config = conf.set_global_batch_size(config, 32) config = conf.set_max_length(config, {"batches": 200}) exp.run_basic_test_with_temp_config( config, conf.nas_examples_path("gaea_pytorch/eval"), 1)
def test_word_language_lstm_const() -> None: config = conf.load_config(conf.nlp_examples_path("word_language_model/distributed.yaml")) config = conf.set_max_length(config, {"batches": 200}) config = config.copy() config["hyperparameters"]["model_cls"] = "LSTM" config["hyperparameters"]["tied"] = False exp.run_basic_test_with_temp_config(config, conf.nlp_examples_path("word_language_model"), 1)