def test_launch_layer_cifar( collect_trial_profiles: Callable[[int], None]) -> None: config = conf.load_config( conf.cv_examples_path("cifar10_pytorch/const.yaml")) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_slots_per_trial(config, 1) config = conf.set_profiling_enabled(config) config = conf.set_entrypoint( config, "python3 -m determined.launch.horovod --autohorovod --trial model_def:CIFARTrial" ) experiment_id = exp.run_basic_test_with_temp_config( config, conf.cv_examples_path("cifar10_pytorch"), 1) trials = exp.experiment_trials(experiment_id) (Determined(conf.make_master_url()).get_trial( trials[0].trial.id).select_checkpoint(latest=True).load( map_location="cpu")) collect_trial_profiles(trials[0].trial.id) assert exp.check_if_string_present_in_trial_logs( trials[0].trial.id, "allocation stopped after resources exited successfully with a zero exit code", )
def test_pytorch_parallel() -> None: config = conf.load_config(conf.tutorials_path("mnist_pytorch/const.yaml")) config = conf.set_slots_per_trial(config, 8) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_tensor_auto_tuning(config, True) config = conf.set_perform_initial_validation(config, True) exp_id = exp.run_basic_test_with_temp_config( config, conf.tutorials_path("mnist_pytorch"), 1) exp.assert_performed_initial_validation(exp_id) # Check on record/batch counts we emitted in logs. validation_size = 10000 global_batch_size = config["hyperparameters"]["global_batch_size"] num_workers = config.get("resources", {}).get("slots_per_trial", 1) global_batch_size = config["hyperparameters"]["global_batch_size"] scheduling_unit = config.get("scheduling_unit", 100) per_slot_batch_size = global_batch_size // num_workers exp_val_batches = (validation_size + (per_slot_batch_size - 1)) // per_slot_batch_size patterns = [ # Expect two copies of matching training reports. f"trained: {scheduling_unit * global_batch_size} records.*in {scheduling_unit} batches", f"trained: {scheduling_unit * global_batch_size} records.*in {scheduling_unit} batches", f"validated: {validation_size} records.*in {exp_val_batches} batches", ] trial_id = exp.experiment_trials(exp_id)[0].trial.id exp.assert_patterns_in_trial_logs(trial_id, patterns)
def test_epoch_sync(num_workers: int, global_batch_size: int, dataset_len: int) -> None: """ Test that epoch_idx is synchronized across all workers regardless of whether the number of batches is evenly divisible by the number of workers. """ config = conf.load_config(conf.fixtures_path("pytorch_no_op/const.yaml")) config = conf.set_slots_per_trial(config, num_workers) max_len_batches = 10 config = conf.set_max_length(config, {"batches": max_len_batches}) config = conf.set_hparam(config, "dataset_len", dataset_len) config = conf.set_global_batch_size(config, global_batch_size) e_id = exp.run_basic_test_with_temp_config( config, conf.fixtures_path("pytorch_no_op"), 1) t_id = exp.experiment_trials(e_id)[0].trial.id batches_per_epoch = (dataset_len + global_batch_size - 1) // global_batch_size # ceil for batch_idx in range(max_len_batches): epoch_idx = batch_idx // batches_per_epoch for rank in range(config["resources"]["slots_per_trial"]): assert exp.check_if_string_present_in_trial_logs( t_id, f"rank {rank} finished batch {batch_idx} in epoch {epoch_idx}")
def test_detr_coco_pytorch_distributed() -> None: config = conf.load_config( conf.cv_examples_path("detr_coco_pytorch/const_fake.yaml")) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_slots_per_trial(config, 2) exp.run_basic_test_with_temp_config( config, conf.cv_examples_path("detr_coco_pytorch"), 1)
def test_faster_rcnn() -> None: config = conf.load_config(conf.experimental_path("trial/FasterRCNN_tp/16-gpus.yaml")) config = conf.set_max_length(config, {"batches": 128}) config = conf.set_slots_per_trial(config, 1) exp.run_basic_test_with_temp_config( config, conf.experimental_path("trial/FasterRCNN_tp"), 1, max_wait_secs=4800 )
def test_pytorch_const_native_parallel() -> None: config = conf.load_config(conf.tutorials_path("mnist_pytorch/const.yaml")) config = conf.set_slots_per_trial(config, 8) config = conf.set_native_parallel(config, True) config = conf.set_max_length(config, {"batches": 200}) exp.run_basic_test_with_temp_config(config, conf.tutorials_path("mnist_pytorch"), 1)
def test_language_modeling_mlm() -> None: example_path = conf.model_hub_examples_path("huggingface/language-modeling") config = conf.load_config(os.path.join(example_path, "mlm_config.yaml")) config = conf.set_slots_per_trial(config, 8) config = conf.set_global_batch_size(config, 16) config = conf.set_max_length(config, {"batches": 200}) config = set_docker_image(config) exp.run_basic_test_with_temp_config(config, example_path, 1)
def test_multiple_choice_swag() -> None: example_path = conf.model_hub_examples_path("huggingface/multiple-choice") config = conf.load_config(os.path.join(example_path, "swag_config.yaml")) config = conf.set_slots_per_trial(config, 8) config = conf.set_global_batch_size(config, 64) config = conf.set_max_length(config, {"batches": 200}) config = set_docker_image(config) exp.run_basic_test_with_temp_config(config, example_path, 1)
def test_pytorch_const_native_parallel() -> None: config = conf.load_config( conf.official_examples_path("mnist_pytorch/const.yaml")) config = conf.set_slots_per_trial(config, 8) config = conf.set_native_parallel(config, True) config = conf.set_max_steps(config, 2) exp.run_basic_test_with_temp_config( config, conf.official_examples_path("mnist_pytorch"), 1)
def test_squad_v2_with_beam_search() -> None: example_path = conf.model_hub_examples_path("huggingface/question-answering") config = conf.load_config(os.path.join(example_path, "squad_v2_beam_search.yaml")) config = conf.set_slots_per_trial(config, 8) config = conf.set_global_batch_size(config, 16) config = conf.set_max_length(config, {"batches": 200}) config = set_docker_image(config) exp.run_basic_test_with_temp_config(config, example_path, 1)
def test_token_classification_ner() -> None: example_path = conf.model_hub_examples_path("huggingface/token-classification") config = conf.load_config(os.path.join(example_path, "ner_config.yaml")) config = conf.set_slots_per_trial(config, 8) config = conf.set_global_batch_size(config, 32) config = conf.set_max_length(config, {"batches": 200}) config = set_docker_image(config) exp.run_basic_test_with_temp_config(config, example_path, 1)
def run_tf_keras_dcgan_example() -> None: config = conf.load_config( conf.gan_examples_path("dcgan_tf_keras/const.yaml")) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_min_validation_period(config, {"batches": 200}) config = conf.set_slots_per_trial(config, 8) config = conf.set_tf2_image(config) exp.run_basic_test_with_temp_config( config, conf.gan_examples_path("dcgan_tf_keras"), 1)
def test_tf_keras_mnist_parallel() -> None: config = conf.load_config( conf.tutorials_path("fashion_mnist_tf_keras/const.yaml")) config = conf.set_slots_per_trial(config, 8) config = conf.set_max_length(config, {"batches": 200}) experiment_id = exp.run_basic_test_with_temp_config( config, conf.tutorials_path("fashion_mnist_tf_keras"), 1) trials = exp.experiment_trials(experiment_id) assert len(trials) == 1
def test_squad_amp() -> None: example_path = conf.model_hub_examples_path("huggingface/question-answering") config = conf.load_config(os.path.join(example_path, "squad.yaml")) config = conf.set_slots_per_trial(config, 8) config = conf.set_global_batch_size(config, 64) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_hparam(config, "use_apex_amp", True) config = set_docker_image(config) exp.run_basic_test_with_temp_config(config, example_path, 1)
def test_text_classification_xnli_amp() -> None: example_path = conf.model_hub_examples_path("huggingface/text-classification") config = conf.load_config(os.path.join(example_path, "xnli_config.yaml")) config = conf.set_slots_per_trial(config, 8) config = conf.set_global_batch_size(config, 128) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_hparam(config, "use_apex_amp", True) config = set_docker_image(config) exp.run_basic_test_with_temp_config(config, example_path, 1)
def test_pytorch_parallel() -> None: config = conf.load_config(conf.tutorials_path("mnist_pytorch/const.yaml")) config = conf.set_slots_per_trial(config, 8) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_tensor_auto_tuning(config, True) config = conf.set_perform_initial_validation(config, True) exp_id = exp.run_basic_test_with_temp_config( config, conf.tutorials_path("mnist_pytorch"), 1, has_zeroth_step=True) exp.assert_performed_initial_validation(exp_id)
def test_pytorch_parallel() -> None: config = conf.load_config(conf.official_examples_path("trial/mnist_pytorch/const.yaml")) config = conf.set_slots_per_trial(config, 8) config = conf.set_native_parallel(config, False) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_tensor_auto_tuning(config, True) exp.run_basic_test_with_temp_config( config, conf.official_examples_path("trial/mnist_pytorch"), 1 )
def test_tensorpack_native_parallel() -> None: config = conf.load_config( conf.official_examples_path("trial/mnist_tp/const.yaml")) config = conf.set_slots_per_trial(config, 8) config = conf.set_native_parallel(config, True) config = conf.set_max_length(config, {"batches": 32}) experiment_id = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("trial/mnist_tp"), 1) trials = exp.experiment_trials(experiment_id) assert len(trials) == 1
def test_estimator_when_detecting_gpus() -> None: config = conf.load_config( conf.fixtures_path("estimator_gpu_detection/const.yaml")) config = conf.set_slots_per_trial(config, 8) config = conf.set_max_length(config, {"batches": 200}) exp.run_basic_test_with_temp_config( config, conf.fixtures_path("estimator_gpu_detection/"), 1, has_zeroth_step=False)
def test_tf_keras_mnist_parallel() -> None: config = conf.load_config(conf.official_examples_path("fashion_mnist_tf_keras/const.yaml")) config = conf.set_slots_per_trial(config, 8) config = conf.set_native_parallel(config, False) config = conf.set_max_steps(config, 2) experiment_id = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("fashion_mnist_tf_keras"), 1 ) trials = exp.experiment_trials(experiment_id) assert len(trials) == 1
def test_tf_keras_single_gpu(tf2: bool) -> None: config = conf.load_config(conf.official_examples_path("cifar10_cnn_tf_keras/const.yaml")) config = conf.set_slots_per_trial(config, 1) config = conf.set_max_steps(config, 2) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) experiment_id = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("cifar10_cnn_tf_keras"), 1 ) trials = exp.experiment_trials(experiment_id) assert len(trials) == 1
def test_on_trial_close_callback() -> None: config = conf.load_config(conf.fixtures_path("estimator_no_op/const.yaml")) config = conf.set_slots_per_trial(config, 8) config = conf.set_max_length(config, {"batches": 3}) exp_id = exp.run_basic_test_with_temp_config( config, conf.fixtures_path("estimator_no_op"), 1) assert exp.check_if_string_present_in_trial_logs( exp.experiment_trials(exp_id)[0].trial.id, "rank 0 has completed on_trial_close")
def test_mnist_estimator_const_parallel(tf2: bool) -> None: config = conf.load_config( conf.fixtures_path("mnist_estimator/single-multi-slot.yaml")) config = conf.set_slots_per_trial(config, 8) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) config = conf.set_perform_initial_validation(config, True) exp_id = exp.run_basic_test_with_temp_config( config, conf.cv_examples_path("mnist_estimator"), 1) exp.assert_performed_initial_validation(exp_id)
def test_pytorch_gan_parallel() -> None: config = conf.load_config( conf.gan_examples_path("gan_mnist_pytorch/const.yaml")) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_slots_per_trial(config, 8) experiment_id = exp.run_basic_test_with_temp_config( config, conf.gan_examples_path("gan_mnist_pytorch"), 1) trials = exp.experiment_trials(experiment_id) (Determined(conf.make_master_url()).get_trial( trials[0]["id"]).select_checkpoint(latest=True).load( map_location="cpu"))
def test_tensorpack_parallel(aggregation_frequency: int) -> None: config = conf.load_config( conf.official_examples_path("mnist_tp/const.yaml")) config = conf.set_slots_per_trial(config, 8) config = conf.set_native_parallel(config, False) config = conf.set_max_steps(config, 2) config = conf.set_aggregation_frequency(config, aggregation_frequency) experiment_id = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("mnist_tp"), 1) trials = exp.experiment_trials(experiment_id) assert len(trials) == 1
def test_pytorch_const_parallel(aggregation_frequency: int, use_amp: bool) -> None: if use_amp and aggregation_frequency > 1: pytest.skip("Mixed precision is not support with aggregation frequency > 1.") config = conf.load_config(conf.tutorials_path("mnist_pytorch/const.yaml")) config = conf.set_slots_per_trial(config, 8) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_aggregation_frequency(config, aggregation_frequency) if use_amp: config = conf.set_amp_level(config, "O1") exp.run_basic_test_with_temp_config(config, conf.tutorials_path("mnist_pytorch"), 1)
def test_tf_keras_parallel(aggregation_frequency: int, tf2: bool) -> None: config = conf.load_config( conf.cv_examples_path("cifar10_tf_keras/const.yaml")) config = conf.set_slots_per_trial(config, 8) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_aggregation_frequency(config, aggregation_frequency) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) experiment_id = exp.run_basic_test_with_temp_config( config, conf.cv_examples_path("cifar10_tf_keras"), 1) trials = exp.experiment_trials(experiment_id) assert len(trials) == 1
def test_pytorch_cifar10_parallel() -> None: config = conf.load_config( conf.official_examples_path("trial/cifar10_cnn_pytorch/const.yaml")) config = conf.set_max_steps(config, 2) config = conf.set_slots_per_trial(config, 8) experiment_id = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("trial/cifar10_cnn_pytorch"), 1) trials = exp.experiment_trials(experiment_id) nn = (Determined(conf.make_master_url()).get_trial( trials[0]["id"]).select_checkpoint(latest=True).load()) assert isinstance(nn, torch.nn.Module)
def test_distributed_logging() -> None: config = conf.load_config(conf.fixtures_path("pytorch_no_op/const.yaml")) config = conf.set_slots_per_trial(config, 8) config = conf.set_max_length(config, {"batches": 1}) e_id = exp.run_basic_test_with_temp_config( config, conf.fixtures_path("pytorch_no_op"), 1) t_id = exp.experiment_trials(e_id)[0]["id"] for i in range(config["resources"]["slots_per_trial"]): assert exp.check_if_string_present_in_trial_logs( t_id, "finished train_batch for rank {}".format(i))
def test_tf_keras_native_parallel(tf2: bool) -> None: config = conf.load_config( conf.official_examples_path("trial/cifar10_cnn_tf_keras/const.yaml")) config = conf.set_slots_per_trial(config, 8) config = conf.set_native_parallel(config, True) config = conf.set_max_length(config, {"batches": 200}) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) experiment_id = exp.run_basic_test_with_temp_config( config, conf.official_examples_path("trial/cifar10_cnn_tf_keras"), 1) trials = exp.experiment_trials(experiment_id) assert len(trials) == 1