def test_nas_search() -> None:
    config = conf.load_config(
        conf.experimental_path("trial/rsws_nas/train_one_arch.yaml"))
    config = conf.set_max_length(config, {"batches": 200})

    exp.run_basic_test_with_temp_config(
        config, conf.experimental_path("trial/rsws_nas"), 1)
Exemple #2
0
def test_mnist_estimator_distributed() -> None:
    config = conf.load_config(
        conf.cv_examples_path("mnist_estimator/distributed.yaml"))
    config = conf.set_max_length(config, {"batches": 200})

    exp.run_basic_test_with_temp_config(
        config, conf.cv_examples_path("mnist_estimator"), 1)
Exemple #3
0
def test_resnet50() -> None:
    config = conf.load_config(conf.experimental_path("trial/resnet50_tf_keras/const.yaml"))
    config = conf.set_max_length(config, {"batches": 200})

    exp.run_basic_test_with_temp_config(
        config, conf.experimental_path("trial/resnet50_tf_keras"), 1
    )
Exemple #4
0
def test_deformabledetr_coco_pytorch_const() -> None:
    config = conf.load_config(
        conf.cv_examples_path("deformabledetr_coco_pytorch/const_fake.yaml"))
    config = conf.set_max_length(config, {"batches": 200})

    exp.run_basic_test_with_temp_config(
        config, conf.cv_examples_path("deformabledetr_coco_pytorch"), 1)
Exemple #5
0
def test_fashion_mnist_tf_keras_distributed() -> None:
    config = conf.load_config(
        conf.tutorials_path("fashion_mnist_tf_keras/distributed.yaml"))
    config = conf.set_max_length(config, {"batches": 200})

    exp.run_basic_test_with_temp_config(
        config, conf.tutorials_path("fashion_mnist_tf_keras"), 1)
Exemple #6
0
def test_launch_layer_cifar(
        collect_trial_profiles: Callable[[int], None]) -> None:
    config = conf.load_config(
        conf.cv_examples_path("cifar10_pytorch/const.yaml"))
    config = conf.set_max_length(config, {"batches": 200})
    config = conf.set_slots_per_trial(config, 1)
    config = conf.set_profiling_enabled(config)
    config = conf.set_entrypoint(
        config,
        "python3 -m determined.launch.horovod --autohorovod --trial model_def:CIFARTrial"
    )

    experiment_id = exp.run_basic_test_with_temp_config(
        config, conf.cv_examples_path("cifar10_pytorch"), 1)
    trials = exp.experiment_trials(experiment_id)
    (Determined(conf.make_master_url()).get_trial(
        trials[0].trial.id).select_checkpoint(latest=True).load(
            map_location="cpu"))

    collect_trial_profiles(trials[0].trial.id)

    assert exp.check_if_string_present_in_trial_logs(
        trials[0].trial.id,
        "allocation stopped after resources exited successfully with a zero exit code",
    )
Exemple #7
0
def test_unets_tf_keras_distributed() -> None:
    config = conf.load_config(
        conf.cv_examples_path("unets_tf_keras/distributed.yaml"))
    config = conf.set_max_length(config, {"batches": 200})

    exp.run_basic_test_with_temp_config(
        config, conf.cv_examples_path("unets_tf_keras"), 1)
Exemple #8
0
def test_tf_keras_const_warm_start(
        tf2: bool, collect_trial_profiles: Callable[[int], None]) -> None:
    config = conf.load_config(
        conf.cv_examples_path("cifar10_tf_keras/const.yaml"))
    config = conf.set_max_length(config, {"batches": 200})
    config = conf.set_min_validation_period(config, {"batches": 1000})
    config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config)
    config = conf.set_profiling_enabled(config)

    experiment_id1 = exp.run_basic_test_with_temp_config(
        config, conf.cv_examples_path("cifar10_tf_keras"), 1)
    trials = exp.experiment_trials(experiment_id1)
    assert len(trials) == 1

    first_trial = trials[0]
    first_trial_id = first_trial.trial.id

    assert len(first_trial.workloads) == 4
    checkpoints = exp.workloads_with_checkpoint(first_trial.workloads)
    first_checkpoint_uuid = checkpoints[0].uuid

    # Add a source trial ID to warm start from.
    config["searcher"]["source_trial_id"] = first_trial_id

    experiment_id2 = exp.run_basic_test_with_temp_config(
        config, conf.cv_examples_path("cifar10_tf_keras"), 1)

    # The new  trials should have a warm start checkpoint ID.
    trials = exp.experiment_trials(experiment_id2)
    assert len(trials) == 1
    for t in trials:
        assert t.trial.warmStartCheckpointUuid != ""
        assert t.trial.warmStartCheckpointUuid == first_checkpoint_uuid
    trial_id = trials[0].trial.id
    collect_trial_profiles(trial_id)
Exemple #9
0
def test_word_language_transformer_const() -> None:
    config = conf.load_config(conf.nlp_examples_path("word_language_model/const.yaml"))
    config = conf.set_max_length(config, {"batches": 200})
    config = config.copy()
    config["hyperparameters"]["model_cls"] = "Transformer"

    exp.run_basic_test_with_temp_config(config, conf.nlp_examples_path("word_language_model"), 1)
def test_pytorch_parallel() -> None:
    config = conf.load_config(conf.tutorials_path("mnist_pytorch/const.yaml"))
    config = conf.set_slots_per_trial(config, 8)
    config = conf.set_max_length(config, {"batches": 200})
    config = conf.set_tensor_auto_tuning(config, True)
    config = conf.set_perform_initial_validation(config, True)

    exp_id = exp.run_basic_test_with_temp_config(
        config, conf.tutorials_path("mnist_pytorch"), 1)
    exp.assert_performed_initial_validation(exp_id)

    # Check on record/batch counts we emitted in logs.
    validation_size = 10000
    global_batch_size = config["hyperparameters"]["global_batch_size"]
    num_workers = config.get("resources", {}).get("slots_per_trial", 1)
    global_batch_size = config["hyperparameters"]["global_batch_size"]
    scheduling_unit = config.get("scheduling_unit", 100)
    per_slot_batch_size = global_batch_size // num_workers
    exp_val_batches = (validation_size +
                       (per_slot_batch_size - 1)) // per_slot_batch_size
    patterns = [
        # Expect two copies of matching training reports.
        f"trained: {scheduling_unit * global_batch_size} records.*in {scheduling_unit} batches",
        f"trained: {scheduling_unit * global_batch_size} records.*in {scheduling_unit} batches",
        f"validated: {validation_size} records.*in {exp_val_batches} batches",
    ]
    trial_id = exp.experiment_trials(exp_id)[0].trial.id
    exp.assert_patterns_in_trial_logs(trial_id, patterns)
Exemple #11
0
def test_pl_mnist() -> None:
    exp_dir = "mnist_pl"
    config = conf.load_config(conf.cv_examples_path(exp_dir + "/const.yaml"))
    config = conf.set_max_length(config, {"batches": 200})

    exp.run_basic_test_with_temp_config(config, conf.cv_examples_path(exp_dir),
                                        1)
Exemple #12
0
def test_epoch_sync(num_workers: int, global_batch_size: int,
                    dataset_len: int) -> None:
    """
    Test that epoch_idx is synchronized across all workers regardless of whether the
    number of batches is evenly divisible by the number of workers.
    """
    config = conf.load_config(conf.fixtures_path("pytorch_no_op/const.yaml"))
    config = conf.set_slots_per_trial(config, num_workers)
    max_len_batches = 10
    config = conf.set_max_length(config, {"batches": max_len_batches})
    config = conf.set_hparam(config, "dataset_len", dataset_len)
    config = conf.set_global_batch_size(config, global_batch_size)

    e_id = exp.run_basic_test_with_temp_config(
        config, conf.fixtures_path("pytorch_no_op"), 1)
    t_id = exp.experiment_trials(e_id)[0].trial.id

    batches_per_epoch = (dataset_len + global_batch_size -
                         1) // global_batch_size  # ceil

    for batch_idx in range(max_len_batches):
        epoch_idx = batch_idx // batches_per_epoch
        for rank in range(config["resources"]["slots_per_trial"]):
            assert exp.check_if_string_present_in_trial_logs(
                t_id,
                f"rank {rank} finished batch {batch_idx} in epoch {epoch_idx}")
def test_pix2pix_facades_const() -> None:
    config = conf.load_config(
        conf.gan_examples_path("pix2pix_tf_keras/const.yaml"))
    config = conf.set_max_length(config, {"batches": 200})

    exp.run_basic_test_with_temp_config(
        config, conf.gan_examples_path("pix2pix_tf_keras"), 1)
Exemple #14
0
def test_pytorch_const_with_amp(api_style: str) -> None:
    config = conf.load_config(
        conf.fixtures_path("pytorch_amp/" + api_style + "_amp.yaml"))
    config = conf.set_max_length(config, {"batches": 200})

    exp.run_basic_test_with_temp_config(config,
                                        conf.fixtures_path("pytorch_amp"), 1)
Exemple #15
0
def test_imagenet_nas() -> None:
    config = conf.load_config(conf.experimental_path("trial/imagenet_nas_arch_pytorch/const.yaml"))
    config = conf.set_max_length(config, {"batches": 200})

    exp.run_basic_test_with_temp_config(
        config, conf.experimental_path("trial/imagenet_nas_arch_pytorch"), 1
    )
Exemple #16
0
def test_protein_pytorch_geometric() -> None:
    config = conf.load_config(conf.graphs_examples_path("proteins_pytorch_geometric/const.yaml"))
    config = conf.set_max_length(config, {"epochs": 50})

    exp.run_basic_test_with_temp_config(
        config, conf.graphs_examples_path("proteins_pytorch_geometric"), 1
    )
Exemple #17
0
def test_tf_keras_const_warm_start(tf2: bool) -> None:
    config = conf.load_config(
        conf.cv_examples_path("cifar10_tf_keras/const.yaml"))
    config = conf.set_max_length(config, {"batches": 200})
    config = conf.set_min_validation_period(config, {"batches": 1000})
    config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config)

    experiment_id1 = exp.run_basic_test_with_temp_config(
        config, conf.cv_examples_path("cifar10_tf_keras"), 1)
    trials = exp.experiment_trials(experiment_id1)
    assert len(trials) == 1

    first_trial = trials[0]
    first_trial_id = first_trial["id"]

    assert len(first_trial["steps"]) == 2
    first_checkpoint_id = first_trial["steps"][1]["checkpoint"]["id"]

    # Add a source trial ID to warm start from.
    config["searcher"]["source_trial_id"] = first_trial_id

    experiment_id2 = exp.run_basic_test_with_temp_config(
        config, conf.cv_examples_path("cifar10_tf_keras"), 1)

    # The new  trials should have a warm start checkpoint ID.
    trials = exp.experiment_trials(experiment_id2)
    assert len(trials) == 1
    for trial in trials:
        assert trial["warm_start_checkpoint_id"] == first_checkpoint_id
def test_text_classification_glue() -> None:
    example_path = conf.model_hub_examples_path("huggingface/text-classification")
    config = conf.load_config(os.path.join(example_path, "glue_config.yaml"))
    config = conf.set_max_length(config, {"batches": 200})
    config = set_docker_image(config)

    exp.run_basic_test_with_temp_config(config, example_path, 1)
Exemple #19
0
def test_gaea_pytorch_distributed() -> None:
    config = conf.load_config(
        conf.nas_examples_path("gaea_pytorch/eval/distributed_no_data_download.yaml")
    )
    config = conf.set_max_length(config, {"batches": 200})

    exp.run_basic_test_with_temp_config(config, conf.nas_examples_path("gaea_pytorch/eval"), 1)
def test_language_modeling_plm() -> None:
    example_path = conf.model_hub_examples_path("huggingface/language-modeling")
    config = conf.load_config(os.path.join(example_path, "plm_config.yaml"))
    config = conf.set_max_length(config, {"batches": 200})
    config = set_docker_image(config)

    exp.run_basic_test_with_temp_config(config, example_path, 1)
Exemple #21
0
def test_gan_mnist_pytorch_const() -> None:
    config = conf.load_config(
        conf.gan_examples_path("gan_mnist_pytorch/const.yaml"))
    config = conf.set_max_length(config, {"batches": 200})

    exp.run_basic_test_with_temp_config(
        config, conf.gan_examples_path("gan_mnist_pytorch"), 1)
Exemple #22
0
def test_bert_glue() -> None:
    config = conf.load_config(conf.experimental_path("trial/bert_glue_pytorch/const.yaml"))
    config = conf.set_max_length(config, {"batches": 200})

    exp.run_basic_test_with_temp_config(
        config, conf.experimental_path("trial/bert_glue_pytorch/"), 1
    )
Exemple #23
0
def test_mmdetection_pytorch_const() -> None:
    config = conf.load_config(
        conf.cv_examples_path("mmdetection_pytorch/const_fake_data.yaml"))
    config = conf.set_max_length(config, {"batches": 200})

    exp.run_basic_test_with_temp_config(
        config, conf.cv_examples_path("mmdetection_pytorch"), 1)
Exemple #24
0
def test_mnist_tp_to_estimator() -> None:
    config = conf.load_config(conf.experimental_path("trial/mnist_tp_to_estimator/const.yaml"))
    config = conf.set_max_length(config, {"batches": 32})

    exp.run_basic_test_with_temp_config(
        config, conf.experimental_path("trial/mnist_tp_to_estimator"), 1
    )
Exemple #25
0
def test_cifar10_pytorch_distributed() -> None:
    config = conf.load_config(
        conf.cv_examples_path("cifar10_pytorch/distributed.yaml"))
    config = conf.set_max_length(config, {"batches": 200})

    exp.run_basic_test_with_temp_config(
        config, conf.cv_examples_path("cifar10_pytorch"), 1)
Exemple #26
0
def test_mnist_pytorch_multi_output() -> None:
    config = conf.load_config(conf.experimental_path("trial/mnist_pytorch_multi_output/const.yaml"))
    config = conf.set_max_length(config, {"batches": 200})

    exp.run_basic_test_with_temp_config(
        config, conf.experimental_path("trial/mnist_pytorch_multi_output"), 1
    )
Exemple #27
0
def test_maskrcnn_distributed_fake() -> None:
    example_path = conf.fixtures_path("mmdetection")
    config = conf.load_config(os.path.join(example_path, "distributed_fake_data.yaml"))
    config = conf.set_max_length(config, {"batches": 200})
    config = set_docker_image(config)

    exp.run_basic_test_with_temp_config(config, example_path, 1)
def test_deepspeed_pipeline_parallel() -> None:
    config = conf.load_config(conf.deepspeed_examples_path("pipeline_parallelism/distributed.yaml"))
    config = conf.set_max_length(config, {"batches": 200})
    config = conf.set_min_validation_period(config, {"batches": 100})

    exp.run_basic_test_with_temp_config(
        config, conf.deepspeed_examples_path("pipeline_parallelism"), 1
    )
def test_gaea_pytorch_const() -> None:
    config = conf.load_config(
        conf.nas_examples_path("gaea_pytorch/eval/const.yaml"))
    config = conf.set_global_batch_size(config, 32)
    config = conf.set_max_length(config, {"batches": 200})

    exp.run_basic_test_with_temp_config(
        config, conf.nas_examples_path("gaea_pytorch/eval"), 1)
def test_word_language_lstm_const() -> None:
    config = conf.load_config(conf.nlp_examples_path("word_language_model/distributed.yaml"))
    config = conf.set_max_length(config, {"batches": 200})
    config = config.copy()
    config["hyperparameters"]["model_cls"] = "LSTM"
    config["hyperparameters"]["tied"] = False

    exp.run_basic_test_with_temp_config(config, conf.nlp_examples_path("word_language_model"), 1)