def test_mask_rcnn_64_slots() -> None: experiment_id = exp.run_basic_test( conf.experimental_path("FasterRCNN_tp/64-gpus.yaml"), conf.experimental_path("FasterRCNN_tp/"), 1, max_wait_secs=5 * 60 * 60, ) validation_metric_name = "mAP(bbox)/IoU=0.5:0.95" validation_metric = exp.get_validation_metric_from_last_step( experiment_id, 0, validation_metric_name) durations = exp.get_experiment_durations(experiment_id, 0) wait_for_agents_time = (durations.experiment_duration - durations.training_duration - durations.validation_duration - durations.checkpoint_duration) print(validation_metric_name, validation_metric) print(durations) print(f"wait for agents duration: {wait_for_agents_time}") assert validation_metric > 0.375 assert durations.training_duration < datetime.timedelta(hours=2, minutes=45) assert durations.validation_duration < datetime.timedelta(hours=1, minutes=15)
def test_pytorch_const_multi_output() -> None: config = conf.load_config( conf.experimental_path("mnist_pytorch_multi_output/const.yaml")) config = conf.set_max_steps(config, 2) exp.run_basic_test_with_temp_config( config, conf.experimental_path("mnist_pytorch_multi_output"), 1)
def test_resnet50() -> None: config = conf.load_config( conf.experimental_path("resnet50_tf_keras/const.yaml")) config = conf.set_max_steps(config, 2) exp.run_basic_test_with_temp_config( config, conf.experimental_path("resnet50_tf_keras"), 1)
def test_bert_glue() -> None: config = conf.load_config( conf.experimental_path("bert_glue_pytorch/const.yaml")) config = conf.set_max_steps(config, 2) exp.run_basic_test_with_temp_config( config, conf.experimental_path("bert_glue_pytorch/"), 1)
def test_mnist_tp_to_estimator() -> None: config = conf.load_config( conf.experimental_path("mnist_tp_to_estimator/const.yaml")) config = conf.set_max_steps(config, 2) exp.run_basic_test_with_temp_config( config, conf.experimental_path("mnist_tp_to_estimator"), 1)
def test_nas_search() -> None: config = conf.load_config( conf.experimental_path("nas_search/train_one_arch.yaml")) config = conf.set_max_steps(config, 2) exp.run_basic_test_with_temp_config(config, conf.experimental_path("nas_search"), 1)
def test_faster_rcnn() -> None: config = conf.load_config( conf.experimental_path("FasterRCNN_tp/16-gpus.yaml")) config = conf.set_max_steps(config, 2) config = conf.set_slots_per_trial(config, 1) exp.run_basic_test_with_temp_config( config, conf.experimental_path("FasterRCNN_tp"), 1, max_wait_secs=4800)
def run_tf_keras_mnist_data_layer_test(tf2: bool, storage_type: str) -> None: config = conf.load_config( conf.experimental_path("data_layer_mnist_tf_keras/const.yaml")) config = conf.set_max_steps(config, 2) config = conf.set_tf2_image(config) if tf2 else conf.set_tf1_image(config) if storage_type == "lfs": config = conf.set_shared_fs_data_layer(config) else: config = conf.set_s3_data_layer(config) exp.run_basic_test_with_temp_config( config, conf.experimental_path("data_layer_mnist_tf_keras"), 1)
def test_mnist_estimator_data_layer_parallel(storage_type: str) -> None: config = conf.load_config( conf.experimental_path("data_layer_mnist_estimator/const.yaml")) config = conf.set_max_steps(config, 2) config = conf.set_slots_per_trial(config, 8) config = conf.set_tf1_image(config) if storage_type == "lfs": config = conf.set_shared_fs_data_layer(config) else: config = conf.set_s3_data_layer(config) exp.run_basic_test_with_temp_config( config, conf.experimental_path("data_layer_mnist_estimator"), 1)
def test_mnist_estimator_adaptive_with_data_layer() -> None: config = conf.load_config( conf.fixtures_path("mnist_estimator/adaptive.yaml")) config = conf.set_tf2_image(config) config = conf.set_shared_fs_data_layer(config) exp.run_basic_test_with_temp_config( config, conf.experimental_path("data_layer_mnist_estimator"), None)
class NativeImplementations: PytorchMNISTCNNSingleGeneric = NativeImplementation( cwd=conf.experimental_path("native_mnist_pytorch"), command=[ "python", conf.experimental_path("native_mnist_pytorch/trial_impl.py") ], configuration={ "checkpoint_storage": experiment.shared_fs_checkpoint_config(), "searcher": { "name": "single", "max_steps": 1, "metric": "validation_error" }, "max_restarts": 0, }, num_expected_steps_per_trial=1, num_expected_trials=1, min_num_gpus_required=0, ) TFEstimatorMNISTCNNSingle = NativeImplementation( cwd=conf.experimental_path("native_mnist_estimator"), command=[ "python", conf.experimental_path("native_mnist_estimator/native_impl.py") ], configuration={ "batches_per_step": 4, "checkpoint_storage": experiment.shared_fs_checkpoint_config(), "searcher": { "name": "single", "max_steps": 1, "metric": "accuracy" }, "max_restarts": 0, }, num_expected_steps_per_trial=1, num_expected_trials=1, min_num_gpus_required=0, ) TFEstimatorMNISTCNNSingleGeneric = NativeImplementation( cwd=conf.experimental_path("native_mnist_estimator"), command=[ "python", conf.experimental_path("native_mnist_estimator/trial_impl.py") ], configuration={ "batches_per_step": 4, "checkpoint_storage": experiment.shared_fs_checkpoint_config(), "searcher": { "name": "single", "max_steps": 1, "metric": "accuracy" }, "max_restarts": 0, }, num_expected_steps_per_trial=1, num_expected_trials=1, min_num_gpus_required=0, ) # Train a single tf.keras model using fit(). TFKerasMNISTCNNSingleFit = NativeImplementation( cwd=conf.experimental_path("native_fashion_mnist_tf_keras"), command=[ "python", conf.experimental_path( "native_fashion_mnist_tf_keras/native_impl.py"), "--use-fit", ], configuration={ "batches_per_step": 4, "checkpoint_storage": experiment.shared_fs_checkpoint_config(), "searcher": { "name": "single", "max_steps": 1, "metric": "val_accuracy" }, "max_restarts": 2, }, num_expected_steps_per_trial=1, num_expected_trials=1, min_num_gpus_required=0, ) # Train a single tf.keras model using fit() on multiple GPUs. TFKerasMNISTCNNSingleFitParallel = NativeImplementation( cwd=conf.experimental_path("native_fashion_mnist_tf_keras"), command=[ "python", conf.experimental_path( "native_fashion_mnist_tf_keras/native_impl.py"), "--use-fit", ], configuration={ "batches_per_step": 4, "checkpoint_storage": experiment.shared_fs_checkpoint_config(), "searcher": { "name": "single", "max_steps": 1, "metric": "val_accuracy" }, "resources": { "slots_per_trial": 2 }, "max_restarts": 2, }, num_expected_steps_per_trial=1, num_expected_trials=1, min_num_gpus_required=2, ) # Train a single tf.keras model using fit_generator(). TFKerasMNISTCNNSingleFitGenerator = NativeImplementation( cwd=conf.experimental_path("native_fashion_mnist_tf_keras"), command=[ "python", conf.experimental_path( "native_fashion_mnist_tf_keras/native_impl.py") ], configuration={ "batches_per_step": 4, "checkpoint_storage": experiment.shared_fs_checkpoint_config(), "searcher": { "name": "single", "max_steps": 1, "metric": "val_accuracy" }, "max_restarts": 2, }, num_expected_steps_per_trial=1, num_expected_trials=1, min_num_gpus_required=0, ) TFKerasMNISTCNNSingleGeneric = NativeImplementation( cwd=conf.experimental_path("native_fashion_mnist_tf_keras"), command=[ "python", conf.experimental_path( "native_fashion_mnist_tf_keras/trial_impl.py") ], configuration={ "batches_per_step": 4, "checkpoint_storage": experiment.shared_fs_checkpoint_config(), "searcher": { "name": "single", "max_steps": 1, "metric": "val_accuracy" }, "max_restarts": 2, }, num_expected_steps_per_trial=1, num_expected_trials=1, min_num_gpus_required=0, )