Beispiel #1
0
def test_pytorch_training_torchaudio_cpu(
    pytorch_training, ec2_connection, cpu_only, ec2_instance_type, pt111_and_above_only
):
    _, image_framework_version = get_framework_and_version_from_tag(pytorch_training)
    if test_utils.is_image_incompatible_with_instance_type(pytorch_training, ec2_instance_type):
        pytest.skip(f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}")
    execute_ec2_training_test(ec2_connection, pytorch_training, PT_TORCHAUDIO_CMD)
Beispiel #2
0
def test_pytorch_amp(pytorch_training, ec2_connection, gpu_only, ec2_instance_type):
    _, image_framework_version = get_framework_and_version_from_tag(pytorch_training)
    if Version(image_framework_version) < Version("1.6"):
        pytest.skip("Native AMP was introduced in PyTorch 1.6")
    if test_utils.is_image_incompatible_with_instance_type(pytorch_training, ec2_instance_type):
        pytest.skip(f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}")
    execute_ec2_training_test(ec2_connection, pytorch_training, PT_AMP_CMD)
def test_nvapex(pytorch_training, ec2_connection, gpu_only, ec2_instance_type):
    if test_utils.is_image_incompatible_with_instance_type(
            pytorch_training, ec2_instance_type):
        pytest.skip(
            f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}"
        )
    execute_ec2_training_test(ec2_connection, pytorch_training, PT_APEX_CMD)
Beispiel #4
0
def test_pytorch_telemetry_gpu(pytorch_training, ec2_connection, gpu_only,
                               ec2_instance_type, pt15_and_above_only):
    if test_utils.is_image_incompatible_with_instance_type(
            pytorch_training, ec2_instance_type):
        pytest.skip(
            f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}"
        )
def test_smdebug_gpu(training, ec2_connection, region, ec2_instance_type,
                     gpu_only, py3_only):
    if test_utils.is_image_incompatible_with_instance_type(
            training, ec2_instance_type):
        pytest.skip(
            f"Image {training} is incompatible with instance type {ec2_instance_type}"
        )
    smdebug_test_timeout = 2400
    if is_tf_version("1", training):
        if is_nightly_context():
            smdebug_test_timeout = 7200
        else:
            pytest.skip(
                "TF1 gpu smdebug tests can take up to 2 hours, thus we are only running in nightly context"
            )

    run_smdebug_test(
        training,
        ec2_connection,
        region,
        ec2_instance_type,
        docker_executable="nvidia-docker",
        container_name="smdebug-gpu",
        timeout=smdebug_test_timeout,
    )
Beispiel #6
0
def test_mxnet_train_mnist_gpu(mxnet_training, ec2_connection, gpu_only,
                               ec2_instance_type):
    if test_utils.is_image_incompatible_with_instance_type(
            mxnet_training, ec2_instance_type):
        pytest.skip(
            f"Image {mxnet_training} is incompatible with instance type {ec2_instance_type}"
        )
    execute_ec2_training_test(ec2_connection, mxnet_training, MX_MNIST_CMD)
Beispiel #7
0
def test_ec2_pytorch_inference_gpu(pytorch_inference, ec2_connection, region,
                                   gpu_only, ec2_instance_type):
    if test_utils.is_image_incompatible_with_instance_type(
            pytorch_inference, ec2_instance_type):
        pytest.skip(
            f"Image {pytorch_inference} is incompatible with instance type {ec2_instance_type}"
        )
    ec2_pytorch_inference(pytorch_inference, "gpu", ec2_connection, region)
Beispiel #8
0
def test_pytorch_mpi_gpu(pytorch_training, ec2_connection, gpu_only, py3_only, ec2_instance_type):
    """
    Tests mpi backend
    """
    if test_utils.is_image_incompatible_with_instance_type(pytorch_training, ec2_instance_type):
        pytest.skip(f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}")
    test_cmd = os.path.join(CONTAINER_TESTS_PREFIX, "pytorch_tests", "testPyTorchMpi")
    execute_ec2_training_test(ec2_connection, pytorch_training, test_cmd)
def test_ec2_tensorflow_inference_gpu(tensorflow_inference, ec2_connection,
                                      region, gpu_only, ec2_instance_type):
    if test_utils.is_image_incompatible_with_instance_type(
            tensorflow_inference, ec2_instance_type):
        pytest.skip(
            f"Image {tensorflow_inference} is incompatible with instance type {ec2_instance_type}"
        )
    run_ec2_tensorflow_inference(tensorflow_inference, ec2_connection, "8500",
                                 region)
def test_pytorch_standalone_gpu(pytorch_training, ec2_connection, gpu_only,
                                ec2_instance_type):
    if test_utils.is_image_incompatible_with_instance_type(
            pytorch_training, ec2_instance_type):
        pytest.skip(
            f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}"
        )
    execute_ec2_training_test(ec2_connection, pytorch_training,
                              PT_STANDALONE_CMD)
Beispiel #11
0
def test_pytorch_s3_plugin_cpu(pytorch_training, ec2_connection, cpu_only,
                               ec2_instance_type, pt17_and_above_only):
    if test_utils.is_image_incompatible_with_instance_type(
            pytorch_training, ec2_instance_type):
        pytest.skip(
            f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}"
        )
    execute_ec2_training_test(ec2_connection, pytorch_training,
                              PT_S3_PLUGIN_CMD)
def test_curand_gpu(training, ec2_connection, gpu_only, ec2_instance_type):
    if test_utils.is_image_incompatible_with_instance_type(
            training, ec2_instance_type):
        pytest.skip(
            f"Image {training} is incompatible with instance type {ec2_instance_type}"
        )
    if is_tf_version("1", training) or "mxnet" in training:
        pytest.skip("Test is not configured for TF1 and MXNet")
    execute_ec2_training_test(ec2_connection, training, CURAND_CMD)
def test_mxnet_inference_telemetry_gpu(mxnet_inference, ec2_connection,
                                       gpu_only, ec2_instance_type):
    if test_utils.is_image_incompatible_with_instance_type(
            mxnet_inference, ec2_instance_type):
        pytest.skip(
            f"Image {mxnet_inference} is incompatible with instance type {ec2_instance_type}"
        )
    execute_ec2_inference_test(ec2_connection, mxnet_inference,
                               MX_TELEMETRY_CMD)
Beispiel #14
0
def test_tensorflow_telemetry_gpu(tensorflow_training, ec2_connection,
                                  gpu_only, ec2_instance_type):
    if test_utils.is_image_incompatible_with_instance_type(
            tensorflow_training, ec2_instance_type):
        pytest.skip(
            f"Image {tensorflow_training} is incompatible with instance type {ec2_instance_type}"
        )
    execute_ec2_training_test(ec2_connection, tensorflow_training,
                              TF_TELEMETRY_CMD)
def test_pytorch_linear_regression_gpu(pytorch_training, ec2_connection,
                                       gpu_only, ec2_instance_type):
    if test_utils.is_image_incompatible_with_instance_type(
            pytorch_training, ec2_instance_type):
        pytest.skip(
            f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}"
        )
    execute_ec2_training_test(ec2_connection, pytorch_training,
                              PT_REGRESSION_CMD)
Beispiel #16
0
def test_pytorch_train_dgl_gpu(pytorch_training, ec2_connection, gpu_only, py3_only, ec2_instance_type):
    _, image_framework_version = get_framework_and_version_from_tag(pytorch_training)
    image_cuda_version = get_cuda_version_from_tag(pytorch_training)
    # TODO: Remove when DGL gpu test on ecs get fixed
    if Version(image_framework_version) >= Version("1.10") and image_cuda_version == "cu113":
        pytest.skip("ecs test for DGL gpu fails since pt 1.10")
    if test_utils.is_image_incompatible_with_instance_type(pytorch_training, ec2_instance_type):
        pytest.skip(f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}")
    execute_ec2_training_test(ec2_connection, pytorch_training, PT_DGL_CMD)
Beispiel #17
0
def test_tensorflow_keras_horovod_fp32(tensorflow_training, ec2_connection,
                                       tf2_only, gpu_only, ec2_instance_type):
    if test_utils.is_image_incompatible_with_instance_type(
            tensorflow_training, ec2_instance_type):
        pytest.skip(
            f"Image {tensorflow_training} is incompatible with instance type {ec2_instance_type}"
        )
    execute_ec2_training_test(ec2_connection, tensorflow_training,
                              TF_KERAS_HVD_CMD_FP32)
Beispiel #18
0
def test_tensorflow_addons_gpu(tensorflow_training, ec2_connection, tf2_only,
                               gpu_only, ec2_instance_type):
    if test_utils.is_image_incompatible_with_instance_type(
            tensorflow_training, ec2_instance_type):
        pytest.skip(
            f"Image {tensorflow_training} is incompatible with instance type {ec2_instance_type}"
        )
    execute_ec2_training_test(ec2_connection, tensorflow_training,
                              TF_ADDONS_CMD)
Beispiel #19
0
def test_pytorch_nccl_version(
    pytorch_training, ec2_connection, gpu_only, py3_only, ec2_instance_type, pt17_and_above_only,
):
    """
    Tests nccl version
    """
    if test_utils.is_image_incompatible_with_instance_type(pytorch_training, ec2_instance_type):
        pytest.skip(f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}")
    test_cmd = os.path.join(CONTAINER_TESTS_PREFIX, "pytorch_tests", "testPyTorchNcclVersion")
    execute_ec2_training_test(ec2_connection, pytorch_training, test_cmd)
Beispiel #20
0
def test_pytorch_training_torchdata_cpu(
    pytorch_training, ec2_connection, cpu_only, ec2_instance_type, pt111_and_above_only
):
    _, image_framework_version = get_framework_and_version_from_tag(pytorch_training)
    if test_utils.is_image_incompatible_with_instance_type(pytorch_training, ec2_instance_type):
        pytest.skip(f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}")
    if Version(image_framework_version) in SpecifierSet("==1.11.*"):
        execute_ec2_training_test(ec2_connection, pytorch_training, PT_TORCHDATA_DEV_CMD)
    else:
        execute_ec2_training_test(ec2_connection, pytorch_training, PT_TORCHDATA_CMD)
Beispiel #21
0
def test_pytorch_inference_telemetry_gpu(pytorch_inference, ec2_connection,
                                         gpu_only, ec2_instance_type,
                                         pt15_and_above_only):
    if test_utils.is_image_incompatible_with_instance_type(
            pytorch_inference, ec2_instance_type):
        pytest.skip(
            f"Image {pytorch_inference} is incompatible with instance type {ec2_instance_type}"
        )
    execute_ec2_inference_test(ec2_connection, pytorch_inference,
                               PT_TELEMETRY_CMD)
Beispiel #22
0
def test_tensorflow_standalone_gpu(tensorflow_training, ec2_connection,
                                   gpu_only, ec2_instance_type):
    if test_utils.is_image_incompatible_with_instance_type(
            tensorflow_training, ec2_instance_type):
        pytest.skip(
            f"Image {tensorflow_training} is incompatible with instance type {ec2_instance_type}"
        )
    test_script = TF1_STANDALONE_CMD if is_tf_version(
        "1", tensorflow_training) else TF2_STANDALONE_CMD
    execute_ec2_training_test(ec2_connection, tensorflow_training, test_script)
Beispiel #23
0
def test_tensorflow_dataservice_gpu(tensorflow_training, ec2_connection,
                                    ec2_instance_ami, tf24_and_above_only,
                                    gpu_only, ec2_instance_type):
    if test_utils.is_image_incompatible_with_instance_type(
            tensorflow_training, ec2_instance_type):
        pytest.skip(
            f"Image {tensorflow_training} is incompatible with instance type {ec2_instance_type}"
        )
    run_data_service_test(ec2_connection, ec2_instance_ami,
                          tensorflow_training, TF_DATASERVICE_TEST_CMD)
def test_pytorch_with_horovod(pytorch_training, ec2_connection, gpu_only,
                              ec2_instance_type):
    if test_utils.is_image_incompatible_with_instance_type(
            pytorch_training, ec2_instance_type):
        pytest.skip(
            f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}"
        )
    test_cmd = os.path.join(CONTAINER_TESTS_PREFIX, "pytorch_tests",
                            "testPTHVD")
    execute_ec2_training_test(ec2_connection, pytorch_training, test_cmd)
def test_ec2_mxnet_gluonnlp_inference_gpu(mxnet_inference, ec2_connection,
                                          region, gpu_only, py3_only,
                                          ec2_instance_type):
    if test_utils.is_image_incompatible_with_instance_type(
            mxnet_inference, ec2_instance_type):
        pytest.skip(
            f"Image {mxnet_inference} is incompatible with instance type {ec2_instance_type}"
        )
    run_ec2_mxnet_inference(mxnet_inference, BERT_MODEL, "gluonnlp",
                            ec2_connection, "gpu", region, 90, 9091)
def test_mxnet_train_dgl_gpu(mxnet_training, ec2_connection, gpu_only,
                             py3_only, ec2_instance_type):
    if test_utils.is_image_incompatible_with_instance_type(
            mxnet_training, ec2_instance_type):
        pytest.skip(
            f"Image {mxnet_training} is incompatible with instance type {ec2_instance_type}"
        )
    if "cu110" in mxnet_training:
        pytest.skip("Skipping dgl tests on cuda 11.0 until available")
    execute_ec2_training_test(ec2_connection, mxnet_training, MX_DGL_CMD)
def test_ec2_mxnet_squeezenet_inference_gpu(mxnet_inference, ec2_connection,
                                            region, gpu_only,
                                            ec2_instance_type):
    if test_utils.is_image_incompatible_with_instance_type(
            mxnet_inference, ec2_instance_type):
        pytest.skip(
            f"Image {mxnet_inference} is incompatible with instance type {ec2_instance_type}"
        )
    run_ec2_mxnet_inference(mxnet_inference, SQUEEZENET_MODEL, "squeezenet",
                            ec2_connection, "gpu", region, 80, 8081)
Beispiel #28
0
def test_tensorflow_with_horovod_gpu(tensorflow_training, ec2_instance_type, ec2_connection, gpu_only, tf2_only):
    if test_utils.is_image_incompatible_with_instance_type(tensorflow_training, ec2_instance_type):
        pytest.skip(f"Image {tensorflow_training} is incompatible with instance type {ec2_instance_type}")
    test_script = TF1_HVD_CMD if is_tf_version("1", tensorflow_training) else TF2_HVD_CMD
    execute_ec2_training_test(
        connection=ec2_connection,
        ecr_uri=tensorflow_training,
        test_cmd=f"{test_script} {ec2_instance_type}",
        large_shm=bool(re.match(r"(p2\.8xlarge)|(g3\.16xlarge)", ec2_instance_type)),
    )
Beispiel #29
0
def test_mxnet_keras_gpu(mxnet_training, ec2_connection, gpu_only,
                         ec2_instance_type):
    if test_utils.is_image_incompatible_with_instance_type(
            mxnet_training, ec2_instance_type):
        pytest.skip(
            f"Image {mxnet_training} is incompatible with instance type {ec2_instance_type}"
        )
    _, framework_version = test_utils.get_framework_and_version_from_tag(
        mxnet_training)
    if Version(framework_version) >= Version('1.9.0'):
        pytest.skip(f"Keras support has been deprecated MXNet 1.9.0 onwards")
    execute_ec2_training_test(ec2_connection, mxnet_training, MX_KERAS_CMD)
Beispiel #30
0
def test_pytorch_inference_torchdata_gpu(pytorch_inference, ec2_connection,
                                         gpu_only, ec2_instance_type,
                                         pt111_and_above_only):
    _, image_framework_version = get_framework_and_version_from_tag(
        pytorch_inference)
    if test_utils.is_image_incompatible_with_instance_type(
            pytorch_inference, ec2_instance_type):
        pytest.skip(
            f"Image {pytorch_inference} is incompatible with instance type {ec2_instance_type}"
        )
    execute_ec2_inference_test(ec2_connection, pytorch_inference,
                               PT_TORCHDATA_CMD)