def test_smdebug_gpu(training, ec2_connection, region, ec2_instance_type,
                     gpu_only, py3_only):
    if test_utils.is_image_compatible_with_instance_type(
            training, ec2_instance_type):
        pytest.skip(
            f"Image {training} is incompatible with instance type {ec2_instance_type}"
        )
    smdebug_test_timeout = 2400
    if is_tf_version("1", training):
        if is_nightly_context():
            smdebug_test_timeout = 7200
        else:
            pytest.skip(
                "TF1 gpu smdebug tests can take up to 2 hours, thus we are only running in nightly context"
            )

    run_smdebug_test(
        training,
        ec2_connection,
        region,
        ec2_instance_type,
        docker_executable="nvidia-docker",
        container_name="smdebug-gpu",
        timeout=smdebug_test_timeout,
    )
def test_nvapex(pytorch_training, ec2_connection, gpu_only, ec2_instance_type):
    if test_utils.is_image_compatible_with_instance_type(
            pytorch_training, ec2_instance_type):
        pytest.skip(
            f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}"
        )
    execute_ec2_training_test(ec2_connection, pytorch_training, PT_APEX_CMD)
def test_mxnet_keras_gpu(mxnet_training, ec2_connection, gpu_only,
                         ec2_instance_type):
    if test_utils.is_image_compatible_with_instance_type(
            mxnet_training, ec2_instance_type):
        pytest.skip(
            f"Image {mxnet_training} is incompatible with instance type {ec2_instance_type}"
        )
    execute_ec2_training_test(ec2_connection, mxnet_training, MX_KERAS_CMD)
Example #4
0
def test_ec2_pytorch_inference_gpu(pytorch_inference, ec2_connection, region,
                                   gpu_only, ec2_instance_type):
    if test_utils.is_image_compatible_with_instance_type(
            pytorch_inference, ec2_instance_type):
        pytest.skip(
            f"Image {pytorch_inference} is incompatible with instance type {ec2_instance_type}"
        )
    ec2_pytorch_inference(pytorch_inference, "gpu", ec2_connection, region)
Example #5
0
def test_ec2_tensorflow_inference_gpu(tensorflow_inference, ec2_connection,
                                      region, gpu_only, ec2_instance_type):
    if test_utils.is_image_compatible_with_instance_type(
            tensorflow_inference, ec2_instance_type):
        pytest.skip(
            f"Image {tensorflow_inference} is incompatible with instance type {ec2_instance_type}"
        )
    run_ec2_tensorflow_inference(tensorflow_inference, ec2_connection, "8500",
                                 region)
def test_mxnet_inference_telemetry_gpu(mxnet_inference, ec2_connection,
                                       gpu_only, ec2_instance_type):
    if test_utils.is_image_compatible_with_instance_type(
            mxnet_inference, ec2_instance_type):
        pytest.skip(
            f"Image {mxnet_inference} is incompatible with instance type {ec2_instance_type}"
        )
    execute_ec2_inference_test(ec2_connection, mxnet_inference,
                               MX_TELEMETRY_CMD)
Example #7
0
def test_curand_gpu(training, ec2_connection, gpu_only, ec2_instance_type):
    if test_utils.is_image_compatible_with_instance_type(
            training, ec2_instance_type):
        pytest.skip(
            f"Image {training} is incompatible with instance type {ec2_instance_type}"
        )
    if is_tf_version("1", training) or "mxnet" in training:
        pytest.skip("Test is not configured for TF1 and MXNet")
    execute_ec2_training_test(ec2_connection, training, CURAND_CMD)
def test_pytorch_linear_regression_gpu(pytorch_training, ec2_connection,
                                       gpu_only, ec2_instance_type):
    if test_utils.is_image_compatible_with_instance_type(
            pytorch_training, ec2_instance_type):
        pytest.skip(
            f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}"
        )
    execute_ec2_training_test(ec2_connection, pytorch_training,
                              PT_REGRESSION_CMD)
def test_pytorch_standalone_gpu(pytorch_training, ec2_connection, gpu_only,
                                ec2_instance_type):
    if test_utils.is_image_compatible_with_instance_type(
            pytorch_training, ec2_instance_type):
        pytest.skip(
            f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}"
        )
    execute_ec2_training_test(ec2_connection, pytorch_training,
                              PT_STANDALONE_CMD)
def test_mxnet_train_dgl_gpu(mxnet_training, ec2_connection, gpu_only,
                             py3_only, ec2_instance_type):
    if test_utils.is_image_compatible_with_instance_type(
            mxnet_training, ec2_instance_type):
        pytest.skip(
            f"Image {mxnet_training} is incompatible with instance type {ec2_instance_type}"
        )
    if "cu110" in mxnet_training:
        pytest.skip("Skipping dgl tests on cuda 11.0 until available")
    execute_ec2_training_test(ec2_connection, mxnet_training, MX_DGL_CMD)
def test_tensorflow_with_horovod_gpu(tensorflow_training, ec2_instance_type, ec2_connection, gpu_only, tf2_only):
    if test_utils.is_image_compatible_with_instance_type(tensorflow_training, ec2_instance_type):
        pytest.skip(f"Image {tensorflow_training} is incompatible with instance type {ec2_instance_type}")
    test_script = TF1_HVD_CMD if is_tf_version("1", tensorflow_training) else TF2_HVD_CMD
    execute_ec2_training_test(
        connection=ec2_connection,
        ecr_uri=tensorflow_training,
        test_cmd=test_script,
        large_shm=bool(re.match(r"(p2\.8xlarge)|(g3\.16xlarge)", ec2_instance_type))
    )
def test_ec2_mxnet_gluonnlp_inference_gpu(mxnet_inference, ec2_connection,
                                          region, gpu_only, py3_only,
                                          ec2_instance_type):
    if test_utils.is_image_compatible_with_instance_type(
            mxnet_inference, ec2_instance_type):
        pytest.skip(
            f"Image {mxnet_inference} is incompatible with instance type {ec2_instance_type}"
        )
    run_ec2_mxnet_inference(mxnet_inference, BERT_MODEL, "gluonnlp",
                            ec2_connection, "gpu", region, 90, 9091)
def test_ec2_mxnet_squeezenet_inference_gpu(mxnet_inference, ec2_connection,
                                            region, gpu_only,
                                            ec2_instance_type):
    if test_utils.is_image_compatible_with_instance_type(
            mxnet_inference, ec2_instance_type):
        pytest.skip(
            f"Image {mxnet_inference} is incompatible with instance type {ec2_instance_type}"
        )
    run_ec2_mxnet_inference(mxnet_inference, SQUEEZENET_MODEL, "squeezenet",
                            ec2_connection, "gpu", region, 80, 8081)
def test_pytorch_with_horovod(pytorch_training, ec2_connection, gpu_only,
                              ec2_instance_type):
    if test_utils.is_image_compatible_with_instance_type(
            pytorch_training, ec2_instance_type):
        pytest.skip(
            f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}"
        )
    test_cmd = os.path.join(CONTAINER_TESTS_PREFIX, "pytorch_tests",
                            "testPTHVD")
    execute_ec2_training_test(ec2_connection, pytorch_training, test_cmd)
def test_pytorch_amp(pytorch_training, ec2_connection, gpu_only,
                     ec2_instance_type):
    _, image_framework_version = get_framework_and_version_from_tag(
        pytorch_training)
    if Version(image_framework_version) < Version("1.6"):
        pytest.skip("Native AMP was introduced in PyTorch 1.6")
    if test_utils.is_image_compatible_with_instance_type(
            pytorch_training, ec2_instance_type):
        pytest.skip(
            f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}"
        )
    execute_ec2_training_test(ec2_connection, pytorch_training, PT_AMP_CMD)
def test_pytorch_mpi(pytorch_training, ec2_connection, gpu_only, py3_only,
                     ec2_instance_type):
    """
    Tests mpi backend
    """
    if test_utils.is_image_compatible_with_instance_type(
            pytorch_training, ec2_instance_type):
        pytest.skip(
            f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}"
        )
    test_cmd = os.path.join(CONTAINER_TESTS_PREFIX, "pytorch_tests",
                            "testPyTorchMpi")
    execute_ec2_training_test(ec2_connection, pytorch_training, test_cmd)
def test_smprofiler_gpu(training, ec2_connection, region, ec2_instance_type,
                        gpu_only, py3_only, tf23_and_above_only,
                        pt17_and_above_only):
    # Running the profiler tests for pytorch and tensorflow2 frameworks only.
    # This code needs to be modified past reInvent 2020
    if test_utils.is_image_compatible_with_instance_type(
            training, ec2_instance_type):
        pytest.skip(
            f"Image {training} is incompatible with instance type {ec2_instance_type}"
        )
    framework = get_framework_from_image_uri(training)
    if framework not in ["pytorch", "tensorflow2"]:
        return
    smdebug_test_timeout = 2400
    run_smprofiler_test(
        training,
        ec2_connection,
        region,
        ec2_instance_type,
        docker_executable="nvidia-docker",
        container_name="smdebug-gpu",
        timeout=smdebug_test_timeout,
    )
def test_tensorflow_train_mnist_gpu(tensorflow_training, ec2_connection, gpu_only, ec2_instance_type):
    if test_utils.is_image_compatible_with_instance_type(tensorflow_training, ec2_instance_type):
        pytest.skip(f"Image {tensorflow_training} is incompatible with instance type {ec2_instance_type}")
    execute_ec2_training_test(ec2_connection, tensorflow_training, TF_MNIST_CMD)
def test_tensorflow_standalone_gpu(tensorflow_training, ec2_connection, gpu_only, ec2_instance_type):
    if test_utils.is_image_compatible_with_instance_type(tensorflow_training, ec2_instance_type):
        pytest.skip(f"Image {tensorflow_training} is incompatible with instance type {ec2_instance_type}")
    test_script = TF1_STANDALONE_CMD if is_tf_version("1", tensorflow_training) else TF2_STANDALONE_CMD
    execute_ec2_training_test(ec2_connection, tensorflow_training, test_script)
def test_tensorflow_distribute_dataservice_gpu(
        tensorflow_training, ec2_connection, tf24_and_above_only, gpu_only, ec2_instance_type
):
    if test_utils.is_image_compatible_with_instance_type(tensorflow_training, ec2_instance_type):
        pytest.skip(f"Image {tensorflow_training} is incompatible with instance type {ec2_instance_type}")
    run_data_service_test(ec2_connection, tensorflow_training, TF_DATASERVICE_DISTRIBUTE_TEST_CMD)
def test_tensorflow_keras_horovod_fp32(tensorflow_training, ec2_connection, tf2_only, gpu_only, ec2_instance_type):
    if test_utils.is_image_compatible_with_instance_type(tensorflow_training, ec2_instance_type):
        pytest.skip(f"Image {tensorflow_training} is incompatible with instance type {ec2_instance_type}")
    execute_ec2_training_test(ec2_connection, tensorflow_training, TF_KERAS_HVD_CMD_FP32)