def test_framework_and_cuda_version_gpu(gpu, ec2_connection):
    """
    Check that the framework  and cuda version in the image tag is the same as the one on a running container.

    :param gpu: ECR image URI with "gpu" in the name
    :param ec2_connection: fixture to establish connection with an ec2 instance
    """
    image = gpu
    tested_framework, tag_framework_version = get_framework_and_version_from_tag(
        image)

    # Framework Version Check #
    # Skip framework version test for tensorflow-inference, since it doesn't have core TF installed
    if "tensorflow-inference" not in image:
        # Module name is "torch"
        if tested_framework == "pytorch":
            tested_framework = "torch"
        cmd = f"import {tested_framework}; print({tested_framework}.__version__)"
        output = ec2.execute_ec2_training_test(ec2_connection,
                                               image,
                                               cmd,
                                               executable="python")

        if is_canary_context():
            assert tag_framework_version in output.stdout.strip()
        else:
            assert tag_framework_version == output.stdout.strip()

    # CUDA Version Check #
    cuda_version = re.search(r"-cu(\d+)-", image).group(1)

    # MXNet inference containers do not currently have nvcc in /usr/local/cuda/bin, so check symlink
    if "mxnet-inference" in image:
        cuda_cmd = "readlink /usr/local/cuda"
    else:
        cuda_cmd = "nvcc --version"
    cuda_output = ec2.execute_ec2_training_test(
        ec2_connection, image, cuda_cmd, container_name="cuda_version_test")

    # Ensure that cuda version in tag is in the container
    assert cuda_version in cuda_output.stdout.replace(".", "")
def framework_version_within_limit(metafunc_obj, image):
    """
    Test all pytest fixtures for TensorFlow version limits, and return True if all requirements are satisfied

    :param metafunc_obj: pytest metafunc object from which fixture names used by test function will be obtained
    :param image: Image URI for which the validation must be performed
    :return: True if all validation succeeds, else False
    """
    image_framework_name, _ = get_framework_and_version_from_tag(image)
    if image_framework_name == "tensorflow":
        tf2_requirement_failed = "tf2_only" in metafunc_obj.fixturenames and not is_tf_version(
            "2", image)
        tf25_requirement_failed = "tf25_and_above_only" in metafunc_obj.fixturenames and is_below_framework_version(
            "2.5", image, "tensorflow")
        tf24_requirement_failed = "tf24_and_above_only" in metafunc_obj.fixturenames and is_below_framework_version(
            "2.4", image, "tensorflow")
        tf23_requirement_failed = "tf23_and_above_only" in metafunc_obj.fixturenames and is_below_framework_version(
            "2.3", image, "tensorflow")
        tf21_requirement_failed = "tf21_and_above_only" in metafunc_obj.fixturenames and is_below_framework_version(
            "2.1", image, "tensorflow")
        if (tf2_requirement_failed or tf21_requirement_failed
                or tf24_requirement_failed or tf25_requirement_failed
                or tf23_requirement_failed):
            return False
    if image_framework_name == "mxnet":
        mx18_requirement_failed = "mx18_and_above_only" in metafunc_obj.fixturenames and is_below_framework_version(
            "1.8", image, "mxnet")
        if mx18_requirement_failed:
            return False
    if image_framework_name == "pytorch":
        pt17_requirement_failed = "pt17_and_above_only" in metafunc_obj.fixturenames and is_below_framework_version(
            "1.7", image, "pytorch")
        pt16_requirement_failed = "pt16_and_above_only" in metafunc_obj.fixturenames and is_below_framework_version(
            "1.6", image, "pytorch")
        pt15_requirement_failed = "pt15_and_above_only" in metafunc_obj.fixturenames and is_below_framework_version(
            "1.5", image, "pytorch")
        pt14_requirement_failed = "pt14_and_above_only" in metafunc_obj.fixturenames and is_below_framework_version(
            "1.4", image, "pytorch")
        if pt17_requirement_failed or pt16_requirement_failed or pt15_requirement_failed or pt14_requirement_failed:
            return False
    return True
Ejemplo n.º 3
0
def test_ecs_pytorch_s3_plugin_training_cpu(cpu_only, ecs_container_instance,
                                            pytorch_training, training_cmd,
                                            ecs_cluster_name,
                                            pt17_and_above_only):
    """
    CPU resnet18 test for PyTorch Training using S3 plugin

    Instance Type - c5.9xlarge

    Given above parameters, registers a task with family named after this test, runs the task, and waits for
    the task to be stopped before doing teardown operations of instance and cluster.
    """
    _, image_framework_version = get_framework_and_version_from_tag(
        pytorch_training)
    if Version(image_framework_version) < Version("1.8"):
        pytest.skip("S3 plugin is supported on PyTorch version >=1.8")
    instance_id, cluster_arn = ecs_container_instance

    ecs_utils.ecs_training_test_executor(ecs_cluster_name, cluster_arn,
                                         training_cmd, pytorch_training,
                                         instance_id)
Ejemplo n.º 4
0
def test_smdataparallel_mnist_script_mode_multigpu(ecr_image, instance_type,
                                                   py_version,
                                                   sagemaker_session, tmpdir):
    """
    Tests SM Distributed DataParallel single-node via script mode
    """
    _, image_framework_version = get_framework_and_version_from_tag(ecr_image)
    if (Version(image_framework_version) in SpecifierSet("<1.6")):
        pytest.skip("Data Parallelism is supported on PyTorch v1.6 and above")

    instance_type = "ml.p3.16xlarge"
    with timeout(minutes=DEFAULT_TIMEOUT):
        pytorch = PyTorch(entry_point='smdataparallel_mnist_script_mode.sh',
                          role='SageMakerRole',
                          image_uri=ecr_image,
                          source_dir=mnist_path,
                          instance_count=1,
                          instance_type=instance_type,
                          sagemaker_session=sagemaker_session)

        pytorch.fit()
Ejemplo n.º 5
0
def test_ecs_pytorch_training_dgl_cpu(
    cpu_only, py3_only, ecs_container_instance, pytorch_training, training_cmd, ecs_cluster_name
):
    """
    CPU DGL test for PyTorch Training

    Instance Type - c5.12xlarge

    DGL is only supported in py3, hence we have used the "py3_only" fixture to ensure py2 images don't run
    on this function.

    Given above parameters, registers a task with family named after this test, runs the task, and waits for
    the task to be stopped before doing teardown operations of instance and cluster.
    """
    _, image_framework_version = get_framework_and_version_from_tag(pytorch_training)
    # TODO: Remove when DGL gpu test on ecs get fixed
    if Version(image_framework_version) in SpecifierSet("==1.10.*"):
        pytest.skip("ecs test for DGL gpu fails for pt 1.10")
    instance_id, cluster_arn = ecs_container_instance

    ecs_utils.ecs_training_test_executor(ecs_cluster_name, cluster_arn, training_cmd, pytorch_training, instance_id)
Ejemplo n.º 6
0
def execute_pytorch_gpu_py3_imagenet_ec2_training_performance_test(
        connection, ecr_uri, test_cmd, region=DEFAULT_REGION):
    _, framework_version = get_framework_and_version_from_tag(ecr_uri)
    threshold = get_threshold_for_image(
        framework_version, PYTORCH_TRAINING_GPU_IMAGENET_THRESHOLD)
    repo_name, image_tag = ecr_uri.split("/")[-1].split(":")
    container_test_local_dir = os.path.join("$HOME", "container_tests")

    container_name = f"{repo_name}-performance-{image_tag}-ec2"

    # Make sure we are logged into ECR so we can pull the image
    connection.run(
        f"$(aws ecr get-login --no-include-email --region {region})",
        hide=True)
    connection.run(f"nvidia-docker pull -q {ecr_uri}")
    timestamp = time.strftime("%Y-%m-%d-%H-%M-%S")
    log_name = f"imagenet_{os.getenv('CODEBUILD_RESOLVED_SOURCE_VERSION')}_{timestamp}.txt"
    log_location = os.path.join(container_test_local_dir, "benchmark", "logs",
                                log_name)
    # Run training command, display benchmark results to console
    try:
        connection.run(
            f"nvidia-docker run --user root "
            f"-e LOG_FILE={os.path.join(os.sep, 'test', 'benchmark', 'logs', log_name)} "
            f"-e PR_CONTEXT={1 if is_pr_context() else 0} "
            f"--shm-size 8G --env OMP_NUM_THREADS=1 --name {container_name} "
            f"-v {container_test_local_dir}:{os.path.join(os.sep, 'test')} "
            f"-v /home/ubuntu/:/root/:delegated "
            f"{ecr_uri} {os.path.join(os.sep, 'bin', 'bash')} -c {test_cmd}")
    finally:
        connection.run(f"docker rm -f {container_name}", warn=True, hide=True)
    ec2_performance_upload_result_to_s3_and_validate(
        connection,
        ecr_uri,
        log_location,
        "imagenet",
        {"Cost": threshold},
        post_process_pytorch_gpu_py3_imagenet_ec2_training_performance,
        log_name,
    )
def test_optimized_tensorflow_sagemaker_training_performance_singlenode(tensorflow_training, region, gpu_only, tf25_and_above_only):
    throughput_without_xla = run_sm_perf_test(
                                image_uri=tensorflow_training,
                                xla=False,
                                num_nodes=1,
                                region=region,
                                threshold=None
                                )
    '''
    TODO: Add an absolute THRESHOLD for the XLA accelerated benchmark to guard against regression.
    '''
    throughput_with_xla = run_sm_perf_test(
                                image_uri=tensorflow_training,
                                xla=True,
                                num_nodes=1,
                                region=region,
                                threshold=None
                                )
   
    _, framework_version = get_framework_and_version_from_tag(tensorflow_training)
    py_version = "py2" if "py2" in tensorflow_training else "py37" if "py37" in tensorflow_training else "py3"
    '''
Ejemplo n.º 8
0
def test_framework_version_cpu(image):
    """
    Check that the framework version in the image tag is the same as the one on a running container.
    This function tests CPU, EIA, and Neuron images.

    :param image: ECR image URI
    """
    if "gpu" in image:
        pytest.skip(
            "GPU images will have their framework version tested in test_framework_and_cuda_version_gpu"
        )
    image_repo_name, _ = get_repository_and_tag_from_image_uri(image)
    if re.fullmatch(r"(pr-|beta-|nightly-)?tensorflow-inference(-eia)?",
                    image_repo_name):
        pytest.skip(
            msg=
            "TF inference for CPU/GPU/EIA does not have core tensorflow installed"
        )

    tested_framework, tag_framework_version = get_framework_and_version_from_tag(
        image)

    # Framework name may include huggingface
    tested_framework = tested_framework.lstrip("huggingface_")
    # Module name is torch
    if tested_framework == "pytorch":
        tested_framework = "torch"
    ctx = Context()
    container_name = get_container_name("framework-version", image)
    start_container(container_name, image, ctx)
    output = run_cmd_on_container(
        container_name,
        ctx,
        f"import {tested_framework}; print({tested_framework}.__version__)",
        executable="python")
    if is_canary_context():
        assert tag_framework_version in output.stdout.strip()
    else:
        assert tag_framework_version == output.stdout.strip()
def test_tf_serving_version_cpu(tensorflow_inference):
    """
    For non-huggingface non-GPU TF inference images, check that the tag version matches the version of TF serving
    in the container.

    Huggingface includes MMS and core TF, hence the versioning scheme is based off of the underlying tensorflow
    framework version, rather than the TF serving version.

    GPU inference images will be tested along side `test_framework_and_cuda_version_gpu` in order to be judicious
    about GPU resources. This test can run directly on the host, and thus does not require additional resources
    to be spun up.

    @param tensorflow_inference: ECR image URI
    """
    # Set local variable to clarify contents of fixture
    image = tensorflow_inference

    if "gpu" in image:
        pytest.skip(
            "GPU images will have their framework version tested in test_framework_and_cuda_version_gpu"
        )
    if "neuron" in image:
        pytest.skip(
            "Neuron images will have their framework version tested in test_framework_and_neuron_sdk_version"
        )

    _, tag_framework_version = get_framework_and_version_from_tag(image)

    ctx = Context()
    container_name = get_container_name("tf-serving-version", image)
    start_container(container_name, image, ctx)
    output = run_cmd_on_container(container_name,
                                  ctx,
                                  "tensorflow_model_server --version",
                                  executable="bash")
    assert re.match(rf"TensorFlow ModelServer: {tag_framework_version}(\D+)?", output.stdout), \
        f"Cannot find model server version {tag_framework_version} in {output.stdout}"

    stop_and_remove_container(container_name, ctx)
def test_torchvision_nms_training(pytorch_training):
    """
    Check that the internally built torchvision binary is used to resolve the missing nms issue.
    :param pytorch_training: framework fixture for pytorch training
    """
    _, framework_version = get_framework_and_version_from_tag(pytorch_training)
    if Version(framework_version) >= Version("1.10.0"):
        pytest.skip(
            "Skipping this test for PT 1.10.0 and onward, since torch.ops.torchvision.nms api is outdated."
        )
    if Version(framework_version) == Version(
            "1.5.1") and get_processor_from_image_uri(
                pytorch_training) == "gpu":
        pytest.skip("Skipping this test for PT 1.5.1 GPU Training DLC images")
    ctx = Context()
    container_name = get_container_name("torchvision-nms", pytorch_training)
    start_container(container_name, pytorch_training, ctx)
    run_cmd_on_container(
        container_name,
        ctx,
        f"import torch; import torchvision; print(torch.ops.torchvision.nms)",
        executable="python")
Ejemplo n.º 11
0
def test_sm_profiler_pt(pytorch_training):
    processor = get_processor_from_image_uri(pytorch_training)
    if processor not in ("cpu", "gpu"):
        pytest.skip(f"Processor {processor} not supported. Skipping test.")

    _, image_framework_version = get_framework_and_version_from_tag(pytorch_training)
    if Version(image_framework_version) in SpecifierSet(">=1.12"):
        pytest.skip("sm profiler ZCC test is not supported in PT 1.12 and above")

    ctx = Context()

    profiler_tests_dir = os.path.join(
        os.getenv("CODEBUILD_SRC_DIR"), get_container_name("smprof", pytorch_training), "smprofiler_tests"
    )
    ctx.run(f"mkdir -p {profiler_tests_dir}", hide=True)

    # Download sagemaker-tests zip
    sm_tests_zip = "sagemaker-tests.zip"
    ctx.run(
        f"aws s3 cp {os.getenv('SMPROFILER_TESTS_BUCKET')}/{sm_tests_zip} {profiler_tests_dir}/{sm_tests_zip}",
        hide=True,
    )

    # PT test setup requirements
    with ctx.prefix(f"cd {profiler_tests_dir}"):
        ctx.run(f"unzip {sm_tests_zip}", hide=True)
        with ctx.prefix("cd sagemaker-tests/tests/scripts/pytorch_scripts"):
            ctx.run("mkdir -p data", hide=True)
            ctx.run(
                "aws s3 cp s3://smdebug-testing/datasets/cifar-10-python.tar.gz data/cifar-10-batches-py.tar.gz",
                hide=True,
            )
            ctx.run("aws s3 cp s3://smdebug-testing/datasets/MNIST_pytorch.tar.gz data/MNIST_pytorch.tar.gz", hide=True)
            with ctx.prefix("cd data"):
                ctx.run("tar -zxf MNIST_pytorch.tar.gz", hide=True)
                ctx.run("tar -zxf cifar-10-batches-py.tar.gz", hide=True)

    run_sm_profiler_tests(pytorch_training, profiler_tests_dir, "test_profiler_pytorch.py", processor)
def _run_tag_success(image_uri, ec2_client, ec2_instance, ec2_connection):
    expected_tag_key = "aws-dlc-autogenerated-tag-do-not-delete"

    ec2_instance_id, _ = ec2_instance
    account_id = test_utils.get_account_id_from_image_uri(image_uri)
    image_region = test_utils.get_region_from_image_uri(image_uri)
    repo_name, image_tag = test_utils.get_repository_and_tag_from_image_uri(image_uri)
    framework, _ = test_utils.get_framework_and_version_from_tag(image_uri)
    job_type = test_utils.get_job_type_from_image(image_uri)
    processor = test_utils.get_processor_from_image_uri(image_uri)

    container_name = f"{repo_name}-telemetry_tag_instance_success-ec2"

    docker_cmd = "nvidia-docker" if processor == "gpu" else "docker"

    test_utils.login_to_ecr_registry(ec2_connection, account_id, image_region)
    ec2_connection.run(f"{docker_cmd} pull -q {image_uri}")

    preexisting_ec2_instance_tags = ec2_utils.get_ec2_instance_tags(ec2_instance_id, ec2_client=ec2_client)
    if expected_tag_key in preexisting_ec2_instance_tags:
        ec2_client.remove_tags(Resources=[ec2_instance_id], Tags=[{"Key": expected_tag_key}])

    if "tensorflow" in framework and job_type == "inference":
        env_vars_list = ecs_utils.get_ecs_tensorflow_environment_variables(processor, "saved_model_half_plus_two")
        env_vars = " ".join([f"-e {entry['name']}={entry['value']}" for entry in env_vars_list])
        ec2_connection.run(f"{docker_cmd} run {env_vars} --name {container_name} -id {image_uri}")
        time.sleep(5)
    else:
        framework_to_import = framework.replace("huggingface_", "")
        framework_to_import = "torch" if framework_to_import == "pytorch" else framework_to_import
        ec2_connection.run(f"{docker_cmd} run --name {container_name} -id {image_uri} bash")
        output = ec2_connection.run(
            f"{docker_cmd} exec -i {container_name} python -c 'import {framework_to_import}; import time; time.sleep(5)'",
            warn=True
        )

    ec2_instance_tags = ec2_utils.get_ec2_instance_tags(ec2_instance_id, ec2_client=ec2_client)
    assert expected_tag_key in ec2_instance_tags, f"{expected_tag_key} was not applied as an instance tag"
Ejemplo n.º 13
0
def ec2_performance_tensorflow_inference(image_uri, processor, ec2_connection,
                                         ec2_instance_ami, region, threshold):
    docker_cmd = "nvidia-docker" if processor == "gpu" else "docker"
    container_test_local_dir = os.path.join("$HOME", "container_tests")
    tf_version = "1" if is_tf_version("1", image_uri) else "2"
    _, tf_api_version = get_framework_and_version_from_tag(image_uri)

    num_iterations = 500 if is_pr_context() else 1000
    # Make sure we are logged into ECR so we can pull the image
    ec2_connection.run(
        f"$(aws ecr get-login --no-include-email --region {region})",
        hide=True)

    ec2_connection.run(f"{docker_cmd} pull -q {image_uri} ")

    # Run performance inference command, display benchmark results to console
    ec2_connection.run(f"pip3 install -U pip")
    ec2_connection.run(
        f"pip3 install boto3 grpcio 'tensorflow-serving-api<={tf_api_version}' --user --no-warn-script-location"
    )
    time_str = time.strftime("%Y-%m-%d-%H-%M-%S")
    commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION")
    log_file = f"synthetic_{commit_info}_{time_str}.log"
    python_invoker = get_python_invoker(ec2_instance_ami)
    ec2_connection.run(
        f"{python_invoker} {container_test_local_dir}/bin/benchmark/tf{tf_version}_serving_perf.py "
        f"--processor {processor} --docker_image_name {image_uri} "
        f"--run_all_s3 --binary /usr/bin/tensorflow_model_server --get_perf --iterations {num_iterations} "
        f"2>&1 | tee {log_file}")
    ec2_performance_upload_result_to_s3_and_validate(
        ec2_connection,
        image_uri,
        log_file,
        "synthetic",
        threshold,
        post_process_inference,
        log_file,
    )
def test_sagemaker_studio_analytics_extension(training, package_name):
    framework, framework_version = test_utils.get_framework_and_version_from_tag(
        training)
    utility_package_minimum_framework_version = {
        "pytorch": "1.7",
        "tensorflow": "2.4"
    }
    utility_package_maximum_framework_version = {
        "pytorch": "1.8",
        "tensorflow": "2.6"
    }

    if framework not in utility_package_minimum_framework_version or Version(
            framework_version) < Version(
                utility_package_minimum_framework_version[framework]
            ) or Version(framework_version) > Version(
                utility_package_maximum_framework_version[framework]):
        pytest.skip(
            f"sagemaker_studio_analytics_extension is not installed in {framework} {framework_version} DLCs"
        )

    ctx = Context()
    container_name = test_utils.get_container_name(
        f"sagemaker_studio_analytics_extension-{package_name}", training)
    test_utils.start_container(container_name, training, ctx)

    # Optionally add version validation in the following steps, rather than just printing it.
    test_utils.run_cmd_on_container(container_name, ctx,
                                    f"pip list | grep -i {package_name}")
    import_package = package_name.replace("-", "_")
    import_test_cmd = (f"import {import_package}" if package_name in [
        "sagemaker-studio-sparkmagic-lib",
        "sagemaker-studio-analytics-extension"
    ] else f"import {import_package}; print({import_package}.__version__)")
    test_utils.run_cmd_on_container(container_name,
                                    ctx,
                                    import_test_cmd,
                                    executable="python")
def test_ecs_mxnet_training_dgl_gpu(gpu_only, py3_only, ecs_container_instance, mxnet_training, training_cmd,
                                    ecs_cluster_name):
    """
    GPU DGL test for MXNet Training

    Instance Type - p2.xlarge

    DGL is only supported in py3, hence we have used the "py3_only" fixture to ensure py2 images don't run
    on this function.

    Given above parameters, registers a task with family named after this test, runs the task, and waits for
    the task to be stopped before doing teardown operations of instance and cluster.
    """
    # TODO: remove/update this when DGL supports MXNet 1.9
    _, framework_version = get_framework_and_version_from_tag(mxnet_training)
    if Version(framework_version) >= Version('1.9.0'):
        pytest.skip("Skipping DGL tests as DGL does not yet support MXNet 1.9")
    instance_id, cluster_arn = ecs_container_instance

    num_gpus = ec2_utils.get_instance_num_gpus(instance_id)

    ecs_utils.ecs_training_test_executor(ecs_cluster_name, cluster_arn, training_cmd, mxnet_training, instance_id,
                                         num_gpus=num_gpus)
Ejemplo n.º 16
0
def test_utility_packages_using_import(training):
    """
    Verify that utility packages are installed in the Training DLC image

    :param training: training ECR image URI
    """
    ctx = Context()
    container_name = test_utils.get_container_name(
        "utility_packages_using_import", training)
    test_utils.start_container(container_name, training, ctx)

    framework, framework_version = test_utils.get_framework_and_version_from_tag(
        training)
    utility_package_minimum_framework_version = {
        "mxnet": "1.8",
        "pytorch": "1.7",
        "tensorflow2": "2.4",
        "tensorflow1": "1.15",
    }

    framework = "tensorflow1" if framework == "tensorflow" and framework_version.startswith(
        "1.") else "tensorflow2"
    if Version(framework_version) < Version(
            utility_package_minimum_framework_version[framework]):
        pytest.skip("Extra utility packages will be added going forward.")

    for package in UTILITY_PACKAGES_IMPORT:
        version = test_utils.run_cmd_on_container(
            container_name,
            ctx,
            f"import {package}; print({package}.__version__)",
            executable="python").stdout.strip()
        if package == "sagemaker":
            assert Version(version) > Version(
                "2"
            ), f"Sagemaker version should be > 2.0. Found version {sm_version}"
Ejemplo n.º 17
0
def test_smmodelparallel_mnist_multigpu(ecr_image, instance_type, py_version,
                                        sagemaker_session, tmpdir):
    """
    Tests pt mnist command via script mode
    """
    instance_type = "ml.p3.16xlarge"
    _, image_framework_version = get_framework_and_version_from_tag(ecr_image)
    image_cuda_version = get_cuda_version_from_tag(ecr_image)
    if not (Version(image_framework_version)
            in SpecifierSet(">=1.6,<1.8")) or image_cuda_version != "cu110":
        pytest.skip(
            "Model Parallelism only supports CUDA 11 on PyTorch 1.6 and PyTorch 1.7"
        )

    with timeout(minutes=DEFAULT_TIMEOUT):
        pytorch = PyTorch(entry_point='smmodelparallel_pt_mnist.sh',
                          role='SageMakerRole',
                          image_uri=ecr_image,
                          source_dir=mnist_path,
                          instance_count=1,
                          instance_type=instance_type,
                          sagemaker_session=sagemaker_session)

        pytorch.fit()
def test_framework_version_gpu(gpu, ec2_connection):
    """
    Check that the framework version in the image tag is the same as the one on a running container.

    :param gpu: ECR image URI with "gpu" in the name
    :param ec2_connection: fixture to establish connection with an ec2 instance
    """
    image = gpu
    if "tensorflow-inference" in image:
        pytest.skip(msg="TF inference does not have core tensorflow installed")

    tested_framework, tag_framework_version = get_framework_and_version_from_tag(
        image)

    # Module name is "torch"
    if tested_framework == "pytorch":
        tested_framework = "torch"
    cmd = f'import {tested_framework}; print({tested_framework}.__version__)'
    output = ec2.execute_ec2_training_test(ec2_connection,
                                           image,
                                           cmd,
                                           executable="python")

    assert tag_framework_version == output.stdout.strip()
def test_smdataparallel_mnist(instance_types, ecr_image, py_version,
                              sagemaker_session, tmpdir):
    """
    Tests smddprun command via Estimator API distribution parameter
    """
    _, image_framework_version = get_framework_and_version_from_tag(ecr_image)
    image_cuda_version = get_cuda_version_from_tag(ecr_image)
    if Version(image_framework_version) < Version(
            "2.3.1") or image_cuda_version != "cu110":
        pytest.skip(
            "Data Parallelism is only supported on CUDA 11, and on TensorFlow 2.3.1 or higher"
        )
    distribution = {"smdistributed": {"dataparallel": {"enabled": True}}}
    estimator = TensorFlow(entry_point='smdataparallel_mnist.py',
                           role='SageMakerRole',
                           image_uri=ecr_image,
                           source_dir=MNIST_PATH,
                           instance_count=2,
                           instance_type=instance_types,
                           sagemaker_session=sagemaker_session,
                           distribution=distribution)

    estimator.fit(
        job_name=unique_name_from_base('test-tf-smdataparallel-multi'))
Ejemplo n.º 20
0
def test_ecs_pytorch_training_dgl_gpu(gpu_only, py3_only, ecs_container_instance, pytorch_training, training_cmd,
                                      ecs_cluster_name):
    """
    GPU DGL test for PyTorch Training

    Instance Type - p3.8xlarge

    DGL is only supported in py3, hence we have used the "py3_only" fixture to ensure py2 images don't run
    on this function.

    Given above parameters, registers a task with family named after this test, runs the task, and waits for
    the task to be stopped before doing teardown operations of instance and cluster.
    """
    _, image_framework_version = get_framework_and_version_from_tag(pytorch_training)
    image_cuda_version = get_cuda_version_from_tag(pytorch_training)
    if Version(image_framework_version) == Version("1.6") and image_cuda_version == "cu110":
        pytest.skip("DGL does not suport CUDA 11 for PyTorch 1.6")

    instance_id, cluster_arn = ecs_container_instance

    num_gpus = ec2_utils.get_instance_num_gpus(instance_id)

    ecs_utils.ecs_training_test_executor(ecs_cluster_name, cluster_arn, training_cmd, pytorch_training, instance_id,
                                         num_gpus=num_gpus)
Ejemplo n.º 21
0
def test_smmodelparallel_multinode(sagemaker_session, instance_type, ecr_image,
                                   tmpdir, framework_version, test_script,
                                   num_processes):
    """
    Tests SM Modelparallel in sagemaker
    """
    instance_type = "ml.p3.16xlarge"
    _, image_framework_version = get_framework_and_version_from_tag(ecr_image)
    image_cuda_version = get_cuda_version_from_tag(ecr_image)
    if Version(image_framework_version) < Version(
            "2.3.1") or image_cuda_version != "cu110":
        pytest.skip(
            "Model Parallelism only supports CUDA 11, and on TensorFlow 2.3.1 or higher"
        )
    smmodelparallel_path = os.path.join(RESOURCE_PATH, 'smmodelparallel')
    estimator = TensorFlow(entry_point=test_script,
                           role='SageMakerRole',
                           instance_count=2,
                           instance_type=instance_type,
                           source_dir=smmodelparallel_path,
                           distributions={
                               "mpi": {
                                   "enabled":
                                   True,
                                   "processes_per_host":
                                   num_processes,
                                   "custom_mpi_options":
                                   "-verbose --mca orte_base_help_aggregate 0",
                               }
                           },
                           sagemaker_session=sagemaker_session,
                           image_uri=ecr_image,
                           framework_version=framework_version,
                           py_version='py3',
                           base_job_name='smp-test2')
    estimator.fit()
def test_dlc_major_version_dockerfiles(image):
    """
    Test to make sure semantic versioning scheme in Dockerfiles is correct

    :param image: <str> ECR image URI
    """
    dlc_dir = os.getcwd().split(f"{os.sep}test{os.sep}")[0]
    job_type = test_utils.get_job_type_from_image(image)
    framework, fw_version = test_utils.get_framework_and_version_from_tag(
        image)
    processor = test_utils.get_processor_from_image_uri(image)

    # Assign a string of numbers associated with python version in tag. Python major version is not sufficient to
    # define DLC major version
    python_major_minor_version = re.search(r"-py(\d{2,})", image).group(1)

    root_dir = os.path.join(dlc_dir, framework, job_type, "docker")

    # Skip older FW versions that did not use this versioning scheme
    references = {
        "tensorflow2": "2.2.0",
        "tensorflow1": "1.16.0",
        "mxnet": "1.7.0",
        "pytorch": "1.5.0"
    }
    if test_utils.is_tf_version("1", image):
        reference_fw = "tensorflow1"
    elif test_utils.is_tf_version("2", image):
        reference_fw = "tensorflow2"
    else:
        reference_fw = framework
    if processor != "eia" and (
            reference_fw in references
            and Version(fw_version) < Version(references[reference_fw])):
        pytest.skip(
            f"Not enforcing new versioning scheme on old image {image}. "
            f"Started enforcing version scheme on the following: {references}")

    # Find all Dockerfile.<processor> for this framework/job_type's Major.Minor version
    dockerfiles = []
    fw_version_major_minor = re.match(r"(\d+\.\d+)", fw_version).group(1)
    for root, dirnames, filenames in os.walk(root_dir):
        for filename in filenames:
            if filename == f"Dockerfile.{processor}":
                dockerfile_path = os.path.join(root_dir, root, filename)
                if "example" not in dockerfile_path and f"{os.sep}{fw_version_major_minor}" in dockerfile_path:
                    dockerfiles.append(dockerfile_path)

    # For the collected dockerfiles above, note the DLC major versions in each Dockerfile if python version matches
    # the current image under test
    versions = {}
    dlc_label_regex = re.compile(r'LABEL dlc_major_version="(\d+)"')
    python_version_regex = re.compile(r"ARG PYTHON_VERSION=(\d+\.\d+)")
    for dockerfile in dockerfiles:
        with open(dockerfile, "r") as df:
            dlc_version = None
            python_version = None
            for line in df:
                major_version_match = dlc_label_regex.match(line)
                python_version_match = python_version_regex.match(line)
                if major_version_match:
                    dlc_version = int(major_version_match.group(1))
                elif python_version_match:
                    python_version = python_version_match.group(1).replace(
                        ".", "")

            # Raise errors if dlc major version label and python version arg are not found in Dockerfile
            if not dlc_version:
                raise DLCMajorVersionLabelNotFound(
                    f"Cannot find dlc_major_version label in {dockerfile}")
            if not python_version:
                raise DLCPythonVersionNotFound(
                    f"Cannot find PYTHON_VERSION arg in {dockerfile}")
            if python_version == python_major_minor_version:
                versions[dockerfile] = dlc_version

    expected_versions = list(range(1, len(dockerfiles) + 1))
    actual_versions = sorted(versions.values())

    # Test case explicitly for TF2.3 gpu, since v1.0 is banned
    if (framework, fw_version_major_minor, processor,
            python_major_minor_version, job_type) == (
                "tensorflow",
                "2.3",
                "gpu",
                "37",
                "training",
            ):
        expected_versions = [v + 1 for v in expected_versions]
        assert 1 not in actual_versions, (
            f"DLC v1.0 is deprecated in TF2.3 gpu containers, but found major version 1 "
            f"in one of the Dockerfiles. Please inspect {versions}")

    # Test case explicitly for PyTorch 1.6.0 training gpu, since v2.0 is banned
    if (framework, fw_version_major_minor, processor,
            python_major_minor_version, job_type) == (
                "pytorch",
                "1.6",
                "gpu",
                "36",
                "training",
            ):
        expected_versions = [v + 1 for v in expected_versions]
        expected_versions[0] = 1
        assert 2 not in actual_versions, (
            f"DLC v2.0 is deprecated in PyTorch 1.6.0 gpu containers, but found major version 2 "
            f"in one of the Dockerfiles. Please inspect {versions}")

    # Note: If, for example, we find 3 dockerfiles with the same framework major/minor version, same processor,
    # and same python major/minor version, we will expect DLC major versions 1, 2, and 3. If an exception needs to be
    # made to this rule, please see the above handling of TF2.3 as an example.
    assert actual_versions == expected_versions, (
        f"Found DLC major versions {actual_versions} but expected {expected_versions} for "
        f"{framework} {job_type} {processor}. Full version info: {versions}. Py version: {python_major_minor_version}"
    )
Ejemplo n.º 23
0
def can_run_smdataparallel(ecr_image):
    _, image_framework_version = get_framework_and_version_from_tag(ecr_image)
    image_cuda_version = get_cuda_version_from_tag(ecr_image)
    return Version(image_framework_version) in SpecifierSet(
        ">=1.6") and Version(image_cuda_version.strip("cu")) >= Version("110")
def run_sm_perf_test(image_uri, num_nodes, region):
    """
    Run TF sagemaker training performance tests

    Additional context: Setup for this function is performed by 'setup_sm_benchmark_tf_train_env' -- this installs
    some prerequisite packages, clones some repos, and creates a virtualenv called sm_benchmark_venv.

    TODO: Refactor the above setup function to be more obviously connected to this function,
    TODO: and install requirements via a requirements.txt file

    :param image_uri: ECR image URI
    :param num_nodes: Number of nodes to run on
    :param region: AWS region
    """
    _, framework_version = get_framework_and_version_from_tag(image_uri)
    if framework_version.startswith("1."):
        pytest.skip("Skipping benchmark test on TF 1.x images.")

    processor = "gpu" if "gpu" in image_uri else "cpu"
    device_cuda_str = f"{processor}-{get_cuda_version_from_tag(image_uri)}" if processor == "gpu" else processor

    ec2_instance_type = "p3.16xlarge" if processor == "gpu" else "c5.18xlarge"

    py_version = "py2" if "py2" in image_uri else "py37" if "py37" in image_uri else "py3"

    time_str = time.strftime("%Y-%m-%d-%H-%M-%S")
    commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION")
    target_upload_location = os.path.join(BENCHMARK_RESULTS_S3_BUCKET,
                                          "tensorflow", framework_version,
                                          "sagemaker", "training",
                                          device_cuda_str, py_version)
    training_job_name = (
        f"tf{framework_version[0]}-tr-bench-{device_cuda_str}-{num_nodes}-node-{py_version}-{commit_info[:7]}-{time_str}"
    )

    # Inserting random sleep because this test starts multiple training jobs around the same time, resulting in
    # a throttling error for SageMaker APIs.
    time.sleep(Random(x=training_job_name).random() * 60)

    test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                            "resources")
    venv_dir = os.path.join(test_dir, "sm_benchmark_venv")

    ctx = Context()

    with ctx.cd(test_dir), ctx.prefix(f"source {venv_dir}/bin/activate"):
        log_file = (
            f"results-{commit_info}-{time_str}-{framework_version}-{device_cuda_str}-{py_version}-{num_nodes}-node.txt"
        )
        run_out = ctx.run(
            f"timeout 45m python tf_sm_benchmark.py "
            f"--framework-version {framework_version} "
            f"--image-uri {image_uri} "
            f"--instance-type ml.{ec2_instance_type} "
            f"--node-count {num_nodes} "
            f"--python {py_version} "
            f"--region {region} "
            f"--job-name {training_job_name}"
            f"2>&1 | tee {log_file}",
            warn=True,
            echo=True,
        )

        if not (run_out.ok or run_out.return_code == 124):
            target_upload_location = os.path.join(target_upload_location,
                                                  "failure_log")

    ctx.run(
        f"aws s3 cp {os.path.join(test_dir, log_file)} {os.path.join(target_upload_location, log_file)}"
    )

    LOGGER.info(
        f"Test results can be found at {os.path.join(target_upload_location, log_file)}"
    )

    result_statement, throughput = _print_results_of_test(
        os.path.join(test_dir, log_file), processor)
    throughput /= num_nodes

    assert run_out.ok, (
        f"Benchmark Test failed with return code {run_out.return_code}. "
        f"Test results can be found at {os.path.join(target_upload_location, log_file)}"
    )

    threshold_table = ((TENSORFLOW_SM_TRAINING_CPU_1NODE_THRESHOLD if num_nodes
                        == 1 else TENSORFLOW_SM_TRAINING_CPU_4NODE_THRESHOLD)
                       if processor == "cpu" else
                       TENSORFLOW_SM_TRAINING_GPU_1NODE_THRESHOLD if num_nodes
                       == 1 else TENSORFLOW_SM_TRAINING_GPU_4NODE_THRESHOLD)
    threshold = get_threshold_for_image(framework_version, threshold_table)
    LOGGER.info(
        f"tensorflow {framework_version} sagemaker training {device_cuda_str} {py_version} "
        f"imagenet {num_nodes} nodes Throughput: {throughput} images/sec, threshold: {threshold} images/sec"
    )
    assert throughput > threshold, (
        f"tensorflow {framework_version} sagemaker training {processor} {py_version} imagenet {num_nodes} nodes "
        f"Benchmark Result {throughput} does not reach the threshold {threshold}"
    )
def test_eks_pytorch_dgl_single_node_training(pytorch_training, py3_only):
    """
    Function to create a pod using kubectl and given container image, and run
    DGL training with PyTorch backend
    Args:
        :param pytorch_training: the ECR URI
    """
    _, image_framework_version = get_framework_and_version_from_tag(
        pytorch_training)
    image_cuda_version = get_cuda_version_from_tag(pytorch_training)
    if Version(image_framework_version) == Version(
            "1.6") and image_cuda_version == "cu110":
        pytest.skip("DGL does not suport CUDA 11 for PyTorch 1.6")
    # TODO: Remove when DGL gpu test on ecs get fixed
    if Version(image_framework_version) >= Version("1.10"):
        pytest.skip("ecs test for DGL gpu fails since pt 1.10")

    training_result = False
    rand_int = random.randint(4001, 6000)

    yaml_path = os.path.join(
        os.sep, "tmp", f"pytorch_single_node_training_dgl_{rand_int}.yaml")
    pod_name = f"pytorch-single-node-training-dgl-{rand_int}"

    if is_below_framework_version("1.7", pytorch_training, "pytorch"):
        dgl_branch = "0.4.x"
    else:
        dgl_branch = "0.7.x"

    args = (
        f"git clone -b {dgl_branch} https://github.com/dmlc/dgl.git && "
        f"cd /dgl/examples/pytorch/gcn/ && DGLBACKEND=pytorch python train.py --dataset cora"
    )

    # TODO: Change hardcoded value to read a mapping from the EKS cluster instance.
    cpu_limit = 72
    cpu_limit = str(int(cpu_limit) / 2)

    if "gpu" in pytorch_training:
        args = args + " --gpu 0"
    else:
        args = args + " --gpu -1"

    search_replace_dict = {
        "<POD_NAME>": pod_name,
        "<CONTAINER_NAME>": pytorch_training,
        "<ARGS>": args,
        "<CPU_LIMIT>": cpu_limit,
    }

    eks_utils.write_eks_yaml_file_from_template(
        eks_utils.SINGLE_NODE_TRAINING_TEMPLATE_PATH, yaml_path,
        search_replace_dict)

    try:
        run("kubectl create -f {}".format(yaml_path))

        if eks_utils.is_eks_training_complete(pod_name):
            dgl_out = run("kubectl logs {}".format(pod_name)).stdout
            if "Test accuracy" in dgl_out:
                training_result = True
            else:
                eks_utils.LOGGER.info("**** training output ****")
                eks_utils.LOGGER.debug(dgl_out)

        assert training_result, f"Training failed"
    finally:
        run("kubectl delete pods {}".format(pod_name))
Ejemplo n.º 26
0
def test_pytorch_train_dgl_cpu(pytorch_training, ec2_connection, cpu_only, py3_only):
    _, image_framework_version = get_framework_and_version_from_tag(pytorch_training)
    # TODO: Remove when DGL gpu test on ecs get fixed
    if Version(image_framework_version) >= Version("1.10"):
        pytest.skip("ecs test for DGL gpu fails since pt 1.10")
    execute_ec2_training_test(ec2_connection, pytorch_training, PT_DGL_CMD)
def run_sm_profiler_tests(image, profiler_tests_dir, test_file, processor):
    """
    Testrunner to execute SM profiler tests from DLC repo
    """
    ctx = Context()

    # Install profiler requirements only once - pytest-rerunfailures has a known issue
    # with the latest pytest https://github.com/pytest-dev/pytest-rerunfailures/issues/128
    try:
        ctx.run(
            "pip install -r "
            "https://raw.githubusercontent.com/awslabs/sagemaker-debugger/master/config/profiler/requirements.txt && "
            "pip install smdebug && "
            "pip uninstall -y pytest-rerunfailures",
            hide=True,
        )
    except UnexpectedExit:
        # Wait a minute and a half if we get an invoke failure - since smprofiler test requirements can be flaky
        time.sleep(90)

    framework, version = get_framework_and_version_from_tag(image)

    # Conditionally set sm data parallel tests, based on config file rules from link below:
    # https://github.com/awslabs/sagemaker-debugger/tree/master/config/profiler
    enable_sm_data_parallel_tests = "true"
    if framework == "pytorch" and Version(version) < Version("1.6"):
        enable_sm_data_parallel_tests = "false"
    if framework == "tensorflow" and Version(version) < Version("2.3"):
        enable_sm_data_parallel_tests = "false"

    # Set SMProfiler specific environment variables
    smprof_configs = {
        "use_current_branch": "false",
        "enable_smdataparallel_tests": enable_sm_data_parallel_tests,
        "force_run_tests": "false",
        "framework": framework,
        "build_type": "release"
    }

    # Command to set all necessary environment variables
    export_cmd = " && ".join(f"export {key}={val}"
                             for key, val in smprof_configs.items())
    export_cmd = f"{export_cmd} && export ENV_CPU_TRAIN_IMAGE=test && export ENV_GPU_TRAIN_IMAGE=test && " \
                 f"export ENV_{processor.upper()}_TRAIN_IMAGE={image}"

    test_results_outfile = os.path.join(
        os.getcwd(), f"{get_container_name('smprof', image)}.txt")
    with ctx.prefix(f"cd {profiler_tests_dir}"):
        with ctx.prefix(f"cd sagemaker-tests && {export_cmd}"):
            try:
                ctx.run(
                    f"pytest --json-report --json-report-file={test_results_outfile} -n=auto "
                    f"-v -s -W=ignore tests/{test_file}::test_{processor}_jobs",
                    hide=True,
                )
                with open(test_results_outfile) as outfile:
                    result_data = json.load(outfile)
                    LOGGER.info(
                        f"Tests passed on {image}; Results:\n{json.dumps(result_data, indent=4)}"
                    )
            except Exception as e:
                if os.path.exists(test_results_outfile):
                    with open(test_results_outfile) as outfile:
                        result_data = json.load(outfile)
                    raise SMProfilerRCTestFailure(
                        f"Failed SM Profiler tests. Results:\n{json.dumps(result_data, indent=4)}"
                    ) from e
                raise
Ejemplo n.º 28
0
def test_pytorch_train_dgl_gpu(pytorch_training, ec2_connection, gpu_only, py3_only):
    _, image_framework_version = get_framework_and_version_from_tag(pytorch_training)
    image_cuda_version = get_cuda_version_from_tag(pytorch_training)
    if Version(image_framework_version) == Version("1.6") and image_cuda_version == "cu110":
        pytest.skip("DGL does not suport CUDA 11 for PyTorch 1.6")
    execute_ec2_training_test(ec2_connection, pytorch_training, PT_DGL_CMD)
Ejemplo n.º 29
0
def test_pytorch_amp(pytorch_training, ec2_connection, gpu_only):
    _, image_framework_version = get_framework_and_version_from_tag(pytorch_training)
    if Version(image_framework_version) < Version("1.6"):
        pytest.skip("Native AMP was introduced in PyTorch 1.6")
    execute_ec2_training_test(ec2_connection, pytorch_training, PT_AMP_CMD)
Ejemplo n.º 30
0
def test_pytorch_inference_torchdata_cpu(pytorch_inference, ec2_connection,
                                         cpu_only, pt111_and_above_only):
    _, image_framework_version = get_framework_and_version_from_tag(
        pytorch_inference)
    execute_ec2_inference_test(ec2_connection, pytorch_inference,
                               PT_TORCHDATA_CMD)