コード例 #1
0
def test_sm_profiler_tf(tensorflow_training):
    if is_tf_version("1", tensorflow_training):
        pytest.skip(
            "Skipping test on TF1, since there are no smprofiler config files for TF1"
        )
    processor = get_processor_from_image_uri(tensorflow_training)
    if processor not in ("cpu", "gpu"):
        pytest.skip(f"Processor {processor} not supported. Skipping test.")

    ctx = Context()

    profiler_tests_dir = os.path.join(
        os.getenv("CODEBUILD_SRC_DIR"),
        get_container_name("smprof", tensorflow_training), "smprofiler_tests")
    ctx.run(f"mkdir -p {profiler_tests_dir}", hide=True)

    # Download sagemaker-tests zip
    sm_tests_zip = "sagemaker-tests.zip"
    ctx.run(
        f"aws s3 cp {os.getenv('SMPROFILER_TESTS_BUCKET')}/{sm_tests_zip} {profiler_tests_dir}/{sm_tests_zip}",
        hide=True)
    ctx.run(f"cd {profiler_tests_dir} && unzip {sm_tests_zip}", hide=True)

    # Install tf datasets
    ctx.run(
        f"echo 'tensorflow-datasets==4.0.1' >> "
        f"{profiler_tests_dir}/sagemaker-tests/tests/scripts/tf_scripts/requirements.txt",
        hide=True,
    )

    run_sm_profiler_tests(tensorflow_training, profiler_tests_dir,
                          "test_profiler_tensorflow.py", processor)
コード例 #2
0
def test_torchvision_nms_inference(pytorch_inference):
    """
    Check that the internally built torchvision binary is used to resolve the missing nms issue.
    :param pytorch_inference: framework fixture for pytorch inference
    """
    _, framework_version = get_framework_and_version_from_tag(
        pytorch_inference)
    if Version(framework_version) == Version(
            "1.5.1") and get_processor_from_image_uri(
                pytorch_inference) == "gpu":
        pytest.skip("Skipping this test for PT 1.5.1 GPU Inference DLC images")
    if "eia" in pytorch_inference and Version(framework_version) < Version(
            "1.5.1"):
        pytest.skip(
            "This test does not apply to PT EIA images for PT versions less than 1.5.1"
        )
    if "neuron" in pytorch_inference:
        pytest.skip(
            "Skipping because this is not relevant to PT Neuron images")
    ctx = Context()
    container_name = get_container_name("torchvision-nms", pytorch_inference)
    start_container(container_name, pytorch_inference, ctx)
    run_cmd_on_container(
        container_name,
        ctx,
        f"import torch; import torchvision; print(torch.ops.torchvision.nms)",
        executable="python")
コード例 #3
0
def _run_instance_role_disabled(image_uri, ec2_client, ec2_instance,
                                ec2_connection):
    expected_tag_key = "aws-dlc-autogenerated-tag-do-not-delete"

    ec2_instance_id, _ = ec2_instance
    account_id = test_utils.get_account_id_from_image_uri(image_uri)
    image_region = test_utils.get_region_from_image_uri(image_uri)
    repo_name, image_tag = test_utils.get_repository_and_tag_from_image_uri(
        image_uri)
    framework, _ = test_utils.get_framework_and_version_from_tag(image_uri)
    job_type = test_utils.get_job_type_from_image(image_uri)
    processor = test_utils.get_processor_from_image_uri(image_uri)

    container_name = f"{repo_name}-telemetry_bad_instance_role-ec2"

    docker_cmd = "nvidia-docker" if processor == "gpu" else "docker"

    test_utils.login_to_ecr_registry(ec2_connection, account_id, image_region)
    ec2_connection.run(f"{docker_cmd} pull -q {image_uri}")

    preexisting_ec2_instance_tags = ec2_utils.get_ec2_instance_tags(
        ec2_instance_id, ec2_client=ec2_client)
    if expected_tag_key in preexisting_ec2_instance_tags:
        ec2_client.remove_tags(Resources=[ec2_instance_id],
                               Tags=[{
                                   "Key": expected_tag_key
                               }])

    # Disable access to EC2 instance metadata
    ec2_connection.run(f"sudo route add -host 169.254.169.254 reject")

    if "tensorflow" in framework and job_type == "inference":
        model_name = "saved_model_half_plus_two"
        model_base_path = test_utils.get_tensorflow_model_base_path(image_uri)
        env_vars_list = test_utils.get_tensorflow_inference_environment_variables(
            model_name, model_base_path)
        env_vars = " ".join([
            f"-e {entry['name']}={entry['value']}" for entry in env_vars_list
        ])
        inference_command = get_tensorflow_inference_command_tf27_above(
            image_uri, model_name)
        ec2_connection.run(
            f"{docker_cmd} run {env_vars} --name {container_name} -id {image_uri} {inference_command}"
        )
        time.sleep(5)
    else:
        framework_to_import = framework.replace("huggingface_", "")
        framework_to_import = "torch" if framework_to_import == "pytorch" else framework_to_import
        ec2_connection.run(
            f"{docker_cmd} run --name {container_name} -id {image_uri} bash")
        output = ec2_connection.run(
            f"{docker_cmd} exec -i {container_name} python -c 'import {framework_to_import}; import time; time.sleep(5)'",
            warn=True)
        assert output.ok, f"'import {framework_to_import}' fails when credentials not configured"

    ec2_instance_tags = ec2_utils.get_ec2_instance_tags(ec2_instance_id,
                                                        ec2_client=ec2_client)
    assert expected_tag_key not in ec2_instance_tags, (
        f"{expected_tag_key} was applied as an instance tag."
        "EC2 create_tags went through even though it should not have")
コード例 #4
0
def test_dlc_standard_labels(image, region):
    customer_type_label_prefix = "ec2" if test_utils.is_ec2_image(
        image) else "sagemaker"

    framework, fw_version = test_utils.get_framework_and_version_from_tag(
        image)
    framework = framework.replace('_', '-')
    fw_version = fw_version.replace('.', '-')
    device_type = test_utils.get_processor_from_image_uri(image)
    if device_type == "gpu":
        cuda_verison = test_utils.get_cuda_version_from_tag(image)
        device_type = f"{device_type}.{cuda_verison}"
    python_version = test_utils.get_python_version_from_image_uri(image)
    job_type = test_utils.get_job_type_from_image(image)
    transformers_version = test_utils.get_transformers_version_from_image_uri(
        image).replace('.', '-')
    os_version = test_utils.get_os_version_from_image_uri(image).replace(
        '.', '-')

    # TODO: Add x86 env variable to check explicitly for x86, instead of assuming that everything not graviton is x86
    arch_type = "graviton" if test_utils.is_graviton_architecture() else "x86"

    contributor = test_utils.get_contributor_from_image_uri(image)

    expected_labels = [
        f"com.amazonaws.ml.engines.{customer_type_label_prefix}.dlc.framework.{framework}.{fw_version}",
        f"com.amazonaws.ml.engines.{customer_type_label_prefix}.dlc.device.{device_type}",
        f"com.amazonaws.ml.engines.{customer_type_label_prefix}.dlc.python.{python_version}",
        f"com.amazonaws.ml.engines.{customer_type_label_prefix}.dlc.job.{job_type}",
        f"com.amazonaws.ml.engines.{customer_type_label_prefix}.dlc.arch.{arch_type}",
        f"com.amazonaws.ml.engines.{customer_type_label_prefix}.dlc.os.{os_version}",
    ]

    if contributor:
        expected_labels.append(
            f"com.amazonaws.ml.engines.{customer_type_label_prefix}.dlc.contributor.{contributor}"
        )
    if transformers_version:
        expected_labels.append(
            f"com.amazonaws.ml.engines.{customer_type_label_prefix}.dlc.lib.transformers.{transformers_version}"
        )

    actual_labels = test_utils.get_labels_from_ecr_image(image, region)

    missing_labels = []

    for label in expected_labels:
        if label not in actual_labels:
            missing_labels.append(label)

    # TODO: Remove this when ec2 labels are added. For now, ensure they are not added.
    if customer_type_label_prefix == "ec2":
        assert set(missing_labels) == set(expected_labels), \
            f"EC2 labels are not supported yet, and should not be added to containers. " \
            f"{set(expected_labels) - set(missing_labels)} should not be present."
    else:
        assert not missing_labels, \
            f"Labels {missing_labels} are expected in image {image}, but cannot be found. " \
            f"All labels on image: {actual_labels}"
コード例 #5
0
def _run_tag_success(image_uri, ec2_client, ec2_instance, ec2_connection):
    expected_tag_key = "aws-dlc-autogenerated-tag-do-not-delete"

    ec2_instance_id, _ = ec2_instance
    account_id = test_utils.get_account_id_from_image_uri(image_uri)
    image_region = test_utils.get_region_from_image_uri(image_uri)
    repo_name, image_tag = test_utils.get_repository_and_tag_from_image_uri(
        image_uri)
    framework, _ = test_utils.get_framework_and_version_from_tag(image_uri)
    job_type = test_utils.get_job_type_from_image(image_uri)
    processor = test_utils.get_processor_from_image_uri(image_uri)

    container_name = f"{repo_name}-telemetry_tag_instance_success-ec2"

    docker_cmd = "nvidia-docker" if processor == "gpu" else "docker"

    test_utils.login_to_ecr_registry(ec2_connection, account_id, image_region)
    ec2_connection.run(f"{docker_cmd} pull -q {image_uri}")

    preexisting_ec2_instance_tags = ec2_utils.get_ec2_instance_tags(
        ec2_instance_id, ec2_client=ec2_client)
    if expected_tag_key in preexisting_ec2_instance_tags:
        ec2_client.remove_tags(Resources=[ec2_instance_id],
                               Tags=[{
                                   "Key": expected_tag_key
                               }])

    if framework == "tensorflow" and job_type == "inference":
        env_vars_list = ecs_utils.get_ecs_tensorflow_environment_variables(
            processor, "saved_model_half_plus_two")
        env_vars = " ".join([
            f"-e {entry['name']}={entry['value']}" for entry in env_vars_list
        ])
        ec2_connection.run(
            f"{docker_cmd} run {env_vars} --name {container_name} -id {image_uri}"
        )
        time.sleep(5)
    else:
        framework_to_import = framework.replace("huggingface_", "")
        framework_to_import = "torch" if framework_to_import == "pytorch" else framework_to_import
        ec2_connection.run(
            f"{docker_cmd} run --name {container_name} -id {image_uri} bash")
        output = ec2_connection.run(
            f"{docker_cmd} exec -i {container_name} python -c 'import {framework_to_import}; import time; time.sleep(5)'",
            warn=True)

    ec2_instance_tags = ec2_utils.get_ec2_instance_tags(ec2_instance_id,
                                                        ec2_client=ec2_client)
    assert expected_tag_key in ec2_instance_tags, f"{expected_tag_key} was not applied as an instance tag"
コード例 #6
0
def test_torchvision_nms_training(pytorch_training):
    """
    Check that the internally built torchvision binary is used to resolve the missing nms issue.
    :param pytorch_training: framework fixture for pytorch training
    """
    _, framework_version = get_framework_and_version_from_tag(pytorch_training)
    if Version(framework_version) == Version(
            "1.5.1") and get_processor_from_image_uri(
                pytorch_training) == "gpu":
        pytest.skip("Skipping this test for PT 1.5.1 GPU Training DLC images")
    ctx = Context()
    container_name = get_container_name("torchvision-nms", pytorch_training)
    start_container(container_name, pytorch_training, ctx)
    run_cmd_on_container(
        container_name,
        ctx,
        f"import torch; import torchvision; print(torch.ops.torchvision.nms)",
        executable="python")
コード例 #7
0
def test_sm_profiler_pt(pytorch_training):
    processor = get_processor_from_image_uri(pytorch_training)
    if processor not in ("cpu", "gpu"):
        pytest.skip(f"Processor {processor} not supported. Skipping test.")

    _, image_framework_version = get_framework_and_version_from_tag(pytorch_training)
    if Version(image_framework_version) in SpecifierSet(">=1.12"):
        pytest.skip("sm profiler ZCC test is not supported in PT 1.12 and above")

    ctx = Context()

    profiler_tests_dir = os.path.join(
        os.getenv("CODEBUILD_SRC_DIR"), get_container_name("smprof", pytorch_training), "smprofiler_tests"
    )
    ctx.run(f"mkdir -p {profiler_tests_dir}", hide=True)

    # Download sagemaker-tests zip
    sm_tests_zip = "sagemaker-tests.zip"
    ctx.run(
        f"aws s3 cp {os.getenv('SMPROFILER_TESTS_BUCKET')}/{sm_tests_zip} {profiler_tests_dir}/{sm_tests_zip}",
        hide=True,
    )

    # PT test setup requirements
    with ctx.prefix(f"cd {profiler_tests_dir}"):
        ctx.run(f"unzip {sm_tests_zip}", hide=True)
        with ctx.prefix("cd sagemaker-tests/tests/scripts/pytorch_scripts"):
            ctx.run("mkdir -p data", hide=True)
            ctx.run(
                "aws s3 cp s3://smdebug-testing/datasets/cifar-10-python.tar.gz data/cifar-10-batches-py.tar.gz",
                hide=True,
            )
            ctx.run("aws s3 cp s3://smdebug-testing/datasets/MNIST_pytorch.tar.gz data/MNIST_pytorch.tar.gz", hide=True)
            with ctx.prefix("cd data"):
                ctx.run("tar -zxf MNIST_pytorch.tar.gz", hide=True)
                ctx.run("tar -zxf cifar-10-batches-py.tar.gz", hide=True)

    run_sm_profiler_tests(pytorch_training, profiler_tests_dir, "test_profiler_pytorch.py", processor)
コード例 #8
0
def test_dlc_major_version_dockerfiles(image):
    """
    Test to make sure semantic versioning scheme in Dockerfiles is correct

    :param image: <str> ECR image URI
    """
    dlc_dir = os.getcwd().split(f"{os.sep}test{os.sep}")[0]
    job_type = test_utils.get_job_type_from_image(image)
    framework, fw_version = test_utils.get_framework_and_version_from_tag(
        image)
    processor = test_utils.get_processor_from_image_uri(image)

    # Assign a string of numbers associated with python version in tag. Python major version is not sufficient to
    # define DLC major version
    python_major_minor_version = re.search(r"-py(\d{2,})", image).group(1)

    root_dir = os.path.join(dlc_dir, framework, job_type, "docker")

    # Skip older FW versions that did not use this versioning scheme
    references = {
        "tensorflow2": "2.2.0",
        "tensorflow1": "1.16.0",
        "mxnet": "1.7.0",
        "pytorch": "1.5.0"
    }
    if test_utils.is_tf_version("1", image):
        reference_fw = "tensorflow1"
    elif test_utils.is_tf_version("2", image):
        reference_fw = "tensorflow2"
    else:
        reference_fw = framework
    if processor != "eia" and (
            reference_fw in references
            and Version(fw_version) < Version(references[reference_fw])):
        pytest.skip(
            f"Not enforcing new versioning scheme on old image {image}. "
            f"Started enforcing version scheme on the following: {references}")

    # Find all Dockerfile.<processor> for this framework/job_type's Major.Minor version
    dockerfiles = []
    fw_version_major_minor = re.match(r"(\d+\.\d+)", fw_version).group(1)
    for root, dirnames, filenames in os.walk(root_dir):
        for filename in filenames:
            if filename == f"Dockerfile.{processor}":
                dockerfile_path = os.path.join(root_dir, root, filename)
                if "example" not in dockerfile_path and f"{os.sep}{fw_version_major_minor}" in dockerfile_path:
                    dockerfiles.append(dockerfile_path)

    # For the collected dockerfiles above, note the DLC major versions in each Dockerfile if python version matches
    # the current image under test
    versions = {}
    dlc_label_regex = re.compile(r'LABEL dlc_major_version="(\d+)"')
    python_version_regex = re.compile(r"ARG PYTHON_VERSION=(\d+\.\d+)")
    for dockerfile in dockerfiles:
        with open(dockerfile, "r") as df:
            dlc_version = None
            python_version = None
            for line in df:
                major_version_match = dlc_label_regex.match(line)
                python_version_match = python_version_regex.match(line)
                if major_version_match:
                    dlc_version = int(major_version_match.group(1))
                elif python_version_match:
                    python_version = python_version_match.group(1).replace(
                        ".", "")

            # Raise errors if dlc major version label and python version arg are not found in Dockerfile
            if not dlc_version:
                raise DLCMajorVersionLabelNotFound(
                    f"Cannot find dlc_major_version label in {dockerfile}")
            if not python_version:
                raise DLCPythonVersionNotFound(
                    f"Cannot find PYTHON_VERSION arg in {dockerfile}")
            if python_version == python_major_minor_version:
                versions[dockerfile] = dlc_version

    expected_versions = list(range(1, len(dockerfiles) + 1))
    actual_versions = sorted(versions.values())

    # Test case explicitly for TF2.3 gpu, since v1.0 is banned
    if (framework, fw_version_major_minor, processor,
            python_major_minor_version, job_type) == (
                "tensorflow",
                "2.3",
                "gpu",
                "37",
                "training",
            ):
        expected_versions = [v + 1 for v in expected_versions]
        assert 1 not in actual_versions, (
            f"DLC v1.0 is deprecated in TF2.3 gpu containers, but found major version 1 "
            f"in one of the Dockerfiles. Please inspect {versions}")

    # Test case explicitly for PyTorch 1.6.0 training gpu, since v2.0 is banned
    if (framework, fw_version_major_minor, processor,
            python_major_minor_version, job_type) == (
                "pytorch",
                "1.6",
                "gpu",
                "36",
                "training",
            ):
        expected_versions = [v + 1 for v in expected_versions]
        expected_versions[0] = 1
        assert 2 not in actual_versions, (
            f"DLC v2.0 is deprecated in PyTorch 1.6.0 gpu containers, but found major version 2 "
            f"in one of the Dockerfiles. Please inspect {versions}")

    # Note: If, for example, we find 3 dockerfiles with the same framework major/minor version, same processor,
    # and same python major/minor version, we will expect DLC major versions 1, 2, and 3. If an exception needs to be
    # made to this rule, please see the above handling of TF2.3 as an example.
    assert actual_versions == expected_versions, (
        f"Found DLC major versions {actual_versions} but expected {expected_versions} for "
        f"{framework} {job_type} {processor}. Full version info: {versions}. Py version: {python_major_minor_version}"
    )
コード例 #9
0
def _run_dependency_check_test(image, ec2_connection):
    # Record any whitelisted medium/low severity CVEs; I.E. allowed_vulnerabilities = {CVE-1000-5555, CVE-9999-9999}
    allowed_vulnerabilities = {
        # Those vulnerabilities are fixed. Current openssl version is 1.1.1g. These are false positive
        "CVE-2016-2109",
        "CVE-2016-2177",
        "CVE-2016-6303",
        "CVE-2016-2182",
        # CVE-2020-13936: vulnerability found in apache velocity package which is a dependency for dependency-check package. Hence, ignoring.
        "CVE-2020-13936",
    }

    processor = get_processor_from_image_uri(image)

    # Whitelist CVE #CVE-2021-3711 for DLCs where openssl is installed using apt-get
    framework, _ = get_framework_and_version_from_tag(image)
    short_fw_version = re.search(r"(\d+\.\d+)", image).group(1)

    # Check that these versions have been matched on https://ubuntu.com/security/CVE-2021-3711 before adding
    allow_openssl_cve_fw_versions = {
        "tensorflow": {
            "1.15": ["cpu", "gpu", "neuron"],
            "2.3": ["cpu", "gpu"],
            "2.4": ["cpu", "gpu"],
            "2.5": ["cpu", "gpu", "neuron"],
            "2.6": ["cpu", "gpu"],
            "2.7": ["cpu", "gpu"],
        },
        "mxnet": {
            "1.8": ["neuron"],
            "1.9": ["cpu", "gpu"]
        },
        "pytorch": {
            "1.10": ["cpu"]
        },
        "huggingface_pytorch": {
            "1.8": ["cpu", "gpu"],
            "1.9": ["cpu", "gpu"]
        },
        "huggingface_tensorflow": {
            "2.4": ["cpu", "gpu"],
            "2.5": ["cpu", "gpu"]
        },
        "autogluon": {
            "0.3": ["cpu"]
        },
    }

    if processor in allow_openssl_cve_fw_versions.get(framework, {}).get(
            short_fw_version, []):
        allowed_vulnerabilities.add("CVE-2021-3711")

    container_name = f"dep_check_{processor}"
    report_addon = get_container_name("depcheck-report", image)
    dependency_check_report = f"{report_addon}.html"
    html_file = f"{container_name}:/build/dependency-check-report.html"
    test_script = os.path.join(CONTAINER_TESTS_PREFIX, "testDependencyCheck")

    # Execute test, copy results to s3
    ec2.execute_ec2_training_test(ec2_connection,
                                  image,
                                  test_script,
                                  container_name=container_name,
                                  bin_bash_entrypoint=True)
    ec2_connection.run(f"docker cp {html_file} ~/{dependency_check_report}")
    ec2_connection.run(
        f"aws s3 cp ~/{dependency_check_report} s3://dlc-dependency-check")

    # Check for any vulnerabilities not mentioned in allowed_vulnerabilities
    html_output = ec2_connection.run(f"cat ~/{dependency_check_report}",
                                     hide=True).stdout
    cves = re.findall(r">(CVE-\d+-\d+)</a>", html_output)
    vulnerabilities = set(cves) - allowed_vulnerabilities

    if vulnerabilities:
        vulnerability_severity = {}

        # Check NVD for vulnerability severity to provide this useful info in error message.
        for vulnerability in vulnerabilities:
            try:
                cve_url = f"https://services.nvd.nist.gov/rest/json/cve/1.0/{vulnerability}"

                session = requests.Session()
                session.mount(
                    "https://",
                    requests.adapters.HTTPAdapter(max_retries=Retry(
                        total=5, status_forcelist=[404, 504, 502])),
                )
                response = session.get(cve_url)

                if response.status_code == 200:
                    severity = (response.json().get("result", {}).get(
                        "CVE_Items",
                        [{}])[0].get("impact",
                                     {}).get("baseMetricV2",
                                             {}).get("severity", "UNKNOWN"))
                    if vulnerability_severity.get(severity):
                        vulnerability_severity[severity].append(vulnerability)
                    else:
                        vulnerability_severity[severity] = [vulnerability]
            except ConnectionError:
                LOGGER.exception(
                    f"Failed to load NIST data for CVE {vulnerability}")

        # TODO: Remove this once we have whitelisted appropriate LOW/MEDIUM vulnerabilities
        if not (vulnerability_severity.get("CRITICAL")
                or vulnerability_severity.get("HIGH")):
            return

        raise DependencyCheckFailure(
            f"Unrecognized CVEs have been reported : {vulnerability_severity}. "
            f"Allowed vulnerabilities are {allowed_vulnerabilities or None}. Please see "
            f"{dependency_check_report} for more details.")