Ejemplo n.º 1
0
def test_dataclasses_check(image):
    """
    Ensure there is no dataclasses pip package is installed for python 3.7 and above version.
    Python version retrieved from the ecr image uri is expected in the format `py<major_verion><minor_version>`
    :param image: ECR image URI
    """
    ctx = Context()
    pip_package = "dataclasses"

    container_name = get_container_name("dataclasses-check", image)

    python_version = get_python_version_from_image_uri(image).replace("py", "")
    python_version = int(python_version)

    if python_version >= 37:
        start_container(container_name, image, ctx)
        output = run_cmd_on_container(container_name,
                                      ctx,
                                      f"pip show {pip_package}",
                                      warn=True)

        if output.return_code == 0:
            pytest.fail(
                f"{pip_package} package exists in the DLC image {image} that has py{python_version} version which is greater than py36 version"
            )
        else:
            LOGGER.info(
                f"{pip_package} package does not exists in the DLC image {image}"
            )
    else:
        pytest.skip(
            f"Skipping test for DLC image {image} that has py36 version as {pip_package} is not included in the python framework"
        )
Ejemplo n.º 2
0
def _print_results_of_test(file_path, processor):
    last_100_lines = Context().run(f"tail -100 {file_path}").stdout.split("\n")
    result = ""
    throughput = 0
    if processor == "cpu":
        for line in last_100_lines:
            if "Total img/sec on " in line:
                result = line + "\n"
                throughput = float(
                    re.search(
                        r"(CPU\(s\):[ ]*)(?P<throughput>[0-9]+\.?[0-9]+)",
                        line).group("throughput"))
                break
    elif processor == "gpu":
        result_dict = dict()
        for line in last_100_lines:
            if "images/sec: " in line:
                key = line.split("<stdout>")[0]
                result_dict[key] = line.strip("\n")
                if throughput == 0:
                    throughput = float(
                        re.search(
                            r"(images/sec:[ ]*)(?P<throughput>[0-9]+\.?[0-9]+)",
                            line).group("throughput"))
        result = "\n".join(result_dict.values()) + "\n"
    LOGGER.info(result)
    return result, throughput
def _print_results_of_test(file_path, processor):
    result = ""
    throughput = 0
    if processor == "cpu":
        with open(file_path, 'r') as f:
            lines = f.readlines()
            for line in lines:
                if "Total img/sec on " in line:
                    result = line + "\n"
                    throughput += float(
                        re.search(
                            r"(CPU\(s\):[ ]*)(?P<throughput>[0-9]+\.?[0-9]+)",
                            line).group("throughput"))
    elif processor == "gpu":
        """calculate average throughput"""
        result_list, throughput_list = [], []
        with open(file_path, 'r') as f:
            lines = f.readlines()
            for line in lines:
                if "images/sec: " in line:
                    result_list.append(line.strip("\n"))
                    throughput = float(
                        re.search(
                            r"(images/sec:[ ]*)(?P<throughput>[0-9]+\.?[0-9]+)",
                            line).group("throughput"))
                    throughput_list.append(throughput)
        result = "\n".join(result_list[-100:]) + "\n"
        if len(throughput_list) == 0:
            raise Exception(
                "Cannot find throughput lines. Looks like SageMaker job was not run successfully. Please check"
            )
        # Take average of last 100 throughput lines
        throughput = sum(throughput_list[-100:]) / len(throughput_list[-100:])
    LOGGER.info(result)
    return result, throughput
def test_tensorflow_sagemaker_training_performance(tensorflow_training,
                                                   num_nodes, region):

    # This sleep has been inserted because all the parametrized training jobs are automatically created
    # by SageMaker with the same name, due to being started around the same time, and with the same image uri.
    time.sleep(
        random.Random(x=f"{tensorflow_training}{num_nodes}").random() * 60)

    framework_version = re.search(r"[1,2](\.\d+){2}",
                                  tensorflow_training).group()
    processor = "gpu" if "gpu" in tensorflow_training else "cpu"

    ec2_instance_type = "p3.16xlarge" if processor == "gpu" else "c5.18xlarge"

    py_version = "py2" if "py2" in tensorflow_training else "py37" if "py37" in tensorflow_training else "py3"

    time_str = time.strftime('%Y-%m-%d-%H-%M-%S')
    commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION")
    target_upload_location = os.path.join(BENCHMARK_RESULTS_S3_BUCKET,
                                          "tensorflow", framework_version,
                                          "sagemaker", "training", processor,
                                          py_version)

    test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                            "resources")
    venv_dir = os.path.join(test_dir, "sm_benchmark_venv")

    ctx = Context()

    with ctx.cd(test_dir), ctx.prefix(f"source {venv_dir}/bin/activate"):
        log_file = f"results-{commit_info}-{time_str}-{num_nodes}-node.txt"
        run_out = ctx.run(
            f"timeout 45m python tf_sm_benchmark.py "
            f"--framework-version {framework_version} "
            f"--image-uri {tensorflow_training} "
            f"--instance-type ml.{ec2_instance_type} "
            f"--node-count {num_nodes} "
            f"--python {py_version} "
            f"--region {region} "
            f"> {log_file}",
            warn=True,
            echo=True)

        if not (run_out.ok or run_out.return_code == 124):
            target_upload_location = os.path.join(target_upload_location,
                                                  "failure_log")

        ctx.run(
            f"aws s3 cp {log_file} {os.path.join(target_upload_location, log_file)}"
        )

    LOGGER.info(
        f"Test results can be found at {os.path.join(target_upload_location, log_file)}"
    )

    assert run_out.ok, (
        f"Benchmark Test failed with return code {run_out.return_code}. "
        f"Test results can be found at {os.path.join(target_upload_location, log_file)}"
    )
Ejemplo n.º 5
0
def _print_results_of_test(file_path, processor):
    last_100_lines = Context().run(f"tail -100 {file_path}").stdout.split("\n")
    result = ""
    if processor == "cpu":
        for line in last_100_lines:
            if "Total img/sec on " in line:
                result = line + "\n"
    elif processor == "gpu":
        result_dict = dict()
        for line in last_100_lines:
            if "images/sec: " in line:
                key = line.split("<stdout>")[0]
                result_dict[key] = line.strip("\n")
        result = "\n".join(result_dict.values()) + "\n"
    LOGGER.info(result)
    return result
def test_dlc_major_version_label(image, region):
    """
    Test to ensure that all DLC images have the LABEL "dlc_major_version"

    :param image: <str> Image URI
    :param region: <str> region where ECR repository holding the image resides
    :return:
    """
    ecr_client = boto3.client("ecr", region_name=region)

    image_repository, image_tag = get_repository_and_tag_from_image_uri(image)
    # Using "acceptedMediaTypes" on the batch_get_image request allows the returned image information to
    # provide the ECR Image Manifest in the specific format that we need, so that the image LABELS can be found
    # on the manifest. The default format does not return the image LABELs.
    response = ecr_client.batch_get_image(
        repositoryName=image_repository,
        imageIds=[{
            "imageTag": image_tag
        }],
        acceptedMediaTypes=[
            "application/vnd.docker.distribution.manifest.v1+json"
        ],
    )
    if not response.get("images"):
        raise KeyError(
            f"Failed to get images through ecr_client.batch_get_image response for image {image_repository}:{image_tag}"
        )
    elif not response["images"][0].get("imageManifest"):
        raise KeyError(
            f"imageManifest not found in ecr_client.batch_get_image response:\n{response['images']}"
        )

    manifest_str = response["images"][0]["imageManifest"]
    # manifest_str is a json-format string
    manifest = json.loads(manifest_str)
    image_metadata = json.loads(manifest["history"][0]["v1Compatibility"])
    major_version = image_metadata["config"]["Labels"].get(
        "dlc_major_version", None)

    assert major_version, f"{image} has no LABEL named 'dlc_major_version'. Please insert label."

    LOGGER.info(f"{image} has 'dlc_major_version' = {major_version}")
def test_canary_images_pullable(region):
    """
    Sanity test to verify canary specific functions
    """
    ctx = Context()
    frameworks = ("tensorflow", "mxnet", "pytorch")

    # Have a default framework to test on
    framework = "pytorch"
    for fw in frameworks:
        if fw in os.getenv("CODEBUILD_INITIATOR"):
            framework = fw
            break

    images = parse_canary_images(framework, region)
    login_to_ecr_registry(ctx, PUBLIC_DLC_REGISTRY, region)
    if not images:
        return
    for image in images.split(" "):
        ctx.run(f"docker pull -q {image}")
        LOGGER.info(f"Canary image {image} is available")
Ejemplo n.º 8
0
def _print_results_of_test(file_path):
    last_n_lines = Context().run(f"tail -500 {file_path}").stdout.split("\n")
    result_dict = dict()
    accuracy = 0
    time_cost = 0
    accuracy_key = "Train-accuracy"
    time_cost_key = "Time cost"
    reversed_log = reversed(last_n_lines)
    for line in reversed_log:
        if all(key in result_dict for key in ("Train-accuracy", "Time cost")):
            break
        if accuracy_key in line:
            if accuracy_key in result_dict:
                continue
            accuracy_str = line.split("=")[1]
            result_dict[accuracy_key] = accuracy_str
            accuracy = float(accuracy_str)
        if time_cost_key in line:
            if time_cost_key in result_dict:
                continue
            time_str = line.split("=")[1]
            result_dict[time_cost_key] = time_str
            time_cost = float(time_str)
    result = "\n".join(result_dict.values()) + "\n"
    LOGGER.info(f'Result is {result}')
    LOGGER.info(f'{accuracy_key} is {accuracy}')
    LOGGER.info(f'{time_cost_key} is {time_cost}')
    return result, time_cost, accuracy
def run_sm_profiler_tests(image, profiler_tests_dir, test_file, processor):
    """
    Testrunner to execute SM profiler tests from DLC repo
    """
    ctx = Context()

    # Install profiler requirements only once - pytest-rerunfailures has a known issue
    # with the latest pytest https://github.com/pytest-dev/pytest-rerunfailures/issues/128
    try:
        ctx.run(
            "pip install -r "
            "https://raw.githubusercontent.com/awslabs/sagemaker-debugger/master/config/profiler/requirements.txt && "
            "pip install smdebug && "
            "pip uninstall -y pytest-rerunfailures",
            hide=True,
        )
    except UnexpectedExit:
        # Wait a minute and a half if we get an invoke failure - since smprofiler test requirements can be flaky
        time.sleep(90)

    framework, version = get_framework_and_version_from_tag(image)

    # Conditionally set sm data parallel tests, based on config file rules from link below:
    # https://github.com/awslabs/sagemaker-debugger/tree/master/config/profiler
    enable_sm_data_parallel_tests = "true"
    if framework == "pytorch" and Version(version) < Version("1.6"):
        enable_sm_data_parallel_tests = "false"
    if framework == "tensorflow" and Version(version) < Version("2.3"):
        enable_sm_data_parallel_tests = "false"

    # Set SMProfiler specific environment variables
    smprof_configs = {
        "use_current_branch": "false",
        "enable_smdataparallel_tests": enable_sm_data_parallel_tests,
        "force_run_tests": "false",
        "framework": framework,
        "build_type": "release"
    }

    # Command to set all necessary environment variables
    export_cmd = " && ".join(f"export {key}={val}"
                             for key, val in smprof_configs.items())
    export_cmd = f"{export_cmd} && export ENV_CPU_TRAIN_IMAGE=test && export ENV_GPU_TRAIN_IMAGE=test && " \
                 f"export ENV_{processor.upper()}_TRAIN_IMAGE={image}"

    test_results_outfile = os.path.join(
        os.getcwd(), f"{get_container_name('smprof', image)}.txt")
    with ctx.prefix(f"cd {profiler_tests_dir}"):
        with ctx.prefix(f"cd sagemaker-tests && {export_cmd}"):
            try:
                ctx.run(
                    f"pytest --json-report --json-report-file={test_results_outfile} -n=auto "
                    f"-v -s -W=ignore tests/{test_file}::test_{processor}_jobs",
                    hide=True,
                )
                with open(test_results_outfile) as outfile:
                    result_data = json.load(outfile)
                    LOGGER.info(
                        f"Tests passed on {image}; Results:\n{json.dumps(result_data, indent=4)}"
                    )
            except Exception as e:
                if os.path.exists(test_results_outfile):
                    with open(test_results_outfile) as outfile:
                        result_data = json.load(outfile)
                    raise SMProfilerRCTestFailure(
                        f"Failed SM Profiler tests. Results:\n{json.dumps(result_data, indent=4)}"
                    ) from e
                raise
Ejemplo n.º 10
0
def test_tensorflow_sagemaker_training_performance(tensorflow_training,
                                                   num_nodes, region):
    """
    Run TF sagemaker training performance tests

    Additonal context: Setup for this function is performed by 'setup_sm_benchmark_tf_train_env' -- this installs
    some prerequisite packages, clones some repos, and creates a virtualenv called sm_benchmark_venv.

    TODO: Refactor the above setup function to be more obviously connected to this function,
    TODO: and install requirements via a requirements.txt file

    :param tensorflow_training: ECR image URI
    :param num_nodes: Number of nodes to run on
    :param region: AWS region
    """
    framework_version = re.search(r"[1,2](\.\d+){2}",
                                  tensorflow_training).group()
    if framework_version.startswith("1."):
        pytest.skip("Skipping benchmark test on TF 1.x images.")

    processor = "gpu" if "gpu" in tensorflow_training else "cpu"

    ec2_instance_type = "p3.16xlarge" if processor == "gpu" else "c5.18xlarge"

    py_version = "py2" if "py2" in tensorflow_training else "py37" if "py37" in tensorflow_training else "py3"

    time_str = time.strftime('%Y-%m-%d-%H-%M-%S')
    commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION")
    target_upload_location = os.path.join(BENCHMARK_RESULTS_S3_BUCKET,
                                          "tensorflow", framework_version,
                                          "sagemaker", "training", processor,
                                          py_version)
    training_job_name = (
        f"tf{framework_version[0]}-tr-bench-{processor}-{num_nodes}-node-{py_version}"
        f"-{commit_info[:7]}-{time_str}")

    # Inserting random sleep because this test starts multiple training jobs around the same time, resulting in
    # a throttling error for SageMaker APIs.
    time.sleep(Random(x=training_job_name).random() * 60)

    test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                            "resources")
    venv_dir = os.path.join(test_dir, "sm_benchmark_venv")

    ctx = Context()

    with ctx.cd(test_dir), ctx.prefix(f"source {venv_dir}/bin/activate"):
        log_file = f"results-{commit_info}-{time_str}-{framework_version}-{processor}-{py_version}-{num_nodes}-node.txt"
        run_out = ctx.run(
            f"timeout 45m python tf_sm_benchmark.py "
            f"--framework-version {framework_version} "
            f"--image-uri {tensorflow_training} "
            f"--instance-type ml.{ec2_instance_type} "
            f"--node-count {num_nodes} "
            f"--python {py_version} "
            f"--region {region} "
            f"--job-name {training_job_name}"
            f"2>&1 | tee {log_file}",
            warn=True,
            echo=True)

        if not (run_out.ok or run_out.return_code == 124):
            target_upload_location = os.path.join(target_upload_location,
                                                  "failure_log")

    ctx.run(
        f"aws s3 cp {os.path.join(test_dir, log_file)} {os.path.join(target_upload_location, log_file)}"
    )

    LOGGER.info(
        f"Test results can be found at {os.path.join(target_upload_location, log_file)}"
    )

    _print_results_of_test(os.path.join(test_dir, log_file), processor)

    assert run_out.ok, (
        f"Benchmark Test failed with return code {run_out.return_code}. "
        f"Test results can be found at {os.path.join(target_upload_location, log_file)}"
    )
Ejemplo n.º 11
0
def test_oss_compliance(image):
    """
    Run oss compliance check on a container to check if license attribution files exist.
    And upload source of third party packages to S3 bucket.
    """
    THIRD_PARTY_SOURCE_CODE_BUCKET = "aws-dlinfra-licenses"
    THIRD_PARTY_SOURCE_CODE_BUCKET_PATH = "third_party_source_code"
    file = "THIRD_PARTY_SOURCE_CODE_URLS"
    container_name = get_container_name("oss_compliance", image)
    context = Context()
    local_repo_path = get_repository_local_path()
    start_container(container_name, image, context)

    # run compliance test to make sure license attribution files exists. testOSSCompliance is copied as part of Dockerfile
    run_cmd_on_container(container_name, context,
                         "/usr/local/bin/testOSSCompliance /root")

    try:
        context.run(
            f"docker cp {container_name}:/root/{file} {os.path.join(local_repo_path, file)}"
        )
    finally:
        context.run(f"docker rm -f {container_name}", hide=True)

    s3_resource = boto3.resource("s3")

    with open(os.path.join(local_repo_path, file)) as source_code_file:
        for line in source_code_file:
            name, version, url = line.split(" ")
            file_name = f"{name}_v{version}_source_code"
            s3_object_path = f"{THIRD_PARTY_SOURCE_CODE_BUCKET_PATH}/{file_name}.tar.gz"
            local_file_path = os.path.join(local_repo_path, file_name)

            for i in range(3):
                try:
                    if not os.path.isdir(local_file_path):
                        context.run(
                            f"git clone {url.rstrip()} {local_file_path}")
                        context.run(
                            f"tar -czvf {local_file_path}.tar.gz {local_file_path}"
                        )
                except Exception as e:
                    time.sleep(1)
                    if i == 2:
                        LOGGER.error(f"Unable to clone git repo. Error: {e}")
                        raise
                    continue
            try:
                if os.path.exists(f"{local_file_path}.tar.gz"):
                    LOGGER.info(f"Uploading package to s3 bucket: {line}")
                    s3_resource.Object(THIRD_PARTY_SOURCE_CODE_BUCKET,
                                       s3_object_path).load()
            except botocore.exceptions.ClientError as e:
                if e.response["Error"]["Code"] == "404":
                    try:
                        # using aws cli as using boto3 expects to upload folder by iterating through each file instead of entire folder.
                        context.run(
                            f"aws s3 cp {local_file_path}.tar.gz s3://{THIRD_PARTY_SOURCE_CODE_BUCKET}/{s3_object_path}"
                        )
                        object = s3_resource.Bucket(
                            THIRD_PARTY_SOURCE_CODE_BUCKET).Object(
                                s3_object_path)
                        object.Acl().put(ACL="public-read")
                    except ClientError as e:
                        LOGGER.error(
                            f"Unable to upload source code to bucket {THIRD_PARTY_SOURCE_CODE_BUCKET}. Error: {e}"
                        )
                        raise
                else:
                    LOGGER.error(
                        f"Unable to check if source code is present on bucket {THIRD_PARTY_SOURCE_CODE_BUCKET}. Error: {e}"
                    )
                    raise
Ejemplo n.º 12
0
def test_ecr_scan(image, ecr_client, sts_client, region):
    """
    Run ECR Scan Tool on an image being tested, and raise Error if vulnerabilities found
    1. Start Scan.
    2. For 5 minutes (Run DescribeImages):
       (We run this for 5 minutes because the Scan is expected to complete in about 2 minutes, though no
        analysis has been performed on exactly how long the Scan takes for a DLC image. Therefore we also
        have a 3 minute buffer beyond the expected amount of time taken.)
    3.1. If imageScanStatus == COMPLETE: exit loop
    3.2. If imageScanStatus == IN_PROGRESS or AttributeNotFound(imageScanStatus): continue loop
    3.3. If imageScanStatus == FAILED: raise RuntimeError
    4. If DescribeImages.imageScanStatus != COMPLETE: raise TimeOutError
    5. assert imageScanFindingsSummary.findingSeverityCounts.HIGH/CRITICAL == 0

    :param image: str Image URI for image to be tested
    :param ecr_client: boto3 Client for ECR
    :param sts_client: boto3 Client for STS
    :param region: str Name of region where test is executed
    """
    test_account_id = sts_client.get_caller_identity().get("Account")
    image_account_id = get_account_id_from_image_uri(image)
    if image_account_id != test_account_id:
        image_repo_uri, image_tag = image.split(":")
        _, image_repo_name = image_repo_uri.split("/")
        target_image_repo_name = f"beta-{image_repo_name}"
        image = ecr_utils.reupload_image_to_test_ecr(image,
                                                     target_image_repo_name,
                                                     region)

    minimum_sev_threshold = get_minimum_sev_threshold_level(image)
    LOGGER.info(f"Severity threshold level is {minimum_sev_threshold}")

    run_scan(ecr_client, image)
    scan_results = ecr_utils.get_ecr_image_scan_results(
        ecr_client, image, minimum_vulnerability=minimum_sev_threshold)
    scan_results = ecr_utils.populate_ecr_scan_with_web_scraper_results(
        image, scan_results)
    ecr_image_vulnerability_list = ScanVulnerabilityList(
        minimum_severity=CVESeverity[minimum_sev_threshold])
    ecr_image_vulnerability_list.construct_allowlist_from_ecr_scan_result(
        scan_results)

    remaining_vulnerabilities = ecr_image_vulnerability_list

    # TODO: Once this feature is enabled, remove "if" condition and second assertion statement
    # TODO: Ensure this works on the canary tags before removing feature flag
    if is_image_covered_by_allowlist_feature(image):
        upgraded_image_vulnerability_list, image_scan_allowlist = fetch_other_vulnerability_lists(
            image, ecr_client, minimum_sev_threshold)
        s3_bucket_name = ECR_SCAN_HELPER_BUCKET

        ## In case new vulnerabilities are found conduct failure routine
        newly_found_vulnerabilities = ecr_image_vulnerability_list - image_scan_allowlist
        if newly_found_vulnerabilities:
            failure_routine_summary = conduct_failure_routine(
                image,
                image_scan_allowlist,
                ecr_image_vulnerability_list,
                upgraded_image_vulnerability_list,
                s3_bucket_name,
            )
            (
                s3_filename_for_fixable_list,
                s3_filename_for_non_fixable_list,
            ) = process_failure_routine_summary_and_store_data_in_s3(
                failure_routine_summary, s3_bucket_name)
        assert not newly_found_vulnerabilities, (
            f"""Found {len(failure_routine_summary["fixable_vulnerabilities"])} fixable vulnerabilites """
            f"""and {len(failure_routine_summary["non_fixable_vulnerabilities"])} non fixable vulnerabilites. """
            f"""Refer to files s3://{s3_bucket_name}/{s3_filename_for_fixable_list}, s3://{s3_bucket_name}/{s3_filename_for_non_fixable_list}, """
            f"""s3://{s3_bucket_name}/{failure_routine_summary["s3_filename_for_current_image_ecr_scan_list"]} and s3://{s3_bucket_name}/{failure_routine_summary["s3_filename_for_allowlist"]}."""
        )

        ## In case there is no new vulnerability but the allowlist is outdated conduct failure routine
        vulnerabilities_that_can_be_fixed = image_scan_allowlist - upgraded_image_vulnerability_list
        if vulnerabilities_that_can_be_fixed:
            failure_routine_summary = conduct_failure_routine(
                image,
                image_scan_allowlist,
                ecr_image_vulnerability_list,
                upgraded_image_vulnerability_list,
                s3_bucket_name,
            )
            (
                s3_filename_for_fixable_list,
                s3_filename_for_non_fixable_list,
            ) = process_failure_routine_summary_and_store_data_in_s3(
                failure_routine_summary, s3_bucket_name)
        assert not vulnerabilities_that_can_be_fixed, (
            f"""Allowlist is Outdated!! Found {len(failure_routine_summary["fixable_vulnerabilities"])} fixable vulnerabilites """
            f"""and {len(failure_routine_summary["non_fixable_vulnerabilities"])} non fixable vulnerabilites. """
            f"""Refer to files s3://{s3_bucket_name}/{s3_filename_for_fixable_list}, s3://{s3_bucket_name}/{s3_filename_for_non_fixable_list}, """
            f"""s3://{s3_bucket_name}/{failure_routine_summary["s3_filename_for_current_image_ecr_scan_list"]} and s3://{s3_bucket_name}/{failure_routine_summary["s3_filename_for_allowlist"]}."""
        )
        return

    common_ecr_scan_allowlist = ScanVulnerabilityList(
        minimum_severity=CVESeverity[minimum_sev_threshold])
    common_ecr_scan_allowlist_path = os.path.join(
        os.sep, get_repository_local_path(), "data",
        "common-ecr-scan-allowlist.json")
    if os.path.exists(common_ecr_scan_allowlist_path):
        common_ecr_scan_allowlist.construct_allowlist_from_file(
            common_ecr_scan_allowlist_path)

    remaining_vulnerabilities = remaining_vulnerabilities - common_ecr_scan_allowlist

    if remaining_vulnerabilities:
        assert not remaining_vulnerabilities.vulnerability_list, (
            f"The following vulnerabilities need to be fixed on {image}:\n"
            f"{json.dumps(remaining_vulnerabilities.vulnerability_list, indent=4)}"
        )
def run_sm_perf_test(image_uri, xla, num_nodes, region, threshold=None):
    """
    Run TF sagemaker training performance tests

    Additional context: Setup for this function is performed by 'setup_sm_benchmark_tf_train_env' -- this installs
    some prerequisite packages, clones some repos, and creates a virtualenv called sm_benchmark_venv.

    TODO: Refactor the above setup function to be more obviously connected to this function,
    TODO: and install requirements via a requirements.txt file

    :param image_uri: ECR image URI
    :param xla: [ True | False ] Enable XLA acceleration
    :param num_nodes: Number of nodes to run on
    :param region: AWS region

    This function was inspired by deep-learning-containers/test/dlc_tests/benchmark/sagemaker/tensorflow/training/test_performance_tensorflow_sm_training.py

    """
    _, framework_version = get_framework_and_version_from_tag(image_uri)

    processor = "xla" if xla else "gpu"
    device_cuda_str = f"{processor}-{get_cuda_version_from_tag(image_uri)}"
    '''
    TODO: Switch to p3.16xlarge when EC2 availability issues are resolved
    '''
    ec2_instance_type = "p3.8xlarge"
    py_version = "py2" if "py2" in image_uri else "py37" if "py37" in image_uri else "py3"

    time_str = time.strftime("%Y-%m-%d-%H-%M-%S")
    commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION")
    target_upload_location = os.path.join(
        BENCHMARK_RESULTS_S3_BUCKET, "xla", "tensorflow", framework_version, "sagemaker", "training", device_cuda_str, py_version
    )
    training_job_name = (
        f"opt-tf{framework_version[0]}-bench-{device_cuda_str}-{num_nodes}-node-{py_version}-{commit_info[:7]}-{time_str}"
    )

    # Inserting random sleep because this test starts multiple training jobs around the same time, resulting in
    # a throttling error for SageMaker APIs.
    time.sleep(Random(x=training_job_name).random() * 60)

    test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "resources")
    venv_dir = os.path.join(test_dir, "sm_benchmark_venv")

    ctx = Context()

    with ctx.cd(test_dir), ctx.prefix(f"source {venv_dir}/bin/activate"):
        log_file = (
            f"results-{commit_info}-{time_str}-optimized-tf{framework_version}-{device_cuda_str}-{py_version}-{num_nodes}-node.txt"
        )
        run_out = ctx.run(
            f"timeout 45m python tf_sm_benchmark.py "
            f"--framework-version {framework_version} "
            f"--image-uri {image_uri} "
            f"--instance-type ml.{ec2_instance_type} "
            f"--node-count {num_nodes} "
            f"--python {py_version} "
            f"--region {region} "
            f"--job-name {training_job_name} "
            f"--xla-{'on' if xla else 'off'} "
            f"2>&1 | tee {log_file}",
            warn=True,
            echo=True,
        )

        if not (run_out.ok or run_out.return_code == 124):
            target_upload_location = os.path.join(target_upload_location, "failure_log")

    ctx.run(f"aws s3 cp {os.path.join(test_dir, log_file)} {os.path.join(target_upload_location, log_file)}")

    LOGGER.info(f"Test results can be found at {os.path.join(target_upload_location, log_file)}")

    result_statement, throughput = _print_results_of_test(os.path.join(test_dir, log_file))
    throughput /= num_nodes

    assert run_out.ok, (
        f"Benchmark Test failed with return code {run_out.return_code}. "
        f"Test results can be found at {os.path.join(target_upload_location, log_file)}"
    )

    LOGGER.info(
        f"optimized-tensorflow-{framework_version} sagemaker training {ec2_instance_type} {device_cuda_str} {py_version} "
        f"imagenet {num_nodes} nodes Throughput: {throughput} images/sec, threshold: {threshold} images/sec"
    )
    if threshold:
        assert throughput > threshold, (
            f"optimized-tensorflow-{framework_version} sagemaker training {ec2_instance_type} {device_cuda_str} {py_version} imagenet {num_nodes} nodes "
            f"Regression Benchmark Result {throughput} does not reach the threshold {threshold}"
        )
    return throughput
Ejemplo n.º 14
0
    def test_resnet101_at_fp16(self, instance_type, num_gpus, total_n_gpus,
                               instance_count, distribution_strategy, caching,
                               tensorflow_training, sagemaker_session, capsys,
                               framework_version):
        epochs = int(100 * total_n_gpus)
        batches = np.array([224]) * total_n_gpus
        for batch in np.array(batches, dtype=int):
            train_steps = int(10240 * epochs / batch)
            steps_per_loop = train_steps // 10
            overrides=\
            f"runtime.enable_xla=True,"\
            f"runtime.num_gpus={num_gpus},"\
            f"runtime.distribution_strategy={distribution_strategy},"\
            f"runtime.mixed_precision_dtype=float16,"\
            f"task.train_data.global_batch_size={batch},"\
            f"task.train_data.input_path=/opt/ml/input/data/training/validation*,"\
            f"task.train_data.cache={caching},"\
            f"trainer.train_steps={train_steps},"\
            f"trainer.steps_per_loop={steps_per_loop},"\
            f"trainer.summary_interval={steps_per_loop},"\
            f"trainer.checkpoint_interval={train_steps},"\
            f"task.model.backbone.type=resnet,"\
            f"task.model.backbone.resnet.model_id=101"
            estimator = TensorFlow(
                sagemaker_session=sagemaker_session,
                git_config={
                    'repo': 'https://github.com/tensorflow/models.git',
                    'branch': 'v2.9.2',
                },
                source_dir='.',
                entry_point='official/vision/train.py',
                model_dir=False,
                instance_type=instance_type,
                instance_count=instance_count,
                image_uri=tensorflow_training,
                hyperparameters={
                    TrainingCompilerConfig.HP_ENABLE_COMPILER: True,
                    'experiment': 'resnet_imagenet',
                    'config_file':
                    'official/vision/configs/experiments/image_classification/imagenet_resnet50_gpu.yaml',
                    'mode': 'train',
                    'model_dir': '/opt/ml/model',
                    'params_override': overrides,
                },
                debugger_hook_config=None,
                disable_profiler=True,
                max_run=60 * 60 * 1,  # Timeout in 1 hours
                base_job_name=
                f"tf{framework_version.replace('.','')}-trcomp-bench-resnet101",
                role="SageMakerRole",
            )
            estimator.fit(
                inputs=
                's3://collection-of-ml-datasets/Imagenet/TFRecords/validation',
                logs=True,
                wait=True)

            captured = capsys.readouterr()
            logs = captured.out + captured.err
            match = re.search('Billable seconds: ([0-9]*)', logs)
            billable = int(match.group(1))

            short_version = '.'.join(framework_version.split('.')[:2])
            threshold = TRCOMP_THRESHOLD['tensorflow'][short_version][
                'resnet101'][instance_type][instance_count][batch]
            result = (
                f"tensorflow-trcomp {framework_version} resnet101 fp16 XLA "
                f"imagenet {instance_type} {instance_count} {batch} Billable: {billable} secs threshold: {threshold} secs "
                f"{estimator.latest_training_job.name}")
            LOGGER.info(result)
            assert billable >= 1000, 'False Positive ' + result
            assert billable <= threshold, result
def test_tensorflow_sagemaker_training_performance(tensorflow_training,
                                                   num_nodes, region):

    framework_version = re.search(r"[1,2](\.\d+){2}",
                                  tensorflow_training).group()
    if framework_version.startswith("1."):
        pytest.skip("Skipping benchmark test on TF 1.x images.")

    processor = "gpu" if "gpu" in tensorflow_training else "cpu"

    ec2_instance_type = "p3.16xlarge" if processor == "gpu" else "c5.18xlarge"

    py_version = "py2" if "py2" in tensorflow_training else "py37" if "py37" in tensorflow_training else "py3"

    time_str = time.strftime('%Y-%m-%d-%H-%M-%S')
    commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION")
    target_upload_location = os.path.join(BENCHMARK_RESULTS_S3_BUCKET,
                                          "tensorflow", framework_version,
                                          "sagemaker", "training", processor,
                                          py_version)
    training_job_name = (
        f"tf{framework_version[0]}-tr-bench-{processor}-{num_nodes}-node-{py_version}"
        f"-{commit_info[:7]}-{time_str}")

    # Inserting random sleep because this test starts multiple training jobs around the same time, resulting in
    # a throttling error for SageMaker APIs.
    time.sleep(Random(x=training_job_name).random() * 60)

    test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                            "resources")
    venv_dir = os.path.join(test_dir, "sm_benchmark_venv")

    ctx = Context()

    with ctx.cd(test_dir), ctx.prefix(f"source {venv_dir}/bin/activate"):
        log_file = f"results-{commit_info}-{time_str}-{num_nodes}-node.txt"
        run_out = ctx.run(
            f"timeout 45m python tf_sm_benchmark.py "
            f"--framework-version {framework_version} "
            f"--image-uri {tensorflow_training} "
            f"--instance-type ml.{ec2_instance_type} "
            f"--node-count {num_nodes} "
            f"--python {py_version} "
            f"--region {region} "
            f"--job-name {training_job_name}"
            f"2>&1 > {log_file}",
            warn=True,
            echo=True)

        if not (run_out.ok or run_out.return_code == 124):
            target_upload_location = os.path.join(target_upload_location,
                                                  "failure_log")

    ctx.run(
        f"aws s3 cp {os.path.join(test_dir, log_file)} {os.path.join(target_upload_location, log_file)}"
    )

    LOGGER.info(
        f"Test results can be found at {os.path.join(target_upload_location, log_file)}"
    )

    assert run_out.ok, (
        f"Benchmark Test failed with return code {run_out.return_code}. "
        f"Test results can be found at {os.path.join(target_upload_location, log_file)}"
    )
Ejemplo n.º 16
0
def test_mxnet_sagemaker_training_performance(mxnet_training, num_nodes,
                                              region, gpu_only, py3_only):
    """
    Run MX sagemaker training performance test

    Additional context: Setup for this function is performed by 'setup_sm_benchmark_mx_train_env' -- this installs
    some prerequisite packages, pulls required script, and creates a virtualenv called sm_benchmark_venv.

    The training script mxnet_imagenet_resnet50.py is invoked via a shell script smtrain-resnet50-imagenet.sh
    The shell script sets num-epochs to 40. This parameter is configurable.

    TODO: Refactor the above setup function to be more obviously connected to this function,
    TODO: and install requirements via a requirements.txt file
    TODO: Change latency [time/epoch] metric to Throughput metric

    :param mxnet_training: ECR image URI
    :param num_nodes: Number of nodes to run on
    :param region: AWS region
    """
    _, framework_version = get_framework_and_version_from_tag(mxnet_training)
    device_cuda_str = f"gpu-{get_cuda_version_from_tag(mxnet_training)}"
    py_version = "py37" if "py37" in mxnet_training else "py2" if "py2" in mxnet_training else "py3"
    ec2_instance_type = "p3.16xlarge"

    time_str = time.strftime('%Y-%m-%d-%H-%M-%S')
    commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION", "manual")
    target_upload_location = os.path.join(BENCHMARK_RESULTS_S3_BUCKET, "mxnet",
                                          framework_version, "sagemaker",
                                          "training", device_cuda_str,
                                          py_version)
    training_job_name = f"mx-tr-bench-{device_cuda_str}-{num_nodes}-node-{py_version}-{commit_info[:7]}-{time_str}"

    test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                            "resources")
    venv_dir = os.path.join(test_dir, "sm_benchmark_venv")

    ctx = Context()

    with ctx.cd(test_dir), ctx.prefix(f"source {venv_dir}/bin/activate"):
        log_file = f"results-{commit_info}-{time_str}-{num_nodes}-node.txt"
        run_out = ctx.run(
            f"timeout 90m python mx_sm_benchmark.py "
            f"--framework-version {framework_version} "
            f"--image-uri {mxnet_training} "
            f"--instance-type ml.{ec2_instance_type} "
            f"--node-count {num_nodes} "
            f"--python {py_version} "
            f"--region {region} "
            f"--job-name {training_job_name} "
            f"2>&1 | tee {log_file}",
            warn=True,
            echo=True)

        if not run_out.ok:
            target_upload_location = os.path.join(target_upload_location,
                                                  "failure_log")

    ctx.run(
        f"aws s3 cp {os.path.join(test_dir, log_file)} {os.path.join(target_upload_location, log_file)}",
        warn=True,
        echo=True)

    LOGGER.info(
        f"Test results can be found at {os.path.join(target_upload_location, log_file)}"
    )

    assert run_out.ok, (
        f"Benchmark Test failed with return code {run_out.return_code}. "
        f"Test results can be found at {os.path.join(target_upload_location, log_file)}"
    )

    result_statement, time_val, accuracy = _print_results_of_test(
        os.path.join(test_dir, log_file))

    accuracy_threshold = get_threshold_for_image(
        framework_version, MXNET_TRAINING_GPU_IMAGENET_ACCURACY_THRESHOLD)
    assert accuracy > accuracy_threshold, (
        f"mxnet {framework_version} sagemaker training {py_version} imagenet {num_nodes} nodes "
        f"Benchmark Result {accuracy} does not reach the threshold accuracy {accuracy_threshold}"
    )

    time_threshold = get_threshold_for_image(
        framework_version, MXNET_TRAINING_GPU_IMAGENET_LATENCY_THRESHOLD)
    assert time_val < time_threshold, (
        f"mxnet {framework_version} sagemaker training {py_version} imagenet {num_nodes} nodes "
        f"Benchmark Result {time_val} does not reach the threshold latency {time_threshold}"
    )
def run_sm_perf_test(image_uri, num_nodes, region):
    """
    Run TF sagemaker training performance tests

    Additional context: Setup for this function is performed by 'setup_sm_benchmark_tf_train_env' -- this installs
    some prerequisite packages, clones some repos, and creates a virtualenv called sm_benchmark_venv.

    TODO: Refactor the above setup function to be more obviously connected to this function,
    TODO: and install requirements via a requirements.txt file

    :param image_uri: ECR image URI
    :param num_nodes: Number of nodes to run on
    :param region: AWS region
    """
    _, framework_version = get_framework_and_version_from_tag(image_uri)
    if framework_version.startswith("1."):
        pytest.skip("Skipping benchmark test on TF 1.x images.")

    processor = "gpu" if "gpu" in image_uri else "cpu"
    device_cuda_str = f"{processor}-{get_cuda_version_from_tag(image_uri)}" if processor == "gpu" else processor

    ec2_instance_type = "p3.16xlarge" if processor == "gpu" else "c5.18xlarge"

    py_version = "py2" if "py2" in image_uri else "py37" if "py37" in image_uri else "py3"

    time_str = time.strftime("%Y-%m-%d-%H-%M-%S")
    commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION")
    target_upload_location = os.path.join(BENCHMARK_RESULTS_S3_BUCKET,
                                          "tensorflow", framework_version,
                                          "sagemaker", "training",
                                          device_cuda_str, py_version)
    training_job_name = (
        f"tf{framework_version[0]}-tr-bench-{device_cuda_str}-{num_nodes}-node-{py_version}-{commit_info[:7]}-{time_str}"
    )

    # Inserting random sleep because this test starts multiple training jobs around the same time, resulting in
    # a throttling error for SageMaker APIs.
    time.sleep(Random(x=training_job_name).random() * 60)

    test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                            "resources")
    venv_dir = os.path.join(test_dir, "sm_benchmark_venv")

    ctx = Context()

    with ctx.cd(test_dir), ctx.prefix(f"source {venv_dir}/bin/activate"):
        log_file = (
            f"results-{commit_info}-{time_str}-{framework_version}-{device_cuda_str}-{py_version}-{num_nodes}-node.txt"
        )
        run_out = ctx.run(
            f"timeout 45m python tf_sm_benchmark.py "
            f"--framework-version {framework_version} "
            f"--image-uri {image_uri} "
            f"--instance-type ml.{ec2_instance_type} "
            f"--node-count {num_nodes} "
            f"--python {py_version} "
            f"--region {region} "
            f"--job-name {training_job_name}"
            f"2>&1 | tee {log_file}",
            warn=True,
            echo=True,
        )

        if not (run_out.ok or run_out.return_code == 124):
            target_upload_location = os.path.join(target_upload_location,
                                                  "failure_log")

    ctx.run(
        f"aws s3 cp {os.path.join(test_dir, log_file)} {os.path.join(target_upload_location, log_file)}"
    )

    LOGGER.info(
        f"Test results can be found at {os.path.join(target_upload_location, log_file)}"
    )

    result_statement, throughput = _print_results_of_test(
        os.path.join(test_dir, log_file), processor)
    throughput /= num_nodes

    assert run_out.ok, (
        f"Benchmark Test failed with return code {run_out.return_code}. "
        f"Test results can be found at {os.path.join(target_upload_location, log_file)}"
    )

    threshold_table = ((TENSORFLOW_SM_TRAINING_CPU_1NODE_THRESHOLD if num_nodes
                        == 1 else TENSORFLOW_SM_TRAINING_CPU_4NODE_THRESHOLD)
                       if processor == "cpu" else
                       TENSORFLOW_SM_TRAINING_GPU_1NODE_THRESHOLD if num_nodes
                       == 1 else TENSORFLOW_SM_TRAINING_GPU_4NODE_THRESHOLD)
    threshold = get_threshold_for_image(framework_version, threshold_table)
    LOGGER.info(
        f"tensorflow {framework_version} sagemaker training {device_cuda_str} {py_version} "
        f"imagenet {num_nodes} nodes Throughput: {throughput} images/sec, threshold: {threshold} images/sec"
    )
    assert throughput > threshold, (
        f"tensorflow {framework_version} sagemaker training {processor} {py_version} imagenet {num_nodes} nodes "
        f"Benchmark Result {throughput} does not reach the threshold {threshold}"
    )
Ejemplo n.º 18
0
def test_ecr_scan(image, ecr_client, sts_client, region):
    """
    Run ECR Scan Tool on an image being tested, and raise Error if vulnerabilities found
    1. Start Scan.
    2. For 5 minutes (Run DescribeImages):
       (We run this for 5 minutes because the Scan is expected to complete in about 2 minutes, though no
        analysis has been performed on exactly how long the Scan takes for a DLC image. Therefore we also
        have a 3 minute buffer beyond the expected amount of time taken.)
    3.1. If imageScanStatus == COMPLETE: exit loop
    3.2. If imageScanStatus == IN_PROGRESS or AttributeNotFound(imageScanStatus): continue loop
    3.3. If imageScanStatus == FAILED: raise RuntimeError
    4. If DescribeImages.imageScanStatus != COMPLETE: raise TimeOutError
    5. assert imageScanFindingsSummary.findingSeverityCounts.HIGH/CRITICAL == 0

    :param image: str Image URI for image to be tested
    :param ecr_client: boto3 Client for ECR
    :param sts_client: boto3 Client for STS
    :param region: str Name of region where test is executed
    """
    test_account_id = sts_client.get_caller_identity().get("Account")
    image_account_id = get_account_id_from_image_uri(image)
    image_region = get_region_from_image_uri(image)
    image_repo_name, original_image_tag = get_repository_and_tag_from_image_uri(image)
    additional_image_tags = get_all_the_tags_of_an_image_from_ecr(ecr_client, image)
    if not is_image_available_locally(image):
        LOGGER.info(f"Image {image} not available locally!! Pulling the image...")
        login_to_ecr_registry(Context(), image_account_id, image_region)
        run(f"docker pull {image}")
        if not is_image_available_locally(image):
            raise RuntimeError("Image shown as not available even after pulling")
    for additional_tag in additional_image_tags:
        image_uri_with_new_tag = image.replace(original_image_tag, additional_tag)
        run(f"docker tag {image} {image_uri_with_new_tag}", hide=True)

    if image_account_id != test_account_id:
        original_image = image
        target_image_repo_name = f"beta-{image_repo_name}"
        for additional_tag in additional_image_tags:
            image_uri_with_new_tag = original_image.replace(original_image_tag, additional_tag)
            new_image_uri = ecr_utils.reupload_image_to_test_ecr(image_uri_with_new_tag, target_image_repo_name, region)
            if image_uri_with_new_tag == original_image:
                image = new_image_uri

    minimum_sev_threshold = get_minimum_sev_threshold_level(image)
    LOGGER.info(f"Severity threshold level is {minimum_sev_threshold}")

    run_scan(ecr_client, image)
    scan_results = ecr_utils.get_ecr_image_scan_results(ecr_client, image, minimum_vulnerability=minimum_sev_threshold)
    scan_results = ecr_utils.populate_ecr_scan_with_web_scraper_results(image, scan_results)
    ecr_image_vulnerability_list = ScanVulnerabilityList(minimum_severity=CVESeverity[minimum_sev_threshold])
    ecr_image_vulnerability_list.construct_allowlist_from_ecr_scan_result(scan_results)

    remaining_vulnerabilities = ecr_image_vulnerability_list

    if not is_image_covered_by_allowlist_feature(image):
        if is_canary_context():
            pytest.skip("Skipping the test on the canary.")
        
        common_ecr_scan_allowlist = ScanVulnerabilityList(minimum_severity=CVESeverity[minimum_sev_threshold])
        common_ecr_scan_allowlist_path = os.path.join(
            os.sep, get_repository_local_path(), "data", "common-ecr-scan-allowlist.json"
        )
        if os.path.exists(common_ecr_scan_allowlist_path):
            common_ecr_scan_allowlist.construct_allowlist_from_file(common_ecr_scan_allowlist_path)

        remaining_vulnerabilities = remaining_vulnerabilities - common_ecr_scan_allowlist

        if remaining_vulnerabilities:
            assert not remaining_vulnerabilities.vulnerability_list, (
                f"The following vulnerabilities need to be fixed on {image}:\n"
                f"{json.dumps(remaining_vulnerabilities.vulnerability_list, indent=4)}"
            )
        return

    upgraded_image_vulnerability_list, image_scan_allowlist = fetch_other_vulnerability_lists(
        image, ecr_client, minimum_sev_threshold
    )
    s3_bucket_name = ECR_SCAN_HELPER_BUCKET

    ## In case new vulnerabilities (fixable or non-fixable) are found, then conduct failure routine
    newly_found_vulnerabilities = ecr_image_vulnerability_list - image_scan_allowlist
    # In case there is no new vulnerability but the allowlist is outdated
    vulnerabilities_that_can_be_fixed = image_scan_allowlist - upgraded_image_vulnerability_list

    if newly_found_vulnerabilities or vulnerabilities_that_can_be_fixed:
        failure_routine_summary = conduct_failure_routine(
            image,
            image_scan_allowlist,
            ecr_image_vulnerability_list,
            upgraded_image_vulnerability_list,
            s3_bucket_name,
        )
        (
            s3_filename_for_fixable_list,
            s3_filename_for_non_fixable_list,
        ) = process_failure_routine_summary_and_store_data_in_s3(failure_routine_summary, s3_bucket_name)
        prepend_message = "Found new vulnerabilities in image." if newly_found_vulnerabilities else "Allowlist is outdated."
        display_message = prepend_message + " " + (
            f"""Found {len(failure_routine_summary["fixable_vulnerabilities"])} fixable vulnerabilites """
            f"""and {len(failure_routine_summary["non_fixable_vulnerabilities"])} non fixable vulnerabilites. """
            f"""Refer to files s3://{s3_bucket_name}/{s3_filename_for_fixable_list}, s3://{s3_bucket_name}/{s3_filename_for_non_fixable_list}, """
            f"""s3://{s3_bucket_name}/{failure_routine_summary["s3_filename_for_current_image_ecr_scan_list"]} and s3://{s3_bucket_name}/{failure_routine_summary["s3_filename_for_allowlist"]}."""
        )
        if is_canary_context():
            LOGGER.error(display_message)
            pytest.skip("Skipping the test failure on the canary.")
        else:
            raise RuntimeError(display_message)