def _print_results_of_test(file_path, processor):
    result = ""
    throughput = 0
    if processor == "cpu":
        with open(file_path, 'r') as f:
            lines = f.readlines()
            for line in lines:
                if "Total img/sec on " in line:
                    result = line + "\n"
                    throughput += float(
                        re.search(
                            r"(CPU\(s\):[ ]*)(?P<throughput>[0-9]+\.?[0-9]+)",
                            line).group("throughput"))
    elif processor == "gpu":
        """calculate average throughput"""
        result_list, throughput_list = [], []
        with open(file_path, 'r') as f:
            lines = f.readlines()
            for line in lines:
                if "images/sec: " in line:
                    result_list.append(line.strip("\n"))
                    throughput = float(
                        re.search(
                            r"(images/sec:[ ]*)(?P<throughput>[0-9]+\.?[0-9]+)",
                            line).group("throughput"))
                    throughput_list.append(throughput)
        result = "\n".join(result_list[-100:]) + "\n"
        if len(throughput_list) == 0:
            raise Exception(
                "Cannot find throughput lines. Looks like SageMaker job was not run successfully. Please check"
            )
        # Take average of last 100 throughput lines
        throughput = sum(throughput_list[-100:]) / len(throughput_list[-100:])
    LOGGER.info(result)
    return result, throughput
コード例 #2
0
def run_smdebug_test(
    image_uri,
    ec2_connection,
    region,
    docker_executable="docker",
    container_name="smdebug",
    test_script=SMDEBUG_SCRIPT,
    logfile="output.log",
):
    framework = get_framework_from_image_uri(image_uri)
    container_test_local_dir = os.path.join("$HOME", "container_tests")
    ec2_connection.run(f"$(aws ecr get-login --no-include-email --region {region})", hide=True)

    ec2_connection.run(
        f"{docker_executable} run --name {container_name} -v "
        f"{container_test_local_dir}:{os.path.join(os.sep, 'test')} -itd {image_uri}",
        hide=True,
    )

    try:
        test_output = ec2_connection.run(
            f"{docker_executable} exec --user root {container_name} "
            f"/bin/bash -c '{test_script} {framework}' | tee {logfile}",
            hide=True,
            warn=True,
            timeout=3000,
        )
    except Exception:
        debug_output = ec2_connection.run(f"cat {logfile}")
        LOGGER.error(f"Caught exception while trying to run test via fabric. Output: {debug_output.stdout}")
        raise

    # LOGGER.info(test_output.stdout)  # Uncomment this line for a complete log dump

    assert test_output.ok, f"SMDebug tests failed. Output:\n{test_output.stdout}"
コード例 #3
0
def _print_results_of_test(file_path, processor):
    last_100_lines = Context().run(f"tail -100 {file_path}").stdout.split("\n")
    result = ""
    throughput = 0
    if processor == "cpu":
        for line in last_100_lines:
            if "Total img/sec on " in line:
                result = line + "\n"
                throughput = float(
                    re.search(
                        r"(CPU\(s\):[ ]*)(?P<throughput>[0-9]+\.?[0-9]+)",
                        line).group("throughput"))
                break
    elif processor == "gpu":
        result_dict = dict()
        for line in last_100_lines:
            if "images/sec: " in line:
                key = line.split("<stdout>")[0]
                result_dict[key] = line.strip("\n")
                if throughput == 0:
                    throughput = float(
                        re.search(
                            r"(images/sec:[ ]*)(?P<throughput>[0-9]+\.?[0-9]+)",
                            line).group("throughput"))
        result = "\n".join(result_dict.values()) + "\n"
    LOGGER.info(result)
    return result, throughput
コード例 #4
0
def test_dataclasses_check(image):
    """
    Ensure there is no dataclasses pip package is installed for python 3.7 and above version.
    Python version retrieved from the ecr image uri is expected in the format `py<major_verion><minor_version>`
    :param image: ECR image URI
    """
    ctx = Context()
    pip_package = "dataclasses"

    container_name = get_container_name("dataclasses-check", image)

    python_version = get_python_version_from_image_uri(image).replace("py", "")
    python_version = int(python_version)

    if python_version >= 37:
        start_container(container_name, image, ctx)
        output = run_cmd_on_container(container_name,
                                      ctx,
                                      f"pip show {pip_package}",
                                      warn=True)

        if output.return_code == 0:
            pytest.fail(
                f"{pip_package} package exists in the DLC image {image} that has py{python_version} version which is greater than py36 version"
            )
        else:
            LOGGER.info(
                f"{pip_package} package does not exists in the DLC image {image}"
            )
    else:
        pytest.skip(
            f"Skipping test for DLC image {image} that has py36 version as {pip_package} is not included in the python framework"
        )
コード例 #5
0
def test_generate_coverage_doc():
    """
    Test generating the test coverage doc
    """
    test_coverage_file = get_test_coverage_file_path()
    ctx = Context()
    # Set DLC_IMAGES to 'test' to avoid image names affecting function metadata (due to parametrization)
    # Set CODEBUILD_RESOLVED_SOURCE_VERSION to test for ease of running this test locally
    ctx.run(
        "export DLC_IMAGES='' && export CODEBUILD_RESOLVED_SOURCE_VERSION='test' && export BUILD_CONTEXT=''"
        "&& pytest -s --collect-only  --generate-coverage-doc --ignore=container_tests/",
        hide=True,
    )

    # Ensure that the coverage report is created
    assert os.path.exists(test_coverage_file), f"Cannot find test coverage report file {test_coverage_file}"

    # Write test coverage file to S3
    if is_mainline_context():
        client = boto3.client("s3")
        with open(test_coverage_file, "rb") as test_file:
            try:
                client.put_object(Bucket=TEST_COVERAGE_REPORT_BUCKET, Key=os.path.basename(test_coverage_file),
                                  Body=test_file)
            except ClientError as e:
                LOGGER.error(f"Unable to upload report to bucket {TEST_COVERAGE_REPORT_BUCKET}. Error: {e}")
                raise
コード例 #6
0
def run_smclarify_bias_metrics(
    image_uri,
    ec2_connection,
    ec2_instance_type,
    docker_executable="docker",
    container_name="smclarify",
    test_script=SMCLARIFY_SCRIPT,
):
    container_test_local_dir = os.path.join("$HOME", "container_tests")
    account_id = get_account_id_from_image_uri(image_uri)
    region = get_region_from_image_uri(image_uri)

    login_to_ecr_registry(ec2_connection, account_id, region)
    ec2_connection.run(f"docker pull -q {image_uri}")

    try:
        ec2_connection.run(
            f"{docker_executable} run --name {container_name} -v "
            f"{container_test_local_dir}:{os.path.join(os.sep, 'test')} {image_uri} "
            f"python {test_script}",
            hide=True,
            timeout=300,
        )
    except Exception as e:
        debug_output = ec2_connection.run(f"docker logs {container_name}")
        debug_stdout = debug_output.stdout
        if "Test SMClarify Bias Metrics succeeded!" in debug_stdout:
            LOGGER.warning(
                f"SMClarify test succeeded, but there is an issue with fabric. "
                f"Error:\n{e}\nTest output:\n{debug_stdout}"
            )
            return
        raise SMClarifyTestFailure(
            f"SMClarify test failed on {image_uri} on {ec2_instance_type}. Full output:\n{debug_stdout}"
        ) from e
def test_tensorflow_sagemaker_training_performance(tensorflow_training,
                                                   num_nodes, region):

    # This sleep has been inserted because all the parametrized training jobs are automatically created
    # by SageMaker with the same name, due to being started around the same time, and with the same image uri.
    time.sleep(
        random.Random(x=f"{tensorflow_training}{num_nodes}").random() * 60)

    framework_version = re.search(r"[1,2](\.\d+){2}",
                                  tensorflow_training).group()
    processor = "gpu" if "gpu" in tensorflow_training else "cpu"

    ec2_instance_type = "p3.16xlarge" if processor == "gpu" else "c5.18xlarge"

    py_version = "py2" if "py2" in tensorflow_training else "py37" if "py37" in tensorflow_training else "py3"

    time_str = time.strftime('%Y-%m-%d-%H-%M-%S')
    commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION")
    target_upload_location = os.path.join(BENCHMARK_RESULTS_S3_BUCKET,
                                          "tensorflow", framework_version,
                                          "sagemaker", "training", processor,
                                          py_version)

    test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                            "resources")
    venv_dir = os.path.join(test_dir, "sm_benchmark_venv")

    ctx = Context()

    with ctx.cd(test_dir), ctx.prefix(f"source {venv_dir}/bin/activate"):
        log_file = f"results-{commit_info}-{time_str}-{num_nodes}-node.txt"
        run_out = ctx.run(
            f"timeout 45m python tf_sm_benchmark.py "
            f"--framework-version {framework_version} "
            f"--image-uri {tensorflow_training} "
            f"--instance-type ml.{ec2_instance_type} "
            f"--node-count {num_nodes} "
            f"--python {py_version} "
            f"--region {region} "
            f"> {log_file}",
            warn=True,
            echo=True)

        if not (run_out.ok or run_out.return_code == 124):
            target_upload_location = os.path.join(target_upload_location,
                                                  "failure_log")

        ctx.run(
            f"aws s3 cp {log_file} {os.path.join(target_upload_location, log_file)}"
        )

    LOGGER.info(
        f"Test results can be found at {os.path.join(target_upload_location, log_file)}"
    )

    assert run_out.ok, (
        f"Benchmark Test failed with return code {run_out.return_code}. "
        f"Test results can be found at {os.path.join(target_upload_location, log_file)}"
    )
コード例 #8
0
def start_ecr_image_scan(ecr_client, image_uri):
    """
    Start ECR Scan for an image, and Warn if scan cannot be started
    :param ecr_client: boto3 client for ECR
    :param image_uri: image URI for image to be checked
    """
    repository, tag = get_repository_and_tag_from_image_uri(image_uri)
    try:
        scan_info = ecr_client.start_image_scan(repositoryName=repository, imageId={"imageTag": tag})
    except ecr_client.exceptions.LimitExceededException:
        LOGGER.warning("Scan has already been run on this image in the last 24 hours.")
        return
    if scan_info["imageScanStatus"]["status"] == "FAILED":
        raise ECRScanFailedError(f"ECR Scan failed and returned:\n{json.dumps(scan_info, indent=4)}")
    return
コード例 #9
0
def test_tensorflow_with_horovod_cpu(tensorflow_training, ec2_connection, cpu_only, tf2_only):
    container_name = "tf_hvd_cpu_test"
    test_script = TF1_HVD_CMD if is_tf_version("1", tensorflow_training) else TF2_HVD_CMD
    try:
        execute_ec2_training_test(
            ec2_connection, tensorflow_training, test_script, container_name=container_name, timeout=1800
        )
    except Exception as e:
        debug_output = ec2_connection.run(f"docker logs {container_name}")
        debug_stdout = debug_output.stdout
        if "TF HVD tests passed!" in debug_stdout:
            LOGGER.warning(
                f"TF HVD tests succeeded, but there is an issue with fabric. Error:\n{e}\nTest output:\n{debug_stdout}"
            )
            return
        raise TFTrainingTestFailure(f"TF HVD test failed. Full output:\n{debug_stdout}") from e
コード例 #10
0
def _print_results_of_test(file_path, processor):
    last_100_lines = Context().run(f"tail -100 {file_path}").stdout.split("\n")
    result = ""
    if processor == "cpu":
        for line in last_100_lines:
            if "Total img/sec on " in line:
                result = line + "\n"
    elif processor == "gpu":
        result_dict = dict()
        for line in last_100_lines:
            if "images/sec: " in line:
                key = line.split("<stdout>")[0]
                result_dict[key] = line.strip("\n")
        result = "\n".join(result_dict.values()) + "\n"
    LOGGER.info(result)
    return result
コード例 #11
0
def _run_cmd_on_container(container_name, context, cmd, executable="bash"):
    """
    Helper function to run commands on a locally running container

    :param container_name: Name of the docker container
    :param context: ECR image URI
    :param cmd: Command to run on the container
    :param executable: Executable to run on the container (bash or python)
    :return: invoke output, can be used to parse stdout, etc
    """
    if executable not in ("bash", "python"):
        LOGGER.warn(
            f"Unrecognized executable {executable}. It will be run as {executable} -c '{cmd}'"
        )
    return context.run(
        f"docker exec --user root {container_name} {executable} -c '{cmd}'",
        hide=True,
        timeout=30)
コード例 #12
0
def test_dlc_major_version_label(image, region):
    """
    Test to ensure that all DLC images have the LABEL "dlc_major_version"

    :param image: <str> Image URI
    :param region: <str> region where ECR repository holding the image resides
    :return:
    """
    ecr_client = boto3.client("ecr", region_name=region)

    image_repository, image_tag = get_repository_and_tag_from_image_uri(image)
    # Using "acceptedMediaTypes" on the batch_get_image request allows the returned image information to
    # provide the ECR Image Manifest in the specific format that we need, so that the image LABELS can be found
    # on the manifest. The default format does not return the image LABELs.
    response = ecr_client.batch_get_image(
        repositoryName=image_repository,
        imageIds=[{
            "imageTag": image_tag
        }],
        acceptedMediaTypes=[
            "application/vnd.docker.distribution.manifest.v1+json"
        ],
    )
    if not response.get("images"):
        raise KeyError(
            f"Failed to get images through ecr_client.batch_get_image response for image {image_repository}:{image_tag}"
        )
    elif not response["images"][0].get("imageManifest"):
        raise KeyError(
            f"imageManifest not found in ecr_client.batch_get_image response:\n{response['images']}"
        )

    manifest_str = response["images"][0]["imageManifest"]
    # manifest_str is a json-format string
    manifest = json.loads(manifest_str)
    image_metadata = json.loads(manifest["history"][0]["v1Compatibility"])
    major_version = image_metadata["config"]["Labels"].get(
        "dlc_major_version", None)

    assert major_version, f"{image} has no LABEL named 'dlc_major_version'. Please insert label."

    LOGGER.info(f"{image} has 'dlc_major_version' = {major_version}")
コード例 #13
0
def test_canary_images_pullable(region):
    """
    Sanity test to verify canary specific functions
    """
    ctx = Context()
    frameworks = ("tensorflow", "mxnet", "pytorch")

    # Have a default framework to test on
    framework = "pytorch"
    for fw in frameworks:
        if fw in os.getenv("CODEBUILD_INITIATOR"):
            framework = fw
            break

    images = parse_canary_images(framework, region)
    login_to_ecr_registry(ctx, PUBLIC_DLC_REGISTRY, region)
    if not images:
        return
    for image in images.split(" "):
        ctx.run(f"docker pull -q {image}")
        LOGGER.info(f"Canary image {image} is available")
コード例 #14
0
def _print_results_of_test(file_path):
    last_n_lines = Context().run(f"tail -500 {file_path}").stdout.split("\n")
    result_dict = dict()
    accuracy = 0
    time_cost = 0
    accuracy_key = "Train-accuracy"
    time_cost_key = "Time cost"
    reversed_log = reversed(last_n_lines)
    for line in reversed_log:
        if all(key in result_dict for key in ("Train-accuracy", "Time cost")):
            break
        if accuracy_key in line:
            if accuracy_key in result_dict:
                continue
            accuracy_str = line.split("=")[1]
            result_dict[accuracy_key] = accuracy_str
            accuracy = float(accuracy_str)
        if time_cost_key in line:
            if time_cost_key in result_dict:
                continue
            time_str = line.split("=")[1]
            result_dict[time_cost_key] = time_str
            time_cost = float(time_str)
    result = "\n".join(result_dict.values()) + "\n"
    LOGGER.info(f'Result is {result}')
    LOGGER.info(f'{accuracy_key} is {accuracy}')
    LOGGER.info(f'{time_cost_key} is {time_cost}')
    return result, time_cost, accuracy
コード例 #15
0
def run_smdebug_test(
    image_uri,
    ec2_connection,
    region,
    ec2_instance_type,
    docker_executable="docker",
    container_name="smdebug",
    test_script=SMDEBUG_SCRIPT,
    timeout=2400,
):
    large_shm_instance_types = ("p2.8xlarge", "m4.16xlarge")
    shm_setting = " --shm-size=1g " if ec2_instance_type in large_shm_instance_types else " "
    framework = get_framework_from_image_uri(image_uri)
    container_test_local_dir = os.path.join("$HOME", "container_tests")
    ec2_connection.run(
        f"$(aws ecr get-login --no-include-email --region {region})",
        hide=True)
    ec2_connection.run(f"docker pull -q {image_uri}")

    try:
        ec2_connection.run(
            f"{docker_executable} run --name {container_name} -v "
            f"{container_test_local_dir}:{os.path.join(os.sep, 'test')}{shm_setting}{image_uri} "
            f"./{test_script} {framework}",
            hide=True,
            timeout=timeout,
        )
    except Exception as e:
        debug_output = ec2_connection.run(f"docker logs {container_name}")
        debug_stdout = debug_output.stdout
        if "All SMDebug tests succeeded!" in debug_stdout:
            LOGGER.warning(
                f"SMDebug tests succeeded, but there is an issue with fabric. Error:\n{e}\nTest output:\n{debug_stdout}"
            )
            return
        raise SMDebugTestFailure(
            f"SMDebug test failed on {image_uri} on {ec2_instance_type}. Full output:\n{debug_stdout}"
        ) from e
def test_tensorflow_sagemaker_training_performance(tensorflow_training,
                                                   num_nodes, region):

    framework_version = re.search(r"[1,2](\.\d+){2}",
                                  tensorflow_training).group()
    if framework_version.startswith("1."):
        pytest.skip("Skipping benchmark test on TF 1.x images.")

    processor = "gpu" if "gpu" in tensorflow_training else "cpu"

    ec2_instance_type = "p3.16xlarge" if processor == "gpu" else "c5.18xlarge"

    py_version = "py2" if "py2" in tensorflow_training else "py37" if "py37" in tensorflow_training else "py3"

    time_str = time.strftime('%Y-%m-%d-%H-%M-%S')
    commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION")
    target_upload_location = os.path.join(BENCHMARK_RESULTS_S3_BUCKET,
                                          "tensorflow", framework_version,
                                          "sagemaker", "training", processor,
                                          py_version)
    training_job_name = (
        f"tf{framework_version[0]}-tr-bench-{processor}-{num_nodes}-node-{py_version}"
        f"-{commit_info[:7]}-{time_str}")

    # Inserting random sleep because this test starts multiple training jobs around the same time, resulting in
    # a throttling error for SageMaker APIs.
    time.sleep(Random(x=training_job_name).random() * 60)

    test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                            "resources")
    venv_dir = os.path.join(test_dir, "sm_benchmark_venv")

    ctx = Context()

    with ctx.cd(test_dir), ctx.prefix(f"source {venv_dir}/bin/activate"):
        log_file = f"results-{commit_info}-{time_str}-{num_nodes}-node.txt"
        run_out = ctx.run(
            f"timeout 45m python tf_sm_benchmark.py "
            f"--framework-version {framework_version} "
            f"--image-uri {tensorflow_training} "
            f"--instance-type ml.{ec2_instance_type} "
            f"--node-count {num_nodes} "
            f"--python {py_version} "
            f"--region {region} "
            f"--job-name {training_job_name}"
            f"2>&1 > {log_file}",
            warn=True,
            echo=True)

        if not (run_out.ok or run_out.return_code == 124):
            target_upload_location = os.path.join(target_upload_location,
                                                  "failure_log")

    ctx.run(
        f"aws s3 cp {os.path.join(test_dir, log_file)} {os.path.join(target_upload_location, log_file)}"
    )

    LOGGER.info(
        f"Test results can be found at {os.path.join(target_upload_location, log_file)}"
    )

    assert run_out.ok, (
        f"Benchmark Test failed with return code {run_out.return_code}. "
        f"Test results can be found at {os.path.join(target_upload_location, log_file)}"
    )
コード例 #17
0
def test_tensorflow_sagemaker_training_performance(tensorflow_training,
                                                   num_nodes, region):
    """
    Run TF sagemaker training performance tests

    Additonal context: Setup for this function is performed by 'setup_sm_benchmark_tf_train_env' -- this installs
    some prerequisite packages, clones some repos, and creates a virtualenv called sm_benchmark_venv.

    TODO: Refactor the above setup function to be more obviously connected to this function,
    TODO: and install requirements via a requirements.txt file

    :param tensorflow_training: ECR image URI
    :param num_nodes: Number of nodes to run on
    :param region: AWS region
    """
    framework_version = re.search(r"[1,2](\.\d+){2}",
                                  tensorflow_training).group()
    if framework_version.startswith("1."):
        pytest.skip("Skipping benchmark test on TF 1.x images.")

    processor = "gpu" if "gpu" in tensorflow_training else "cpu"

    ec2_instance_type = "p3.16xlarge" if processor == "gpu" else "c5.18xlarge"

    py_version = "py2" if "py2" in tensorflow_training else "py37" if "py37" in tensorflow_training else "py3"

    time_str = time.strftime('%Y-%m-%d-%H-%M-%S')
    commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION")
    target_upload_location = os.path.join(BENCHMARK_RESULTS_S3_BUCKET,
                                          "tensorflow", framework_version,
                                          "sagemaker", "training", processor,
                                          py_version)
    training_job_name = (
        f"tf{framework_version[0]}-tr-bench-{processor}-{num_nodes}-node-{py_version}"
        f"-{commit_info[:7]}-{time_str}")

    # Inserting random sleep because this test starts multiple training jobs around the same time, resulting in
    # a throttling error for SageMaker APIs.
    time.sleep(Random(x=training_job_name).random() * 60)

    test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                            "resources")
    venv_dir = os.path.join(test_dir, "sm_benchmark_venv")

    ctx = Context()

    with ctx.cd(test_dir), ctx.prefix(f"source {venv_dir}/bin/activate"):
        log_file = f"results-{commit_info}-{time_str}-{framework_version}-{processor}-{py_version}-{num_nodes}-node.txt"
        run_out = ctx.run(
            f"timeout 45m python tf_sm_benchmark.py "
            f"--framework-version {framework_version} "
            f"--image-uri {tensorflow_training} "
            f"--instance-type ml.{ec2_instance_type} "
            f"--node-count {num_nodes} "
            f"--python {py_version} "
            f"--region {region} "
            f"--job-name {training_job_name}"
            f"2>&1 | tee {log_file}",
            warn=True,
            echo=True)

        if not (run_out.ok or run_out.return_code == 124):
            target_upload_location = os.path.join(target_upload_location,
                                                  "failure_log")

    ctx.run(
        f"aws s3 cp {os.path.join(test_dir, log_file)} {os.path.join(target_upload_location, log_file)}"
    )

    LOGGER.info(
        f"Test results can be found at {os.path.join(target_upload_location, log_file)}"
    )

    _print_results_of_test(os.path.join(test_dir, log_file), processor)

    assert run_out.ok, (
        f"Benchmark Test failed with return code {run_out.return_code}. "
        f"Test results can be found at {os.path.join(target_upload_location, log_file)}"
    )
コード例 #18
0
def test_cuda_paths(gpu):
    """
    Test to ensure that:
    a. buildspec contains an entry to create the same image as the image URI
    b. directory structure for GPU Dockerfiles has framework version, python version, and cuda version in it

    :param gpu: gpu image uris
    """
    image = gpu
    if "example" in image:
        pytest.skip(
            "Skipping Example Dockerfiles which are not explicitly tied to a cuda version"
        )

    dlc_path = os.getcwd().split("/test/")[0]
    job_type = "training" if "training" in image else "inference"

    # Ensure that image has a supported framework
    frameworks = ("tensorflow", "pytorch", "mxnet")
    framework = ""
    for fw in frameworks:
        if fw in image:
            framework = fw
            break
    assert framework, f"Cannot find any frameworks {frameworks} in image uri {image}"

    # Get cuda, framework version, python version through regex
    cuda_version = re.search(r"-(cu\d+)-", image).group(1)
    framework_version = re.search(r":(\d+(\.\d+){2})", image).group(1)
    framework_short_version = None
    python_version = re.search(r"(py\d+)", image).group(1)
    short_python_version = None
    image_tag = re.search(
        r":(\d+(\.\d+){2}-(cpu|gpu|neuron)-(py\d+)(-cu\d+)-(ubuntu\d+\.\d+)(-example)?)",
        image).group(1)

    framework_version_path = os.path.join(dlc_path, framework, job_type,
                                          "docker", framework_version)
    if not os.path.exists(framework_version_path):
        framework_short_version = re.match(r"(\d+.\d+)",
                                           framework_version).group(1)
        framework_version_path = os.path.join(dlc_path, framework, job_type,
                                              "docker",
                                              framework_short_version)
    if not os.path.exists(os.path.join(framework_version_path,
                                       python_version)):
        # Use the pyX version as opposed to the pyXY version if pyXY path does not exist
        short_python_version = python_version[:3]

    # Check buildspec for cuda version
    buildspec = "buildspec.yml"
    if is_tf_version("1", image):
        buildspec = "buildspec-tf1.yml"

    cuda_in_buildspec = False
    dockerfile_spec_abs_path = None
    cuda_in_buildspec_ref = f"CUDA_VERSION {cuda_version}"
    buildspec_path = os.path.join(dlc_path, framework, buildspec)
    buildspec_def = Buildspec()
    buildspec_def.load(buildspec_path)

    for name, image_spec in buildspec_def["images"].items():
        if image_spec["device_type"] == "gpu" and image_spec[
                "tag"] == image_tag:
            cuda_in_buildspec = True
            dockerfile_spec_abs_path = os.path.join(
                os.path.dirname(framework_version_path),
                image_spec["docker_file"].lstrip("docker/"))
            break

    try:
        assert cuda_in_buildspec, f"Can't find {cuda_in_buildspec_ref} in {buildspec_path}"
    except AssertionError as e:
        if not is_dlc_cicd_context():
            LOGGER.warn(
                f"{e} - not failing, as this is a(n) {os.getenv('BUILD_CONTEXT', 'empty')} build context."
            )
        else:
            raise

    image_properties_expected_in_dockerfile_path = [
        framework_short_version or framework_version, short_python_version
        or python_version, cuda_version
    ]
    assert all(
        prop in dockerfile_spec_abs_path
        for prop in image_properties_expected_in_dockerfile_path
    ), (f"Dockerfile location {dockerfile_spec_abs_path} does not contain all the image properties in "
        f"{image_properties_expected_in_dockerfile_path}")

    assert os.path.exists(
        dockerfile_spec_abs_path
    ), f"Cannot find dockerfile for {image} in {dockerfile_spec_abs_path}"
コード例 #19
0
    def test_resnet101_at_fp16(self, instance_type, num_gpus, total_n_gpus,
                               instance_count, distribution_strategy, caching,
                               tensorflow_training, sagemaker_session, capsys,
                               framework_version):
        epochs = int(100 * total_n_gpus)
        batches = np.array([224]) * total_n_gpus
        for batch in np.array(batches, dtype=int):
            train_steps = int(10240 * epochs / batch)
            steps_per_loop = train_steps // 10
            overrides=\
            f"runtime.enable_xla=True,"\
            f"runtime.num_gpus={num_gpus},"\
            f"runtime.distribution_strategy={distribution_strategy},"\
            f"runtime.mixed_precision_dtype=float16,"\
            f"task.train_data.global_batch_size={batch},"\
            f"task.train_data.input_path=/opt/ml/input/data/training/validation*,"\
            f"task.train_data.cache={caching},"\
            f"trainer.train_steps={train_steps},"\
            f"trainer.steps_per_loop={steps_per_loop},"\
            f"trainer.summary_interval={steps_per_loop},"\
            f"trainer.checkpoint_interval={train_steps},"\
            f"task.model.backbone.type=resnet,"\
            f"task.model.backbone.resnet.model_id=101"
            estimator = TensorFlow(
                sagemaker_session=sagemaker_session,
                git_config={
                    'repo': 'https://github.com/tensorflow/models.git',
                    'branch': 'v2.9.2',
                },
                source_dir='.',
                entry_point='official/vision/train.py',
                model_dir=False,
                instance_type=instance_type,
                instance_count=instance_count,
                image_uri=tensorflow_training,
                hyperparameters={
                    TrainingCompilerConfig.HP_ENABLE_COMPILER: True,
                    'experiment': 'resnet_imagenet',
                    'config_file':
                    'official/vision/configs/experiments/image_classification/imagenet_resnet50_gpu.yaml',
                    'mode': 'train',
                    'model_dir': '/opt/ml/model',
                    'params_override': overrides,
                },
                debugger_hook_config=None,
                disable_profiler=True,
                max_run=60 * 60 * 1,  # Timeout in 1 hours
                base_job_name=
                f"tf{framework_version.replace('.','')}-trcomp-bench-resnet101",
                role="SageMakerRole",
            )
            estimator.fit(
                inputs=
                's3://collection-of-ml-datasets/Imagenet/TFRecords/validation',
                logs=True,
                wait=True)

            captured = capsys.readouterr()
            logs = captured.out + captured.err
            match = re.search('Billable seconds: ([0-9]*)', logs)
            billable = int(match.group(1))

            short_version = '.'.join(framework_version.split('.')[:2])
            threshold = TRCOMP_THRESHOLD['tensorflow'][short_version][
                'resnet101'][instance_type][instance_count][batch]
            result = (
                f"tensorflow-trcomp {framework_version} resnet101 fp16 XLA "
                f"imagenet {instance_type} {instance_count} {batch} Billable: {billable} secs threshold: {threshold} secs "
                f"{estimator.latest_training_job.name}")
            LOGGER.info(result)
            assert billable >= 1000, 'False Positive ' + result
            assert billable <= threshold, result
コード例 #20
0
def test_oss_compliance(image):
    """
    Run oss compliance check on a container to check if license attribution files exist.
    And upload source of third party packages to S3 bucket.
    """
    THIRD_PARTY_SOURCE_CODE_BUCKET = "aws-dlinfra-licenses"
    THIRD_PARTY_SOURCE_CODE_BUCKET_PATH = "third_party_source_code"
    file = "THIRD_PARTY_SOURCE_CODE_URLS"
    container_name = get_container_name("oss_compliance", image)
    context = Context()
    local_repo_path = get_repository_local_path()
    start_container(container_name, image, context)

    # run compliance test to make sure license attribution files exists. testOSSCompliance is copied as part of Dockerfile
    run_cmd_on_container(container_name, context,
                         "/usr/local/bin/testOSSCompliance /root")

    try:
        context.run(
            f"docker cp {container_name}:/root/{file} {os.path.join(local_repo_path, file)}"
        )
    finally:
        context.run(f"docker rm -f {container_name}", hide=True)

    s3_resource = boto3.resource("s3")

    with open(os.path.join(local_repo_path, file)) as source_code_file:
        for line in source_code_file:
            name, version, url = line.split(" ")
            file_name = f"{name}_v{version}_source_code"
            s3_object_path = f"{THIRD_PARTY_SOURCE_CODE_BUCKET_PATH}/{file_name}.tar.gz"
            local_file_path = os.path.join(local_repo_path, file_name)

            for i in range(3):
                try:
                    if not os.path.isdir(local_file_path):
                        context.run(
                            f"git clone {url.rstrip()} {local_file_path}")
                        context.run(
                            f"tar -czvf {local_file_path}.tar.gz {local_file_path}"
                        )
                except Exception as e:
                    time.sleep(1)
                    if i == 2:
                        LOGGER.error(f"Unable to clone git repo. Error: {e}")
                        raise
                    continue
            try:
                if os.path.exists(f"{local_file_path}.tar.gz"):
                    LOGGER.info(f"Uploading package to s3 bucket: {line}")
                    s3_resource.Object(THIRD_PARTY_SOURCE_CODE_BUCKET,
                                       s3_object_path).load()
            except botocore.exceptions.ClientError as e:
                if e.response["Error"]["Code"] == "404":
                    try:
                        # using aws cli as using boto3 expects to upload folder by iterating through each file instead of entire folder.
                        context.run(
                            f"aws s3 cp {local_file_path}.tar.gz s3://{THIRD_PARTY_SOURCE_CODE_BUCKET}/{s3_object_path}"
                        )
                        object = s3_resource.Bucket(
                            THIRD_PARTY_SOURCE_CODE_BUCKET).Object(
                                s3_object_path)
                        object.Acl().put(ACL="public-read")
                    except ClientError as e:
                        LOGGER.error(
                            f"Unable to upload source code to bucket {THIRD_PARTY_SOURCE_CODE_BUCKET}. Error: {e}"
                        )
                        raise
                else:
                    LOGGER.error(
                        f"Unable to check if source code is present on bucket {THIRD_PARTY_SOURCE_CODE_BUCKET}. Error: {e}"
                    )
                    raise
コード例 #21
0
def test_cuda_paths(gpu):
    """
    Test to ensure directory structure for GPU Dockerfiles has cuda version in it

    :param gpu: gpu image uris
    """
    image = gpu
    if "example" in image:
        pytest.skip(
            "Skipping Example Dockerfiles which are not explicitly tied to a cuda version"
        )

    dlc_path = os.getcwd().split("/test/")[0]
    job_type = "training" if "training" in image else "inference"

    # Ensure that image has a supported framework
    frameworks = ("tensorflow", "pytorch", "mxnet")
    framework = ""
    for fw in frameworks:
        if fw in image:
            framework = fw
            break
    assert framework, f"Cannot find any frameworks {frameworks} in image uri {image}"

    # Get cuda, framework version, python version through regex
    cuda_version = re.search(r"-(cu\d+)-", image).group(1)
    framework_version = re.search(r":(\d+(.\d+){2})", image).group(1)
    python_version = re.search(r"(py\d+)", image).group(1)

    framework_version_path = os.path.join(dlc_path, framework, job_type,
                                          "docker", framework_version)
    if not os.path.exists(framework_version_path):
        framework_short_version = re.match(r"(\d+.\d+)",
                                           framework_version).group(1)
        framework_version_path = os.path.join(dlc_path, framework, job_type,
                                              "docker",
                                              framework_short_version)
    if not os.path.exists(os.path.join(framework_version_path,
                                       python_version)):
        # Use the pyX version as opposed to the pyXY version if pyXY path does not exist
        python_version = python_version[:3]

    # Check buildspec for cuda version
    buildspec = "buildspec.yml"
    if is_tf_version("1", image):
        buildspec = "buildspec-tf1.yml"

    cuda_in_buildspec = False
    cuda_in_buildspec_ref = f"CUDA_VERSION {cuda_version}"
    buildspec_path = os.path.join(dlc_path, framework, buildspec)
    with open(buildspec_path, "r") as bf:
        for line in bf:
            if cuda_in_buildspec_ref in line:
                cuda_in_buildspec = True
                break

    try:
        assert cuda_in_buildspec, f"Can't find {cuda_in_buildspec_ref} in {buildspec_path}"
    except AssertionError as e:
        if not is_dlc_cicd_context():
            LOGGER.warn(
                f"{e} - not failing, as this is a(n) {os.getenv('BUILD_CONTEXT', 'empty')} build context."
            )
        else:
            raise

    # Check that a Dockerfile exists in the right directory
    dockerfile_path = os.path.join(framework_version_path, python_version,
                                   cuda_version, "Dockerfile.gpu")

    assert os.path.exists(
        dockerfile_path
    ), f"Cannot find dockerfile for image {image} in {dockerfile_path}"
def run_sm_perf_test(image_uri, xla, num_nodes, region, threshold=None):
    """
    Run TF sagemaker training performance tests

    Additional context: Setup for this function is performed by 'setup_sm_benchmark_tf_train_env' -- this installs
    some prerequisite packages, clones some repos, and creates a virtualenv called sm_benchmark_venv.

    TODO: Refactor the above setup function to be more obviously connected to this function,
    TODO: and install requirements via a requirements.txt file

    :param image_uri: ECR image URI
    :param xla: [ True | False ] Enable XLA acceleration
    :param num_nodes: Number of nodes to run on
    :param region: AWS region

    This function was inspired by deep-learning-containers/test/dlc_tests/benchmark/sagemaker/tensorflow/training/test_performance_tensorflow_sm_training.py

    """
    _, framework_version = get_framework_and_version_from_tag(image_uri)

    processor = "xla" if xla else "gpu"
    device_cuda_str = f"{processor}-{get_cuda_version_from_tag(image_uri)}"
    '''
    TODO: Switch to p3.16xlarge when EC2 availability issues are resolved
    '''
    ec2_instance_type = "p3.8xlarge"
    py_version = "py2" if "py2" in image_uri else "py37" if "py37" in image_uri else "py3"

    time_str = time.strftime("%Y-%m-%d-%H-%M-%S")
    commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION")
    target_upload_location = os.path.join(
        BENCHMARK_RESULTS_S3_BUCKET, "xla", "tensorflow", framework_version, "sagemaker", "training", device_cuda_str, py_version
    )
    training_job_name = (
        f"opt-tf{framework_version[0]}-bench-{device_cuda_str}-{num_nodes}-node-{py_version}-{commit_info[:7]}-{time_str}"
    )

    # Inserting random sleep because this test starts multiple training jobs around the same time, resulting in
    # a throttling error for SageMaker APIs.
    time.sleep(Random(x=training_job_name).random() * 60)

    test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "resources")
    venv_dir = os.path.join(test_dir, "sm_benchmark_venv")

    ctx = Context()

    with ctx.cd(test_dir), ctx.prefix(f"source {venv_dir}/bin/activate"):
        log_file = (
            f"results-{commit_info}-{time_str}-optimized-tf{framework_version}-{device_cuda_str}-{py_version}-{num_nodes}-node.txt"
        )
        run_out = ctx.run(
            f"timeout 45m python tf_sm_benchmark.py "
            f"--framework-version {framework_version} "
            f"--image-uri {image_uri} "
            f"--instance-type ml.{ec2_instance_type} "
            f"--node-count {num_nodes} "
            f"--python {py_version} "
            f"--region {region} "
            f"--job-name {training_job_name} "
            f"--xla-{'on' if xla else 'off'} "
            f"2>&1 | tee {log_file}",
            warn=True,
            echo=True,
        )

        if not (run_out.ok or run_out.return_code == 124):
            target_upload_location = os.path.join(target_upload_location, "failure_log")

    ctx.run(f"aws s3 cp {os.path.join(test_dir, log_file)} {os.path.join(target_upload_location, log_file)}")

    LOGGER.info(f"Test results can be found at {os.path.join(target_upload_location, log_file)}")

    result_statement, throughput = _print_results_of_test(os.path.join(test_dir, log_file))
    throughput /= num_nodes

    assert run_out.ok, (
        f"Benchmark Test failed with return code {run_out.return_code}. "
        f"Test results can be found at {os.path.join(target_upload_location, log_file)}"
    )

    LOGGER.info(
        f"optimized-tensorflow-{framework_version} sagemaker training {ec2_instance_type} {device_cuda_str} {py_version} "
        f"imagenet {num_nodes} nodes Throughput: {throughput} images/sec, threshold: {threshold} images/sec"
    )
    if threshold:
        assert throughput > threshold, (
            f"optimized-tensorflow-{framework_version} sagemaker training {ec2_instance_type} {device_cuda_str} {py_version} imagenet {num_nodes} nodes "
            f"Regression Benchmark Result {throughput} does not reach the threshold {threshold}"
        )
    return throughput
コード例 #23
0
def _run_dependency_check_test(image, ec2_connection):
    # Record any whitelisted medium/low severity CVEs; I.E. allowed_vulnerabilities = {CVE-1000-5555, CVE-9999-9999}
    allowed_vulnerabilities = {
        # Those vulnerabilities are fixed. Current openssl version is 1.1.1g. These are false positive
        "CVE-2016-2109",
        "CVE-2016-2177",
        "CVE-2016-6303",
        "CVE-2016-2182",
        # CVE-2020-13936: vulnerability found in apache velocity package which is a dependency for dependency-check package. Hence, ignoring.
        "CVE-2020-13936",
    }

    processor = get_processor_from_image_uri(image)

    # Whitelist CVE #CVE-2021-3711 for DLCs where openssl is installed using apt-get
    framework, _ = get_framework_and_version_from_tag(image)
    short_fw_version = re.search(r"(\d+\.\d+)", image).group(1)

    # Check that these versions have been matched on https://ubuntu.com/security/CVE-2021-3711 before adding
    allow_openssl_cve_fw_versions = {
        "tensorflow": {
            "1.15": ["cpu", "gpu", "neuron"],
            "2.3": ["cpu", "gpu"],
            "2.4": ["cpu", "gpu"],
            "2.5": ["cpu", "gpu", "neuron"],
            "2.6": ["cpu", "gpu"],
            "2.7": ["cpu", "gpu"],
        },
        "mxnet": {
            "1.8": ["neuron"],
            "1.9": ["cpu", "gpu"]
        },
        "pytorch": {
            "1.10": ["cpu"]
        },
        "huggingface_pytorch": {
            "1.8": ["cpu", "gpu"],
            "1.9": ["cpu", "gpu"]
        },
        "huggingface_tensorflow": {
            "2.4": ["cpu", "gpu"],
            "2.5": ["cpu", "gpu"]
        },
        "autogluon": {
            "0.3": ["cpu"]
        },
    }

    if processor in allow_openssl_cve_fw_versions.get(framework, {}).get(
            short_fw_version, []):
        allowed_vulnerabilities.add("CVE-2021-3711")

    container_name = f"dep_check_{processor}"
    report_addon = get_container_name("depcheck-report", image)
    dependency_check_report = f"{report_addon}.html"
    html_file = f"{container_name}:/build/dependency-check-report.html"
    test_script = os.path.join(CONTAINER_TESTS_PREFIX, "testDependencyCheck")

    # Execute test, copy results to s3
    ec2.execute_ec2_training_test(ec2_connection,
                                  image,
                                  test_script,
                                  container_name=container_name,
                                  bin_bash_entrypoint=True)
    ec2_connection.run(f"docker cp {html_file} ~/{dependency_check_report}")
    ec2_connection.run(
        f"aws s3 cp ~/{dependency_check_report} s3://dlc-dependency-check")

    # Check for any vulnerabilities not mentioned in allowed_vulnerabilities
    html_output = ec2_connection.run(f"cat ~/{dependency_check_report}",
                                     hide=True).stdout
    cves = re.findall(r">(CVE-\d+-\d+)</a>", html_output)
    vulnerabilities = set(cves) - allowed_vulnerabilities

    if vulnerabilities:
        vulnerability_severity = {}

        # Check NVD for vulnerability severity to provide this useful info in error message.
        for vulnerability in vulnerabilities:
            try:
                cve_url = f"https://services.nvd.nist.gov/rest/json/cve/1.0/{vulnerability}"

                session = requests.Session()
                session.mount(
                    "https://",
                    requests.adapters.HTTPAdapter(max_retries=Retry(
                        total=5, status_forcelist=[404, 504, 502])),
                )
                response = session.get(cve_url)

                if response.status_code == 200:
                    severity = (response.json().get("result", {}).get(
                        "CVE_Items",
                        [{}])[0].get("impact",
                                     {}).get("baseMetricV2",
                                             {}).get("severity", "UNKNOWN"))
                    if vulnerability_severity.get(severity):
                        vulnerability_severity[severity].append(vulnerability)
                    else:
                        vulnerability_severity[severity] = [vulnerability]
            except ConnectionError:
                LOGGER.exception(
                    f"Failed to load NIST data for CVE {vulnerability}")

        # TODO: Remove this once we have whitelisted appropriate LOW/MEDIUM vulnerabilities
        if not (vulnerability_severity.get("CRITICAL")
                or vulnerability_severity.get("HIGH")):
            return

        raise DependencyCheckFailure(
            f"Unrecognized CVEs have been reported : {vulnerability_severity}. "
            f"Allowed vulnerabilities are {allowed_vulnerabilities or None}. Please see "
            f"{dependency_check_report} for more details.")
コード例 #24
0
def test_mxnet_sagemaker_training_performance(mxnet_training, num_nodes,
                                              region, gpu_only, py3_only):
    """
    Run MX sagemaker training performance test

    Additional context: Setup for this function is performed by 'setup_sm_benchmark_mx_train_env' -- this installs
    some prerequisite packages, pulls required script, and creates a virtualenv called sm_benchmark_venv.

    The training script mxnet_imagenet_resnet50.py is invoked via a shell script smtrain-resnet50-imagenet.sh
    The shell script sets num-epochs to 40. This parameter is configurable.

    TODO: Refactor the above setup function to be more obviously connected to this function,
    TODO: and install requirements via a requirements.txt file
    TODO: Change latency [time/epoch] metric to Throughput metric

    :param mxnet_training: ECR image URI
    :param num_nodes: Number of nodes to run on
    :param region: AWS region
    """
    _, framework_version = get_framework_and_version_from_tag(mxnet_training)
    device_cuda_str = f"gpu-{get_cuda_version_from_tag(mxnet_training)}"
    py_version = "py37" if "py37" in mxnet_training else "py2" if "py2" in mxnet_training else "py3"
    ec2_instance_type = "p3.16xlarge"

    time_str = time.strftime('%Y-%m-%d-%H-%M-%S')
    commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION", "manual")
    target_upload_location = os.path.join(BENCHMARK_RESULTS_S3_BUCKET, "mxnet",
                                          framework_version, "sagemaker",
                                          "training", device_cuda_str,
                                          py_version)
    training_job_name = f"mx-tr-bench-{device_cuda_str}-{num_nodes}-node-{py_version}-{commit_info[:7]}-{time_str}"

    test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                            "resources")
    venv_dir = os.path.join(test_dir, "sm_benchmark_venv")

    ctx = Context()

    with ctx.cd(test_dir), ctx.prefix(f"source {venv_dir}/bin/activate"):
        log_file = f"results-{commit_info}-{time_str}-{num_nodes}-node.txt"
        run_out = ctx.run(
            f"timeout 90m python mx_sm_benchmark.py "
            f"--framework-version {framework_version} "
            f"--image-uri {mxnet_training} "
            f"--instance-type ml.{ec2_instance_type} "
            f"--node-count {num_nodes} "
            f"--python {py_version} "
            f"--region {region} "
            f"--job-name {training_job_name} "
            f"2>&1 | tee {log_file}",
            warn=True,
            echo=True)

        if not run_out.ok:
            target_upload_location = os.path.join(target_upload_location,
                                                  "failure_log")

    ctx.run(
        f"aws s3 cp {os.path.join(test_dir, log_file)} {os.path.join(target_upload_location, log_file)}",
        warn=True,
        echo=True)

    LOGGER.info(
        f"Test results can be found at {os.path.join(target_upload_location, log_file)}"
    )

    assert run_out.ok, (
        f"Benchmark Test failed with return code {run_out.return_code}. "
        f"Test results can be found at {os.path.join(target_upload_location, log_file)}"
    )

    result_statement, time_val, accuracy = _print_results_of_test(
        os.path.join(test_dir, log_file))

    accuracy_threshold = get_threshold_for_image(
        framework_version, MXNET_TRAINING_GPU_IMAGENET_ACCURACY_THRESHOLD)
    assert accuracy > accuracy_threshold, (
        f"mxnet {framework_version} sagemaker training {py_version} imagenet {num_nodes} nodes "
        f"Benchmark Result {accuracy} does not reach the threshold accuracy {accuracy_threshold}"
    )

    time_threshold = get_threshold_for_image(
        framework_version, MXNET_TRAINING_GPU_IMAGENET_LATENCY_THRESHOLD)
    assert time_val < time_threshold, (
        f"mxnet {framework_version} sagemaker training {py_version} imagenet {num_nodes} nodes "
        f"Benchmark Result {time_val} does not reach the threshold latency {time_threshold}"
    )
コード例 #25
0
    def generate_coverage_doc(self, framework=None, job_type=None):
        """
        Generate the test coverage docs based on pytest item objects

        :param framework: str, ML framework
        :param job_type: str, training or inference
        """
        test_cov = {}
        for item in self.items:
            # Define additional csv options
            function_name = item.name.split("[")[0]
            function_key = f"{item.fspath}::{function_name}"
            str_fspath = str(item.fspath)
            str_keywords = str(item.keywords)

            # Construct Category and Github_Link fields based on the filepath
            category = str_fspath.split("/dlc_tests/")[-1].split("/")[0]
            if self.is_sagemaker:
                category = "sagemaker_local" if "local" in str_fspath else "sagemaker"
            repo_url = os.getenv(
                "CODEBUILD_SOURCE_REPO_URL",
                "https://github.com/aws/deep-learning-containers.git").rstrip(
                    ".git")
            github_link = f"{repo_url}/blob/master/test/{str_fspath.split('/test/')[-1]}"

            # Only create a new test coverage item if we have not seen the function before. This is a necessary step,
            # as parametrization can make it appear as if the same test function is a unique test function
            if test_cov.get(function_key):
                continue

            # Based on keywords and filepaths, assign values
            framework_scope = (framework if framework else _infer_field_value(
                "all", ("mxnet", "tensorflow", "pytorch"), str_fspath))
            job_type_scope = (job_type if job_type else _infer_field_value(
                "both", ("training", "inference"), str_fspath, str_keywords))
            integration_scope = _infer_field_value(
                "general integration",
                ("_dgl_", "smdebug", "gluonnlp", "smexperiments", "_mme_",
                 "pipemode", "tensorboard", "_s3_", "nccl"),
                str_keywords,
            )
            processor_scope = _infer_field_value("all", ("cpu", "gpu", "eia"),
                                                 str_keywords)
            if processor_scope == "gpu":
                processor_scope = self.handle_single_gpu_instances_test_report(
                    function_key, str_keywords)

            # Create a new test coverage item if we have not seen the function before. This is a necessary step,
            # as parametrization can make it appear as if the same test function is a unique test function
            test_cov[function_key] = {
                "Category":
                category,
                "Name":
                function_name,
                "Scope":
                framework_scope,
                "Job_Type":
                job_type_scope,
                "Num_Instances":
                self.get_marker_arg_value(item, function_key, "multinode", 1),
                "Processor":
                self.get_marker_arg_value(item, function_key, "processor",
                                          processor_scope),
                "Integration":
                self.get_marker_arg_value(item, function_key, "integration",
                                          integration_scope),
                "Model":
                self.get_marker_arg_value(item, function_key, "model"),
                "GitHub_Link":
                github_link,
            }
        self.write_test_coverage_file(test_cov)

        if self.failure_conditions:
            message, total_issues, error_file = self.assemble_report_failure_message(
            )
            if total_issues == 0:
                LOGGER.warning(
                    f"Found failure message, but no issues. Message:\n{message}"
                )
            else:
                raise TestReportGenerationFailure(
                    f"{message}\nFollow {error_file} if message is truncated")
コード例 #26
0
def run_sm_profiler_tests(image, profiler_tests_dir, test_file, processor):
    """
    Testrunner to execute SM profiler tests from DLC repo
    """
    ctx = Context()

    # Install profiler requirements only once - pytest-rerunfailures has a known issue
    # with the latest pytest https://github.com/pytest-dev/pytest-rerunfailures/issues/128
    try:
        ctx.run(
            "pip install -r "
            "https://raw.githubusercontent.com/awslabs/sagemaker-debugger/master/config/profiler/requirements.txt && "
            "pip install smdebug && "
            "pip uninstall -y pytest-rerunfailures",
            hide=True,
        )
    except UnexpectedExit:
        # Wait a minute and a half if we get an invoke failure - since smprofiler test requirements can be flaky
        time.sleep(90)

    framework, version = get_framework_and_version_from_tag(image)

    # Conditionally set sm data parallel tests, based on config file rules from link below:
    # https://github.com/awslabs/sagemaker-debugger/tree/master/config/profiler
    enable_sm_data_parallel_tests = "true"
    if framework == "pytorch" and Version(version) < Version("1.6"):
        enable_sm_data_parallel_tests = "false"
    if framework == "tensorflow" and Version(version) < Version("2.3"):
        enable_sm_data_parallel_tests = "false"

    # Set SMProfiler specific environment variables
    smprof_configs = {
        "use_current_branch": "false",
        "enable_smdataparallel_tests": enable_sm_data_parallel_tests,
        "force_run_tests": "false",
        "framework": framework,
        "build_type": "release"
    }

    # Command to set all necessary environment variables
    export_cmd = " && ".join(f"export {key}={val}"
                             for key, val in smprof_configs.items())
    export_cmd = f"{export_cmd} && export ENV_CPU_TRAIN_IMAGE=test && export ENV_GPU_TRAIN_IMAGE=test && " \
                 f"export ENV_{processor.upper()}_TRAIN_IMAGE={image}"

    test_results_outfile = os.path.join(
        os.getcwd(), f"{get_container_name('smprof', image)}.txt")
    with ctx.prefix(f"cd {profiler_tests_dir}"):
        with ctx.prefix(f"cd sagemaker-tests && {export_cmd}"):
            try:
                ctx.run(
                    f"pytest --json-report --json-report-file={test_results_outfile} -n=auto "
                    f"-v -s -W=ignore tests/{test_file}::test_{processor}_jobs",
                    hide=True,
                )
                with open(test_results_outfile) as outfile:
                    result_data = json.load(outfile)
                    LOGGER.info(
                        f"Tests passed on {image}; Results:\n{json.dumps(result_data, indent=4)}"
                    )
            except Exception as e:
                if os.path.exists(test_results_outfile):
                    with open(test_results_outfile) as outfile:
                        result_data = json.load(outfile)
                    raise SMProfilerRCTestFailure(
                        f"Failed SM Profiler tests. Results:\n{json.dumps(result_data, indent=4)}"
                    ) from e
                raise
コード例 #27
0
def test_cuda_paths(gpu):
    """
    Test to ensure that:
    a. buildspec contains an entry to create the same image as the image URI
    b. directory structure for GPU Dockerfiles has framework version, python version, and cuda version in it

    :param gpu: gpu image uris
    """
    image = gpu
    if "example" in image:
        pytest.skip(
            "Skipping Example Dockerfiles which are not explicitly tied to a cuda version"
        )

    dlc_path = os.getcwd().split("/test/")[0]
    job_type = "training" if "training" in image else "inference"

    # Ensure that image has a supported framework
    framework, framework_version = get_framework_and_version_from_tag(image)

    # Get cuda, framework version, python version through regex
    cuda_version = re.search(r"-(cu\d+)-", image).group(1)
    framework_short_version = None
    python_version = re.search(r"(py\d+)", image).group(1)
    short_python_version = None
    image_tag = re.search(
        r":(\d+(\.\d+){2}(-transformers\d+(\.\d+){2})?-(gpu)-(py\d+)(-cu\d+)-(ubuntu\d+\.\d+)((-e3)?-example|-e3|-sagemaker)?)",
        image,
    ).group(1)

    # replacing '_' by '/' to handle huggingface_<framework> case
    framework_path = framework.replace("_", "/")
    framework_version_path = os.path.join(dlc_path, framework_path, job_type,
                                          "docker", framework_version)
    if not os.path.exists(framework_version_path):
        framework_short_version = re.match(r"(\d+.\d+)",
                                           framework_version).group(1)
        framework_version_path = os.path.join(dlc_path, framework_path,
                                              job_type, "docker",
                                              framework_short_version)
    if not os.path.exists(os.path.join(framework_version_path,
                                       python_version)):
        # Use the pyX version as opposed to the pyXY version if pyXY path does not exist
        short_python_version = python_version[:3]

    # Check buildspec for cuda version
    buildspec = "buildspec.yml"
    if is_tf_version("1", image):
        buildspec = "buildspec-tf1.yml"

    image_tag_in_buildspec = False
    dockerfile_spec_abs_path = None
    buildspec_path = os.path.join(dlc_path, framework_path, buildspec)
    buildspec_def = Buildspec()
    buildspec_def.load(buildspec_path)

    for name, image_spec in buildspec_def["images"].items():
        if image_spec["device_type"] == "gpu" and image_spec[
                "tag"] == image_tag:
            image_tag_in_buildspec = True
            dockerfile_spec_abs_path = os.path.join(
                os.path.dirname(framework_version_path),
                image_spec["docker_file"].lstrip("docker/"))
            break
    try:
        assert image_tag_in_buildspec, f"Image tag {image_tag} not found in {buildspec_path}"
    except AssertionError as e:
        if not is_dlc_cicd_context():
            LOGGER.warn(
                f"{e} - not failing, as this is a(n) {os.getenv('BUILD_CONTEXT', 'empty')} build context."
            )
        else:
            raise

    image_properties_expected_in_dockerfile_path = [
        framework_short_version or framework_version,
        short_python_version or python_version,
        cuda_version,
    ]
    assert all(
        prop in dockerfile_spec_abs_path
        for prop in image_properties_expected_in_dockerfile_path
    ), (f"Dockerfile location {dockerfile_spec_abs_path} does not contain all the image properties in "
        f"{image_properties_expected_in_dockerfile_path}")

    assert os.path.exists(
        dockerfile_spec_abs_path
    ), f"Cannot find dockerfile for {image} in {dockerfile_spec_abs_path}"
コード例 #28
0
def _run_dependency_check_test(image, ec2_connection, processor):
    # Record any whitelisted medium/low severity CVEs; I.E. allowed_vulnerabilities = {CVE-1000-5555, CVE-9999-9999}
    allowed_vulnerabilities = {
        # Those vulnerabilities are fixed. Current openssl version is 1.1.1g. These are false positive
        "CVE-2016-2109",
        "CVE-2016-2177",
        "CVE-2016-6303",
        "CVE-2016-2182",
        # CVE-2020-13936: vulnerability found in apache velocity package which is a dependency for dependency-check package. Hence, ignoring.
        "CVE-2020-13936",
    }

    container_name = f"dep_check_{processor}"
    report_addon = get_container_name("depcheck-report", image)
    dependency_check_report = f"{report_addon}.html"
    html_file = f"{container_name}:/build/dependency-check-report.html"
    test_script = os.path.join(CONTAINER_TESTS_PREFIX, "testDependencyCheck")

    # Execute test, copy results to s3
    ec2.execute_ec2_training_test(ec2_connection, image, test_script, container_name=container_name)
    ec2_connection.run(f"docker cp {html_file} ~/{dependency_check_report}")
    ec2_connection.run(f"aws s3 cp ~/{dependency_check_report} s3://dlc-dependency-check")

    # Check for any vulnerabilities not mentioned in allowed_vulnerabilities
    html_output = ec2_connection.run(f"cat ~/{dependency_check_report}", hide=True).stdout
    cves = re.findall(r">(CVE-\d+-\d+)</a>", html_output)
    vulnerabilities = set(cves) - allowed_vulnerabilities

    if vulnerabilities:
        vulnerability_severity = {}

        # Check NVD for vulnerability severity to provide this useful info in error message.
        for vulnerability in vulnerabilities:
            try:
                cve_url = f"https://services.nvd.nist.gov/rest/json/cve/1.0/{vulnerability}"

                session = requests.Session()
                session.mount(
                    "https://",
                    requests.adapters.HTTPAdapter(max_retries=Retry(total=5, status_forcelist=[404, 504, 502])),
                )
                response = session.get(cve_url)

                if response.status_code == 200:
                    severity = (
                        response.json()
                        .get("result", {})
                        .get("CVE_Items", [{}])[0]
                        .get("impact", {})
                        .get("baseMetricV2", {})
                        .get("severity", "UNKNOWN")
                    )
            except ConnectionError:
                LOGGER.exception(f"Failed to load NIST data for CVE {vulnerability}")

            if vulnerability_severity.get(severity):
                vulnerability_severity[severity].append(vulnerability)
            else:
                vulnerability_severity[severity] = [vulnerability]

        # TODO: Remove this once we have whitelisted appropriate LOW/MEDIUM vulnerabilities
        if not (vulnerability_severity.get("CRITICAL") or vulnerability_severity.get("HIGH")):
            return

        raise DependencyCheckFailure(
            f"Unrecognized CVEs have been reported : {vulnerability_severity}. "
            f"Allowed vulnerabilities are {allowed_vulnerabilities or None}. Please see "
            f"{dependency_check_report} for more details."
        )
def run_sm_perf_test(image_uri, num_nodes, region):
    """
    Run TF sagemaker training performance tests

    Additional context: Setup for this function is performed by 'setup_sm_benchmark_tf_train_env' -- this installs
    some prerequisite packages, clones some repos, and creates a virtualenv called sm_benchmark_venv.

    TODO: Refactor the above setup function to be more obviously connected to this function,
    TODO: and install requirements via a requirements.txt file

    :param image_uri: ECR image URI
    :param num_nodes: Number of nodes to run on
    :param region: AWS region
    """
    _, framework_version = get_framework_and_version_from_tag(image_uri)
    if framework_version.startswith("1."):
        pytest.skip("Skipping benchmark test on TF 1.x images.")

    processor = "gpu" if "gpu" in image_uri else "cpu"
    device_cuda_str = f"{processor}-{get_cuda_version_from_tag(image_uri)}" if processor == "gpu" else processor

    ec2_instance_type = "p3.16xlarge" if processor == "gpu" else "c5.18xlarge"

    py_version = "py2" if "py2" in image_uri else "py37" if "py37" in image_uri else "py3"

    time_str = time.strftime("%Y-%m-%d-%H-%M-%S")
    commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION")
    target_upload_location = os.path.join(BENCHMARK_RESULTS_S3_BUCKET,
                                          "tensorflow", framework_version,
                                          "sagemaker", "training",
                                          device_cuda_str, py_version)
    training_job_name = (
        f"tf{framework_version[0]}-tr-bench-{device_cuda_str}-{num_nodes}-node-{py_version}-{commit_info[:7]}-{time_str}"
    )

    # Inserting random sleep because this test starts multiple training jobs around the same time, resulting in
    # a throttling error for SageMaker APIs.
    time.sleep(Random(x=training_job_name).random() * 60)

    test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                            "resources")
    venv_dir = os.path.join(test_dir, "sm_benchmark_venv")

    ctx = Context()

    with ctx.cd(test_dir), ctx.prefix(f"source {venv_dir}/bin/activate"):
        log_file = (
            f"results-{commit_info}-{time_str}-{framework_version}-{device_cuda_str}-{py_version}-{num_nodes}-node.txt"
        )
        run_out = ctx.run(
            f"timeout 45m python tf_sm_benchmark.py "
            f"--framework-version {framework_version} "
            f"--image-uri {image_uri} "
            f"--instance-type ml.{ec2_instance_type} "
            f"--node-count {num_nodes} "
            f"--python {py_version} "
            f"--region {region} "
            f"--job-name {training_job_name}"
            f"2>&1 | tee {log_file}",
            warn=True,
            echo=True,
        )

        if not (run_out.ok or run_out.return_code == 124):
            target_upload_location = os.path.join(target_upload_location,
                                                  "failure_log")

    ctx.run(
        f"aws s3 cp {os.path.join(test_dir, log_file)} {os.path.join(target_upload_location, log_file)}"
    )

    LOGGER.info(
        f"Test results can be found at {os.path.join(target_upload_location, log_file)}"
    )

    result_statement, throughput = _print_results_of_test(
        os.path.join(test_dir, log_file), processor)
    throughput /= num_nodes

    assert run_out.ok, (
        f"Benchmark Test failed with return code {run_out.return_code}. "
        f"Test results can be found at {os.path.join(target_upload_location, log_file)}"
    )

    threshold_table = ((TENSORFLOW_SM_TRAINING_CPU_1NODE_THRESHOLD if num_nodes
                        == 1 else TENSORFLOW_SM_TRAINING_CPU_4NODE_THRESHOLD)
                       if processor == "cpu" else
                       TENSORFLOW_SM_TRAINING_GPU_1NODE_THRESHOLD if num_nodes
                       == 1 else TENSORFLOW_SM_TRAINING_GPU_4NODE_THRESHOLD)
    threshold = get_threshold_for_image(framework_version, threshold_table)
    LOGGER.info(
        f"tensorflow {framework_version} sagemaker training {device_cuda_str} {py_version} "
        f"imagenet {num_nodes} nodes Throughput: {throughput} images/sec, threshold: {threshold} images/sec"
    )
    assert throughput > threshold, (
        f"tensorflow {framework_version} sagemaker training {processor} {py_version} imagenet {num_nodes} nodes "
        f"Benchmark Result {throughput} does not reach the threshold {threshold}"
    )
コード例 #30
0
def test_ecr_scan(image, ecr_client, sts_client, region):
    """
    Run ECR Scan Tool on an image being tested, and raise Error if vulnerabilities found
    1. Start Scan.
    2. For 5 minutes (Run DescribeImages):
       (We run this for 5 minutes because the Scan is expected to complete in about 2 minutes, though no
        analysis has been performed on exactly how long the Scan takes for a DLC image. Therefore we also
        have a 3 minute buffer beyond the expected amount of time taken.)
    3.1. If imageScanStatus == COMPLETE: exit loop
    3.2. If imageScanStatus == IN_PROGRESS or AttributeNotFound(imageScanStatus): continue loop
    3.3. If imageScanStatus == FAILED: raise RuntimeError
    4. If DescribeImages.imageScanStatus != COMPLETE: raise TimeOutError
    5. assert imageScanFindingsSummary.findingSeverityCounts.HIGH/CRITICAL == 0

    :param image: str Image URI for image to be tested
    :param ecr_client: boto3 Client for ECR
    :param sts_client: boto3 Client for STS
    :param region: str Name of region where test is executed
    """
    test_account_id = sts_client.get_caller_identity().get("Account")
    image_account_id = get_account_id_from_image_uri(image)
    if image_account_id != test_account_id:
        image_repo_uri, image_tag = image.split(":")
        _, image_repo_name = image_repo_uri.split("/")
        target_image_repo_name = f"beta-{image_repo_name}"
        image = ecr_utils.reupload_image_to_test_ecr(image,
                                                     target_image_repo_name,
                                                     region)

    minimum_sev_threshold = get_minimum_sev_threshold_level(image)
    LOGGER.info(f"Severity threshold level is {minimum_sev_threshold}")

    run_scan(ecr_client, image)
    scan_results = ecr_utils.get_ecr_image_scan_results(
        ecr_client, image, minimum_vulnerability=minimum_sev_threshold)
    scan_results = ecr_utils.populate_ecr_scan_with_web_scraper_results(
        image, scan_results)
    ecr_image_vulnerability_list = ScanVulnerabilityList(
        minimum_severity=CVESeverity[minimum_sev_threshold])
    ecr_image_vulnerability_list.construct_allowlist_from_ecr_scan_result(
        scan_results)

    remaining_vulnerabilities = ecr_image_vulnerability_list

    # TODO: Once this feature is enabled, remove "if" condition and second assertion statement
    # TODO: Ensure this works on the canary tags before removing feature flag
    if is_image_covered_by_allowlist_feature(image):
        upgraded_image_vulnerability_list, image_scan_allowlist = fetch_other_vulnerability_lists(
            image, ecr_client, minimum_sev_threshold)
        s3_bucket_name = ECR_SCAN_HELPER_BUCKET

        ## In case new vulnerabilities are found conduct failure routine
        newly_found_vulnerabilities = ecr_image_vulnerability_list - image_scan_allowlist
        if newly_found_vulnerabilities:
            failure_routine_summary = conduct_failure_routine(
                image,
                image_scan_allowlist,
                ecr_image_vulnerability_list,
                upgraded_image_vulnerability_list,
                s3_bucket_name,
            )
            (
                s3_filename_for_fixable_list,
                s3_filename_for_non_fixable_list,
            ) = process_failure_routine_summary_and_store_data_in_s3(
                failure_routine_summary, s3_bucket_name)
        assert not newly_found_vulnerabilities, (
            f"""Found {len(failure_routine_summary["fixable_vulnerabilities"])} fixable vulnerabilites """
            f"""and {len(failure_routine_summary["non_fixable_vulnerabilities"])} non fixable vulnerabilites. """
            f"""Refer to files s3://{s3_bucket_name}/{s3_filename_for_fixable_list}, s3://{s3_bucket_name}/{s3_filename_for_non_fixable_list}, """
            f"""s3://{s3_bucket_name}/{failure_routine_summary["s3_filename_for_current_image_ecr_scan_list"]} and s3://{s3_bucket_name}/{failure_routine_summary["s3_filename_for_allowlist"]}."""
        )

        ## In case there is no new vulnerability but the allowlist is outdated conduct failure routine
        vulnerabilities_that_can_be_fixed = image_scan_allowlist - upgraded_image_vulnerability_list
        if vulnerabilities_that_can_be_fixed:
            failure_routine_summary = conduct_failure_routine(
                image,
                image_scan_allowlist,
                ecr_image_vulnerability_list,
                upgraded_image_vulnerability_list,
                s3_bucket_name,
            )
            (
                s3_filename_for_fixable_list,
                s3_filename_for_non_fixable_list,
            ) = process_failure_routine_summary_and_store_data_in_s3(
                failure_routine_summary, s3_bucket_name)
        assert not vulnerabilities_that_can_be_fixed, (
            f"""Allowlist is Outdated!! Found {len(failure_routine_summary["fixable_vulnerabilities"])} fixable vulnerabilites """
            f"""and {len(failure_routine_summary["non_fixable_vulnerabilities"])} non fixable vulnerabilites. """
            f"""Refer to files s3://{s3_bucket_name}/{s3_filename_for_fixable_list}, s3://{s3_bucket_name}/{s3_filename_for_non_fixable_list}, """
            f"""s3://{s3_bucket_name}/{failure_routine_summary["s3_filename_for_current_image_ecr_scan_list"]} and s3://{s3_bucket_name}/{failure_routine_summary["s3_filename_for_allowlist"]}."""
        )
        return

    common_ecr_scan_allowlist = ScanVulnerabilityList(
        minimum_severity=CVESeverity[minimum_sev_threshold])
    common_ecr_scan_allowlist_path = os.path.join(
        os.sep, get_repository_local_path(), "data",
        "common-ecr-scan-allowlist.json")
    if os.path.exists(common_ecr_scan_allowlist_path):
        common_ecr_scan_allowlist.construct_allowlist_from_file(
            common_ecr_scan_allowlist_path)

    remaining_vulnerabilities = remaining_vulnerabilities - common_ecr_scan_allowlist

    if remaining_vulnerabilities:
        assert not remaining_vulnerabilities.vulnerability_list, (
            f"The following vulnerabilities need to be fixed on {image}:\n"
            f"{json.dumps(remaining_vulnerabilities.vulnerability_list, indent=4)}"
        )