Exemple #1
0
def run_smclarify_bias_metrics(
    image_uri,
    ec2_connection,
    ec2_instance_type,
    docker_executable="docker",
    container_name="smclarify",
    test_script=SMCLARIFY_SCRIPT,
):
    container_test_local_dir = os.path.join("$HOME", "container_tests")
    account_id = get_account_id_from_image_uri(image_uri)
    region = get_region_from_image_uri(image_uri)

    login_to_ecr_registry(ec2_connection, account_id, region)
    ec2_connection.run(f"docker pull -q {image_uri}")

    try:
        ec2_connection.run(
            f"{docker_executable} run --name {container_name} -v "
            f"{container_test_local_dir}:{os.path.join(os.sep, 'test')} {image_uri} "
            f"python {test_script}",
            hide=True,
            timeout=300,
        )
    except Exception as e:
        debug_output = ec2_connection.run(f"docker logs {container_name}")
        debug_stdout = debug_output.stdout
        if "Test SMClarify Bias Metrics succeeded!" in debug_stdout:
            LOGGER.warning(
                f"SMClarify test succeeded, but there is an issue with fabric. "
                f"Error:\n{e}\nTest output:\n{debug_stdout}"
            )
            return
        raise SMClarifyTestFailure(
            f"SMClarify test failed on {image_uri} on {ec2_instance_type}. Full output:\n{debug_stdout}"
        ) from e
Exemple #2
0
def _run_instance_role_disabled(image_uri, ec2_client, ec2_instance,
                                ec2_connection):
    expected_tag_key = "aws-dlc-autogenerated-tag-do-not-delete"

    ec2_instance_id, _ = ec2_instance
    account_id = test_utils.get_account_id_from_image_uri(image_uri)
    image_region = test_utils.get_region_from_image_uri(image_uri)
    repo_name, image_tag = test_utils.get_repository_and_tag_from_image_uri(
        image_uri)
    framework, _ = test_utils.get_framework_and_version_from_tag(image_uri)
    job_type = test_utils.get_job_type_from_image(image_uri)
    processor = test_utils.get_processor_from_image_uri(image_uri)

    container_name = f"{repo_name}-telemetry_bad_instance_role-ec2"

    docker_cmd = "nvidia-docker" if processor == "gpu" else "docker"

    test_utils.login_to_ecr_registry(ec2_connection, account_id, image_region)
    ec2_connection.run(f"{docker_cmd} pull -q {image_uri}")

    preexisting_ec2_instance_tags = ec2_utils.get_ec2_instance_tags(
        ec2_instance_id, ec2_client=ec2_client)
    if expected_tag_key in preexisting_ec2_instance_tags:
        ec2_client.remove_tags(Resources=[ec2_instance_id],
                               Tags=[{
                                   "Key": expected_tag_key
                               }])

    # Disable access to EC2 instance metadata
    ec2_connection.run(f"sudo route add -host 169.254.169.254 reject")

    if "tensorflow" in framework and job_type == "inference":
        model_name = "saved_model_half_plus_two"
        model_base_path = test_utils.get_tensorflow_model_base_path(image_uri)
        env_vars_list = test_utils.get_tensorflow_inference_environment_variables(
            model_name, model_base_path)
        env_vars = " ".join([
            f"-e {entry['name']}={entry['value']}" for entry in env_vars_list
        ])
        inference_command = get_tensorflow_inference_command_tf27_above(
            image_uri, model_name)
        ec2_connection.run(
            f"{docker_cmd} run {env_vars} --name {container_name} -id {image_uri} {inference_command}"
        )
        time.sleep(5)
    else:
        framework_to_import = framework.replace("huggingface_", "")
        framework_to_import = "torch" if framework_to_import == "pytorch" else framework_to_import
        ec2_connection.run(
            f"{docker_cmd} run --name {container_name} -id {image_uri} bash")
        output = ec2_connection.run(
            f"{docker_cmd} exec -i {container_name} python -c 'import {framework_to_import}; import time; time.sleep(5)'",
            warn=True)
        assert output.ok, f"'import {framework_to_import}' fails when credentials not configured"

    ec2_instance_tags = ec2_utils.get_ec2_instance_tags(ec2_instance_id,
                                                        ec2_client=ec2_client)
    assert expected_tag_key not in ec2_instance_tags, (
        f"{expected_tag_key} was applied as an instance tag."
        "EC2 create_tags went through even though it should not have")
def _run_tag_success(image_uri, ec2_client, ec2_instance, ec2_connection):
    expected_tag_key = "aws-dlc-autogenerated-tag-do-not-delete"

    ec2_instance_id, _ = ec2_instance
    account_id = test_utils.get_account_id_from_image_uri(image_uri)
    image_region = test_utils.get_region_from_image_uri(image_uri)
    repo_name, image_tag = test_utils.get_repository_and_tag_from_image_uri(
        image_uri)
    framework, _ = test_utils.get_framework_and_version_from_tag(image_uri)
    job_type = test_utils.get_job_type_from_image(image_uri)
    processor = test_utils.get_processor_from_image_uri(image_uri)

    container_name = f"{repo_name}-telemetry_tag_instance_success-ec2"

    docker_cmd = "nvidia-docker" if processor == "gpu" else "docker"

    test_utils.login_to_ecr_registry(ec2_connection, account_id, image_region)
    ec2_connection.run(f"{docker_cmd} pull -q {image_uri}")

    preexisting_ec2_instance_tags = ec2_utils.get_ec2_instance_tags(
        ec2_instance_id, ec2_client=ec2_client)
    if expected_tag_key in preexisting_ec2_instance_tags:
        ec2_client.remove_tags(Resources=[ec2_instance_id],
                               Tags=[{
                                   "Key": expected_tag_key
                               }])

    if framework == "tensorflow" and job_type == "inference":
        env_vars_list = ecs_utils.get_ecs_tensorflow_environment_variables(
            processor, "saved_model_half_plus_two")
        env_vars = " ".join([
            f"-e {entry['name']}={entry['value']}" for entry in env_vars_list
        ])
        ec2_connection.run(
            f"{docker_cmd} run {env_vars} --name {container_name} -id {image_uri}"
        )
        time.sleep(5)
    else:
        framework_to_import = framework.replace("huggingface_", "")
        framework_to_import = "torch" if framework_to_import == "pytorch" else framework_to_import
        ec2_connection.run(
            f"{docker_cmd} run --name {container_name} -id {image_uri} bash")
        output = ec2_connection.run(
            f"{docker_cmd} exec -i {container_name} python -c 'import {framework_to_import}; import time; time.sleep(5)'",
            warn=True)

    ec2_instance_tags = ec2_utils.get_ec2_instance_tags(ec2_instance_id,
                                                        ec2_client=ec2_client)
    assert expected_tag_key in ec2_instance_tags, f"{expected_tag_key} was not applied as an instance tag"
def ec2_connection(request, ec2_instance, ec2_key_name, ec2_instance_type,
                   region):
    """
    Fixture to establish connection with EC2 instance if necessary
    :param request: pytest test request
    :param ec2_instance: ec2_instance pytest fixture
    :param ec2_key_name: unique key name
    :param ec2_instance_type: ec2_instance_type pytest fixture
    :param region: Region where ec2 instance is launched
    :return: Fabric connection object
    """
    instance_id, instance_pem_file = ec2_instance
    region = P3DN_REGION if ec2_instance_type == "p3dn.24xlarge" else region
    ip_address = ec2_utils.get_public_ip(instance_id, region=region)
    LOGGER.info(f"Instance ip_address: {ip_address}")
    user = ec2_utils.get_instance_user(instance_id, region=region)
    LOGGER.info(f"Connecting to {user}@{ip_address}")
    conn = Connection(
        user=user,
        host=ip_address,
        connect_kwargs={"key_filename": [instance_pem_file]},
        connect_timeout=18000,
    )

    random.seed(f"{datetime.datetime.now().strftime('%Y%m%d%H%M%S%f')}")
    unique_id = random.randint(1, 100000)

    artifact_folder = f"{ec2_key_name}-{unique_id}-folder"
    s3_test_artifact_location = test_utils.upload_tests_to_s3(artifact_folder)

    def delete_s3_artifact_copy():
        test_utils.delete_uploaded_tests_from_s3(s3_test_artifact_location)

    request.addfinalizer(delete_s3_artifact_copy)

    conn.run(
        f"aws s3 cp --recursive {test_utils.TEST_TRANSFER_S3_BUCKET}/{artifact_folder} $HOME/container_tests"
    )
    conn.run(
        f"mkdir -p $HOME/container_tests/logs && chmod -R +x $HOME/container_tests/*"
    )

    # Log into ECR if we are in canary context
    if test_utils.is_canary_context():
        public_registry = test_utils.PUBLIC_DLC_REGISTRY
        test_utils.login_to_ecr_registry(conn, public_registry, region)

    return conn
Exemple #5
0
def test_canary_images_pullable(region):
    """
    Sanity test to verify canary specific functions
    """
    ctx = Context()
    frameworks = ("tensorflow", "mxnet", "pytorch")

    # Have a default framework to test on
    framework = "pytorch"
    for fw in frameworks:
        if fw in os.getenv("CODEBUILD_INITIATOR"):
            framework = fw
            break

    images = parse_canary_images(framework, region)
    login_to_ecr_registry(ctx, PUBLIC_DLC_REGISTRY, region)
    for image in images.split(" "):
        ctx.run(f"docker pull {image}", hide=True)
def _reupload_image_to_test_ecr(source_image_uri, test_ecr_client, test_region,
                                test_account_id):
    """
    Helper function to reupload an image owned by a different account to an ECR repo in this account, so that
    this account can freely run ECR Scan without permission issues.

    :param source_image_uri: str Image URI for image to be tested
    :param test_ecr_client: boto3.Client ECR client for account where test is being run
    :param test_region: str Region where test is being run
    :param test_account_id: str Account ID for account where test is being run
    :return: str New image URI for re-uploaded image
    """
    ctx = Context()
    image_account_id = get_account_id_from_image_uri(source_image_uri)
    image_region = get_region_from_image_uri(source_image_uri)
    login_to_ecr_registry(ctx, image_account_id, image_region)
    ctx.run(f"docker pull {source_image_uri}")

    image_repo_uri, image_tag = source_image_uri.split(":")
    _, image_repo_name = image_repo_uri.split("/")
    test_image_repo_name = f"beta-{image_repo_name}"
    if not ecr_utils.ecr_repo_exists(test_ecr_client, test_image_repo_name):
        raise ecr_utils.ECRRepoDoesNotExist(
            f"Repo named {test_image_repo_name} does not exist in {test_region} on the account {test_account_id}"
        )

    test_image_uri = (source_image_uri.replace(
        image_region,
        test_region).replace(image_repo_name, test_image_repo_name).replace(
            image_account_id, test_account_id))
    ctx.run(f"docker tag {source_image_uri} {test_image_uri}")

    login_to_ecr_registry(ctx, test_account_id, test_region)
    ctx.run(f"docker push {test_image_uri}")

    return test_image_uri
Exemple #7
0
def test_ecr_scan(image, ecr_client, sts_client, region):
    """
    Run ECR Scan Tool on an image being tested, and raise Error if vulnerabilities found
    1. Start Scan.
    2. For 5 minutes (Run DescribeImages):
       (We run this for 5 minutes because the Scan is expected to complete in about 2 minutes, though no
        analysis has been performed on exactly how long the Scan takes for a DLC image. Therefore we also
        have a 3 minute buffer beyond the expected amount of time taken.)
    3.1. If imageScanStatus == COMPLETE: exit loop
    3.2. If imageScanStatus == IN_PROGRESS or AttributeNotFound(imageScanStatus): continue loop
    3.3. If imageScanStatus == FAILED: raise RuntimeError
    4. If DescribeImages.imageScanStatus != COMPLETE: raise TimeOutError
    5. assert imageScanFindingsSummary.findingSeverityCounts.HIGH/CRITICAL == 0

    :param image: str Image URI for image to be tested
    :param ecr_client: boto3 Client for ECR
    :param sts_client: boto3 Client for STS
    :param region: str Name of region where test is executed
    """
    test_account_id = sts_client.get_caller_identity().get("Account")
    image_account_id = get_account_id_from_image_uri(image)
    image_region = get_region_from_image_uri(image)
    image_repo_name, original_image_tag = get_repository_and_tag_from_image_uri(image)
    additional_image_tags = get_all_the_tags_of_an_image_from_ecr(ecr_client, image)
    if not is_image_available_locally(image):
        LOGGER.info(f"Image {image} not available locally!! Pulling the image...")
        login_to_ecr_registry(Context(), image_account_id, image_region)
        run(f"docker pull {image}")
        if not is_image_available_locally(image):
            raise RuntimeError("Image shown as not available even after pulling")
    for additional_tag in additional_image_tags:
        image_uri_with_new_tag = image.replace(original_image_tag, additional_tag)
        run(f"docker tag {image} {image_uri_with_new_tag}", hide=True)

    if image_account_id != test_account_id:
        original_image = image
        target_image_repo_name = f"beta-{image_repo_name}"
        for additional_tag in additional_image_tags:
            image_uri_with_new_tag = original_image.replace(original_image_tag, additional_tag)
            new_image_uri = ecr_utils.reupload_image_to_test_ecr(image_uri_with_new_tag, target_image_repo_name, region)
            if image_uri_with_new_tag == original_image:
                image = new_image_uri

    minimum_sev_threshold = get_minimum_sev_threshold_level(image)
    LOGGER.info(f"Severity threshold level is {minimum_sev_threshold}")

    run_scan(ecr_client, image)
    scan_results = ecr_utils.get_ecr_image_scan_results(ecr_client, image, minimum_vulnerability=minimum_sev_threshold)
    scan_results = ecr_utils.populate_ecr_scan_with_web_scraper_results(image, scan_results)
    ecr_image_vulnerability_list = ScanVulnerabilityList(minimum_severity=CVESeverity[minimum_sev_threshold])
    ecr_image_vulnerability_list.construct_allowlist_from_ecr_scan_result(scan_results)

    remaining_vulnerabilities = ecr_image_vulnerability_list

    if not is_image_covered_by_allowlist_feature(image):
        if is_canary_context():
            pytest.skip("Skipping the test on the canary.")
        
        common_ecr_scan_allowlist = ScanVulnerabilityList(minimum_severity=CVESeverity[minimum_sev_threshold])
        common_ecr_scan_allowlist_path = os.path.join(
            os.sep, get_repository_local_path(), "data", "common-ecr-scan-allowlist.json"
        )
        if os.path.exists(common_ecr_scan_allowlist_path):
            common_ecr_scan_allowlist.construct_allowlist_from_file(common_ecr_scan_allowlist_path)

        remaining_vulnerabilities = remaining_vulnerabilities - common_ecr_scan_allowlist

        if remaining_vulnerabilities:
            assert not remaining_vulnerabilities.vulnerability_list, (
                f"The following vulnerabilities need to be fixed on {image}:\n"
                f"{json.dumps(remaining_vulnerabilities.vulnerability_list, indent=4)}"
            )
        return

    upgraded_image_vulnerability_list, image_scan_allowlist = fetch_other_vulnerability_lists(
        image, ecr_client, minimum_sev_threshold
    )
    s3_bucket_name = ECR_SCAN_HELPER_BUCKET

    ## In case new vulnerabilities (fixable or non-fixable) are found, then conduct failure routine
    newly_found_vulnerabilities = ecr_image_vulnerability_list - image_scan_allowlist
    # In case there is no new vulnerability but the allowlist is outdated
    vulnerabilities_that_can_be_fixed = image_scan_allowlist - upgraded_image_vulnerability_list

    if newly_found_vulnerabilities or vulnerabilities_that_can_be_fixed:
        failure_routine_summary = conduct_failure_routine(
            image,
            image_scan_allowlist,
            ecr_image_vulnerability_list,
            upgraded_image_vulnerability_list,
            s3_bucket_name,
        )
        (
            s3_filename_for_fixable_list,
            s3_filename_for_non_fixable_list,
        ) = process_failure_routine_summary_and_store_data_in_s3(failure_routine_summary, s3_bucket_name)
        prepend_message = "Found new vulnerabilities in image." if newly_found_vulnerabilities else "Allowlist is outdated."
        display_message = prepend_message + " " + (
            f"""Found {len(failure_routine_summary["fixable_vulnerabilities"])} fixable vulnerabilites """
            f"""and {len(failure_routine_summary["non_fixable_vulnerabilities"])} non fixable vulnerabilites. """
            f"""Refer to files s3://{s3_bucket_name}/{s3_filename_for_fixable_list}, s3://{s3_bucket_name}/{s3_filename_for_non_fixable_list}, """
            f"""s3://{s3_bucket_name}/{failure_routine_summary["s3_filename_for_current_image_ecr_scan_list"]} and s3://{s3_bucket_name}/{failure_routine_summary["s3_filename_for_allowlist"]}."""
        )
        if is_canary_context():
            LOGGER.error(display_message)
            pytest.skip("Skipping the test failure on the canary.")
        else:
            raise RuntimeError(display_message)