def test_sm_profiler_tf(tensorflow_training): if is_tf_version("1", tensorflow_training): pytest.skip( "Skipping test on TF1, since there are no smprofiler config files for TF1" ) processor = get_processor_from_image_uri(tensorflow_training) if processor not in ("cpu", "gpu"): pytest.skip(f"Processor {processor} not supported. Skipping test.") ctx = Context() profiler_tests_dir = os.path.join( os.getenv("CODEBUILD_SRC_DIR"), get_container_name("smprof", tensorflow_training), "smprofiler_tests") ctx.run(f"mkdir -p {profiler_tests_dir}", hide=True) # Download sagemaker-tests zip sm_tests_zip = "sagemaker-tests.zip" ctx.run( f"aws s3 cp {os.getenv('SMPROFILER_TESTS_BUCKET')}/{sm_tests_zip} {profiler_tests_dir}/{sm_tests_zip}", hide=True) ctx.run(f"cd {profiler_tests_dir} && unzip {sm_tests_zip}", hide=True) # Install tf datasets ctx.run( f"echo 'tensorflow-datasets==4.0.1' >> " f"{profiler_tests_dir}/sagemaker-tests/tests/scripts/tf_scripts/requirements.txt", hide=True, ) run_sm_profiler_tests(tensorflow_training, profiler_tests_dir, "test_profiler_tensorflow.py", processor)
def test_torchvision_nms_inference(pytorch_inference): """ Check that the internally built torchvision binary is used to resolve the missing nms issue. :param pytorch_inference: framework fixture for pytorch inference """ _, framework_version = get_framework_and_version_from_tag( pytorch_inference) if Version(framework_version) == Version( "1.5.1") and get_processor_from_image_uri( pytorch_inference) == "gpu": pytest.skip("Skipping this test for PT 1.5.1 GPU Inference DLC images") if "eia" in pytorch_inference and Version(framework_version) < Version( "1.5.1"): pytest.skip( "This test does not apply to PT EIA images for PT versions less than 1.5.1" ) if "neuron" in pytorch_inference: pytest.skip( "Skipping because this is not relevant to PT Neuron images") ctx = Context() container_name = get_container_name("torchvision-nms", pytorch_inference) start_container(container_name, pytorch_inference, ctx) run_cmd_on_container( container_name, ctx, f"import torch; import torchvision; print(torch.ops.torchvision.nms)", executable="python")
def _run_instance_role_disabled(image_uri, ec2_client, ec2_instance, ec2_connection): expected_tag_key = "aws-dlc-autogenerated-tag-do-not-delete" ec2_instance_id, _ = ec2_instance account_id = test_utils.get_account_id_from_image_uri(image_uri) image_region = test_utils.get_region_from_image_uri(image_uri) repo_name, image_tag = test_utils.get_repository_and_tag_from_image_uri( image_uri) framework, _ = test_utils.get_framework_and_version_from_tag(image_uri) job_type = test_utils.get_job_type_from_image(image_uri) processor = test_utils.get_processor_from_image_uri(image_uri) container_name = f"{repo_name}-telemetry_bad_instance_role-ec2" docker_cmd = "nvidia-docker" if processor == "gpu" else "docker" test_utils.login_to_ecr_registry(ec2_connection, account_id, image_region) ec2_connection.run(f"{docker_cmd} pull -q {image_uri}") preexisting_ec2_instance_tags = ec2_utils.get_ec2_instance_tags( ec2_instance_id, ec2_client=ec2_client) if expected_tag_key in preexisting_ec2_instance_tags: ec2_client.remove_tags(Resources=[ec2_instance_id], Tags=[{ "Key": expected_tag_key }]) # Disable access to EC2 instance metadata ec2_connection.run(f"sudo route add -host 169.254.169.254 reject") if "tensorflow" in framework and job_type == "inference": model_name = "saved_model_half_plus_two" model_base_path = test_utils.get_tensorflow_model_base_path(image_uri) env_vars_list = test_utils.get_tensorflow_inference_environment_variables( model_name, model_base_path) env_vars = " ".join([ f"-e {entry['name']}={entry['value']}" for entry in env_vars_list ]) inference_command = get_tensorflow_inference_command_tf27_above( image_uri, model_name) ec2_connection.run( f"{docker_cmd} run {env_vars} --name {container_name} -id {image_uri} {inference_command}" ) time.sleep(5) else: framework_to_import = framework.replace("huggingface_", "") framework_to_import = "torch" if framework_to_import == "pytorch" else framework_to_import ec2_connection.run( f"{docker_cmd} run --name {container_name} -id {image_uri} bash") output = ec2_connection.run( f"{docker_cmd} exec -i {container_name} python -c 'import {framework_to_import}; import time; time.sleep(5)'", warn=True) assert output.ok, f"'import {framework_to_import}' fails when credentials not configured" ec2_instance_tags = ec2_utils.get_ec2_instance_tags(ec2_instance_id, ec2_client=ec2_client) assert expected_tag_key not in ec2_instance_tags, ( f"{expected_tag_key} was applied as an instance tag." "EC2 create_tags went through even though it should not have")
def test_dlc_standard_labels(image, region): customer_type_label_prefix = "ec2" if test_utils.is_ec2_image( image) else "sagemaker" framework, fw_version = test_utils.get_framework_and_version_from_tag( image) framework = framework.replace('_', '-') fw_version = fw_version.replace('.', '-') device_type = test_utils.get_processor_from_image_uri(image) if device_type == "gpu": cuda_verison = test_utils.get_cuda_version_from_tag(image) device_type = f"{device_type}.{cuda_verison}" python_version = test_utils.get_python_version_from_image_uri(image) job_type = test_utils.get_job_type_from_image(image) transformers_version = test_utils.get_transformers_version_from_image_uri( image).replace('.', '-') os_version = test_utils.get_os_version_from_image_uri(image).replace( '.', '-') # TODO: Add x86 env variable to check explicitly for x86, instead of assuming that everything not graviton is x86 arch_type = "graviton" if test_utils.is_graviton_architecture() else "x86" contributor = test_utils.get_contributor_from_image_uri(image) expected_labels = [ f"com.amazonaws.ml.engines.{customer_type_label_prefix}.dlc.framework.{framework}.{fw_version}", f"com.amazonaws.ml.engines.{customer_type_label_prefix}.dlc.device.{device_type}", f"com.amazonaws.ml.engines.{customer_type_label_prefix}.dlc.python.{python_version}", f"com.amazonaws.ml.engines.{customer_type_label_prefix}.dlc.job.{job_type}", f"com.amazonaws.ml.engines.{customer_type_label_prefix}.dlc.arch.{arch_type}", f"com.amazonaws.ml.engines.{customer_type_label_prefix}.dlc.os.{os_version}", ] if contributor: expected_labels.append( f"com.amazonaws.ml.engines.{customer_type_label_prefix}.dlc.contributor.{contributor}" ) if transformers_version: expected_labels.append( f"com.amazonaws.ml.engines.{customer_type_label_prefix}.dlc.lib.transformers.{transformers_version}" ) actual_labels = test_utils.get_labels_from_ecr_image(image, region) missing_labels = [] for label in expected_labels: if label not in actual_labels: missing_labels.append(label) # TODO: Remove this when ec2 labels are added. For now, ensure they are not added. if customer_type_label_prefix == "ec2": assert set(missing_labels) == set(expected_labels), \ f"EC2 labels are not supported yet, and should not be added to containers. " \ f"{set(expected_labels) - set(missing_labels)} should not be present." else: assert not missing_labels, \ f"Labels {missing_labels} are expected in image {image}, but cannot be found. " \ f"All labels on image: {actual_labels}"
def _run_tag_success(image_uri, ec2_client, ec2_instance, ec2_connection): expected_tag_key = "aws-dlc-autogenerated-tag-do-not-delete" ec2_instance_id, _ = ec2_instance account_id = test_utils.get_account_id_from_image_uri(image_uri) image_region = test_utils.get_region_from_image_uri(image_uri) repo_name, image_tag = test_utils.get_repository_and_tag_from_image_uri( image_uri) framework, _ = test_utils.get_framework_and_version_from_tag(image_uri) job_type = test_utils.get_job_type_from_image(image_uri) processor = test_utils.get_processor_from_image_uri(image_uri) container_name = f"{repo_name}-telemetry_tag_instance_success-ec2" docker_cmd = "nvidia-docker" if processor == "gpu" else "docker" test_utils.login_to_ecr_registry(ec2_connection, account_id, image_region) ec2_connection.run(f"{docker_cmd} pull -q {image_uri}") preexisting_ec2_instance_tags = ec2_utils.get_ec2_instance_tags( ec2_instance_id, ec2_client=ec2_client) if expected_tag_key in preexisting_ec2_instance_tags: ec2_client.remove_tags(Resources=[ec2_instance_id], Tags=[{ "Key": expected_tag_key }]) if framework == "tensorflow" and job_type == "inference": env_vars_list = ecs_utils.get_ecs_tensorflow_environment_variables( processor, "saved_model_half_plus_two") env_vars = " ".join([ f"-e {entry['name']}={entry['value']}" for entry in env_vars_list ]) ec2_connection.run( f"{docker_cmd} run {env_vars} --name {container_name} -id {image_uri}" ) time.sleep(5) else: framework_to_import = framework.replace("huggingface_", "") framework_to_import = "torch" if framework_to_import == "pytorch" else framework_to_import ec2_connection.run( f"{docker_cmd} run --name {container_name} -id {image_uri} bash") output = ec2_connection.run( f"{docker_cmd} exec -i {container_name} python -c 'import {framework_to_import}; import time; time.sleep(5)'", warn=True) ec2_instance_tags = ec2_utils.get_ec2_instance_tags(ec2_instance_id, ec2_client=ec2_client) assert expected_tag_key in ec2_instance_tags, f"{expected_tag_key} was not applied as an instance tag"
def test_torchvision_nms_training(pytorch_training): """ Check that the internally built torchvision binary is used to resolve the missing nms issue. :param pytorch_training: framework fixture for pytorch training """ _, framework_version = get_framework_and_version_from_tag(pytorch_training) if Version(framework_version) == Version( "1.5.1") and get_processor_from_image_uri( pytorch_training) == "gpu": pytest.skip("Skipping this test for PT 1.5.1 GPU Training DLC images") ctx = Context() container_name = get_container_name("torchvision-nms", pytorch_training) start_container(container_name, pytorch_training, ctx) run_cmd_on_container( container_name, ctx, f"import torch; import torchvision; print(torch.ops.torchvision.nms)", executable="python")
def test_sm_profiler_pt(pytorch_training): processor = get_processor_from_image_uri(pytorch_training) if processor not in ("cpu", "gpu"): pytest.skip(f"Processor {processor} not supported. Skipping test.") _, image_framework_version = get_framework_and_version_from_tag(pytorch_training) if Version(image_framework_version) in SpecifierSet(">=1.12"): pytest.skip("sm profiler ZCC test is not supported in PT 1.12 and above") ctx = Context() profiler_tests_dir = os.path.join( os.getenv("CODEBUILD_SRC_DIR"), get_container_name("smprof", pytorch_training), "smprofiler_tests" ) ctx.run(f"mkdir -p {profiler_tests_dir}", hide=True) # Download sagemaker-tests zip sm_tests_zip = "sagemaker-tests.zip" ctx.run( f"aws s3 cp {os.getenv('SMPROFILER_TESTS_BUCKET')}/{sm_tests_zip} {profiler_tests_dir}/{sm_tests_zip}", hide=True, ) # PT test setup requirements with ctx.prefix(f"cd {profiler_tests_dir}"): ctx.run(f"unzip {sm_tests_zip}", hide=True) with ctx.prefix("cd sagemaker-tests/tests/scripts/pytorch_scripts"): ctx.run("mkdir -p data", hide=True) ctx.run( "aws s3 cp s3://smdebug-testing/datasets/cifar-10-python.tar.gz data/cifar-10-batches-py.tar.gz", hide=True, ) ctx.run("aws s3 cp s3://smdebug-testing/datasets/MNIST_pytorch.tar.gz data/MNIST_pytorch.tar.gz", hide=True) with ctx.prefix("cd data"): ctx.run("tar -zxf MNIST_pytorch.tar.gz", hide=True) ctx.run("tar -zxf cifar-10-batches-py.tar.gz", hide=True) run_sm_profiler_tests(pytorch_training, profiler_tests_dir, "test_profiler_pytorch.py", processor)
def test_dlc_major_version_dockerfiles(image): """ Test to make sure semantic versioning scheme in Dockerfiles is correct :param image: <str> ECR image URI """ dlc_dir = os.getcwd().split(f"{os.sep}test{os.sep}")[0] job_type = test_utils.get_job_type_from_image(image) framework, fw_version = test_utils.get_framework_and_version_from_tag( image) processor = test_utils.get_processor_from_image_uri(image) # Assign a string of numbers associated with python version in tag. Python major version is not sufficient to # define DLC major version python_major_minor_version = re.search(r"-py(\d{2,})", image).group(1) root_dir = os.path.join(dlc_dir, framework, job_type, "docker") # Skip older FW versions that did not use this versioning scheme references = { "tensorflow2": "2.2.0", "tensorflow1": "1.16.0", "mxnet": "1.7.0", "pytorch": "1.5.0" } if test_utils.is_tf_version("1", image): reference_fw = "tensorflow1" elif test_utils.is_tf_version("2", image): reference_fw = "tensorflow2" else: reference_fw = framework if processor != "eia" and ( reference_fw in references and Version(fw_version) < Version(references[reference_fw])): pytest.skip( f"Not enforcing new versioning scheme on old image {image}. " f"Started enforcing version scheme on the following: {references}") # Find all Dockerfile.<processor> for this framework/job_type's Major.Minor version dockerfiles = [] fw_version_major_minor = re.match(r"(\d+\.\d+)", fw_version).group(1) for root, dirnames, filenames in os.walk(root_dir): for filename in filenames: if filename == f"Dockerfile.{processor}": dockerfile_path = os.path.join(root_dir, root, filename) if "example" not in dockerfile_path and f"{os.sep}{fw_version_major_minor}" in dockerfile_path: dockerfiles.append(dockerfile_path) # For the collected dockerfiles above, note the DLC major versions in each Dockerfile if python version matches # the current image under test versions = {} dlc_label_regex = re.compile(r'LABEL dlc_major_version="(\d+)"') python_version_regex = re.compile(r"ARG PYTHON_VERSION=(\d+\.\d+)") for dockerfile in dockerfiles: with open(dockerfile, "r") as df: dlc_version = None python_version = None for line in df: major_version_match = dlc_label_regex.match(line) python_version_match = python_version_regex.match(line) if major_version_match: dlc_version = int(major_version_match.group(1)) elif python_version_match: python_version = python_version_match.group(1).replace( ".", "") # Raise errors if dlc major version label and python version arg are not found in Dockerfile if not dlc_version: raise DLCMajorVersionLabelNotFound( f"Cannot find dlc_major_version label in {dockerfile}") if not python_version: raise DLCPythonVersionNotFound( f"Cannot find PYTHON_VERSION arg in {dockerfile}") if python_version == python_major_minor_version: versions[dockerfile] = dlc_version expected_versions = list(range(1, len(dockerfiles) + 1)) actual_versions = sorted(versions.values()) # Test case explicitly for TF2.3 gpu, since v1.0 is banned if (framework, fw_version_major_minor, processor, python_major_minor_version, job_type) == ( "tensorflow", "2.3", "gpu", "37", "training", ): expected_versions = [v + 1 for v in expected_versions] assert 1 not in actual_versions, ( f"DLC v1.0 is deprecated in TF2.3 gpu containers, but found major version 1 " f"in one of the Dockerfiles. Please inspect {versions}") # Test case explicitly for PyTorch 1.6.0 training gpu, since v2.0 is banned if (framework, fw_version_major_minor, processor, python_major_minor_version, job_type) == ( "pytorch", "1.6", "gpu", "36", "training", ): expected_versions = [v + 1 for v in expected_versions] expected_versions[0] = 1 assert 2 not in actual_versions, ( f"DLC v2.0 is deprecated in PyTorch 1.6.0 gpu containers, but found major version 2 " f"in one of the Dockerfiles. Please inspect {versions}") # Note: If, for example, we find 3 dockerfiles with the same framework major/minor version, same processor, # and same python major/minor version, we will expect DLC major versions 1, 2, and 3. If an exception needs to be # made to this rule, please see the above handling of TF2.3 as an example. assert actual_versions == expected_versions, ( f"Found DLC major versions {actual_versions} but expected {expected_versions} for " f"{framework} {job_type} {processor}. Full version info: {versions}. Py version: {python_major_minor_version}" )
def _run_dependency_check_test(image, ec2_connection): # Record any whitelisted medium/low severity CVEs; I.E. allowed_vulnerabilities = {CVE-1000-5555, CVE-9999-9999} allowed_vulnerabilities = { # Those vulnerabilities are fixed. Current openssl version is 1.1.1g. These are false positive "CVE-2016-2109", "CVE-2016-2177", "CVE-2016-6303", "CVE-2016-2182", # CVE-2020-13936: vulnerability found in apache velocity package which is a dependency for dependency-check package. Hence, ignoring. "CVE-2020-13936", } processor = get_processor_from_image_uri(image) # Whitelist CVE #CVE-2021-3711 for DLCs where openssl is installed using apt-get framework, _ = get_framework_and_version_from_tag(image) short_fw_version = re.search(r"(\d+\.\d+)", image).group(1) # Check that these versions have been matched on https://ubuntu.com/security/CVE-2021-3711 before adding allow_openssl_cve_fw_versions = { "tensorflow": { "1.15": ["cpu", "gpu", "neuron"], "2.3": ["cpu", "gpu"], "2.4": ["cpu", "gpu"], "2.5": ["cpu", "gpu", "neuron"], "2.6": ["cpu", "gpu"], "2.7": ["cpu", "gpu"], }, "mxnet": { "1.8": ["neuron"], "1.9": ["cpu", "gpu"] }, "pytorch": { "1.10": ["cpu"] }, "huggingface_pytorch": { "1.8": ["cpu", "gpu"], "1.9": ["cpu", "gpu"] }, "huggingface_tensorflow": { "2.4": ["cpu", "gpu"], "2.5": ["cpu", "gpu"] }, "autogluon": { "0.3": ["cpu"] }, } if processor in allow_openssl_cve_fw_versions.get(framework, {}).get( short_fw_version, []): allowed_vulnerabilities.add("CVE-2021-3711") container_name = f"dep_check_{processor}" report_addon = get_container_name("depcheck-report", image) dependency_check_report = f"{report_addon}.html" html_file = f"{container_name}:/build/dependency-check-report.html" test_script = os.path.join(CONTAINER_TESTS_PREFIX, "testDependencyCheck") # Execute test, copy results to s3 ec2.execute_ec2_training_test(ec2_connection, image, test_script, container_name=container_name, bin_bash_entrypoint=True) ec2_connection.run(f"docker cp {html_file} ~/{dependency_check_report}") ec2_connection.run( f"aws s3 cp ~/{dependency_check_report} s3://dlc-dependency-check") # Check for any vulnerabilities not mentioned in allowed_vulnerabilities html_output = ec2_connection.run(f"cat ~/{dependency_check_report}", hide=True).stdout cves = re.findall(r">(CVE-\d+-\d+)</a>", html_output) vulnerabilities = set(cves) - allowed_vulnerabilities if vulnerabilities: vulnerability_severity = {} # Check NVD for vulnerability severity to provide this useful info in error message. for vulnerability in vulnerabilities: try: cve_url = f"https://services.nvd.nist.gov/rest/json/cve/1.0/{vulnerability}" session = requests.Session() session.mount( "https://", requests.adapters.HTTPAdapter(max_retries=Retry( total=5, status_forcelist=[404, 504, 502])), ) response = session.get(cve_url) if response.status_code == 200: severity = (response.json().get("result", {}).get( "CVE_Items", [{}])[0].get("impact", {}).get("baseMetricV2", {}).get("severity", "UNKNOWN")) if vulnerability_severity.get(severity): vulnerability_severity[severity].append(vulnerability) else: vulnerability_severity[severity] = [vulnerability] except ConnectionError: LOGGER.exception( f"Failed to load NIST data for CVE {vulnerability}") # TODO: Remove this once we have whitelisted appropriate LOW/MEDIUM vulnerabilities if not (vulnerability_severity.get("CRITICAL") or vulnerability_severity.get("HIGH")): return raise DependencyCheckFailure( f"Unrecognized CVEs have been reported : {vulnerability_severity}. " f"Allowed vulnerabilities are {allowed_vulnerabilities or None}. Please see " f"{dependency_check_report} for more details.")