def _print_results_of_test(file_path, processor): result = "" throughput = 0 if processor == "cpu": with open(file_path, 'r') as f: lines = f.readlines() for line in lines: if "Total img/sec on " in line: result = line + "\n" throughput += float( re.search( r"(CPU\(s\):[ ]*)(?P<throughput>[0-9]+\.?[0-9]+)", line).group("throughput")) elif processor == "gpu": """calculate average throughput""" result_list, throughput_list = [], [] with open(file_path, 'r') as f: lines = f.readlines() for line in lines: if "images/sec: " in line: result_list.append(line.strip("\n")) throughput = float( re.search( r"(images/sec:[ ]*)(?P<throughput>[0-9]+\.?[0-9]+)", line).group("throughput")) throughput_list.append(throughput) result = "\n".join(result_list[-100:]) + "\n" if len(throughput_list) == 0: raise Exception( "Cannot find throughput lines. Looks like SageMaker job was not run successfully. Please check" ) # Take average of last 100 throughput lines throughput = sum(throughput_list[-100:]) / len(throughput_list[-100:]) LOGGER.info(result) return result, throughput
def run_smdebug_test( image_uri, ec2_connection, region, docker_executable="docker", container_name="smdebug", test_script=SMDEBUG_SCRIPT, logfile="output.log", ): framework = get_framework_from_image_uri(image_uri) container_test_local_dir = os.path.join("$HOME", "container_tests") ec2_connection.run(f"$(aws ecr get-login --no-include-email --region {region})", hide=True) ec2_connection.run( f"{docker_executable} run --name {container_name} -v " f"{container_test_local_dir}:{os.path.join(os.sep, 'test')} -itd {image_uri}", hide=True, ) try: test_output = ec2_connection.run( f"{docker_executable} exec --user root {container_name} " f"/bin/bash -c '{test_script} {framework}' | tee {logfile}", hide=True, warn=True, timeout=3000, ) except Exception: debug_output = ec2_connection.run(f"cat {logfile}") LOGGER.error(f"Caught exception while trying to run test via fabric. Output: {debug_output.stdout}") raise # LOGGER.info(test_output.stdout) # Uncomment this line for a complete log dump assert test_output.ok, f"SMDebug tests failed. Output:\n{test_output.stdout}"
def _print_results_of_test(file_path, processor): last_100_lines = Context().run(f"tail -100 {file_path}").stdout.split("\n") result = "" throughput = 0 if processor == "cpu": for line in last_100_lines: if "Total img/sec on " in line: result = line + "\n" throughput = float( re.search( r"(CPU\(s\):[ ]*)(?P<throughput>[0-9]+\.?[0-9]+)", line).group("throughput")) break elif processor == "gpu": result_dict = dict() for line in last_100_lines: if "images/sec: " in line: key = line.split("<stdout>")[0] result_dict[key] = line.strip("\n") if throughput == 0: throughput = float( re.search( r"(images/sec:[ ]*)(?P<throughput>[0-9]+\.?[0-9]+)", line).group("throughput")) result = "\n".join(result_dict.values()) + "\n" LOGGER.info(result) return result, throughput
def test_dataclasses_check(image): """ Ensure there is no dataclasses pip package is installed for python 3.7 and above version. Python version retrieved from the ecr image uri is expected in the format `py<major_verion><minor_version>` :param image: ECR image URI """ ctx = Context() pip_package = "dataclasses" container_name = get_container_name("dataclasses-check", image) python_version = get_python_version_from_image_uri(image).replace("py", "") python_version = int(python_version) if python_version >= 37: start_container(container_name, image, ctx) output = run_cmd_on_container(container_name, ctx, f"pip show {pip_package}", warn=True) if output.return_code == 0: pytest.fail( f"{pip_package} package exists in the DLC image {image} that has py{python_version} version which is greater than py36 version" ) else: LOGGER.info( f"{pip_package} package does not exists in the DLC image {image}" ) else: pytest.skip( f"Skipping test for DLC image {image} that has py36 version as {pip_package} is not included in the python framework" )
def test_generate_coverage_doc(): """ Test generating the test coverage doc """ test_coverage_file = get_test_coverage_file_path() ctx = Context() # Set DLC_IMAGES to 'test' to avoid image names affecting function metadata (due to parametrization) # Set CODEBUILD_RESOLVED_SOURCE_VERSION to test for ease of running this test locally ctx.run( "export DLC_IMAGES='' && export CODEBUILD_RESOLVED_SOURCE_VERSION='test' && export BUILD_CONTEXT=''" "&& pytest -s --collect-only --generate-coverage-doc --ignore=container_tests/", hide=True, ) # Ensure that the coverage report is created assert os.path.exists(test_coverage_file), f"Cannot find test coverage report file {test_coverage_file}" # Write test coverage file to S3 if is_mainline_context(): client = boto3.client("s3") with open(test_coverage_file, "rb") as test_file: try: client.put_object(Bucket=TEST_COVERAGE_REPORT_BUCKET, Key=os.path.basename(test_coverage_file), Body=test_file) except ClientError as e: LOGGER.error(f"Unable to upload report to bucket {TEST_COVERAGE_REPORT_BUCKET}. Error: {e}") raise
def run_smclarify_bias_metrics( image_uri, ec2_connection, ec2_instance_type, docker_executable="docker", container_name="smclarify", test_script=SMCLARIFY_SCRIPT, ): container_test_local_dir = os.path.join("$HOME", "container_tests") account_id = get_account_id_from_image_uri(image_uri) region = get_region_from_image_uri(image_uri) login_to_ecr_registry(ec2_connection, account_id, region) ec2_connection.run(f"docker pull -q {image_uri}") try: ec2_connection.run( f"{docker_executable} run --name {container_name} -v " f"{container_test_local_dir}:{os.path.join(os.sep, 'test')} {image_uri} " f"python {test_script}", hide=True, timeout=300, ) except Exception as e: debug_output = ec2_connection.run(f"docker logs {container_name}") debug_stdout = debug_output.stdout if "Test SMClarify Bias Metrics succeeded!" in debug_stdout: LOGGER.warning( f"SMClarify test succeeded, but there is an issue with fabric. " f"Error:\n{e}\nTest output:\n{debug_stdout}" ) return raise SMClarifyTestFailure( f"SMClarify test failed on {image_uri} on {ec2_instance_type}. Full output:\n{debug_stdout}" ) from e
def test_tensorflow_sagemaker_training_performance(tensorflow_training, num_nodes, region): # This sleep has been inserted because all the parametrized training jobs are automatically created # by SageMaker with the same name, due to being started around the same time, and with the same image uri. time.sleep( random.Random(x=f"{tensorflow_training}{num_nodes}").random() * 60) framework_version = re.search(r"[1,2](\.\d+){2}", tensorflow_training).group() processor = "gpu" if "gpu" in tensorflow_training else "cpu" ec2_instance_type = "p3.16xlarge" if processor == "gpu" else "c5.18xlarge" py_version = "py2" if "py2" in tensorflow_training else "py37" if "py37" in tensorflow_training else "py3" time_str = time.strftime('%Y-%m-%d-%H-%M-%S') commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION") target_upload_location = os.path.join(BENCHMARK_RESULTS_S3_BUCKET, "tensorflow", framework_version, "sagemaker", "training", processor, py_version) test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "resources") venv_dir = os.path.join(test_dir, "sm_benchmark_venv") ctx = Context() with ctx.cd(test_dir), ctx.prefix(f"source {venv_dir}/bin/activate"): log_file = f"results-{commit_info}-{time_str}-{num_nodes}-node.txt" run_out = ctx.run( f"timeout 45m python tf_sm_benchmark.py " f"--framework-version {framework_version} " f"--image-uri {tensorflow_training} " f"--instance-type ml.{ec2_instance_type} " f"--node-count {num_nodes} " f"--python {py_version} " f"--region {region} " f"> {log_file}", warn=True, echo=True) if not (run_out.ok or run_out.return_code == 124): target_upload_location = os.path.join(target_upload_location, "failure_log") ctx.run( f"aws s3 cp {log_file} {os.path.join(target_upload_location, log_file)}" ) LOGGER.info( f"Test results can be found at {os.path.join(target_upload_location, log_file)}" ) assert run_out.ok, ( f"Benchmark Test failed with return code {run_out.return_code}. " f"Test results can be found at {os.path.join(target_upload_location, log_file)}" )
def start_ecr_image_scan(ecr_client, image_uri): """ Start ECR Scan for an image, and Warn if scan cannot be started :param ecr_client: boto3 client for ECR :param image_uri: image URI for image to be checked """ repository, tag = get_repository_and_tag_from_image_uri(image_uri) try: scan_info = ecr_client.start_image_scan(repositoryName=repository, imageId={"imageTag": tag}) except ecr_client.exceptions.LimitExceededException: LOGGER.warning("Scan has already been run on this image in the last 24 hours.") return if scan_info["imageScanStatus"]["status"] == "FAILED": raise ECRScanFailedError(f"ECR Scan failed and returned:\n{json.dumps(scan_info, indent=4)}") return
def test_tensorflow_with_horovod_cpu(tensorflow_training, ec2_connection, cpu_only, tf2_only): container_name = "tf_hvd_cpu_test" test_script = TF1_HVD_CMD if is_tf_version("1", tensorflow_training) else TF2_HVD_CMD try: execute_ec2_training_test( ec2_connection, tensorflow_training, test_script, container_name=container_name, timeout=1800 ) except Exception as e: debug_output = ec2_connection.run(f"docker logs {container_name}") debug_stdout = debug_output.stdout if "TF HVD tests passed!" in debug_stdout: LOGGER.warning( f"TF HVD tests succeeded, but there is an issue with fabric. Error:\n{e}\nTest output:\n{debug_stdout}" ) return raise TFTrainingTestFailure(f"TF HVD test failed. Full output:\n{debug_stdout}") from e
def _print_results_of_test(file_path, processor): last_100_lines = Context().run(f"tail -100 {file_path}").stdout.split("\n") result = "" if processor == "cpu": for line in last_100_lines: if "Total img/sec on " in line: result = line + "\n" elif processor == "gpu": result_dict = dict() for line in last_100_lines: if "images/sec: " in line: key = line.split("<stdout>")[0] result_dict[key] = line.strip("\n") result = "\n".join(result_dict.values()) + "\n" LOGGER.info(result) return result
def _run_cmd_on_container(container_name, context, cmd, executable="bash"): """ Helper function to run commands on a locally running container :param container_name: Name of the docker container :param context: ECR image URI :param cmd: Command to run on the container :param executable: Executable to run on the container (bash or python) :return: invoke output, can be used to parse stdout, etc """ if executable not in ("bash", "python"): LOGGER.warn( f"Unrecognized executable {executable}. It will be run as {executable} -c '{cmd}'" ) return context.run( f"docker exec --user root {container_name} {executable} -c '{cmd}'", hide=True, timeout=30)
def test_dlc_major_version_label(image, region): """ Test to ensure that all DLC images have the LABEL "dlc_major_version" :param image: <str> Image URI :param region: <str> region where ECR repository holding the image resides :return: """ ecr_client = boto3.client("ecr", region_name=region) image_repository, image_tag = get_repository_and_tag_from_image_uri(image) # Using "acceptedMediaTypes" on the batch_get_image request allows the returned image information to # provide the ECR Image Manifest in the specific format that we need, so that the image LABELS can be found # on the manifest. The default format does not return the image LABELs. response = ecr_client.batch_get_image( repositoryName=image_repository, imageIds=[{ "imageTag": image_tag }], acceptedMediaTypes=[ "application/vnd.docker.distribution.manifest.v1+json" ], ) if not response.get("images"): raise KeyError( f"Failed to get images through ecr_client.batch_get_image response for image {image_repository}:{image_tag}" ) elif not response["images"][0].get("imageManifest"): raise KeyError( f"imageManifest not found in ecr_client.batch_get_image response:\n{response['images']}" ) manifest_str = response["images"][0]["imageManifest"] # manifest_str is a json-format string manifest = json.loads(manifest_str) image_metadata = json.loads(manifest["history"][0]["v1Compatibility"]) major_version = image_metadata["config"]["Labels"].get( "dlc_major_version", None) assert major_version, f"{image} has no LABEL named 'dlc_major_version'. Please insert label." LOGGER.info(f"{image} has 'dlc_major_version' = {major_version}")
def test_canary_images_pullable(region): """ Sanity test to verify canary specific functions """ ctx = Context() frameworks = ("tensorflow", "mxnet", "pytorch") # Have a default framework to test on framework = "pytorch" for fw in frameworks: if fw in os.getenv("CODEBUILD_INITIATOR"): framework = fw break images = parse_canary_images(framework, region) login_to_ecr_registry(ctx, PUBLIC_DLC_REGISTRY, region) if not images: return for image in images.split(" "): ctx.run(f"docker pull -q {image}") LOGGER.info(f"Canary image {image} is available")
def _print_results_of_test(file_path): last_n_lines = Context().run(f"tail -500 {file_path}").stdout.split("\n") result_dict = dict() accuracy = 0 time_cost = 0 accuracy_key = "Train-accuracy" time_cost_key = "Time cost" reversed_log = reversed(last_n_lines) for line in reversed_log: if all(key in result_dict for key in ("Train-accuracy", "Time cost")): break if accuracy_key in line: if accuracy_key in result_dict: continue accuracy_str = line.split("=")[1] result_dict[accuracy_key] = accuracy_str accuracy = float(accuracy_str) if time_cost_key in line: if time_cost_key in result_dict: continue time_str = line.split("=")[1] result_dict[time_cost_key] = time_str time_cost = float(time_str) result = "\n".join(result_dict.values()) + "\n" LOGGER.info(f'Result is {result}') LOGGER.info(f'{accuracy_key} is {accuracy}') LOGGER.info(f'{time_cost_key} is {time_cost}') return result, time_cost, accuracy
def run_smdebug_test( image_uri, ec2_connection, region, ec2_instance_type, docker_executable="docker", container_name="smdebug", test_script=SMDEBUG_SCRIPT, timeout=2400, ): large_shm_instance_types = ("p2.8xlarge", "m4.16xlarge") shm_setting = " --shm-size=1g " if ec2_instance_type in large_shm_instance_types else " " framework = get_framework_from_image_uri(image_uri) container_test_local_dir = os.path.join("$HOME", "container_tests") ec2_connection.run( f"$(aws ecr get-login --no-include-email --region {region})", hide=True) ec2_connection.run(f"docker pull -q {image_uri}") try: ec2_connection.run( f"{docker_executable} run --name {container_name} -v " f"{container_test_local_dir}:{os.path.join(os.sep, 'test')}{shm_setting}{image_uri} " f"./{test_script} {framework}", hide=True, timeout=timeout, ) except Exception as e: debug_output = ec2_connection.run(f"docker logs {container_name}") debug_stdout = debug_output.stdout if "All SMDebug tests succeeded!" in debug_stdout: LOGGER.warning( f"SMDebug tests succeeded, but there is an issue with fabric. Error:\n{e}\nTest output:\n{debug_stdout}" ) return raise SMDebugTestFailure( f"SMDebug test failed on {image_uri} on {ec2_instance_type}. Full output:\n{debug_stdout}" ) from e
def test_tensorflow_sagemaker_training_performance(tensorflow_training, num_nodes, region): framework_version = re.search(r"[1,2](\.\d+){2}", tensorflow_training).group() if framework_version.startswith("1."): pytest.skip("Skipping benchmark test on TF 1.x images.") processor = "gpu" if "gpu" in tensorflow_training else "cpu" ec2_instance_type = "p3.16xlarge" if processor == "gpu" else "c5.18xlarge" py_version = "py2" if "py2" in tensorflow_training else "py37" if "py37" in tensorflow_training else "py3" time_str = time.strftime('%Y-%m-%d-%H-%M-%S') commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION") target_upload_location = os.path.join(BENCHMARK_RESULTS_S3_BUCKET, "tensorflow", framework_version, "sagemaker", "training", processor, py_version) training_job_name = ( f"tf{framework_version[0]}-tr-bench-{processor}-{num_nodes}-node-{py_version}" f"-{commit_info[:7]}-{time_str}") # Inserting random sleep because this test starts multiple training jobs around the same time, resulting in # a throttling error for SageMaker APIs. time.sleep(Random(x=training_job_name).random() * 60) test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "resources") venv_dir = os.path.join(test_dir, "sm_benchmark_venv") ctx = Context() with ctx.cd(test_dir), ctx.prefix(f"source {venv_dir}/bin/activate"): log_file = f"results-{commit_info}-{time_str}-{num_nodes}-node.txt" run_out = ctx.run( f"timeout 45m python tf_sm_benchmark.py " f"--framework-version {framework_version} " f"--image-uri {tensorflow_training} " f"--instance-type ml.{ec2_instance_type} " f"--node-count {num_nodes} " f"--python {py_version} " f"--region {region} " f"--job-name {training_job_name}" f"2>&1 > {log_file}", warn=True, echo=True) if not (run_out.ok or run_out.return_code == 124): target_upload_location = os.path.join(target_upload_location, "failure_log") ctx.run( f"aws s3 cp {os.path.join(test_dir, log_file)} {os.path.join(target_upload_location, log_file)}" ) LOGGER.info( f"Test results can be found at {os.path.join(target_upload_location, log_file)}" ) assert run_out.ok, ( f"Benchmark Test failed with return code {run_out.return_code}. " f"Test results can be found at {os.path.join(target_upload_location, log_file)}" )
def test_tensorflow_sagemaker_training_performance(tensorflow_training, num_nodes, region): """ Run TF sagemaker training performance tests Additonal context: Setup for this function is performed by 'setup_sm_benchmark_tf_train_env' -- this installs some prerequisite packages, clones some repos, and creates a virtualenv called sm_benchmark_venv. TODO: Refactor the above setup function to be more obviously connected to this function, TODO: and install requirements via a requirements.txt file :param tensorflow_training: ECR image URI :param num_nodes: Number of nodes to run on :param region: AWS region """ framework_version = re.search(r"[1,2](\.\d+){2}", tensorflow_training).group() if framework_version.startswith("1."): pytest.skip("Skipping benchmark test on TF 1.x images.") processor = "gpu" if "gpu" in tensorflow_training else "cpu" ec2_instance_type = "p3.16xlarge" if processor == "gpu" else "c5.18xlarge" py_version = "py2" if "py2" in tensorflow_training else "py37" if "py37" in tensorflow_training else "py3" time_str = time.strftime('%Y-%m-%d-%H-%M-%S') commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION") target_upload_location = os.path.join(BENCHMARK_RESULTS_S3_BUCKET, "tensorflow", framework_version, "sagemaker", "training", processor, py_version) training_job_name = ( f"tf{framework_version[0]}-tr-bench-{processor}-{num_nodes}-node-{py_version}" f"-{commit_info[:7]}-{time_str}") # Inserting random sleep because this test starts multiple training jobs around the same time, resulting in # a throttling error for SageMaker APIs. time.sleep(Random(x=training_job_name).random() * 60) test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "resources") venv_dir = os.path.join(test_dir, "sm_benchmark_venv") ctx = Context() with ctx.cd(test_dir), ctx.prefix(f"source {venv_dir}/bin/activate"): log_file = f"results-{commit_info}-{time_str}-{framework_version}-{processor}-{py_version}-{num_nodes}-node.txt" run_out = ctx.run( f"timeout 45m python tf_sm_benchmark.py " f"--framework-version {framework_version} " f"--image-uri {tensorflow_training} " f"--instance-type ml.{ec2_instance_type} " f"--node-count {num_nodes} " f"--python {py_version} " f"--region {region} " f"--job-name {training_job_name}" f"2>&1 | tee {log_file}", warn=True, echo=True) if not (run_out.ok or run_out.return_code == 124): target_upload_location = os.path.join(target_upload_location, "failure_log") ctx.run( f"aws s3 cp {os.path.join(test_dir, log_file)} {os.path.join(target_upload_location, log_file)}" ) LOGGER.info( f"Test results can be found at {os.path.join(target_upload_location, log_file)}" ) _print_results_of_test(os.path.join(test_dir, log_file), processor) assert run_out.ok, ( f"Benchmark Test failed with return code {run_out.return_code}. " f"Test results can be found at {os.path.join(target_upload_location, log_file)}" )
def test_cuda_paths(gpu): """ Test to ensure that: a. buildspec contains an entry to create the same image as the image URI b. directory structure for GPU Dockerfiles has framework version, python version, and cuda version in it :param gpu: gpu image uris """ image = gpu if "example" in image: pytest.skip( "Skipping Example Dockerfiles which are not explicitly tied to a cuda version" ) dlc_path = os.getcwd().split("/test/")[0] job_type = "training" if "training" in image else "inference" # Ensure that image has a supported framework frameworks = ("tensorflow", "pytorch", "mxnet") framework = "" for fw in frameworks: if fw in image: framework = fw break assert framework, f"Cannot find any frameworks {frameworks} in image uri {image}" # Get cuda, framework version, python version through regex cuda_version = re.search(r"-(cu\d+)-", image).group(1) framework_version = re.search(r":(\d+(\.\d+){2})", image).group(1) framework_short_version = None python_version = re.search(r"(py\d+)", image).group(1) short_python_version = None image_tag = re.search( r":(\d+(\.\d+){2}-(cpu|gpu|neuron)-(py\d+)(-cu\d+)-(ubuntu\d+\.\d+)(-example)?)", image).group(1) framework_version_path = os.path.join(dlc_path, framework, job_type, "docker", framework_version) if not os.path.exists(framework_version_path): framework_short_version = re.match(r"(\d+.\d+)", framework_version).group(1) framework_version_path = os.path.join(dlc_path, framework, job_type, "docker", framework_short_version) if not os.path.exists(os.path.join(framework_version_path, python_version)): # Use the pyX version as opposed to the pyXY version if pyXY path does not exist short_python_version = python_version[:3] # Check buildspec for cuda version buildspec = "buildspec.yml" if is_tf_version("1", image): buildspec = "buildspec-tf1.yml" cuda_in_buildspec = False dockerfile_spec_abs_path = None cuda_in_buildspec_ref = f"CUDA_VERSION {cuda_version}" buildspec_path = os.path.join(dlc_path, framework, buildspec) buildspec_def = Buildspec() buildspec_def.load(buildspec_path) for name, image_spec in buildspec_def["images"].items(): if image_spec["device_type"] == "gpu" and image_spec[ "tag"] == image_tag: cuda_in_buildspec = True dockerfile_spec_abs_path = os.path.join( os.path.dirname(framework_version_path), image_spec["docker_file"].lstrip("docker/")) break try: assert cuda_in_buildspec, f"Can't find {cuda_in_buildspec_ref} in {buildspec_path}" except AssertionError as e: if not is_dlc_cicd_context(): LOGGER.warn( f"{e} - not failing, as this is a(n) {os.getenv('BUILD_CONTEXT', 'empty')} build context." ) else: raise image_properties_expected_in_dockerfile_path = [ framework_short_version or framework_version, short_python_version or python_version, cuda_version ] assert all( prop in dockerfile_spec_abs_path for prop in image_properties_expected_in_dockerfile_path ), (f"Dockerfile location {dockerfile_spec_abs_path} does not contain all the image properties in " f"{image_properties_expected_in_dockerfile_path}") assert os.path.exists( dockerfile_spec_abs_path ), f"Cannot find dockerfile for {image} in {dockerfile_spec_abs_path}"
def test_resnet101_at_fp16(self, instance_type, num_gpus, total_n_gpus, instance_count, distribution_strategy, caching, tensorflow_training, sagemaker_session, capsys, framework_version): epochs = int(100 * total_n_gpus) batches = np.array([224]) * total_n_gpus for batch in np.array(batches, dtype=int): train_steps = int(10240 * epochs / batch) steps_per_loop = train_steps // 10 overrides=\ f"runtime.enable_xla=True,"\ f"runtime.num_gpus={num_gpus},"\ f"runtime.distribution_strategy={distribution_strategy},"\ f"runtime.mixed_precision_dtype=float16,"\ f"task.train_data.global_batch_size={batch},"\ f"task.train_data.input_path=/opt/ml/input/data/training/validation*,"\ f"task.train_data.cache={caching},"\ f"trainer.train_steps={train_steps},"\ f"trainer.steps_per_loop={steps_per_loop},"\ f"trainer.summary_interval={steps_per_loop},"\ f"trainer.checkpoint_interval={train_steps},"\ f"task.model.backbone.type=resnet,"\ f"task.model.backbone.resnet.model_id=101" estimator = TensorFlow( sagemaker_session=sagemaker_session, git_config={ 'repo': 'https://github.com/tensorflow/models.git', 'branch': 'v2.9.2', }, source_dir='.', entry_point='official/vision/train.py', model_dir=False, instance_type=instance_type, instance_count=instance_count, image_uri=tensorflow_training, hyperparameters={ TrainingCompilerConfig.HP_ENABLE_COMPILER: True, 'experiment': 'resnet_imagenet', 'config_file': 'official/vision/configs/experiments/image_classification/imagenet_resnet50_gpu.yaml', 'mode': 'train', 'model_dir': '/opt/ml/model', 'params_override': overrides, }, debugger_hook_config=None, disable_profiler=True, max_run=60 * 60 * 1, # Timeout in 1 hours base_job_name= f"tf{framework_version.replace('.','')}-trcomp-bench-resnet101", role="SageMakerRole", ) estimator.fit( inputs= 's3://collection-of-ml-datasets/Imagenet/TFRecords/validation', logs=True, wait=True) captured = capsys.readouterr() logs = captured.out + captured.err match = re.search('Billable seconds: ([0-9]*)', logs) billable = int(match.group(1)) short_version = '.'.join(framework_version.split('.')[:2]) threshold = TRCOMP_THRESHOLD['tensorflow'][short_version][ 'resnet101'][instance_type][instance_count][batch] result = ( f"tensorflow-trcomp {framework_version} resnet101 fp16 XLA " f"imagenet {instance_type} {instance_count} {batch} Billable: {billable} secs threshold: {threshold} secs " f"{estimator.latest_training_job.name}") LOGGER.info(result) assert billable >= 1000, 'False Positive ' + result assert billable <= threshold, result
def test_oss_compliance(image): """ Run oss compliance check on a container to check if license attribution files exist. And upload source of third party packages to S3 bucket. """ THIRD_PARTY_SOURCE_CODE_BUCKET = "aws-dlinfra-licenses" THIRD_PARTY_SOURCE_CODE_BUCKET_PATH = "third_party_source_code" file = "THIRD_PARTY_SOURCE_CODE_URLS" container_name = get_container_name("oss_compliance", image) context = Context() local_repo_path = get_repository_local_path() start_container(container_name, image, context) # run compliance test to make sure license attribution files exists. testOSSCompliance is copied as part of Dockerfile run_cmd_on_container(container_name, context, "/usr/local/bin/testOSSCompliance /root") try: context.run( f"docker cp {container_name}:/root/{file} {os.path.join(local_repo_path, file)}" ) finally: context.run(f"docker rm -f {container_name}", hide=True) s3_resource = boto3.resource("s3") with open(os.path.join(local_repo_path, file)) as source_code_file: for line in source_code_file: name, version, url = line.split(" ") file_name = f"{name}_v{version}_source_code" s3_object_path = f"{THIRD_PARTY_SOURCE_CODE_BUCKET_PATH}/{file_name}.tar.gz" local_file_path = os.path.join(local_repo_path, file_name) for i in range(3): try: if not os.path.isdir(local_file_path): context.run( f"git clone {url.rstrip()} {local_file_path}") context.run( f"tar -czvf {local_file_path}.tar.gz {local_file_path}" ) except Exception as e: time.sleep(1) if i == 2: LOGGER.error(f"Unable to clone git repo. Error: {e}") raise continue try: if os.path.exists(f"{local_file_path}.tar.gz"): LOGGER.info(f"Uploading package to s3 bucket: {line}") s3_resource.Object(THIRD_PARTY_SOURCE_CODE_BUCKET, s3_object_path).load() except botocore.exceptions.ClientError as e: if e.response["Error"]["Code"] == "404": try: # using aws cli as using boto3 expects to upload folder by iterating through each file instead of entire folder. context.run( f"aws s3 cp {local_file_path}.tar.gz s3://{THIRD_PARTY_SOURCE_CODE_BUCKET}/{s3_object_path}" ) object = s3_resource.Bucket( THIRD_PARTY_SOURCE_CODE_BUCKET).Object( s3_object_path) object.Acl().put(ACL="public-read") except ClientError as e: LOGGER.error( f"Unable to upload source code to bucket {THIRD_PARTY_SOURCE_CODE_BUCKET}. Error: {e}" ) raise else: LOGGER.error( f"Unable to check if source code is present on bucket {THIRD_PARTY_SOURCE_CODE_BUCKET}. Error: {e}" ) raise
def test_cuda_paths(gpu): """ Test to ensure directory structure for GPU Dockerfiles has cuda version in it :param gpu: gpu image uris """ image = gpu if "example" in image: pytest.skip( "Skipping Example Dockerfiles which are not explicitly tied to a cuda version" ) dlc_path = os.getcwd().split("/test/")[0] job_type = "training" if "training" in image else "inference" # Ensure that image has a supported framework frameworks = ("tensorflow", "pytorch", "mxnet") framework = "" for fw in frameworks: if fw in image: framework = fw break assert framework, f"Cannot find any frameworks {frameworks} in image uri {image}" # Get cuda, framework version, python version through regex cuda_version = re.search(r"-(cu\d+)-", image).group(1) framework_version = re.search(r":(\d+(.\d+){2})", image).group(1) python_version = re.search(r"(py\d+)", image).group(1) framework_version_path = os.path.join(dlc_path, framework, job_type, "docker", framework_version) if not os.path.exists(framework_version_path): framework_short_version = re.match(r"(\d+.\d+)", framework_version).group(1) framework_version_path = os.path.join(dlc_path, framework, job_type, "docker", framework_short_version) if not os.path.exists(os.path.join(framework_version_path, python_version)): # Use the pyX version as opposed to the pyXY version if pyXY path does not exist python_version = python_version[:3] # Check buildspec for cuda version buildspec = "buildspec.yml" if is_tf_version("1", image): buildspec = "buildspec-tf1.yml" cuda_in_buildspec = False cuda_in_buildspec_ref = f"CUDA_VERSION {cuda_version}" buildspec_path = os.path.join(dlc_path, framework, buildspec) with open(buildspec_path, "r") as bf: for line in bf: if cuda_in_buildspec_ref in line: cuda_in_buildspec = True break try: assert cuda_in_buildspec, f"Can't find {cuda_in_buildspec_ref} in {buildspec_path}" except AssertionError as e: if not is_dlc_cicd_context(): LOGGER.warn( f"{e} - not failing, as this is a(n) {os.getenv('BUILD_CONTEXT', 'empty')} build context." ) else: raise # Check that a Dockerfile exists in the right directory dockerfile_path = os.path.join(framework_version_path, python_version, cuda_version, "Dockerfile.gpu") assert os.path.exists( dockerfile_path ), f"Cannot find dockerfile for image {image} in {dockerfile_path}"
def run_sm_perf_test(image_uri, xla, num_nodes, region, threshold=None): """ Run TF sagemaker training performance tests Additional context: Setup for this function is performed by 'setup_sm_benchmark_tf_train_env' -- this installs some prerequisite packages, clones some repos, and creates a virtualenv called sm_benchmark_venv. TODO: Refactor the above setup function to be more obviously connected to this function, TODO: and install requirements via a requirements.txt file :param image_uri: ECR image URI :param xla: [ True | False ] Enable XLA acceleration :param num_nodes: Number of nodes to run on :param region: AWS region This function was inspired by deep-learning-containers/test/dlc_tests/benchmark/sagemaker/tensorflow/training/test_performance_tensorflow_sm_training.py """ _, framework_version = get_framework_and_version_from_tag(image_uri) processor = "xla" if xla else "gpu" device_cuda_str = f"{processor}-{get_cuda_version_from_tag(image_uri)}" ''' TODO: Switch to p3.16xlarge when EC2 availability issues are resolved ''' ec2_instance_type = "p3.8xlarge" py_version = "py2" if "py2" in image_uri else "py37" if "py37" in image_uri else "py3" time_str = time.strftime("%Y-%m-%d-%H-%M-%S") commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION") target_upload_location = os.path.join( BENCHMARK_RESULTS_S3_BUCKET, "xla", "tensorflow", framework_version, "sagemaker", "training", device_cuda_str, py_version ) training_job_name = ( f"opt-tf{framework_version[0]}-bench-{device_cuda_str}-{num_nodes}-node-{py_version}-{commit_info[:7]}-{time_str}" ) # Inserting random sleep because this test starts multiple training jobs around the same time, resulting in # a throttling error for SageMaker APIs. time.sleep(Random(x=training_job_name).random() * 60) test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "resources") venv_dir = os.path.join(test_dir, "sm_benchmark_venv") ctx = Context() with ctx.cd(test_dir), ctx.prefix(f"source {venv_dir}/bin/activate"): log_file = ( f"results-{commit_info}-{time_str}-optimized-tf{framework_version}-{device_cuda_str}-{py_version}-{num_nodes}-node.txt" ) run_out = ctx.run( f"timeout 45m python tf_sm_benchmark.py " f"--framework-version {framework_version} " f"--image-uri {image_uri} " f"--instance-type ml.{ec2_instance_type} " f"--node-count {num_nodes} " f"--python {py_version} " f"--region {region} " f"--job-name {training_job_name} " f"--xla-{'on' if xla else 'off'} " f"2>&1 | tee {log_file}", warn=True, echo=True, ) if not (run_out.ok or run_out.return_code == 124): target_upload_location = os.path.join(target_upload_location, "failure_log") ctx.run(f"aws s3 cp {os.path.join(test_dir, log_file)} {os.path.join(target_upload_location, log_file)}") LOGGER.info(f"Test results can be found at {os.path.join(target_upload_location, log_file)}") result_statement, throughput = _print_results_of_test(os.path.join(test_dir, log_file)) throughput /= num_nodes assert run_out.ok, ( f"Benchmark Test failed with return code {run_out.return_code}. " f"Test results can be found at {os.path.join(target_upload_location, log_file)}" ) LOGGER.info( f"optimized-tensorflow-{framework_version} sagemaker training {ec2_instance_type} {device_cuda_str} {py_version} " f"imagenet {num_nodes} nodes Throughput: {throughput} images/sec, threshold: {threshold} images/sec" ) if threshold: assert throughput > threshold, ( f"optimized-tensorflow-{framework_version} sagemaker training {ec2_instance_type} {device_cuda_str} {py_version} imagenet {num_nodes} nodes " f"Regression Benchmark Result {throughput} does not reach the threshold {threshold}" ) return throughput
def _run_dependency_check_test(image, ec2_connection): # Record any whitelisted medium/low severity CVEs; I.E. allowed_vulnerabilities = {CVE-1000-5555, CVE-9999-9999} allowed_vulnerabilities = { # Those vulnerabilities are fixed. Current openssl version is 1.1.1g. These are false positive "CVE-2016-2109", "CVE-2016-2177", "CVE-2016-6303", "CVE-2016-2182", # CVE-2020-13936: vulnerability found in apache velocity package which is a dependency for dependency-check package. Hence, ignoring. "CVE-2020-13936", } processor = get_processor_from_image_uri(image) # Whitelist CVE #CVE-2021-3711 for DLCs where openssl is installed using apt-get framework, _ = get_framework_and_version_from_tag(image) short_fw_version = re.search(r"(\d+\.\d+)", image).group(1) # Check that these versions have been matched on https://ubuntu.com/security/CVE-2021-3711 before adding allow_openssl_cve_fw_versions = { "tensorflow": { "1.15": ["cpu", "gpu", "neuron"], "2.3": ["cpu", "gpu"], "2.4": ["cpu", "gpu"], "2.5": ["cpu", "gpu", "neuron"], "2.6": ["cpu", "gpu"], "2.7": ["cpu", "gpu"], }, "mxnet": { "1.8": ["neuron"], "1.9": ["cpu", "gpu"] }, "pytorch": { "1.10": ["cpu"] }, "huggingface_pytorch": { "1.8": ["cpu", "gpu"], "1.9": ["cpu", "gpu"] }, "huggingface_tensorflow": { "2.4": ["cpu", "gpu"], "2.5": ["cpu", "gpu"] }, "autogluon": { "0.3": ["cpu"] }, } if processor in allow_openssl_cve_fw_versions.get(framework, {}).get( short_fw_version, []): allowed_vulnerabilities.add("CVE-2021-3711") container_name = f"dep_check_{processor}" report_addon = get_container_name("depcheck-report", image) dependency_check_report = f"{report_addon}.html" html_file = f"{container_name}:/build/dependency-check-report.html" test_script = os.path.join(CONTAINER_TESTS_PREFIX, "testDependencyCheck") # Execute test, copy results to s3 ec2.execute_ec2_training_test(ec2_connection, image, test_script, container_name=container_name, bin_bash_entrypoint=True) ec2_connection.run(f"docker cp {html_file} ~/{dependency_check_report}") ec2_connection.run( f"aws s3 cp ~/{dependency_check_report} s3://dlc-dependency-check") # Check for any vulnerabilities not mentioned in allowed_vulnerabilities html_output = ec2_connection.run(f"cat ~/{dependency_check_report}", hide=True).stdout cves = re.findall(r">(CVE-\d+-\d+)</a>", html_output) vulnerabilities = set(cves) - allowed_vulnerabilities if vulnerabilities: vulnerability_severity = {} # Check NVD for vulnerability severity to provide this useful info in error message. for vulnerability in vulnerabilities: try: cve_url = f"https://services.nvd.nist.gov/rest/json/cve/1.0/{vulnerability}" session = requests.Session() session.mount( "https://", requests.adapters.HTTPAdapter(max_retries=Retry( total=5, status_forcelist=[404, 504, 502])), ) response = session.get(cve_url) if response.status_code == 200: severity = (response.json().get("result", {}).get( "CVE_Items", [{}])[0].get("impact", {}).get("baseMetricV2", {}).get("severity", "UNKNOWN")) if vulnerability_severity.get(severity): vulnerability_severity[severity].append(vulnerability) else: vulnerability_severity[severity] = [vulnerability] except ConnectionError: LOGGER.exception( f"Failed to load NIST data for CVE {vulnerability}") # TODO: Remove this once we have whitelisted appropriate LOW/MEDIUM vulnerabilities if not (vulnerability_severity.get("CRITICAL") or vulnerability_severity.get("HIGH")): return raise DependencyCheckFailure( f"Unrecognized CVEs have been reported : {vulnerability_severity}. " f"Allowed vulnerabilities are {allowed_vulnerabilities or None}. Please see " f"{dependency_check_report} for more details.")
def test_mxnet_sagemaker_training_performance(mxnet_training, num_nodes, region, gpu_only, py3_only): """ Run MX sagemaker training performance test Additional context: Setup for this function is performed by 'setup_sm_benchmark_mx_train_env' -- this installs some prerequisite packages, pulls required script, and creates a virtualenv called sm_benchmark_venv. The training script mxnet_imagenet_resnet50.py is invoked via a shell script smtrain-resnet50-imagenet.sh The shell script sets num-epochs to 40. This parameter is configurable. TODO: Refactor the above setup function to be more obviously connected to this function, TODO: and install requirements via a requirements.txt file TODO: Change latency [time/epoch] metric to Throughput metric :param mxnet_training: ECR image URI :param num_nodes: Number of nodes to run on :param region: AWS region """ _, framework_version = get_framework_and_version_from_tag(mxnet_training) device_cuda_str = f"gpu-{get_cuda_version_from_tag(mxnet_training)}" py_version = "py37" if "py37" in mxnet_training else "py2" if "py2" in mxnet_training else "py3" ec2_instance_type = "p3.16xlarge" time_str = time.strftime('%Y-%m-%d-%H-%M-%S') commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION", "manual") target_upload_location = os.path.join(BENCHMARK_RESULTS_S3_BUCKET, "mxnet", framework_version, "sagemaker", "training", device_cuda_str, py_version) training_job_name = f"mx-tr-bench-{device_cuda_str}-{num_nodes}-node-{py_version}-{commit_info[:7]}-{time_str}" test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "resources") venv_dir = os.path.join(test_dir, "sm_benchmark_venv") ctx = Context() with ctx.cd(test_dir), ctx.prefix(f"source {venv_dir}/bin/activate"): log_file = f"results-{commit_info}-{time_str}-{num_nodes}-node.txt" run_out = ctx.run( f"timeout 90m python mx_sm_benchmark.py " f"--framework-version {framework_version} " f"--image-uri {mxnet_training} " f"--instance-type ml.{ec2_instance_type} " f"--node-count {num_nodes} " f"--python {py_version} " f"--region {region} " f"--job-name {training_job_name} " f"2>&1 | tee {log_file}", warn=True, echo=True) if not run_out.ok: target_upload_location = os.path.join(target_upload_location, "failure_log") ctx.run( f"aws s3 cp {os.path.join(test_dir, log_file)} {os.path.join(target_upload_location, log_file)}", warn=True, echo=True) LOGGER.info( f"Test results can be found at {os.path.join(target_upload_location, log_file)}" ) assert run_out.ok, ( f"Benchmark Test failed with return code {run_out.return_code}. " f"Test results can be found at {os.path.join(target_upload_location, log_file)}" ) result_statement, time_val, accuracy = _print_results_of_test( os.path.join(test_dir, log_file)) accuracy_threshold = get_threshold_for_image( framework_version, MXNET_TRAINING_GPU_IMAGENET_ACCURACY_THRESHOLD) assert accuracy > accuracy_threshold, ( f"mxnet {framework_version} sagemaker training {py_version} imagenet {num_nodes} nodes " f"Benchmark Result {accuracy} does not reach the threshold accuracy {accuracy_threshold}" ) time_threshold = get_threshold_for_image( framework_version, MXNET_TRAINING_GPU_IMAGENET_LATENCY_THRESHOLD) assert time_val < time_threshold, ( f"mxnet {framework_version} sagemaker training {py_version} imagenet {num_nodes} nodes " f"Benchmark Result {time_val} does not reach the threshold latency {time_threshold}" )
def generate_coverage_doc(self, framework=None, job_type=None): """ Generate the test coverage docs based on pytest item objects :param framework: str, ML framework :param job_type: str, training or inference """ test_cov = {} for item in self.items: # Define additional csv options function_name = item.name.split("[")[0] function_key = f"{item.fspath}::{function_name}" str_fspath = str(item.fspath) str_keywords = str(item.keywords) # Construct Category and Github_Link fields based on the filepath category = str_fspath.split("/dlc_tests/")[-1].split("/")[0] if self.is_sagemaker: category = "sagemaker_local" if "local" in str_fspath else "sagemaker" repo_url = os.getenv( "CODEBUILD_SOURCE_REPO_URL", "https://github.com/aws/deep-learning-containers.git").rstrip( ".git") github_link = f"{repo_url}/blob/master/test/{str_fspath.split('/test/')[-1]}" # Only create a new test coverage item if we have not seen the function before. This is a necessary step, # as parametrization can make it appear as if the same test function is a unique test function if test_cov.get(function_key): continue # Based on keywords and filepaths, assign values framework_scope = (framework if framework else _infer_field_value( "all", ("mxnet", "tensorflow", "pytorch"), str_fspath)) job_type_scope = (job_type if job_type else _infer_field_value( "both", ("training", "inference"), str_fspath, str_keywords)) integration_scope = _infer_field_value( "general integration", ("_dgl_", "smdebug", "gluonnlp", "smexperiments", "_mme_", "pipemode", "tensorboard", "_s3_", "nccl"), str_keywords, ) processor_scope = _infer_field_value("all", ("cpu", "gpu", "eia"), str_keywords) if processor_scope == "gpu": processor_scope = self.handle_single_gpu_instances_test_report( function_key, str_keywords) # Create a new test coverage item if we have not seen the function before. This is a necessary step, # as parametrization can make it appear as if the same test function is a unique test function test_cov[function_key] = { "Category": category, "Name": function_name, "Scope": framework_scope, "Job_Type": job_type_scope, "Num_Instances": self.get_marker_arg_value(item, function_key, "multinode", 1), "Processor": self.get_marker_arg_value(item, function_key, "processor", processor_scope), "Integration": self.get_marker_arg_value(item, function_key, "integration", integration_scope), "Model": self.get_marker_arg_value(item, function_key, "model"), "GitHub_Link": github_link, } self.write_test_coverage_file(test_cov) if self.failure_conditions: message, total_issues, error_file = self.assemble_report_failure_message( ) if total_issues == 0: LOGGER.warning( f"Found failure message, but no issues. Message:\n{message}" ) else: raise TestReportGenerationFailure( f"{message}\nFollow {error_file} if message is truncated")
def run_sm_profiler_tests(image, profiler_tests_dir, test_file, processor): """ Testrunner to execute SM profiler tests from DLC repo """ ctx = Context() # Install profiler requirements only once - pytest-rerunfailures has a known issue # with the latest pytest https://github.com/pytest-dev/pytest-rerunfailures/issues/128 try: ctx.run( "pip install -r " "https://raw.githubusercontent.com/awslabs/sagemaker-debugger/master/config/profiler/requirements.txt && " "pip install smdebug && " "pip uninstall -y pytest-rerunfailures", hide=True, ) except UnexpectedExit: # Wait a minute and a half if we get an invoke failure - since smprofiler test requirements can be flaky time.sleep(90) framework, version = get_framework_and_version_from_tag(image) # Conditionally set sm data parallel tests, based on config file rules from link below: # https://github.com/awslabs/sagemaker-debugger/tree/master/config/profiler enable_sm_data_parallel_tests = "true" if framework == "pytorch" and Version(version) < Version("1.6"): enable_sm_data_parallel_tests = "false" if framework == "tensorflow" and Version(version) < Version("2.3"): enable_sm_data_parallel_tests = "false" # Set SMProfiler specific environment variables smprof_configs = { "use_current_branch": "false", "enable_smdataparallel_tests": enable_sm_data_parallel_tests, "force_run_tests": "false", "framework": framework, "build_type": "release" } # Command to set all necessary environment variables export_cmd = " && ".join(f"export {key}={val}" for key, val in smprof_configs.items()) export_cmd = f"{export_cmd} && export ENV_CPU_TRAIN_IMAGE=test && export ENV_GPU_TRAIN_IMAGE=test && " \ f"export ENV_{processor.upper()}_TRAIN_IMAGE={image}" test_results_outfile = os.path.join( os.getcwd(), f"{get_container_name('smprof', image)}.txt") with ctx.prefix(f"cd {profiler_tests_dir}"): with ctx.prefix(f"cd sagemaker-tests && {export_cmd}"): try: ctx.run( f"pytest --json-report --json-report-file={test_results_outfile} -n=auto " f"-v -s -W=ignore tests/{test_file}::test_{processor}_jobs", hide=True, ) with open(test_results_outfile) as outfile: result_data = json.load(outfile) LOGGER.info( f"Tests passed on {image}; Results:\n{json.dumps(result_data, indent=4)}" ) except Exception as e: if os.path.exists(test_results_outfile): with open(test_results_outfile) as outfile: result_data = json.load(outfile) raise SMProfilerRCTestFailure( f"Failed SM Profiler tests. Results:\n{json.dumps(result_data, indent=4)}" ) from e raise
def test_cuda_paths(gpu): """ Test to ensure that: a. buildspec contains an entry to create the same image as the image URI b. directory structure for GPU Dockerfiles has framework version, python version, and cuda version in it :param gpu: gpu image uris """ image = gpu if "example" in image: pytest.skip( "Skipping Example Dockerfiles which are not explicitly tied to a cuda version" ) dlc_path = os.getcwd().split("/test/")[0] job_type = "training" if "training" in image else "inference" # Ensure that image has a supported framework framework, framework_version = get_framework_and_version_from_tag(image) # Get cuda, framework version, python version through regex cuda_version = re.search(r"-(cu\d+)-", image).group(1) framework_short_version = None python_version = re.search(r"(py\d+)", image).group(1) short_python_version = None image_tag = re.search( r":(\d+(\.\d+){2}(-transformers\d+(\.\d+){2})?-(gpu)-(py\d+)(-cu\d+)-(ubuntu\d+\.\d+)((-e3)?-example|-e3|-sagemaker)?)", image, ).group(1) # replacing '_' by '/' to handle huggingface_<framework> case framework_path = framework.replace("_", "/") framework_version_path = os.path.join(dlc_path, framework_path, job_type, "docker", framework_version) if not os.path.exists(framework_version_path): framework_short_version = re.match(r"(\d+.\d+)", framework_version).group(1) framework_version_path = os.path.join(dlc_path, framework_path, job_type, "docker", framework_short_version) if not os.path.exists(os.path.join(framework_version_path, python_version)): # Use the pyX version as opposed to the pyXY version if pyXY path does not exist short_python_version = python_version[:3] # Check buildspec for cuda version buildspec = "buildspec.yml" if is_tf_version("1", image): buildspec = "buildspec-tf1.yml" image_tag_in_buildspec = False dockerfile_spec_abs_path = None buildspec_path = os.path.join(dlc_path, framework_path, buildspec) buildspec_def = Buildspec() buildspec_def.load(buildspec_path) for name, image_spec in buildspec_def["images"].items(): if image_spec["device_type"] == "gpu" and image_spec[ "tag"] == image_tag: image_tag_in_buildspec = True dockerfile_spec_abs_path = os.path.join( os.path.dirname(framework_version_path), image_spec["docker_file"].lstrip("docker/")) break try: assert image_tag_in_buildspec, f"Image tag {image_tag} not found in {buildspec_path}" except AssertionError as e: if not is_dlc_cicd_context(): LOGGER.warn( f"{e} - not failing, as this is a(n) {os.getenv('BUILD_CONTEXT', 'empty')} build context." ) else: raise image_properties_expected_in_dockerfile_path = [ framework_short_version or framework_version, short_python_version or python_version, cuda_version, ] assert all( prop in dockerfile_spec_abs_path for prop in image_properties_expected_in_dockerfile_path ), (f"Dockerfile location {dockerfile_spec_abs_path} does not contain all the image properties in " f"{image_properties_expected_in_dockerfile_path}") assert os.path.exists( dockerfile_spec_abs_path ), f"Cannot find dockerfile for {image} in {dockerfile_spec_abs_path}"
def _run_dependency_check_test(image, ec2_connection, processor): # Record any whitelisted medium/low severity CVEs; I.E. allowed_vulnerabilities = {CVE-1000-5555, CVE-9999-9999} allowed_vulnerabilities = { # Those vulnerabilities are fixed. Current openssl version is 1.1.1g. These are false positive "CVE-2016-2109", "CVE-2016-2177", "CVE-2016-6303", "CVE-2016-2182", # CVE-2020-13936: vulnerability found in apache velocity package which is a dependency for dependency-check package. Hence, ignoring. "CVE-2020-13936", } container_name = f"dep_check_{processor}" report_addon = get_container_name("depcheck-report", image) dependency_check_report = f"{report_addon}.html" html_file = f"{container_name}:/build/dependency-check-report.html" test_script = os.path.join(CONTAINER_TESTS_PREFIX, "testDependencyCheck") # Execute test, copy results to s3 ec2.execute_ec2_training_test(ec2_connection, image, test_script, container_name=container_name) ec2_connection.run(f"docker cp {html_file} ~/{dependency_check_report}") ec2_connection.run(f"aws s3 cp ~/{dependency_check_report} s3://dlc-dependency-check") # Check for any vulnerabilities not mentioned in allowed_vulnerabilities html_output = ec2_connection.run(f"cat ~/{dependency_check_report}", hide=True).stdout cves = re.findall(r">(CVE-\d+-\d+)</a>", html_output) vulnerabilities = set(cves) - allowed_vulnerabilities if vulnerabilities: vulnerability_severity = {} # Check NVD for vulnerability severity to provide this useful info in error message. for vulnerability in vulnerabilities: try: cve_url = f"https://services.nvd.nist.gov/rest/json/cve/1.0/{vulnerability}" session = requests.Session() session.mount( "https://", requests.adapters.HTTPAdapter(max_retries=Retry(total=5, status_forcelist=[404, 504, 502])), ) response = session.get(cve_url) if response.status_code == 200: severity = ( response.json() .get("result", {}) .get("CVE_Items", [{}])[0] .get("impact", {}) .get("baseMetricV2", {}) .get("severity", "UNKNOWN") ) except ConnectionError: LOGGER.exception(f"Failed to load NIST data for CVE {vulnerability}") if vulnerability_severity.get(severity): vulnerability_severity[severity].append(vulnerability) else: vulnerability_severity[severity] = [vulnerability] # TODO: Remove this once we have whitelisted appropriate LOW/MEDIUM vulnerabilities if not (vulnerability_severity.get("CRITICAL") or vulnerability_severity.get("HIGH")): return raise DependencyCheckFailure( f"Unrecognized CVEs have been reported : {vulnerability_severity}. " f"Allowed vulnerabilities are {allowed_vulnerabilities or None}. Please see " f"{dependency_check_report} for more details." )
def run_sm_perf_test(image_uri, num_nodes, region): """ Run TF sagemaker training performance tests Additional context: Setup for this function is performed by 'setup_sm_benchmark_tf_train_env' -- this installs some prerequisite packages, clones some repos, and creates a virtualenv called sm_benchmark_venv. TODO: Refactor the above setup function to be more obviously connected to this function, TODO: and install requirements via a requirements.txt file :param image_uri: ECR image URI :param num_nodes: Number of nodes to run on :param region: AWS region """ _, framework_version = get_framework_and_version_from_tag(image_uri) if framework_version.startswith("1."): pytest.skip("Skipping benchmark test on TF 1.x images.") processor = "gpu" if "gpu" in image_uri else "cpu" device_cuda_str = f"{processor}-{get_cuda_version_from_tag(image_uri)}" if processor == "gpu" else processor ec2_instance_type = "p3.16xlarge" if processor == "gpu" else "c5.18xlarge" py_version = "py2" if "py2" in image_uri else "py37" if "py37" in image_uri else "py3" time_str = time.strftime("%Y-%m-%d-%H-%M-%S") commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION") target_upload_location = os.path.join(BENCHMARK_RESULTS_S3_BUCKET, "tensorflow", framework_version, "sagemaker", "training", device_cuda_str, py_version) training_job_name = ( f"tf{framework_version[0]}-tr-bench-{device_cuda_str}-{num_nodes}-node-{py_version}-{commit_info[:7]}-{time_str}" ) # Inserting random sleep because this test starts multiple training jobs around the same time, resulting in # a throttling error for SageMaker APIs. time.sleep(Random(x=training_job_name).random() * 60) test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "resources") venv_dir = os.path.join(test_dir, "sm_benchmark_venv") ctx = Context() with ctx.cd(test_dir), ctx.prefix(f"source {venv_dir}/bin/activate"): log_file = ( f"results-{commit_info}-{time_str}-{framework_version}-{device_cuda_str}-{py_version}-{num_nodes}-node.txt" ) run_out = ctx.run( f"timeout 45m python tf_sm_benchmark.py " f"--framework-version {framework_version} " f"--image-uri {image_uri} " f"--instance-type ml.{ec2_instance_type} " f"--node-count {num_nodes} " f"--python {py_version} " f"--region {region} " f"--job-name {training_job_name}" f"2>&1 | tee {log_file}", warn=True, echo=True, ) if not (run_out.ok or run_out.return_code == 124): target_upload_location = os.path.join(target_upload_location, "failure_log") ctx.run( f"aws s3 cp {os.path.join(test_dir, log_file)} {os.path.join(target_upload_location, log_file)}" ) LOGGER.info( f"Test results can be found at {os.path.join(target_upload_location, log_file)}" ) result_statement, throughput = _print_results_of_test( os.path.join(test_dir, log_file), processor) throughput /= num_nodes assert run_out.ok, ( f"Benchmark Test failed with return code {run_out.return_code}. " f"Test results can be found at {os.path.join(target_upload_location, log_file)}" ) threshold_table = ((TENSORFLOW_SM_TRAINING_CPU_1NODE_THRESHOLD if num_nodes == 1 else TENSORFLOW_SM_TRAINING_CPU_4NODE_THRESHOLD) if processor == "cpu" else TENSORFLOW_SM_TRAINING_GPU_1NODE_THRESHOLD if num_nodes == 1 else TENSORFLOW_SM_TRAINING_GPU_4NODE_THRESHOLD) threshold = get_threshold_for_image(framework_version, threshold_table) LOGGER.info( f"tensorflow {framework_version} sagemaker training {device_cuda_str} {py_version} " f"imagenet {num_nodes} nodes Throughput: {throughput} images/sec, threshold: {threshold} images/sec" ) assert throughput > threshold, ( f"tensorflow {framework_version} sagemaker training {processor} {py_version} imagenet {num_nodes} nodes " f"Benchmark Result {throughput} does not reach the threshold {threshold}" )
def test_ecr_scan(image, ecr_client, sts_client, region): """ Run ECR Scan Tool on an image being tested, and raise Error if vulnerabilities found 1. Start Scan. 2. For 5 minutes (Run DescribeImages): (We run this for 5 minutes because the Scan is expected to complete in about 2 minutes, though no analysis has been performed on exactly how long the Scan takes for a DLC image. Therefore we also have a 3 minute buffer beyond the expected amount of time taken.) 3.1. If imageScanStatus == COMPLETE: exit loop 3.2. If imageScanStatus == IN_PROGRESS or AttributeNotFound(imageScanStatus): continue loop 3.3. If imageScanStatus == FAILED: raise RuntimeError 4. If DescribeImages.imageScanStatus != COMPLETE: raise TimeOutError 5. assert imageScanFindingsSummary.findingSeverityCounts.HIGH/CRITICAL == 0 :param image: str Image URI for image to be tested :param ecr_client: boto3 Client for ECR :param sts_client: boto3 Client for STS :param region: str Name of region where test is executed """ test_account_id = sts_client.get_caller_identity().get("Account") image_account_id = get_account_id_from_image_uri(image) if image_account_id != test_account_id: image_repo_uri, image_tag = image.split(":") _, image_repo_name = image_repo_uri.split("/") target_image_repo_name = f"beta-{image_repo_name}" image = ecr_utils.reupload_image_to_test_ecr(image, target_image_repo_name, region) minimum_sev_threshold = get_minimum_sev_threshold_level(image) LOGGER.info(f"Severity threshold level is {minimum_sev_threshold}") run_scan(ecr_client, image) scan_results = ecr_utils.get_ecr_image_scan_results( ecr_client, image, minimum_vulnerability=minimum_sev_threshold) scan_results = ecr_utils.populate_ecr_scan_with_web_scraper_results( image, scan_results) ecr_image_vulnerability_list = ScanVulnerabilityList( minimum_severity=CVESeverity[minimum_sev_threshold]) ecr_image_vulnerability_list.construct_allowlist_from_ecr_scan_result( scan_results) remaining_vulnerabilities = ecr_image_vulnerability_list # TODO: Once this feature is enabled, remove "if" condition and second assertion statement # TODO: Ensure this works on the canary tags before removing feature flag if is_image_covered_by_allowlist_feature(image): upgraded_image_vulnerability_list, image_scan_allowlist = fetch_other_vulnerability_lists( image, ecr_client, minimum_sev_threshold) s3_bucket_name = ECR_SCAN_HELPER_BUCKET ## In case new vulnerabilities are found conduct failure routine newly_found_vulnerabilities = ecr_image_vulnerability_list - image_scan_allowlist if newly_found_vulnerabilities: failure_routine_summary = conduct_failure_routine( image, image_scan_allowlist, ecr_image_vulnerability_list, upgraded_image_vulnerability_list, s3_bucket_name, ) ( s3_filename_for_fixable_list, s3_filename_for_non_fixable_list, ) = process_failure_routine_summary_and_store_data_in_s3( failure_routine_summary, s3_bucket_name) assert not newly_found_vulnerabilities, ( f"""Found {len(failure_routine_summary["fixable_vulnerabilities"])} fixable vulnerabilites """ f"""and {len(failure_routine_summary["non_fixable_vulnerabilities"])} non fixable vulnerabilites. """ f"""Refer to files s3://{s3_bucket_name}/{s3_filename_for_fixable_list}, s3://{s3_bucket_name}/{s3_filename_for_non_fixable_list}, """ f"""s3://{s3_bucket_name}/{failure_routine_summary["s3_filename_for_current_image_ecr_scan_list"]} and s3://{s3_bucket_name}/{failure_routine_summary["s3_filename_for_allowlist"]}.""" ) ## In case there is no new vulnerability but the allowlist is outdated conduct failure routine vulnerabilities_that_can_be_fixed = image_scan_allowlist - upgraded_image_vulnerability_list if vulnerabilities_that_can_be_fixed: failure_routine_summary = conduct_failure_routine( image, image_scan_allowlist, ecr_image_vulnerability_list, upgraded_image_vulnerability_list, s3_bucket_name, ) ( s3_filename_for_fixable_list, s3_filename_for_non_fixable_list, ) = process_failure_routine_summary_and_store_data_in_s3( failure_routine_summary, s3_bucket_name) assert not vulnerabilities_that_can_be_fixed, ( f"""Allowlist is Outdated!! Found {len(failure_routine_summary["fixable_vulnerabilities"])} fixable vulnerabilites """ f"""and {len(failure_routine_summary["non_fixable_vulnerabilities"])} non fixable vulnerabilites. """ f"""Refer to files s3://{s3_bucket_name}/{s3_filename_for_fixable_list}, s3://{s3_bucket_name}/{s3_filename_for_non_fixable_list}, """ f"""s3://{s3_bucket_name}/{failure_routine_summary["s3_filename_for_current_image_ecr_scan_list"]} and s3://{s3_bucket_name}/{failure_routine_summary["s3_filename_for_allowlist"]}.""" ) return common_ecr_scan_allowlist = ScanVulnerabilityList( minimum_severity=CVESeverity[minimum_sev_threshold]) common_ecr_scan_allowlist_path = os.path.join( os.sep, get_repository_local_path(), "data", "common-ecr-scan-allowlist.json") if os.path.exists(common_ecr_scan_allowlist_path): common_ecr_scan_allowlist.construct_allowlist_from_file( common_ecr_scan_allowlist_path) remaining_vulnerabilities = remaining_vulnerabilities - common_ecr_scan_allowlist if remaining_vulnerabilities: assert not remaining_vulnerabilities.vulnerability_list, ( f"The following vulnerabilities need to be fixed on {image}:\n" f"{json.dumps(remaining_vulnerabilities.vulnerability_list, indent=4)}" )