def test_smdebug_gpu(training, ec2_connection, region, gpu_only, py3_only): # TODO: Remove this once test timeout has been debugged (failures especially on p2.8xlarge) if is_tf2(training) and "2.3.0" in training and "p2.8xlarge" in SMDEBUG_EC2_GPU_INSTANCE_TYPE: pytest.skip("Currently skipping for TF2.3.0 on p2.8xlarge until the issue is fixed") if is_tf1(training): pytest.skip("Currently skipping for TF1 until the issue is fixed") run_smdebug_test(training, ec2_connection, region, docker_executable="nvidia-docker", container_name="smdebug-gpu")
def get_framework_from_image_uri(image_uri): frameworks = ("tensorflow", "mxnet", "pytorch") for framework in frameworks: if framework in image_uri: if framework == "tensorflow" and is_tf2(image_uri): return "tensorflow2" return framework raise RuntimeError(f"Could not find any framework {frameworks} in {image_uri}")
def test_performance_tensorflow_gpu_imagenet(tensorflow_training, ec2_connection, gpu_only): threshold = (TENSORFLOW2_TRAINING_GPU_IMAGENET_THRESHOLD if is_tf2(tensorflow_training) else TENSORFLOW1_TRAINING_GPU_IMAGENET_THRESHOLD) execute_ec2_training_performance_test( ec2_connection, tensorflow_training, TF_PERFORMANCE_TRAINING_GPU_IMAGENET_CMD, post_process=post_process_tensorflow_training_performance, data_source="imagenet", threshold={"Throughput": threshold}, )
def test_performance_tensorflow_cpu(tensorflow_training, ec2_connection, cpu_only): threshold = (TENSORFLOW2_TRAINING_CPU_SYNTHETIC_THRESHOLD if is_tf2(tensorflow_training) else TENSORFLOW1_TRAINING_CPU_SYNTHETIC_THRESHOLD) execute_ec2_training_performance_test( ec2_connection, tensorflow_training, TF_PERFORMANCE_TRAINING_CPU_SYNTHETIC_CMD, post_process=post_process_tensorflow_training_performance, data_source="synthetic", threshold={"Throughput": threshold}, )
def test_dlc_major_version_dockerfiles(image): """ Test to make sure semantic versioning scheme in Dockerfiles is correct :param image: <str> ECR image URI """ dlc_dir = os.getcwd().split(f"{os.sep}test{os.sep}")[0] job_type = test_utils.get_job_type_from_image(image) framework, fw_version = test_utils.get_framework_and_version_from_tag( image) processor = test_utils.get_processor_from_image_uri(image) # Assign a string of numbers associated with python version in tag. Python major version is not sufficient to # define DLC major version python_major_minor_version = re.search(r"-py(\d{2,})", image).group(1) root_dir = os.path.join(dlc_dir, framework, job_type, "docker") # Skip older FW versions that did not use this versioning scheme references = { "tensorflow2": "2.2.0", "tensorflow1": "1.16.0", "mxnet": "1.7.0", "pytorch": "1.5.0" } if test_utils.is_tf1(image): reference_fw = "tensorflow1" elif test_utils.is_tf2(image): reference_fw = "tensorflow2" else: reference_fw = framework if processor != "eia" and packaging.version.parse( fw_version) < packaging.version.parse(references[reference_fw]): pytest.skip( f"Not enforcing new versioning scheme on old image {image}. " f"Started enforcing version scheme on the following: {references}") # Find all Dockerfile.<processor> for this framework/job_type's Major.Minor version dockerfiles = [] fw_version_major_minor = re.match(r"(\d+\.\d+)", fw_version).group(1) for root, dirnames, filenames in os.walk(root_dir): for filename in filenames: if filename == f"Dockerfile.{processor}": dockerfile_path = os.path.join(root_dir, root, filename) if "example" not in dockerfile_path and f"{os.sep}{fw_version_major_minor}" in dockerfile_path: dockerfiles.append(dockerfile_path) # For the collected dockerfiles above, note the DLC major versions in each Dockerfile if python version matches # the current image under test versions = {} dlc_label_regex = re.compile(r'LABEL dlc_major_version="(\d+)"') python_version_regex = re.compile(r"ARG PYTHON_VERSION=(\d+\.\d+)") for dockerfile in dockerfiles: with open(dockerfile, "r") as df: dlc_version = None python_version = None for line in df: major_version_match = dlc_label_regex.match(line) python_version_match = python_version_regex.match(line) if major_version_match: dlc_version = int(major_version_match.group(1)) elif python_version_match: python_version = python_version_match.group(1).replace( ".", "") # Raise errors if dlc major version label and python version arg are not found in Dockerfile if not dlc_version: raise DLCMajorVersionLabelNotFound( f"Cannot find dlc_major_version label in {dockerfile}") if not python_version: raise DLCPythonVersionNotFound( f"Cannot find PYTHON_VERSION arg in {dockerfile}") if python_version == python_major_minor_version: versions[dockerfile] = dlc_version expected_versions = list(range(1, len(dockerfiles) + 1)) actual_versions = sorted(versions.values()) # Test case explicitly for TF2.3 gpu, since v1.0 is banned if (framework, fw_version_major_minor, processor, python_major_minor_version, job_type) == ( "tensorflow", "2.3", "gpu", "37", "training", ): expected_versions = [v + 1 for v in expected_versions] assert 1 not in actual_versions, ( f"DLC v1.0 is deprecated in TF2.3 gpu containers, but found major version 1 " f"in one of the Dockerfiles. Please inspect {versions}") # Note: If, for example, we find 3 dockerfiles with the same framework major/minor version, same processor, # and same python major/minor version, we will expect DLC major versions 1, 2, and 3. If an exception needs to be # made to this rule, please see the above handling of TF2.3 as an example. assert actual_versions == expected_versions, ( f"Found DLC major versions {actual_versions} but expected {expected_versions} for " f"{framework} {job_type} {processor}. Full version info: {versions}. Py version: {python_major_minor_version}" )