def test_smdebug_gpu(training, ec2_connection, region, gpu_only, py3_only):
    # TODO: Remove this once test timeout has been debugged (failures especially on p2.8xlarge)
    if is_tf2(training) and "2.3.0" in training and "p2.8xlarge" in SMDEBUG_EC2_GPU_INSTANCE_TYPE:
        pytest.skip("Currently skipping for TF2.3.0 on p2.8xlarge until the issue is fixed")
    if is_tf1(training):
        pytest.skip("Currently skipping for TF1 until the issue is fixed")
    run_smdebug_test(training, ec2_connection, region, docker_executable="nvidia-docker", container_name="smdebug-gpu")
def get_framework_from_image_uri(image_uri):
    frameworks = ("tensorflow", "mxnet", "pytorch")
    for framework in frameworks:
        if framework in image_uri:
            if framework == "tensorflow" and is_tf2(image_uri):
                return "tensorflow2"
            return framework
    raise RuntimeError(f"Could not find any framework {frameworks} in {image_uri}")
Example #3
0
def test_performance_tensorflow_gpu_imagenet(tensorflow_training,
                                             ec2_connection, gpu_only):
    threshold = (TENSORFLOW2_TRAINING_GPU_IMAGENET_THRESHOLD
                 if is_tf2(tensorflow_training) else
                 TENSORFLOW1_TRAINING_GPU_IMAGENET_THRESHOLD)
    execute_ec2_training_performance_test(
        ec2_connection,
        tensorflow_training,
        TF_PERFORMANCE_TRAINING_GPU_IMAGENET_CMD,
        post_process=post_process_tensorflow_training_performance,
        data_source="imagenet",
        threshold={"Throughput": threshold},
    )
Example #4
0
def test_performance_tensorflow_cpu(tensorflow_training, ec2_connection,
                                    cpu_only):
    threshold = (TENSORFLOW2_TRAINING_CPU_SYNTHETIC_THRESHOLD
                 if is_tf2(tensorflow_training) else
                 TENSORFLOW1_TRAINING_CPU_SYNTHETIC_THRESHOLD)
    execute_ec2_training_performance_test(
        ec2_connection,
        tensorflow_training,
        TF_PERFORMANCE_TRAINING_CPU_SYNTHETIC_CMD,
        post_process=post_process_tensorflow_training_performance,
        data_source="synthetic",
        threshold={"Throughput": threshold},
    )
Example #5
0
def test_dlc_major_version_dockerfiles(image):
    """
    Test to make sure semantic versioning scheme in Dockerfiles is correct

    :param image: <str> ECR image URI
    """
    dlc_dir = os.getcwd().split(f"{os.sep}test{os.sep}")[0]
    job_type = test_utils.get_job_type_from_image(image)
    framework, fw_version = test_utils.get_framework_and_version_from_tag(
        image)
    processor = test_utils.get_processor_from_image_uri(image)

    # Assign a string of numbers associated with python version in tag. Python major version is not sufficient to
    # define DLC major version
    python_major_minor_version = re.search(r"-py(\d{2,})", image).group(1)

    root_dir = os.path.join(dlc_dir, framework, job_type, "docker")

    # Skip older FW versions that did not use this versioning scheme
    references = {
        "tensorflow2": "2.2.0",
        "tensorflow1": "1.16.0",
        "mxnet": "1.7.0",
        "pytorch": "1.5.0"
    }
    if test_utils.is_tf1(image):
        reference_fw = "tensorflow1"
    elif test_utils.is_tf2(image):
        reference_fw = "tensorflow2"
    else:
        reference_fw = framework
    if processor != "eia" and packaging.version.parse(
            fw_version) < packaging.version.parse(references[reference_fw]):
        pytest.skip(
            f"Not enforcing new versioning scheme on old image {image}. "
            f"Started enforcing version scheme on the following: {references}")

    # Find all Dockerfile.<processor> for this framework/job_type's Major.Minor version
    dockerfiles = []
    fw_version_major_minor = re.match(r"(\d+\.\d+)", fw_version).group(1)
    for root, dirnames, filenames in os.walk(root_dir):
        for filename in filenames:
            if filename == f"Dockerfile.{processor}":
                dockerfile_path = os.path.join(root_dir, root, filename)
                if "example" not in dockerfile_path and f"{os.sep}{fw_version_major_minor}" in dockerfile_path:
                    dockerfiles.append(dockerfile_path)

    # For the collected dockerfiles above, note the DLC major versions in each Dockerfile if python version matches
    # the current image under test
    versions = {}
    dlc_label_regex = re.compile(r'LABEL dlc_major_version="(\d+)"')
    python_version_regex = re.compile(r"ARG PYTHON_VERSION=(\d+\.\d+)")
    for dockerfile in dockerfiles:
        with open(dockerfile, "r") as df:
            dlc_version = None
            python_version = None
            for line in df:
                major_version_match = dlc_label_regex.match(line)
                python_version_match = python_version_regex.match(line)
                if major_version_match:
                    dlc_version = int(major_version_match.group(1))
                elif python_version_match:
                    python_version = python_version_match.group(1).replace(
                        ".", "")

            # Raise errors if dlc major version label and python version arg are not found in Dockerfile
            if not dlc_version:
                raise DLCMajorVersionLabelNotFound(
                    f"Cannot find dlc_major_version label in {dockerfile}")
            if not python_version:
                raise DLCPythonVersionNotFound(
                    f"Cannot find PYTHON_VERSION arg in {dockerfile}")
            if python_version == python_major_minor_version:
                versions[dockerfile] = dlc_version

    expected_versions = list(range(1, len(dockerfiles) + 1))
    actual_versions = sorted(versions.values())

    # Test case explicitly for TF2.3 gpu, since v1.0 is banned
    if (framework, fw_version_major_minor, processor,
            python_major_minor_version, job_type) == (
                "tensorflow",
                "2.3",
                "gpu",
                "37",
                "training",
            ):
        expected_versions = [v + 1 for v in expected_versions]
        assert 1 not in actual_versions, (
            f"DLC v1.0 is deprecated in TF2.3 gpu containers, but found major version 1 "
            f"in one of the Dockerfiles. Please inspect {versions}")

    # Note: If, for example, we find 3 dockerfiles with the same framework major/minor version, same processor,
    # and same python major/minor version, we will expect DLC major versions 1, 2, and 3. If an exception needs to be
    # made to this rule, please see the above handling of TF2.3 as an example.
    assert actual_versions == expected_versions, (
        f"Found DLC major versions {actual_versions} but expected {expected_versions} for "
        f"{framework} {job_type} {processor}. Full version info: {versions}. Py version: {python_major_minor_version}"
    )