Exemple #1
0
def delete_eks_cluster(eks_cluster_name):
    """ Function to detach SSM policy from IAM role created by EKS nodegroups and delete the EKS cluster
    """
    DETACH_SSM_POLICY = "detach"
    eks_utils.manage_ssm_permissions_nodegroup(eks_cluster_name,
                                               DETACH_SSM_POLICY)
    eks_utils.delete_eks_cluster(eks_cluster_name)
def setup_eks_cluster(framework_name, is_neuron):
    frameworks = {
        "tensorflow": "tf",
        "mxnet": "mx",
        "pytorch": "pt",
    }
    long_name = framework_name
    short_name = frameworks[long_name]
    codebuild_version = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION")[0:7]
    num_nodes = 1 if is_pr_context() else 3 if long_name != "pytorch" else 4
    cluster_name = f"dlc-{short_name}-cluster-{codebuild_version}-{random.randint(1, 10000)}"
    # default volume size
    volume_size = 80
    try:
        eks_utils.eks_setup()
        if is_neuron:
            #TODO the eks AMI used for neuron has a snapshot size of 500GB, if we pass the default 80GB the cluster
            #creation will fail. Once official EKS AMI for neuron 1.1 is released, revert this change.
            volume_size = 500
            eks_utils.create_eks_cluster(cluster_name, "neuron", num_nodes, volume_size, "inf1.xlarge", "pytest.pem")
        else:
            eks_utils.create_eks_cluster(cluster_name, "gpu", num_nodes, volume_size, "p3.16xlarge", "pytest.pem")
    except Exception:
        eks_utils.delete_eks_cluster(cluster_name)
        raise
    return cluster_name
def main():
    # Define constants
    test_type = os.getenv("TEST_TYPE")
    dlc_images = get_dlc_images()
    LOGGER.info(f"Images tested: {dlc_images}")
    all_image_list = dlc_images.split(" ")
    standard_images_list = [
        image_uri for image_uri in all_image_list if "example" not in image_uri
    ]
    eks_terminable_clusters = []
    benchmark_mode = "benchmark" in test_type
    specific_test_type = re.sub("benchmark-", "",
                                test_type) if benchmark_mode else test_type
    test_path = os.path.join(
        "benchmark",
        specific_test_type) if benchmark_mode else specific_test_type

    if specific_test_type in ("sanity", "ecs", "ec2", "eks"):
        report = os.path.join(os.getcwd(), "test", f"{test_type}.xml")

        # PyTest must be run in this directory to avoid conflicting w/ sagemaker_tests conftests
        os.chdir(os.path.join("test", "dlc_tests"))

        # Pull images for necessary tests
        if specific_test_type == "sanity":
            pull_dlc_images(all_image_list)
        if specific_test_type == "eks":
            eks_terminable_clusters = setup_eks_clusters(dlc_images)
        # Execute dlc_tests pytest command
        pytest_cmd = [
            "-s", "-rA", test_path, f"--junitxml={report}", "-n=auto"
        ]
        try:
            sys.exit(pytest.main(pytest_cmd))
        finally:
            if specific_test_type == "eks" and eks_terminable_clusters:
                for cluster in eks_terminable_clusters:
                    eks_utils.delete_eks_cluster(cluster)

            # Delete dangling EC2 KeyPairs
            if specific_test_type == "ec2" and os.path.exists(
                    KEYS_TO_DESTROY_FILE):
                with open(KEYS_TO_DESTROY_FILE) as key_destroy_file:
                    for key_file in key_destroy_file:
                        LOGGER.info(key_file)
                        ec2_client = boto3.client(
                            "ec2", config=Config(retries={'max_attempts': 10}))
                        if ".pem" in key_file:
                            _resp, keyname = destroy_ssh_keypair(
                                ec2_client, key_file)
                            LOGGER.info(f"Deleted {keyname}")
    elif specific_test_type == "sagemaker":
        run_sagemaker_tests([
            image for image in standard_images_list
            if not ("tensorflow-inference" in image and "py2" in image)
        ])
    else:
        raise NotImplementedError(
            f"{test_type} test is not supported. "
            f"Only support ec2, ecs, eks, sagemaker and sanity currently")
def setup_eks_cluster(framework_name, is_neuron):
    frameworks = {
        "tensorflow": "tf",
        "mxnet": "mx",
        "pytorch": "pt",
    }
    long_name = framework_name
    short_name = frameworks[long_name]
    codebuild_version = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION")[0:7]
    num_nodes = 1 if is_pr_context() else 3 if long_name != "pytorch" else 4
    cluster_name = f"dlc-{short_name}-cluster-{codebuild_version}-{random.randint(1, 10000)}"
    # default volume size
    volume_size = 80
    try:
        eks_utils.eks_setup()
        if is_neuron:
            eks_utils.create_eks_cluster(cluster_name, num_nodes, volume_size,
                                         "inf1.xlarge", "pytest.pem")
        else:
            eks_utils.create_eks_cluster(cluster_name, num_nodes, volume_size,
                                         "p3.16xlarge", "pytest.pem")
    except Exception:
        eks_utils.delete_eks_cluster(cluster_name)
        raise
    return cluster_name
def delete_eks_cluster(eks_cluster_name, is_neuron):
    """ Function to delete EKS cluster
    1. Detach IAM permissions from EKS nodegroup IAM role
    2. Delete OIDC provider created by kubeflow
    3. Delete the EKS cluster
    """
    DETACH_IAM_POLICY="detach"
    eks_utils.manage_iam_permissions_nodegroup(eks_cluster_name, DETACH_IAM_POLICY)

    # Delete OIDC provider on EKS cluster other than neuron as kubeflow is not being installed
    if not is_neuron:
        eks_utils.delete_oidc_provider(eks_cluster_name)

    eks_utils.delete_eks_cluster(eks_cluster_name)
Exemple #6
0
def setup_eks_cluster(framework_name):
    frameworks = {"tensorflow": "tf", "pytorch": "pt", "mxnet": "mx"}
    long_name = framework_name
    short_name = frameworks[long_name]
    codebuild_version = os.getenv('CODEBUILD_RESOLVED_SOURCE_VERSION')[0:7]
    num_nodes = 1 if is_pr_context() else 3 if long_name != "pytorch" else 4
    cluster_name = f"dlc-{short_name}-cluster-" \
                   f"{codebuild_version}-{random.randint(1, 10000)}"
    try:
        eks_utils.eks_setup()
        eks_utils.create_eks_cluster(cluster_name, "gpu", num_nodes, "p3.16xlarge", "pytest.pem")
    except Exception:
        eks_utils.delete_eks_cluster(cluster_name)
        raise
    return cluster_name
def main():
    # Define constants
    test_type = os.getenv("TEST_TYPE")
    dlc_images = get_dlc_images()
    LOGGER.info(f"Images tested: {dlc_images}")
    all_image_list = dlc_images.split(" ")
    standard_images_list = [
        image_uri for image_uri in all_image_list if "example" not in image_uri
    ]
    eks_cluster_name = None
    benchmark_mode = "benchmark" in test_type
    specific_test_type = re.sub("benchmark-", "",
                                test_type) if benchmark_mode else test_type
    test_path = os.path.join(
        "benchmark",
        specific_test_type) if benchmark_mode else specific_test_type

    if specific_test_type in ("sanity", "ecs", "ec2", "eks", "canary"):
        report = os.path.join(os.getcwd(), "test", f"{test_type}.xml")
        # The following two report files will only be used by EKS tests, as eks_train.xml and eks_infer.xml.
        # This is to sequence the tests and prevent one set of tests from waiting too long to be scheduled.
        report_train = os.path.join(os.getcwd(), "test",
                                    f"{test_type}_train.xml")
        report_infer = os.path.join(os.getcwd(), "test",
                                    f"{test_type}_infer.xml")

        # PyTest must be run in this directory to avoid conflicting w/ sagemaker_tests conftests
        os.chdir(os.path.join("test", "dlc_tests"))

        # Pull images for necessary tests
        if specific_test_type == "sanity":
            pull_dlc_images(all_image_list)
        if specific_test_type == "eks":
            frameworks_in_images = [
                framework for framework in ("mxnet", "pytorch", "tensorflow")
                if framework in dlc_images
            ]
            if len(frameworks_in_images) != 1:
                raise ValueError(
                    f"All images in dlc_images must be of a single framework for EKS tests.\n"
                    f"Instead seeing {frameworks_in_images} frameworks.")
            framework = frameworks_in_images[0]
            eks_cluster_name = setup_eks_cluster(framework)

            #setup kubeflow
            eks_utils.setup_kubeflow(eks_cluster_name)

            # Split training and inference, and run one after the other, to prevent scheduling issues
            # Set -n=4, instead of -n=auto, because initiating too many pods simultaneously has been resulting in
            # pods timing-out while they were in the Pending state. Scheduling 4 tests (and hence, 4 pods) at once
            # seems to be an optimal configuration.
            pytest_cmds = [
                [
                    "-s", "-rA",
                    os.path.join(test_path, framework, "training"),
                    f"--junitxml={report_train}", "-n=4"
                ],
                [
                    "-s", "-rA",
                    os.path.join(test_path, framework, "inference"),
                    f"--junitxml={report_infer}", "-n=4"
                ],
            ]
        else:
            # Execute dlc_tests pytest command
            pytest_cmds = [[
                "-s", "-rA", test_path, f"--junitxml={report}", "-n=auto"
            ]]
        # Execute separate cmd for canaries
        if specific_test_type == "canary":
            pytest_cmds = [[
                "-s", "-rA", f"--junitxml={report}", "-n=auto", "--canary",
                "--ignore=container_tests/"
            ]]
        try:
            # Note:- Running multiple pytest_cmds in a sequence will result in the execution log having two
            #        separate pytest reports, both of which must be examined in case of a manual review of results.
            cmd_exit_statuses = [
                pytest.main(pytest_cmd) for pytest_cmd in pytest_cmds
            ]
            sys.exit(0) if all([status == 0 for status in cmd_exit_statuses
                                ]) else sys.exit(1)
        finally:
            if specific_test_type == "eks" and eks_cluster_name:
                eks_utils.delete_eks_cluster(eks_cluster_name)

            # Delete dangling EC2 KeyPairs
            if os.path.exists(KEYS_TO_DESTROY_FILE):
                with open(KEYS_TO_DESTROY_FILE) as key_destroy_file:
                    for key_file in key_destroy_file:
                        LOGGER.info(key_file)
                        ec2_client = boto3.client(
                            "ec2", config=Config(retries={'max_attempts': 10}))
                        if ".pem" in key_file:
                            _resp, keyname = destroy_ssh_keypair(
                                ec2_client, key_file)
                            LOGGER.info(f"Deleted {keyname}")
    elif specific_test_type == "sagemaker":
        if benchmark_mode:
            report = os.path.join(os.getcwd(), "test", f"{test_type}.xml")
            os.chdir(os.path.join("test", "dlc_tests"))

            setup_sm_benchmark_env(dlc_images, test_path)
            pytest_cmd = [
                "-s", "-rA", test_path, f"--junitxml={report}", "-n=auto",
                "-o", "norecursedirs=resources"
            ]
            sys.exit(pytest.main(pytest_cmd))
        else:
            run_sagemaker_remote_tests([
                image for image in standard_images_list
                if not ("tensorflow-inference" in image and "py2" in image)
            ])
    elif specific_test_type == "sagemaker-local":
        run_sagemaker_local_tests([
            image for image in standard_images_list
            if not ("tensorflow-inference" in image and "py2" in image)
        ])
    else:
        raise NotImplementedError(
            f"{test_type} test is not supported. "
            f"Only support ec2, ecs, eks, sagemaker and sanity currently")
Exemple #8
0
def main():
    # Define constants
    start_time = datetime.now()
    test_type = os.getenv("TEST_TYPE")
    executor_mode = os.getenv("EXECUTOR_MODE", "False").lower() == "true"
    dlc_images = os.getenv("DLC_IMAGE") if executor_mode else get_dlc_images()
    LOGGER.info(f"Images tested: {dlc_images}")
    all_image_list = dlc_images.split(" ")
    standard_images_list = [
        image_uri for image_uri in all_image_list if "example" not in image_uri
    ]
    # Do not create EKS cluster for when EIA Only Images are present
    is_all_images_list_eia = all("eia" in image_uri
                                 for image_uri in all_image_list)
    eks_cluster_name = None
    benchmark_mode = "benchmark" in test_type
    specific_test_type = re.sub("benchmark-", "",
                                test_type) if benchmark_mode else test_type
    test_path = os.path.join(
        "benchmark",
        specific_test_type) if benchmark_mode else specific_test_type

    if specific_test_type in ("sanity", "ecs", "ec2", "eks", "canary", "bai"):
        report = os.path.join(os.getcwd(), "test", f"{test_type}.xml")
        # The following two report files will only be used by EKS tests, as eks_train.xml and eks_infer.xml.
        # This is to sequence the tests and prevent one set of tests from waiting too long to be scheduled.
        report_train = os.path.join(os.getcwd(), "test",
                                    f"{test_type}_train.xml")
        report_infer = os.path.join(os.getcwd(), "test",
                                    f"{test_type}_infer.xml")
        report_multinode_train = os.path.join(os.getcwd(), "test",
                                              f"eks_multinode_train.xml")

        # PyTest must be run in this directory to avoid conflicting w/ sagemaker_tests conftests
        os.chdir(os.path.join("test", "dlc_tests"))

        # Pull images for necessary tests
        if specific_test_type == "sanity":
            pull_dlc_images(all_image_list)
        if specific_test_type == "bai":
            build_bai_docker_container()
        if specific_test_type == "eks" and not is_all_images_list_eia:
            frameworks_in_images = [
                framework for framework in ("mxnet", "pytorch", "tensorflow")
                if framework in dlc_images
            ]
            if len(frameworks_in_images) != 1:
                raise ValueError(
                    f"All images in dlc_images must be of a single framework for EKS tests.\n"
                    f"Instead seeing {frameworks_in_images} frameworks.")
            framework = frameworks_in_images[0]
            eks_cluster_name = setup_eks_cluster(framework)

            # setup kubeflow
            eks_utils.setup_kubeflow(eks_cluster_name)

            # Change 1: Split training and inference, and run one after the other, to prevent scheduling issues
            # Set -n=4, instead of -n=auto, because initiating too many pods simultaneously has been resulting in
            # pods timing-out while they were in the Pending state. Scheduling 4 tests (and hence, 4 pods) at once
            # seems to be an optimal configuration.
            # Change 2: Separate multi-node EKS tests from single-node tests in execution to prevent resource contention
            pytest_cmds = [
                [
                    "-s", "-rA",
                    os.path.join(test_path, framework, "training"),
                    f"--junitxml={report_train}", "-n=4", "-m", "not multinode"
                ],
                [
                    "-s", "-rA",
                    os.path.join(test_path, framework, "inference"),
                    f"--junitxml={report_infer}", "-n=4", "-m", "not multinode"
                ],
                [
                    "-s", "-rA", test_path,
                    f"--junitxml={report_multinode_train}", "--multinode"
                ],
            ]
            if is_pr_context():
                for cmd in pytest_cmds:
                    cmd.append("--timeout=2340")
        else:
            # Execute dlc_tests pytest command
            pytest_cmd = [
                "-s", "-rA", test_path, f"--junitxml={report}", "-n=auto"
            ]
            if test_type == "ec2":
                pytest_cmd += ["--reruns=1", "--reruns-delay=10"]
            if is_pr_context():
                pytest_cmd.append("--timeout=4860")

            pytest_cmds = [pytest_cmd]
        # Execute separate cmd for canaries
        if specific_test_type == "canary":
            pytest_cmds = [[
                "-s", "-rA", f"--junitxml={report}", "-n=auto", "--canary",
                "--ignore=container_tests/"
            ]]
        try:
            # Note:- Running multiple pytest_cmds in a sequence will result in the execution log having two
            #        separate pytest reports, both of which must be examined in case of a manual review of results.
            cmd_exit_statuses = [
                pytest.main(pytest_cmd) for pytest_cmd in pytest_cmds
            ]
            if all([status == 0 for status in cmd_exit_statuses]):
                sys.exit(0)
            else:
                raise RuntimeError(pytest_cmds)
        finally:
            if specific_test_type == "eks" and eks_cluster_name:
                eks_utils.delete_eks_cluster(eks_cluster_name)

            # Delete dangling EC2 KeyPairs
            if os.path.exists(KEYS_TO_DESTROY_FILE):
                delete_key_pairs(KEYS_TO_DESTROY_FILE)

    elif specific_test_type == "sagemaker":
        if benchmark_mode:
            report = os.path.join(os.getcwd(), "test", f"{test_type}.xml")
            os.chdir(os.path.join("test", "dlc_tests"))

            setup_sm_benchmark_env(dlc_images, test_path)
            pytest_cmd = [
                "-s", "-rA", test_path, f"--junitxml={report}", "-n=auto",
                "-o", "norecursedirs=resources"
            ]
            sys.exit(pytest.main(pytest_cmd))

        else:
            run_sagemaker_remote_tests([
                image for image in standard_images_list
                if not ("tensorflow-inference" in image and "py2" in image)
            ])
        metrics_utils.send_test_duration_metrics(start_time)

    elif specific_test_type == "sagemaker-local":
        testing_image_list = [
            image for image in standard_images_list
            if not (("tensorflow-inference" in image and "py2" in image) or
                    ("eia" in image))
        ]
        run_sagemaker_local_tests(testing_image_list)
        # for EIA Images
        if len(testing_image_list) == 0:
            report = os.path.join(os.getcwd(), "test", f"{test_type}.xml")
            sm_utils.generate_empty_report(report, test_type, "eia")
    else:
        raise NotImplementedError(
            f"{test_type} test is not supported. Only support ec2, ecs, eks, sagemaker and sanity currently"
        )