def delete_eks_cluster(eks_cluster_name): """ Function to detach SSM policy from IAM role created by EKS nodegroups and delete the EKS cluster """ DETACH_SSM_POLICY = "detach" eks_utils.manage_ssm_permissions_nodegroup(eks_cluster_name, DETACH_SSM_POLICY) eks_utils.delete_eks_cluster(eks_cluster_name)
def setup_eks_cluster(framework_name, is_neuron): frameworks = { "tensorflow": "tf", "mxnet": "mx", "pytorch": "pt", } long_name = framework_name short_name = frameworks[long_name] codebuild_version = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION")[0:7] num_nodes = 1 if is_pr_context() else 3 if long_name != "pytorch" else 4 cluster_name = f"dlc-{short_name}-cluster-{codebuild_version}-{random.randint(1, 10000)}" # default volume size volume_size = 80 try: eks_utils.eks_setup() if is_neuron: #TODO the eks AMI used for neuron has a snapshot size of 500GB, if we pass the default 80GB the cluster #creation will fail. Once official EKS AMI for neuron 1.1 is released, revert this change. volume_size = 500 eks_utils.create_eks_cluster(cluster_name, "neuron", num_nodes, volume_size, "inf1.xlarge", "pytest.pem") else: eks_utils.create_eks_cluster(cluster_name, "gpu", num_nodes, volume_size, "p3.16xlarge", "pytest.pem") except Exception: eks_utils.delete_eks_cluster(cluster_name) raise return cluster_name
def main(): # Define constants test_type = os.getenv("TEST_TYPE") dlc_images = get_dlc_images() LOGGER.info(f"Images tested: {dlc_images}") all_image_list = dlc_images.split(" ") standard_images_list = [ image_uri for image_uri in all_image_list if "example" not in image_uri ] eks_terminable_clusters = [] benchmark_mode = "benchmark" in test_type specific_test_type = re.sub("benchmark-", "", test_type) if benchmark_mode else test_type test_path = os.path.join( "benchmark", specific_test_type) if benchmark_mode else specific_test_type if specific_test_type in ("sanity", "ecs", "ec2", "eks"): report = os.path.join(os.getcwd(), "test", f"{test_type}.xml") # PyTest must be run in this directory to avoid conflicting w/ sagemaker_tests conftests os.chdir(os.path.join("test", "dlc_tests")) # Pull images for necessary tests if specific_test_type == "sanity": pull_dlc_images(all_image_list) if specific_test_type == "eks": eks_terminable_clusters = setup_eks_clusters(dlc_images) # Execute dlc_tests pytest command pytest_cmd = [ "-s", "-rA", test_path, f"--junitxml={report}", "-n=auto" ] try: sys.exit(pytest.main(pytest_cmd)) finally: if specific_test_type == "eks" and eks_terminable_clusters: for cluster in eks_terminable_clusters: eks_utils.delete_eks_cluster(cluster) # Delete dangling EC2 KeyPairs if specific_test_type == "ec2" and os.path.exists( KEYS_TO_DESTROY_FILE): with open(KEYS_TO_DESTROY_FILE) as key_destroy_file: for key_file in key_destroy_file: LOGGER.info(key_file) ec2_client = boto3.client( "ec2", config=Config(retries={'max_attempts': 10})) if ".pem" in key_file: _resp, keyname = destroy_ssh_keypair( ec2_client, key_file) LOGGER.info(f"Deleted {keyname}") elif specific_test_type == "sagemaker": run_sagemaker_tests([ image for image in standard_images_list if not ("tensorflow-inference" in image and "py2" in image) ]) else: raise NotImplementedError( f"{test_type} test is not supported. " f"Only support ec2, ecs, eks, sagemaker and sanity currently")
def setup_eks_cluster(framework_name, is_neuron): frameworks = { "tensorflow": "tf", "mxnet": "mx", "pytorch": "pt", } long_name = framework_name short_name = frameworks[long_name] codebuild_version = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION")[0:7] num_nodes = 1 if is_pr_context() else 3 if long_name != "pytorch" else 4 cluster_name = f"dlc-{short_name}-cluster-{codebuild_version}-{random.randint(1, 10000)}" # default volume size volume_size = 80 try: eks_utils.eks_setup() if is_neuron: eks_utils.create_eks_cluster(cluster_name, num_nodes, volume_size, "inf1.xlarge", "pytest.pem") else: eks_utils.create_eks_cluster(cluster_name, num_nodes, volume_size, "p3.16xlarge", "pytest.pem") except Exception: eks_utils.delete_eks_cluster(cluster_name) raise return cluster_name
def delete_eks_cluster(eks_cluster_name, is_neuron): """ Function to delete EKS cluster 1. Detach IAM permissions from EKS nodegroup IAM role 2. Delete OIDC provider created by kubeflow 3. Delete the EKS cluster """ DETACH_IAM_POLICY="detach" eks_utils.manage_iam_permissions_nodegroup(eks_cluster_name, DETACH_IAM_POLICY) # Delete OIDC provider on EKS cluster other than neuron as kubeflow is not being installed if not is_neuron: eks_utils.delete_oidc_provider(eks_cluster_name) eks_utils.delete_eks_cluster(eks_cluster_name)
def setup_eks_cluster(framework_name): frameworks = {"tensorflow": "tf", "pytorch": "pt", "mxnet": "mx"} long_name = framework_name short_name = frameworks[long_name] codebuild_version = os.getenv('CODEBUILD_RESOLVED_SOURCE_VERSION')[0:7] num_nodes = 1 if is_pr_context() else 3 if long_name != "pytorch" else 4 cluster_name = f"dlc-{short_name}-cluster-" \ f"{codebuild_version}-{random.randint(1, 10000)}" try: eks_utils.eks_setup() eks_utils.create_eks_cluster(cluster_name, "gpu", num_nodes, "p3.16xlarge", "pytest.pem") except Exception: eks_utils.delete_eks_cluster(cluster_name) raise return cluster_name
def main(): # Define constants test_type = os.getenv("TEST_TYPE") dlc_images = get_dlc_images() LOGGER.info(f"Images tested: {dlc_images}") all_image_list = dlc_images.split(" ") standard_images_list = [ image_uri for image_uri in all_image_list if "example" not in image_uri ] eks_cluster_name = None benchmark_mode = "benchmark" in test_type specific_test_type = re.sub("benchmark-", "", test_type) if benchmark_mode else test_type test_path = os.path.join( "benchmark", specific_test_type) if benchmark_mode else specific_test_type if specific_test_type in ("sanity", "ecs", "ec2", "eks", "canary"): report = os.path.join(os.getcwd(), "test", f"{test_type}.xml") # The following two report files will only be used by EKS tests, as eks_train.xml and eks_infer.xml. # This is to sequence the tests and prevent one set of tests from waiting too long to be scheduled. report_train = os.path.join(os.getcwd(), "test", f"{test_type}_train.xml") report_infer = os.path.join(os.getcwd(), "test", f"{test_type}_infer.xml") # PyTest must be run in this directory to avoid conflicting w/ sagemaker_tests conftests os.chdir(os.path.join("test", "dlc_tests")) # Pull images for necessary tests if specific_test_type == "sanity": pull_dlc_images(all_image_list) if specific_test_type == "eks": frameworks_in_images = [ framework for framework in ("mxnet", "pytorch", "tensorflow") if framework in dlc_images ] if len(frameworks_in_images) != 1: raise ValueError( f"All images in dlc_images must be of a single framework for EKS tests.\n" f"Instead seeing {frameworks_in_images} frameworks.") framework = frameworks_in_images[0] eks_cluster_name = setup_eks_cluster(framework) #setup kubeflow eks_utils.setup_kubeflow(eks_cluster_name) # Split training and inference, and run one after the other, to prevent scheduling issues # Set -n=4, instead of -n=auto, because initiating too many pods simultaneously has been resulting in # pods timing-out while they were in the Pending state. Scheduling 4 tests (and hence, 4 pods) at once # seems to be an optimal configuration. pytest_cmds = [ [ "-s", "-rA", os.path.join(test_path, framework, "training"), f"--junitxml={report_train}", "-n=4" ], [ "-s", "-rA", os.path.join(test_path, framework, "inference"), f"--junitxml={report_infer}", "-n=4" ], ] else: # Execute dlc_tests pytest command pytest_cmds = [[ "-s", "-rA", test_path, f"--junitxml={report}", "-n=auto" ]] # Execute separate cmd for canaries if specific_test_type == "canary": pytest_cmds = [[ "-s", "-rA", f"--junitxml={report}", "-n=auto", "--canary", "--ignore=container_tests/" ]] try: # Note:- Running multiple pytest_cmds in a sequence will result in the execution log having two # separate pytest reports, both of which must be examined in case of a manual review of results. cmd_exit_statuses = [ pytest.main(pytest_cmd) for pytest_cmd in pytest_cmds ] sys.exit(0) if all([status == 0 for status in cmd_exit_statuses ]) else sys.exit(1) finally: if specific_test_type == "eks" and eks_cluster_name: eks_utils.delete_eks_cluster(eks_cluster_name) # Delete dangling EC2 KeyPairs if os.path.exists(KEYS_TO_DESTROY_FILE): with open(KEYS_TO_DESTROY_FILE) as key_destroy_file: for key_file in key_destroy_file: LOGGER.info(key_file) ec2_client = boto3.client( "ec2", config=Config(retries={'max_attempts': 10})) if ".pem" in key_file: _resp, keyname = destroy_ssh_keypair( ec2_client, key_file) LOGGER.info(f"Deleted {keyname}") elif specific_test_type == "sagemaker": if benchmark_mode: report = os.path.join(os.getcwd(), "test", f"{test_type}.xml") os.chdir(os.path.join("test", "dlc_tests")) setup_sm_benchmark_env(dlc_images, test_path) pytest_cmd = [ "-s", "-rA", test_path, f"--junitxml={report}", "-n=auto", "-o", "norecursedirs=resources" ] sys.exit(pytest.main(pytest_cmd)) else: run_sagemaker_remote_tests([ image for image in standard_images_list if not ("tensorflow-inference" in image and "py2" in image) ]) elif specific_test_type == "sagemaker-local": run_sagemaker_local_tests([ image for image in standard_images_list if not ("tensorflow-inference" in image and "py2" in image) ]) else: raise NotImplementedError( f"{test_type} test is not supported. " f"Only support ec2, ecs, eks, sagemaker and sanity currently")
def main(): # Define constants start_time = datetime.now() test_type = os.getenv("TEST_TYPE") executor_mode = os.getenv("EXECUTOR_MODE", "False").lower() == "true" dlc_images = os.getenv("DLC_IMAGE") if executor_mode else get_dlc_images() LOGGER.info(f"Images tested: {dlc_images}") all_image_list = dlc_images.split(" ") standard_images_list = [ image_uri for image_uri in all_image_list if "example" not in image_uri ] # Do not create EKS cluster for when EIA Only Images are present is_all_images_list_eia = all("eia" in image_uri for image_uri in all_image_list) eks_cluster_name = None benchmark_mode = "benchmark" in test_type specific_test_type = re.sub("benchmark-", "", test_type) if benchmark_mode else test_type test_path = os.path.join( "benchmark", specific_test_type) if benchmark_mode else specific_test_type if specific_test_type in ("sanity", "ecs", "ec2", "eks", "canary", "bai"): report = os.path.join(os.getcwd(), "test", f"{test_type}.xml") # The following two report files will only be used by EKS tests, as eks_train.xml and eks_infer.xml. # This is to sequence the tests and prevent one set of tests from waiting too long to be scheduled. report_train = os.path.join(os.getcwd(), "test", f"{test_type}_train.xml") report_infer = os.path.join(os.getcwd(), "test", f"{test_type}_infer.xml") report_multinode_train = os.path.join(os.getcwd(), "test", f"eks_multinode_train.xml") # PyTest must be run in this directory to avoid conflicting w/ sagemaker_tests conftests os.chdir(os.path.join("test", "dlc_tests")) # Pull images for necessary tests if specific_test_type == "sanity": pull_dlc_images(all_image_list) if specific_test_type == "bai": build_bai_docker_container() if specific_test_type == "eks" and not is_all_images_list_eia: frameworks_in_images = [ framework for framework in ("mxnet", "pytorch", "tensorflow") if framework in dlc_images ] if len(frameworks_in_images) != 1: raise ValueError( f"All images in dlc_images must be of a single framework for EKS tests.\n" f"Instead seeing {frameworks_in_images} frameworks.") framework = frameworks_in_images[0] eks_cluster_name = setup_eks_cluster(framework) # setup kubeflow eks_utils.setup_kubeflow(eks_cluster_name) # Change 1: Split training and inference, and run one after the other, to prevent scheduling issues # Set -n=4, instead of -n=auto, because initiating too many pods simultaneously has been resulting in # pods timing-out while they were in the Pending state. Scheduling 4 tests (and hence, 4 pods) at once # seems to be an optimal configuration. # Change 2: Separate multi-node EKS tests from single-node tests in execution to prevent resource contention pytest_cmds = [ [ "-s", "-rA", os.path.join(test_path, framework, "training"), f"--junitxml={report_train}", "-n=4", "-m", "not multinode" ], [ "-s", "-rA", os.path.join(test_path, framework, "inference"), f"--junitxml={report_infer}", "-n=4", "-m", "not multinode" ], [ "-s", "-rA", test_path, f"--junitxml={report_multinode_train}", "--multinode" ], ] if is_pr_context(): for cmd in pytest_cmds: cmd.append("--timeout=2340") else: # Execute dlc_tests pytest command pytest_cmd = [ "-s", "-rA", test_path, f"--junitxml={report}", "-n=auto" ] if test_type == "ec2": pytest_cmd += ["--reruns=1", "--reruns-delay=10"] if is_pr_context(): pytest_cmd.append("--timeout=4860") pytest_cmds = [pytest_cmd] # Execute separate cmd for canaries if specific_test_type == "canary": pytest_cmds = [[ "-s", "-rA", f"--junitxml={report}", "-n=auto", "--canary", "--ignore=container_tests/" ]] try: # Note:- Running multiple pytest_cmds in a sequence will result in the execution log having two # separate pytest reports, both of which must be examined in case of a manual review of results. cmd_exit_statuses = [ pytest.main(pytest_cmd) for pytest_cmd in pytest_cmds ] if all([status == 0 for status in cmd_exit_statuses]): sys.exit(0) else: raise RuntimeError(pytest_cmds) finally: if specific_test_type == "eks" and eks_cluster_name: eks_utils.delete_eks_cluster(eks_cluster_name) # Delete dangling EC2 KeyPairs if os.path.exists(KEYS_TO_DESTROY_FILE): delete_key_pairs(KEYS_TO_DESTROY_FILE) elif specific_test_type == "sagemaker": if benchmark_mode: report = os.path.join(os.getcwd(), "test", f"{test_type}.xml") os.chdir(os.path.join("test", "dlc_tests")) setup_sm_benchmark_env(dlc_images, test_path) pytest_cmd = [ "-s", "-rA", test_path, f"--junitxml={report}", "-n=auto", "-o", "norecursedirs=resources" ] sys.exit(pytest.main(pytest_cmd)) else: run_sagemaker_remote_tests([ image for image in standard_images_list if not ("tensorflow-inference" in image and "py2" in image) ]) metrics_utils.send_test_duration_metrics(start_time) elif specific_test_type == "sagemaker-local": testing_image_list = [ image for image in standard_images_list if not (("tensorflow-inference" in image and "py2" in image) or ("eia" in image)) ] run_sagemaker_local_tests(testing_image_list) # for EIA Images if len(testing_image_list) == 0: report = os.path.join(os.getcwd(), "test", f"{test_type}.xml") sm_utils.generate_empty_report(report, test_type, "eia") else: raise NotImplementedError( f"{test_type} test is not supported. Only support ec2, ecs, eks, sagemaker and sanity currently" )