import os import pytest from test.test_utils import CONTAINER_TESTS_PREFIX, is_tf2, is_tf1 from test.test_utils.ec2 import get_ec2_instance_type SMDEBUG_SCRIPT = os.path.join(CONTAINER_TESTS_PREFIX, "testSmdebug") SMDEBUG_EC2_GPU_INSTANCE_TYPE = get_ec2_instance_type(default="p3.8xlarge", processor="gpu") SMDEBUG_EC2_CPU_INSTANCE_TYPE = get_ec2_instance_type(default="c5.9xlarge", processor="cpu") @pytest.mark.integration("smdebug") @pytest.mark.parametrize("ec2_instance_type", SMDEBUG_EC2_GPU_INSTANCE_TYPE, indirect=True) def test_smdebug_gpu(training, ec2_connection, region, gpu_only, py3_only): # p2.8xlarge and m4.16xlarge TF1 Pipeline Test are failing for unknown reason. # TODO: Remove this line and provide the required solution. if is_tf1(training) and SMDEBUG_EC2_GPU_INSTANCE_TYPE == "p2.8xlarge": pytest.skip("Currently skipping for TF1 until the issue is fixed") test_script = SMDEBUG_SCRIPT framework = get_framework_from_image_uri(training) container_test_local_dir = os.path.join("$HOME", "container_tests") ec2_connection.run( f"$(aws ecr get-login --no-include-email --region {region})", hide=True)
import os import pytest import test.test_utils.ec2 as ec2_utils from test import test_utils from test.test_utils import CONTAINER_TESTS_PREFIX, get_framework_and_version_from_tag from test.test_utils.ec2 import get_ec2_instance_type, execute_ec2_inference_test, get_ec2_accelerator_type from test.dlc_tests.conftest import LOGGER SQUEEZENET_MODEL = "squeezenet" BERT_MODEL = "bert_sst" RESNET_EIA_MODEL = "resnet-152-eia" MX_EC2_GPU_INSTANCE_TYPE = get_ec2_instance_type(default="g3.8xlarge", processor="gpu") MX_EC2_CPU_INSTANCE_TYPE = get_ec2_instance_type(default="c5.4xlarge", processor="cpu") MX_EC2_EIA_ACCELERATOR_TYPE = get_ec2_accelerator_type(default="eia1.large", processor="eia") MX_EC2_GPU_EIA_INSTANCE_TYPE = get_ec2_instance_type( default="g3.8xlarge", processor="gpu", filter_function=ec2_utils.filter_not_heavy_instance_types, ) MX_EC2_SINGLE_GPU_INSTANCE_TYPE = get_ec2_instance_type( default="p3.2xlarge", processor="gpu", filter_function=ec2_utils.filter_only_single_gpu, ) MX_EC2_NEURON_INSTANCE_TYPE = get_ec2_instance_type(default="inf1.xlarge",
TF1_STANDALONE_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "testTensorflow1Standalone") TF2_STANDALONE_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "testTensorflow2Standalone") TF_MNIST_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "testTensorFlow") TF1_HVD_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "testTF1HVD") TF2_HVD_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "testTF2HVD") TF_OPENCV_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "testOpenCV") TF_TELEMETRY_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "test_tf_dlc_telemetry_test") TF_KERAS_HVD_CMD_AMP = os.path.join(CONTAINER_TESTS_PREFIX, "testTFKerasHVDAMP") TF_KERAS_HVD_CMD_FP32 = os.path.join(CONTAINER_TESTS_PREFIX, "testTFKerasHVDFP32") TF_TENSORBOARD_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "testTensorBoard") TF_ADDONS_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "testTFAddons") TF_DATASERVICE_TEST_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "testDataservice") TF_DATASERVICE_DISTRIBUTE_TEST_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "testDataserviceDistribute") TF_EC2_SINGLE_GPU_INSTANCE_TYPE = get_ec2_instance_type( default="p3.2xlarge", processor="gpu", filter_function=ec2_utils.filter_only_single_gpu ) TF_EC2_GPU_INSTANCE_TYPE = get_ec2_instance_type(default="g3.16xlarge", processor="gpu") TF_EC2_CPU_INSTANCE_TYPE = get_ec2_instance_type(default="c4.8xlarge", processor="cpu") TF_EC2_HPU_INSTANCE_TYPE = get_ec2_instance_type(default="dl1.24xlarge", processor="hpu") class TFTrainingTestFailure(Exception): pass @pytest.mark.integration("tensorflow_sanity_test") @pytest.mark.model("N/A") @pytest.mark.parametrize("ec2_instance_type", TF_EC2_SINGLE_GPU_INSTANCE_TYPE, indirect=True) def test_tensorflow_standalone_gpu(tensorflow_training, ec2_connection, gpu_only, ec2_instance_type): if test_utils.is_image_incompatible_with_instance_type(tensorflow_training, ec2_instance_type):
TF2_STANDALONE_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "testTensorflow2Standalone") TF_MNIST_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "testTensorFlow") TF1_HVD_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "testTF1HVD") TF2_HVD_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "testTF2HVD") TF_OPENCV_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "testOpenCV") TF_TELEMETRY_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "test_tf_dlc_telemetry_test") TF_KERAS_HVD_CMD_AMP = os.path.join(CONTAINER_TESTS_PREFIX, "testTFKerasHVDAMP") TF_KERAS_HVD_CMD_FP32 = os.path.join(CONTAINER_TESTS_PREFIX, "testTFKerasHVDFP32") TF_TENSORBOARD_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "testTensorBoard") # TODO: Set enable_p3dn=True when releasing TF_EC2_GPU_INSTANCE_TYPE = get_ec2_instance_type(default="p2.xlarge", processor="gpu") TF_EC2_CPU_INSTANCE_TYPE = get_ec2_instance_type(default="c5.4xlarge", processor="cpu") @pytest.mark.parametrize("ec2_instance_type", TF_EC2_GPU_INSTANCE_TYPE, indirect=True) def test_tensorflow_standalone_gpu(tensorflow_training, ec2_connection, gpu_only): test_script = TF1_STANDALONE_CMD if is_tf1( tensorflow_training) else TF2_STANDALONE_CMD execute_ec2_training_test(ec2_connection, tensorflow_training, test_script) @pytest.mark.parametrize("ec2_instance_type",
from test.test_utils import CONTAINER_TESTS_PREFIX, UBUNTU_18_HPU_DLAMI_US_WEST_2, get_framework_and_version_from_tag, get_cuda_version_from_tag from test.test_utils.ec2 import execute_ec2_training_test, get_ec2_instance_type PT_STANDALONE_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "pytorch_tests", "testPyTorchStandalone") PT_MNIST_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "pytorch_tests", "testPyTorch") PT_REGRESSION_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "pytorch_tests", "testPyTorchRegression") PT_DGL_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "dgl_tests", "testPyTorchDGL") PT_APEX_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "pytorch_tests", "testNVApex") PT_AMP_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "pytorch_tests", "testPyTorchAMP") PT_TELEMETRY_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "pytorch_tests", "test_pt_dlc_telemetry_test") PT_S3_PLUGIN_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "pytorch_tests", "testPyTorchS3Plugin") PT_HABANA_TEST_SUITE_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "testHabanaPTSuite") PT_EC2_GPU_INSTANCE_TYPE = get_ec2_instance_type(default="g3.8xlarge", processor="gpu") PT_EC2_CPU_INSTANCE_TYPE = get_ec2_instance_type(default="c5.9xlarge", processor="cpu") PT_EC2_SINGLE_GPU_INSTANCE_TYPE = get_ec2_instance_type( default="p3.2xlarge", processor="gpu", filter_function=ec2_utils.filter_only_single_gpu, ) PT_EC2_MULTI_GPU_INSTANCE_TYPE = get_ec2_instance_type( default="g3.8xlarge", processor="gpu", filter_function=ec2_utils.filter_only_multi_gpu, ) PT_EC2_HPU_INSTANCE_TYPE = get_ec2_instance_type(default="dl1.24xlarge", processor="hpu") @pytest.mark.integration("pytorch_sanity_test") @pytest.mark.model("N/A") @pytest.mark.parametrize("ec2_instance_type", PT_EC2_GPU_INSTANCE_TYPE, indirect=True) def test_pytorch_standalone_gpu(pytorch_training, ec2_connection, gpu_only, ec2_instance_type): if test_utils.is_image_incompatible_with_instance_type(pytorch_training, ec2_instance_type):
import test.test_utils as test_utils import test.test_utils.ec2 as ec2_utils from test.test_utils import CONTAINER_TESTS_PREFIX from test.test_utils.ec2 import execute_ec2_training_test, get_ec2_instance_type MX_STANDALONE_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "testMXNetStandalone") MX_MNIST_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "testMXNet") MX_DGL_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "dgl_tests", "testMXNetDGL") MX_NLP_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "gluonnlp_tests", "testNLP") MX_HVD_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "testMXNetHVD") MX_KERAS_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "testKerasMXNet") MX_TELEMETRY_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "test_mx_dlc_telemetry_test") MX_EC2_GPU_INSTANCE_TYPE = get_ec2_instance_type(default="g3.8xlarge", processor="gpu") MX_EC2_CPU_INSTANCE_TYPE = get_ec2_instance_type(default="c5.4xlarge", processor="cpu") MX_EC2_SINGLE_GPU_INSTANCE_TYPE = get_ec2_instance_type( default="p3.2xlarge", processor="gpu", filter_function=ec2_utils.filter_only_single_gpu, ) @pytest.mark.integration("mxnet_sanity_test") @pytest.mark.model("N/A") @pytest.mark.parametrize("ec2_instance_type", MX_EC2_GPU_INSTANCE_TYPE, indirect=True) def test_mxnet_standalone_gpu(mxnet_training, ec2_connection, gpu_only,
TF1_STANDALONE_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "testTensorflow1Standalone") TF2_STANDALONE_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "testTensorflow2Standalone") TF_MNIST_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "testTensorFlow") TF1_HVD_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "testTF1HVD") TF2_HVD_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "testTF2HVD") TF_OPENCV_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "testOpenCV") TF_TELEMETRY_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "test_tf_dlc_telemetry_test") TF_KERAS_HVD_CMD_AMP = os.path.join(CONTAINER_TESTS_PREFIX, "testTFKerasHVDAMP") TF_KERAS_HVD_CMD_FP32 = os.path.join(CONTAINER_TESTS_PREFIX, "testTFKerasHVDFP32") TF_TENSORBOARD_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "testTensorBoard") TF_ADDONS_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "testTFAddons") TF_DATASERVICE_TEST_CMD = os.path.join(CONTAINER_TESTS_PREFIX, "testDataservice") TF_EC2_SINGLE_GPU_INSTANCE_TYPE = get_ec2_instance_type(default="p3.2xlarge", processor="gpu") TF_EC2_GPU_INSTANCE_TYPE = get_ec2_instance_type(default="g3.16xlarge", processor="gpu") TF_EC2_CPU_INSTANCE_TYPE = get_ec2_instance_type(default="c4.8xlarge", processor="cpu") class TFTrainingTestFailure(Exception): pass @pytest.mark.integration('tensorflow_sanity_test') @pytest.mark.model("N/A") @pytest.mark.parametrize("ec2_instance_type", TF_EC2_SINGLE_GPU_INSTANCE_TYPE, indirect=True) def test_tensorflow_standalone_gpu(tensorflow_training, ec2_connection, gpu_only): test_script = TF1_STANDALONE_CMD if is_tf_version("1", tensorflow_training) else TF2_STANDALONE_CMD execute_ec2_training_test(ec2_connection, tensorflow_training, test_script)
import os import pytest from packaging.version import Version from test.test_utils import CONTAINER_TESTS_PREFIX, LOGGER from test.test_utils import ( get_account_id_from_image_uri, get_cuda_version_from_tag, get_region_from_image_uri, login_to_ecr_registry, ) from test.test_utils.ec2 import get_ec2_instance_type SMCLARIFY_SCRIPT = os.path.join(CONTAINER_TESTS_PREFIX, "test_smclarify_bias_metrics.py") SMCLARIFY_EC2_GPU_INSTANCE_TYPE = get_ec2_instance_type(default="p3.2xlarge", processor="gpu") SMCLARIFY_EC2_CPU_INSTANCE_TYPE = get_ec2_instance_type(default="c4.2xlarge", processor="cpu") # Adding separate tests to run on cpu instance for cpu image and gpu instance for gpu image. # But the test behavior doesn't change for cpu or gpu image type. @pytest.mark.integration("smclarify_cpu") @pytest.mark.model("N/A") @pytest.mark.parametrize("ec2_instance_type", SMCLARIFY_EC2_CPU_INSTANCE_TYPE, indirect=True) def test_smclarify_metrics_cpu( training, ec2_connection, ec2_instance_type, cpu_only, py3_only, tf23_and_above_only, mx18_and_above_only,
import os import re from time import sleep import pytest import test.test_utils.ec2 as ec2_utils from test import test_utils from test.test_utils.ec2 import get_ec2_instance_type, get_ec2_accelerator_type from test.dlc_tests.conftest import LOGGER TENSORFLOW1_VERSION = "1." TENSORFLOW2_VERSION = "2." TF_EC2_GPU_INSTANCE_TYPE = get_ec2_instance_type(default="g3.8xlarge", processor="gpu") TF_EC2_CPU_INSTANCE_TYPE = get_ec2_instance_type(default="c5.4xlarge", processor="cpu") TF_EC2_EIA_ACCELERATOR_TYPE = get_ec2_accelerator_type(default="eia1.large", processor="eia") TF_EC2_NEURON_ACCELERATOR_TYPE = get_ec2_instance_type(default="inf1.xlarge", processor="neuron") TF_EC2_SINGLE_GPU_INSTANCE_TYPE = get_ec2_instance_type( default="p3.2xlarge", processor="gpu", filter_function=ec2_utils.filter_only_single_gpu, ) TF_EC2_GRAVITON_INSTANCE_TYPE = get_ec2_instance_type(default="c6g.4xlarge", processor="cpu", arch_type="graviton")