def test_smdebug_gpu(training, ec2_connection, region, ec2_instance_type, gpu_only, py3_only): if test_utils.is_image_compatible_with_instance_type( training, ec2_instance_type): pytest.skip( f"Image {training} is incompatible with instance type {ec2_instance_type}" ) smdebug_test_timeout = 2400 if is_tf_version("1", training): if is_nightly_context(): smdebug_test_timeout = 7200 else: pytest.skip( "TF1 gpu smdebug tests can take up to 2 hours, thus we are only running in nightly context" ) run_smdebug_test( training, ec2_connection, region, ec2_instance_type, docker_executable="nvidia-docker", container_name="smdebug-gpu", timeout=smdebug_test_timeout, )
def test_nvapex(pytorch_training, ec2_connection, gpu_only, ec2_instance_type): if test_utils.is_image_compatible_with_instance_type( pytorch_training, ec2_instance_type): pytest.skip( f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}" ) execute_ec2_training_test(ec2_connection, pytorch_training, PT_APEX_CMD)
def test_mxnet_keras_gpu(mxnet_training, ec2_connection, gpu_only, ec2_instance_type): if test_utils.is_image_compatible_with_instance_type( mxnet_training, ec2_instance_type): pytest.skip( f"Image {mxnet_training} is incompatible with instance type {ec2_instance_type}" ) execute_ec2_training_test(ec2_connection, mxnet_training, MX_KERAS_CMD)
def test_ec2_pytorch_inference_gpu(pytorch_inference, ec2_connection, region, gpu_only, ec2_instance_type): if test_utils.is_image_compatible_with_instance_type( pytorch_inference, ec2_instance_type): pytest.skip( f"Image {pytorch_inference} is incompatible with instance type {ec2_instance_type}" ) ec2_pytorch_inference(pytorch_inference, "gpu", ec2_connection, region)
def test_ec2_tensorflow_inference_gpu(tensorflow_inference, ec2_connection, region, gpu_only, ec2_instance_type): if test_utils.is_image_compatible_with_instance_type( tensorflow_inference, ec2_instance_type): pytest.skip( f"Image {tensorflow_inference} is incompatible with instance type {ec2_instance_type}" ) run_ec2_tensorflow_inference(tensorflow_inference, ec2_connection, "8500", region)
def test_mxnet_inference_telemetry_gpu(mxnet_inference, ec2_connection, gpu_only, ec2_instance_type): if test_utils.is_image_compatible_with_instance_type( mxnet_inference, ec2_instance_type): pytest.skip( f"Image {mxnet_inference} is incompatible with instance type {ec2_instance_type}" ) execute_ec2_inference_test(ec2_connection, mxnet_inference, MX_TELEMETRY_CMD)
def test_curand_gpu(training, ec2_connection, gpu_only, ec2_instance_type): if test_utils.is_image_compatible_with_instance_type( training, ec2_instance_type): pytest.skip( f"Image {training} is incompatible with instance type {ec2_instance_type}" ) if is_tf_version("1", training) or "mxnet" in training: pytest.skip("Test is not configured for TF1 and MXNet") execute_ec2_training_test(ec2_connection, training, CURAND_CMD)
def test_pytorch_linear_regression_gpu(pytorch_training, ec2_connection, gpu_only, ec2_instance_type): if test_utils.is_image_compatible_with_instance_type( pytorch_training, ec2_instance_type): pytest.skip( f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}" ) execute_ec2_training_test(ec2_connection, pytorch_training, PT_REGRESSION_CMD)
def test_pytorch_standalone_gpu(pytorch_training, ec2_connection, gpu_only, ec2_instance_type): if test_utils.is_image_compatible_with_instance_type( pytorch_training, ec2_instance_type): pytest.skip( f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}" ) execute_ec2_training_test(ec2_connection, pytorch_training, PT_STANDALONE_CMD)
def test_mxnet_train_dgl_gpu(mxnet_training, ec2_connection, gpu_only, py3_only, ec2_instance_type): if test_utils.is_image_compatible_with_instance_type( mxnet_training, ec2_instance_type): pytest.skip( f"Image {mxnet_training} is incompatible with instance type {ec2_instance_type}" ) if "cu110" in mxnet_training: pytest.skip("Skipping dgl tests on cuda 11.0 until available") execute_ec2_training_test(ec2_connection, mxnet_training, MX_DGL_CMD)
def test_tensorflow_with_horovod_gpu(tensorflow_training, ec2_instance_type, ec2_connection, gpu_only, tf2_only): if test_utils.is_image_compatible_with_instance_type(tensorflow_training, ec2_instance_type): pytest.skip(f"Image {tensorflow_training} is incompatible with instance type {ec2_instance_type}") test_script = TF1_HVD_CMD if is_tf_version("1", tensorflow_training) else TF2_HVD_CMD execute_ec2_training_test( connection=ec2_connection, ecr_uri=tensorflow_training, test_cmd=test_script, large_shm=bool(re.match(r"(p2\.8xlarge)|(g3\.16xlarge)", ec2_instance_type)) )
def test_ec2_mxnet_gluonnlp_inference_gpu(mxnet_inference, ec2_connection, region, gpu_only, py3_only, ec2_instance_type): if test_utils.is_image_compatible_with_instance_type( mxnet_inference, ec2_instance_type): pytest.skip( f"Image {mxnet_inference} is incompatible with instance type {ec2_instance_type}" ) run_ec2_mxnet_inference(mxnet_inference, BERT_MODEL, "gluonnlp", ec2_connection, "gpu", region, 90, 9091)
def test_ec2_mxnet_squeezenet_inference_gpu(mxnet_inference, ec2_connection, region, gpu_only, ec2_instance_type): if test_utils.is_image_compatible_with_instance_type( mxnet_inference, ec2_instance_type): pytest.skip( f"Image {mxnet_inference} is incompatible with instance type {ec2_instance_type}" ) run_ec2_mxnet_inference(mxnet_inference, SQUEEZENET_MODEL, "squeezenet", ec2_connection, "gpu", region, 80, 8081)
def test_pytorch_with_horovod(pytorch_training, ec2_connection, gpu_only, ec2_instance_type): if test_utils.is_image_compatible_with_instance_type( pytorch_training, ec2_instance_type): pytest.skip( f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}" ) test_cmd = os.path.join(CONTAINER_TESTS_PREFIX, "pytorch_tests", "testPTHVD") execute_ec2_training_test(ec2_connection, pytorch_training, test_cmd)
def test_pytorch_amp(pytorch_training, ec2_connection, gpu_only, ec2_instance_type): _, image_framework_version = get_framework_and_version_from_tag( pytorch_training) if Version(image_framework_version) < Version("1.6"): pytest.skip("Native AMP was introduced in PyTorch 1.6") if test_utils.is_image_compatible_with_instance_type( pytorch_training, ec2_instance_type): pytest.skip( f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}" ) execute_ec2_training_test(ec2_connection, pytorch_training, PT_AMP_CMD)
def test_pytorch_mpi(pytorch_training, ec2_connection, gpu_only, py3_only, ec2_instance_type): """ Tests mpi backend """ if test_utils.is_image_compatible_with_instance_type( pytorch_training, ec2_instance_type): pytest.skip( f"Image {pytorch_training} is incompatible with instance type {ec2_instance_type}" ) test_cmd = os.path.join(CONTAINER_TESTS_PREFIX, "pytorch_tests", "testPyTorchMpi") execute_ec2_training_test(ec2_connection, pytorch_training, test_cmd)
def test_smprofiler_gpu(training, ec2_connection, region, ec2_instance_type, gpu_only, py3_only, tf23_and_above_only, pt17_and_above_only): # Running the profiler tests for pytorch and tensorflow2 frameworks only. # This code needs to be modified past reInvent 2020 if test_utils.is_image_compatible_with_instance_type( training, ec2_instance_type): pytest.skip( f"Image {training} is incompatible with instance type {ec2_instance_type}" ) framework = get_framework_from_image_uri(training) if framework not in ["pytorch", "tensorflow2"]: return smdebug_test_timeout = 2400 run_smprofiler_test( training, ec2_connection, region, ec2_instance_type, docker_executable="nvidia-docker", container_name="smdebug-gpu", timeout=smdebug_test_timeout, )
def test_tensorflow_train_mnist_gpu(tensorflow_training, ec2_connection, gpu_only, ec2_instance_type): if test_utils.is_image_compatible_with_instance_type(tensorflow_training, ec2_instance_type): pytest.skip(f"Image {tensorflow_training} is incompatible with instance type {ec2_instance_type}") execute_ec2_training_test(ec2_connection, tensorflow_training, TF_MNIST_CMD)
def test_tensorflow_standalone_gpu(tensorflow_training, ec2_connection, gpu_only, ec2_instance_type): if test_utils.is_image_compatible_with_instance_type(tensorflow_training, ec2_instance_type): pytest.skip(f"Image {tensorflow_training} is incompatible with instance type {ec2_instance_type}") test_script = TF1_STANDALONE_CMD if is_tf_version("1", tensorflow_training) else TF2_STANDALONE_CMD execute_ec2_training_test(ec2_connection, tensorflow_training, test_script)
def test_tensorflow_distribute_dataservice_gpu( tensorflow_training, ec2_connection, tf24_and_above_only, gpu_only, ec2_instance_type ): if test_utils.is_image_compatible_with_instance_type(tensorflow_training, ec2_instance_type): pytest.skip(f"Image {tensorflow_training} is incompatible with instance type {ec2_instance_type}") run_data_service_test(ec2_connection, tensorflow_training, TF_DATASERVICE_DISTRIBUTE_TEST_CMD)
def test_tensorflow_keras_horovod_fp32(tensorflow_training, ec2_connection, tf2_only, gpu_only, ec2_instance_type): if test_utils.is_image_compatible_with_instance_type(tensorflow_training, ec2_instance_type): pytest.skip(f"Image {tensorflow_training} is incompatible with instance type {ec2_instance_type}") execute_ec2_training_test(ec2_connection, tensorflow_training, TF_KERAS_HVD_CMD_FP32)