def test_ecs_pytorch_s3_plugin_training_gpu(gpu_only, ecs_container_instance, pytorch_training, training_cmd, ecs_cluster_name, pt17_and_above_only): """ GPU resnet18 test for PyTorch Training using S3 plugin Instance Type - p3.8xlarge Given above parameters, registers a task with family named after this test, runs the task, and waits for the task to be stopped before doing teardown operations of instance and cluster. """ _, image_framework_version = get_framework_and_version_from_tag( pytorch_training) if Version(image_framework_version) < Version("1.8"): pytest.skip("S3 plugin is supported on PyTorch version >=1.8") instance_id, cluster_arn = ecs_container_instance num_gpus = ec2_utils.get_instance_num_gpus(instance_id) ecs_utils.ecs_training_test_executor(ecs_cluster_name, cluster_arn, training_cmd, pytorch_training, instance_id, num_gpus=num_gpus)
def test_ecs_pytorch_training_dgl_gpu(gpu_only, py3_only, ecs_container_instance, pytorch_training, training_cmd, ecs_cluster_name): """ GPU DGL test for PyTorch Training Instance Type - p3.8xlarge DGL is only supported in py3, hence we have used the "py3_only" fixture to ensure py2 images don't run on this function. Given above parameters, registers a task with family named after this test, runs the task, and waits for the task to be stopped before doing teardown operations of instance and cluster. """ _, image_framework_version = get_framework_and_version_from_tag( pytorch_training) image_cuda_version = get_cuda_version_from_tag(pytorch_training) if Version(image_framework_version) == Version( "1.6") and image_cuda_version == "cu110": pytest.skip("DGL does not suport CUDA 11 for PyTorch 1.6") # TODO: Remove when DGL gpu test on ecs get fixed if Version(image_framework_version) >= Version( "1.10") and image_cuda_version == "cu113": pytest.skip("ecs test for DGL gpu fails since pt 1.10") instance_id, cluster_arn = ecs_container_instance num_gpus = ec2_utils.get_instance_num_gpus(instance_id) ecs_utils.ecs_training_test_executor(ecs_cluster_name, cluster_arn, training_cmd, pytorch_training, instance_id, num_gpus=num_gpus)
def test_ecs_mxnet_training_gluonnlp_gpu(gpu_only, py3_only, ecs_container_instance, mxnet_training, training_cmd, ecs_cluster_name): """ GPU Gluon NLP test for MXNet Training Instance Type - p2.16xlarge DGL is only supported in py3, hence we have used the "py3_only" fixture to ensure py2 images don't run on this function. Given above parameters, registers a task with family named after this test, runs the task, and waits for the task to be stopped before doing teardown operations of instance and cluster. """ instance_id, cluster_arn = ecs_container_instance num_gpus = ec2_utils.get_instance_num_gpus(instance_id) ecs_utils.ecs_training_test_executor(ecs_cluster_name, cluster_arn, training_cmd, mxnet_training, instance_id, num_gpus=num_gpus)
def test_ecs_mxnet_training_mnist_cpu(cpu_only, ecs_container_instance, mxnet_training, training_cmd, ecs_cluster_name): """ CPU mnist test for MXNet Training Instance Type - c4.8xlarge Given above parameters, registers a task with family named after this test, runs the task, and waits for the task to be stopped before doing teardown operations of instance and cluster. """ instance_id, cluster_arn = ecs_container_instance ecs_utils.ecs_training_test_executor(ecs_cluster_name, cluster_arn, training_cmd, mxnet_training, instance_id)
def test_ecs_pytorch_training_dgl_cpu(cpu_only, py3_only, ecs_container_instance, pytorch_training, training_cmd, ecs_cluster_name): """ CPU DGL test for PyTorch Training Instance Type - c5.12xlarge DGL is only supported in py3, hence we have used the "py3_only" fixture to ensure py2 images don't run on this function. Given above parameters, registers a task with family named after this test, runs the task, and waits for the task to be stopped before doing teardown operations of instance and cluster. """ instance_id, cluster_arn = ecs_container_instance ecs_utils.ecs_training_test_executor(ecs_cluster_name, cluster_arn, training_cmd, pytorch_training, instance_id)
def test_ecs_pytorch_training_mnist_gpu(gpu_only, ecs_container_instance, pytorch_training, training_cmd, ecs_cluster_name): """ GPU mnist test for PyTorch Training Instance Type - p3.8xlarge Given above parameters, registers a task with family named after this test, runs the task, and waits for the task to be stopped before doing teardown operations of instance and cluster. """ instance_id, cluster_arn = ecs_container_instance num_gpus = ec2_utils.get_instance_num_gpus(instance_id) ecs_utils.ecs_training_test_executor(ecs_cluster_name, cluster_arn, training_cmd, pytorch_training, instance_id, num_gpus=num_gpus)
def test_ecs_pytorch_s3_plugin_training_cpu(cpu_only, ecs_container_instance, pytorch_training, training_cmd, ecs_cluster_name): """ CPU resnet18 test for PyTorch Training using S3 plugin Instance Type - c5.9xlarge Given above parameters, registers a task with family named after this test, runs the task, and waits for the task to be stopped before doing teardown operations of instance and cluster. """ instance_id, cluster_arn = ecs_container_instance ecs_utils.ecs_training_test_executor(ecs_cluster_name, cluster_arn, training_cmd, pytorch_training, instance_id)
def test_ecs_mxnet_training_dgl_cpu(cpu_only, py3_only, ecs_container_instance, mxnet_training, training_cmd, ecs_cluster_name): """ CPU DGL test for MXNet Training Instance Type - c4.2xlarge DGL is only supported in py3, hence we have used the "py3_only" fixture to ensure py2 images don't run on this function. Given above parameters, registers a task with family named after this test, runs the task, and waits for the task to be stopped before doing teardown operations of instance and cluster. """ # TODO: remove/update this when DGL supports MXNet 1.9 _, framework_version = get_framework_and_version_from_tag(mxnet_training) if Version(framework_version) >= Version('1.9.0'): pytest.skip("Skipping DGL tests as DGL does not yet support MXNet 1.9") instance_id, cluster_arn = ecs_container_instance ecs_utils.ecs_training_test_executor(ecs_cluster_name, cluster_arn, training_cmd, mxnet_training, instance_id)
def test_ecs_pytorch_training_dgl_cpu( cpu_only, py3_only, ecs_container_instance, pytorch_training, training_cmd, ecs_cluster_name ): """ CPU DGL test for PyTorch Training Instance Type - c5.12xlarge DGL is only supported in py3, hence we have used the "py3_only" fixture to ensure py2 images don't run on this function. Given above parameters, registers a task with family named after this test, runs the task, and waits for the task to be stopped before doing teardown operations of instance and cluster. """ _, image_framework_version = get_framework_and_version_from_tag(pytorch_training) # TODO: Remove when DGL gpu test on ecs get fixed if Version(image_framework_version) in SpecifierSet("==1.10.*"): pytest.skip("ecs test for DGL gpu fails for pt 1.10") instance_id, cluster_arn = ecs_container_instance ecs_utils.ecs_training_test_executor(ecs_cluster_name, cluster_arn, training_cmd, pytorch_training, instance_id)