Example #1
0
def test_ecs_pytorch_s3_plugin_training_gpu(gpu_only, ecs_container_instance,
                                            pytorch_training, training_cmd,
                                            ecs_cluster_name,
                                            pt17_and_above_only):
    """
    GPU resnet18 test for PyTorch Training using S3 plugin

    Instance Type - p3.8xlarge

    Given above parameters, registers a task with family named after this test, runs the task, and waits for
    the task to be stopped before doing teardown operations of instance and cluster.
    """
    _, image_framework_version = get_framework_and_version_from_tag(
        pytorch_training)
    if Version(image_framework_version) < Version("1.8"):
        pytest.skip("S3 plugin is supported on PyTorch version >=1.8")
    instance_id, cluster_arn = ecs_container_instance

    num_gpus = ec2_utils.get_instance_num_gpus(instance_id)

    ecs_utils.ecs_training_test_executor(ecs_cluster_name,
                                         cluster_arn,
                                         training_cmd,
                                         pytorch_training,
                                         instance_id,
                                         num_gpus=num_gpus)
Example #2
0
def test_ecs_pytorch_training_dgl_gpu(gpu_only, py3_only,
                                      ecs_container_instance, pytorch_training,
                                      training_cmd, ecs_cluster_name):
    """
    GPU DGL test for PyTorch Training

    Instance Type - p3.8xlarge

    DGL is only supported in py3, hence we have used the "py3_only" fixture to ensure py2 images don't run
    on this function.

    Given above parameters, registers a task with family named after this test, runs the task, and waits for
    the task to be stopped before doing teardown operations of instance and cluster.
    """
    _, image_framework_version = get_framework_and_version_from_tag(
        pytorch_training)
    image_cuda_version = get_cuda_version_from_tag(pytorch_training)
    if Version(image_framework_version) == Version(
            "1.6") and image_cuda_version == "cu110":
        pytest.skip("DGL does not suport CUDA 11 for PyTorch 1.6")
    # TODO: Remove when DGL gpu test on ecs get fixed
    if Version(image_framework_version) >= Version(
            "1.10") and image_cuda_version == "cu113":
        pytest.skip("ecs test for DGL gpu fails since pt 1.10")

    instance_id, cluster_arn = ecs_container_instance

    num_gpus = ec2_utils.get_instance_num_gpus(instance_id)

    ecs_utils.ecs_training_test_executor(ecs_cluster_name,
                                         cluster_arn,
                                         training_cmd,
                                         pytorch_training,
                                         instance_id,
                                         num_gpus=num_gpus)
def test_ecs_mxnet_training_gluonnlp_gpu(gpu_only, py3_only,
                                         ecs_container_instance,
                                         mxnet_training, training_cmd,
                                         ecs_cluster_name):
    """
    GPU Gluon NLP test for MXNet Training

    Instance Type - p2.16xlarge

    DGL is only supported in py3, hence we have used the "py3_only" fixture to ensure py2 images don't run
    on this function.

    Given above parameters, registers a task with family named after this test, runs the task, and waits for
    the task to be stopped before doing teardown operations of instance and cluster.
    """
    instance_id, cluster_arn = ecs_container_instance

    num_gpus = ec2_utils.get_instance_num_gpus(instance_id)

    ecs_utils.ecs_training_test_executor(ecs_cluster_name,
                                         cluster_arn,
                                         training_cmd,
                                         mxnet_training,
                                         instance_id,
                                         num_gpus=num_gpus)
def test_ecs_mxnet_training_mnist_cpu(cpu_only, ecs_container_instance, mxnet_training, training_cmd, ecs_cluster_name):
    """
    CPU mnist test for MXNet Training

    Instance Type - c4.8xlarge

    Given above parameters, registers a task with family named after this test, runs the task, and waits for
    the task to be stopped before doing teardown operations of instance and cluster.
    """
    instance_id, cluster_arn = ecs_container_instance

    ecs_utils.ecs_training_test_executor(ecs_cluster_name, cluster_arn, training_cmd, mxnet_training, instance_id)
Example #5
0
def test_ecs_pytorch_training_dgl_cpu(cpu_only, py3_only, ecs_container_instance, pytorch_training, training_cmd,
                                      ecs_cluster_name):
    """
    CPU DGL test for PyTorch Training

    Instance Type - c5.12xlarge

    DGL is only supported in py3, hence we have used the "py3_only" fixture to ensure py2 images don't run
    on this function.

    Given above parameters, registers a task with family named after this test, runs the task, and waits for
    the task to be stopped before doing teardown operations of instance and cluster.
    """
    instance_id, cluster_arn = ecs_container_instance

    ecs_utils.ecs_training_test_executor(ecs_cluster_name, cluster_arn, training_cmd, pytorch_training, instance_id)
Example #6
0
def test_ecs_pytorch_training_mnist_gpu(gpu_only, ecs_container_instance, pytorch_training, training_cmd,
                                        ecs_cluster_name):
    """
    GPU mnist test for PyTorch Training

    Instance Type - p3.8xlarge

    Given above parameters, registers a task with family named after this test, runs the task, and waits for
    the task to be stopped before doing teardown operations of instance and cluster.
    """
    instance_id, cluster_arn = ecs_container_instance

    num_gpus = ec2_utils.get_instance_num_gpus(instance_id)

    ecs_utils.ecs_training_test_executor(ecs_cluster_name, cluster_arn, training_cmd, pytorch_training, instance_id,
                                         num_gpus=num_gpus)
def test_ecs_pytorch_s3_plugin_training_cpu(cpu_only, ecs_container_instance,
                                            pytorch_training, training_cmd,
                                            ecs_cluster_name):
    """
    CPU resnet18 test for PyTorch Training using S3 plugin

    Instance Type - c5.9xlarge

    Given above parameters, registers a task with family named after this test, runs the task, and waits for
    the task to be stopped before doing teardown operations of instance and cluster.
    """
    instance_id, cluster_arn = ecs_container_instance

    ecs_utils.ecs_training_test_executor(ecs_cluster_name, cluster_arn,
                                         training_cmd, pytorch_training,
                                         instance_id)
def test_ecs_mxnet_training_dgl_cpu(cpu_only, py3_only, ecs_container_instance, mxnet_training, training_cmd,
                                    ecs_cluster_name):
    """
    CPU DGL test for MXNet Training

    Instance Type - c4.2xlarge

    DGL is only supported in py3, hence we have used the "py3_only" fixture to ensure py2 images don't run
    on this function.

    Given above parameters, registers a task with family named after this test, runs the task, and waits for
    the task to be stopped before doing teardown operations of instance and cluster.
    """
    # TODO: remove/update this when DGL supports MXNet 1.9
    _, framework_version = get_framework_and_version_from_tag(mxnet_training)
    if Version(framework_version) >= Version('1.9.0'):
        pytest.skip("Skipping DGL tests as DGL does not yet support MXNet 1.9")
    instance_id, cluster_arn = ecs_container_instance

    ecs_utils.ecs_training_test_executor(ecs_cluster_name, cluster_arn, training_cmd, mxnet_training, instance_id)
Example #9
0
def test_ecs_pytorch_training_dgl_cpu(
    cpu_only, py3_only, ecs_container_instance, pytorch_training, training_cmd, ecs_cluster_name
):
    """
    CPU DGL test for PyTorch Training

    Instance Type - c5.12xlarge

    DGL is only supported in py3, hence we have used the "py3_only" fixture to ensure py2 images don't run
    on this function.

    Given above parameters, registers a task with family named after this test, runs the task, and waits for
    the task to be stopped before doing teardown operations of instance and cluster.
    """
    _, image_framework_version = get_framework_and_version_from_tag(pytorch_training)
    # TODO: Remove when DGL gpu test on ecs get fixed
    if Version(image_framework_version) in SpecifierSet("==1.10.*"):
        pytest.skip("ecs test for DGL gpu fails for pt 1.10")
    instance_id, cluster_arn = ecs_container_instance

    ecs_utils.ecs_training_test_executor(ecs_cluster_name, cluster_arn, training_cmd, pytorch_training, instance_id)