Esempio n. 1
0
def test_hc_mnist_gpu(framework_version, ecr_image, sagemaker_regions,
                      dist_gpu_backend):
    with timeout(minutes=DEFAULT_TIMEOUT):
        instance_type = MULTI_GPU_INSTANCE
        instance_count = 2
        training_group_1 = InstanceGroup("train_group_1", instance_type,
                                         instance_count)
        training_group_2 = InstanceGroup("train_group_2", instance_type,
                                         instance_count)
        estimator_parameter = {
            'entry_point': mnist_script,
            'role': 'SageMakerRole',
            'instance_groups': [training_group_1, training_group_2],
            'framework_version': framework_version,
            'hyperparameters': {
                'backend': dist_gpu_backend
            },
        }
        upload_s3_data_args = {
            'path': os.path.join(data_dir, 'training'),
            'key_prefix': 'pytorch/mnist'
        }
        job_name = utils.unique_name_from_base('test-pt-hc-mnist-gpu')
        invoke_pytorch_estimator(ecr_image,
                                 sagemaker_regions,
                                 estimator_parameter,
                                 upload_s3_data_args=upload_s3_data_args,
                                 job_name=job_name)
Esempio n. 2
0
def test_hc_smdataparallel_mnist(ecr_image, sagemaker_regions, instance_types,
                                 tmpdir):
    """
    Tests smddprun command via Estimator API distribution parameter
    """
    with timeout(minutes=DEFAULT_TIMEOUT):
        validate_or_skip_smdataparallel_efa(ecr_image)
        instance_count = 2
        training_group = InstanceGroup("train_group", instance_types,
                                       instance_count)
        distribution = {
            "smdistributed": {
                "dataparallel": {
                    "enabled": True
                }
            },
            "instance_groups": [training_group]
        }
        estimator_parameter = {
            'entry_point': 'smdataparallel_mnist.py',
            'role': 'SageMakerRole',
            'source_dir': mnist_path,
            'instance_groups': [training_group],
            'distribution': distribution,
        }

        job_name = utils.unique_name_from_base("test-pt-hc-smddp-mnist")
        invoke_pytorch_estimator(ecr_image,
                                 sagemaker_regions,
                                 estimator_parameter,
                                 job_name=job_name)
def test_hc_training_smdebug(framework_version, ecr_image, sagemaker_regions,
                             instance_type):
    hyperparameters = {
        'random_seed': True,
        'num_steps': 50,
        'smdebug_path': '/tmp/ml/output/tensors',
        'epochs': 1,
        'data_dir': training_dir,
    }

    with timeout(minutes=DEFAULT_TIMEOUT):
        instance_count = 1
        training_group = InstanceGroup('train_group', instance_type,
                                       instance_count)
        estimator_parameter = {
            'entry_point': smdebug_mnist_script,
            'role': 'SageMakerRole',
            'instance_groups': [training_group],
            'framework_version': framework_version,
            'hyperparameters': hyperparameters
        }
        upload_s3_data_args = {
            'path': training_dir,
            'key_prefix': 'pytorch/mnist'
        }
        job_name = utils.unique_name_from_base('test-pt-hc-smdebug-training')
        invoke_pytorch_estimator(ecr_image,
                                 sagemaker_regions,
                                 estimator_parameter,
                                 upload_s3_data_args=upload_s3_data_args,
                                 job_name=job_name)
Esempio n. 4
0
def test_hc_mnist(ecr_image, sagemaker_regions, instance_type,
                  framework_version):
    from sagemaker.instance_group import InstanceGroup
    instance_type = instance_type or 'ml.c5.xlarge'
    training_group = InstanceGroup("train_group", instance_type, 1)
    invoke_sm_helper_function(ecr_image, sagemaker_regions,
                              _test_mnist_hc_function, [training_group],
                              framework_version)
Esempio n. 5
0
def test_hc_distributed_mnist_ps(ecr_image, sagemaker_regions, instance_type,
                                 framework_version):
    from sagemaker.instance_group import InstanceGroup
    validate_or_skip_test(ecr_image=ecr_image)
    print('ecr image used for training', ecr_image)
    instance_type = instance_type or 'ml.p2.xlarge'
    training_group = InstanceGroup("train_group", instance_type, 2)
    invoke_sm_helper_function(ecr_image, sagemaker_regions,
                              _test_hc_distributed_mnist_ps_function,
                              [training_group], framework_version)
Esempio n. 6
0
def test_hc_smmodelparallel_mnist_multigpu_multinode(ecr_image, instance_type,
                                                     sagemaker_regions,
                                                     test_script,
                                                     num_processes):
    """
    Tests pt mnist command via script mode
    """
    instance_type = "ml.p3.16xlarge"
    validate_or_skip_smmodelparallel(ecr_image)
    instance_count = 2
    training_group = InstanceGroup("train_group", instance_type,
                                   instance_count)
    with timeout(minutes=DEFAULT_TIMEOUT):
        estimator_parameter = {
            'entry_point': test_script,
            'role': 'SageMakerRole',
            'source_dir': mnist_path,
            'instance_groups': [training_group],
            'hyperparameters': {
                "assert-losses": 1,
                "amp": 1,
                "ddp": 1,
                "data-dir": "data/training",
                "epochs": 5
            },
            'distribution': {
                "smdistributed": {
                    "modelparallel": {
                        "enabled": True,
                        "parameters": {
                            "partitions": 2,
                            "microbatches": 4,
                            "optimize": "speed",
                            "pipeline": "interleaved",
                            "ddp": True,
                        },
                    }
                },
                "mpi": {
                    "enabled":
                    True,
                    "processes_per_host":
                    num_processes,
                    "custom_mpi_options":
                    "-verbose --mca orte_base_help_aggregate 0 -x SMDEBUG_LOG_LEVEL=error -x OMPI_MCA_btl_vader_single_copy_mechanism=none ",
                },
                "instance_groups": [training_group],
            },
        }
        job_name = utils.unique_name_from_base('test-pt-hc-smdmp-multinode')
        invoke_pytorch_estimator(ecr_image,
                                 sagemaker_regions,
                                 estimator_parameter,
                                 job_name=job_name)
Esempio n. 7
0
def test_hc_distributed_training_horovod(ecr_image, sagemaker_regions,
                                         instance_type, tmpdir,
                                         framework_version):
    instance_type = instance_type or 'ml.p3.16xlarge'
    training_group = InstanceGroup("train_group_1", instance_type, 2)
    invoke_sm_helper_function(
        ecr_image,
        sagemaker_regions,
        _test_hc_distributed_training_horovod_function,
        [training_group],
        tmpdir,
        framework_version,
    )
def test_hc_mnist_distributed_cpu(framework_version, ecr_image,
                                  sagemaker_regions, instance_type,
                                  dist_cpu_backend):
    instance_type = instance_type or 'ml.c4.xlarge'
    training_group = InstanceGroup("train_group", instance_type, 2)
    function_args = {
        'framework_version': framework_version,
        'instance_groups': [training_group],
        'dist_backend': dist_cpu_backend
    }

    invoke_pytorch_helper_function(ecr_image, sagemaker_regions,
                                   _test_hc_mnist_distributed, function_args)
def test_hc_pt_s3_plugin_sm_cpu(framework_version, ecr_image,
                                sagemaker_regions):
    validate_or_skip_s3_plugin(ecr_image)
    training_group = InstanceGroup("train_group", CPU_INSTANCE, 1)
    with timeout(minutes=DEFAULT_TIMEOUT):
        estimator_parameter = {
            'entry_point': 'main.py',
            'role': 'SageMakerRole',
            'source_dir': resnet18_path,
            'instance_groups': [training_group],
            'framework_version': framework_version
        }
        job_name = utils.unique_name_from_base("test-pytorch-hc-s3-plugin-cpu")
        invoke_pytorch_estimator(ecr_image,
                                 sagemaker_regions,
                                 estimator_parameter,
                                 job_name=job_name)
Esempio n. 10
0
def test_hc_smdataparallel_mnist(ecr_image, sagemaker_regions, instance_types,
                                 py_version, tmpdir):
    training_group = InstanceGroup("train_group", instance_types, 2)
    invoke_sm_helper_function(ecr_image, sagemaker_regions,
                              _test_hc_smdataparallel_mnist_function,
                              [training_group])