def test_hc_mnist_gpu(framework_version, ecr_image, sagemaker_regions, dist_gpu_backend): with timeout(minutes=DEFAULT_TIMEOUT): instance_type = MULTI_GPU_INSTANCE instance_count = 2 training_group_1 = InstanceGroup("train_group_1", instance_type, instance_count) training_group_2 = InstanceGroup("train_group_2", instance_type, instance_count) estimator_parameter = { 'entry_point': mnist_script, 'role': 'SageMakerRole', 'instance_groups': [training_group_1, training_group_2], 'framework_version': framework_version, 'hyperparameters': { 'backend': dist_gpu_backend }, } upload_s3_data_args = { 'path': os.path.join(data_dir, 'training'), 'key_prefix': 'pytorch/mnist' } job_name = utils.unique_name_from_base('test-pt-hc-mnist-gpu') invoke_pytorch_estimator(ecr_image, sagemaker_regions, estimator_parameter, upload_s3_data_args=upload_s3_data_args, job_name=job_name)
def test_hc_smdataparallel_mnist(ecr_image, sagemaker_regions, instance_types, tmpdir): """ Tests smddprun command via Estimator API distribution parameter """ with timeout(minutes=DEFAULT_TIMEOUT): validate_or_skip_smdataparallel_efa(ecr_image) instance_count = 2 training_group = InstanceGroup("train_group", instance_types, instance_count) distribution = { "smdistributed": { "dataparallel": { "enabled": True } }, "instance_groups": [training_group] } estimator_parameter = { 'entry_point': 'smdataparallel_mnist.py', 'role': 'SageMakerRole', 'source_dir': mnist_path, 'instance_groups': [training_group], 'distribution': distribution, } job_name = utils.unique_name_from_base("test-pt-hc-smddp-mnist") invoke_pytorch_estimator(ecr_image, sagemaker_regions, estimator_parameter, job_name=job_name)
def test_hc_training_smdebug(framework_version, ecr_image, sagemaker_regions, instance_type): hyperparameters = { 'random_seed': True, 'num_steps': 50, 'smdebug_path': '/tmp/ml/output/tensors', 'epochs': 1, 'data_dir': training_dir, } with timeout(minutes=DEFAULT_TIMEOUT): instance_count = 1 training_group = InstanceGroup('train_group', instance_type, instance_count) estimator_parameter = { 'entry_point': smdebug_mnist_script, 'role': 'SageMakerRole', 'instance_groups': [training_group], 'framework_version': framework_version, 'hyperparameters': hyperparameters } upload_s3_data_args = { 'path': training_dir, 'key_prefix': 'pytorch/mnist' } job_name = utils.unique_name_from_base('test-pt-hc-smdebug-training') invoke_pytorch_estimator(ecr_image, sagemaker_regions, estimator_parameter, upload_s3_data_args=upload_s3_data_args, job_name=job_name)
def test_hc_mnist(ecr_image, sagemaker_regions, instance_type, framework_version): from sagemaker.instance_group import InstanceGroup instance_type = instance_type or 'ml.c5.xlarge' training_group = InstanceGroup("train_group", instance_type, 1) invoke_sm_helper_function(ecr_image, sagemaker_regions, _test_mnist_hc_function, [training_group], framework_version)
def test_hc_distributed_mnist_ps(ecr_image, sagemaker_regions, instance_type, framework_version): from sagemaker.instance_group import InstanceGroup validate_or_skip_test(ecr_image=ecr_image) print('ecr image used for training', ecr_image) instance_type = instance_type or 'ml.p2.xlarge' training_group = InstanceGroup("train_group", instance_type, 2) invoke_sm_helper_function(ecr_image, sagemaker_regions, _test_hc_distributed_mnist_ps_function, [training_group], framework_version)
def test_hc_smmodelparallel_mnist_multigpu_multinode(ecr_image, instance_type, sagemaker_regions, test_script, num_processes): """ Tests pt mnist command via script mode """ instance_type = "ml.p3.16xlarge" validate_or_skip_smmodelparallel(ecr_image) instance_count = 2 training_group = InstanceGroup("train_group", instance_type, instance_count) with timeout(minutes=DEFAULT_TIMEOUT): estimator_parameter = { 'entry_point': test_script, 'role': 'SageMakerRole', 'source_dir': mnist_path, 'instance_groups': [training_group], 'hyperparameters': { "assert-losses": 1, "amp": 1, "ddp": 1, "data-dir": "data/training", "epochs": 5 }, 'distribution': { "smdistributed": { "modelparallel": { "enabled": True, "parameters": { "partitions": 2, "microbatches": 4, "optimize": "speed", "pipeline": "interleaved", "ddp": True, }, } }, "mpi": { "enabled": True, "processes_per_host": num_processes, "custom_mpi_options": "-verbose --mca orte_base_help_aggregate 0 -x SMDEBUG_LOG_LEVEL=error -x OMPI_MCA_btl_vader_single_copy_mechanism=none ", }, "instance_groups": [training_group], }, } job_name = utils.unique_name_from_base('test-pt-hc-smdmp-multinode') invoke_pytorch_estimator(ecr_image, sagemaker_regions, estimator_parameter, job_name=job_name)
def test_hc_distributed_training_horovod(ecr_image, sagemaker_regions, instance_type, tmpdir, framework_version): instance_type = instance_type or 'ml.p3.16xlarge' training_group = InstanceGroup("train_group_1", instance_type, 2) invoke_sm_helper_function( ecr_image, sagemaker_regions, _test_hc_distributed_training_horovod_function, [training_group], tmpdir, framework_version, )
def test_hc_mnist_distributed_cpu(framework_version, ecr_image, sagemaker_regions, instance_type, dist_cpu_backend): instance_type = instance_type or 'ml.c4.xlarge' training_group = InstanceGroup("train_group", instance_type, 2) function_args = { 'framework_version': framework_version, 'instance_groups': [training_group], 'dist_backend': dist_cpu_backend } invoke_pytorch_helper_function(ecr_image, sagemaker_regions, _test_hc_mnist_distributed, function_args)
def test_hc_pt_s3_plugin_sm_cpu(framework_version, ecr_image, sagemaker_regions): validate_or_skip_s3_plugin(ecr_image) training_group = InstanceGroup("train_group", CPU_INSTANCE, 1) with timeout(minutes=DEFAULT_TIMEOUT): estimator_parameter = { 'entry_point': 'main.py', 'role': 'SageMakerRole', 'source_dir': resnet18_path, 'instance_groups': [training_group], 'framework_version': framework_version } job_name = utils.unique_name_from_base("test-pytorch-hc-s3-plugin-cpu") invoke_pytorch_estimator(ecr_image, sagemaker_regions, estimator_parameter, job_name=job_name)
def test_hc_smdataparallel_mnist(ecr_image, sagemaker_regions, instance_types, py_version, tmpdir): training_group = InstanceGroup("train_group", instance_types, 2) invoke_sm_helper_function(ecr_image, sagemaker_regions, _test_hc_smdataparallel_mnist_function, [training_group])