Esempio n. 1
0
def test_s3_plugin(sagemaker_session, ecr_image, instance_type, region, framework_version):
    resource_path = os.path.join(os.path.dirname(__file__), '..', '..', 'resources')
    script = os.path.join(resource_path, 'mnist', 'mnist_estimator.py')
    estimator = TensorFlow(entry_point=script,
                           role='SageMakerRole',
                           hyperparameters={
                               # Saving a checkpoint after every 5 steps to hammer the S3 plugin
                               'save-checkpoint-steps': 10,
                               # Reducing throttling for checkpoint and model saving
                               'throttle-secs': 1,
                               # Without the patch training jobs would fail around 100th to
                               # 150th step
                               'max-steps': 200,
                               # Large batch size would result in a larger checkpoint file
                               'batch-size': 1024,
                               # This makes the training job exporting model during training.
                               # Stale model garbage collection will also be performed.
                               'export-model-during-training': True
                           },
                           train_instance_count=1,
                           train_instance_type=instance_type,
                           sagemaker_session=sagemaker_session,
                           image_name=ecr_image,
                           framework_version=framework_version,
                           script_mode=True)
    estimator.fit('s3://sagemaker-sample-data-{}/tensorflow/mnist'.format(region),
                  job_name=unique_name_from_base('test-tf-sm-s3-mnist'))
    _assert_s3_file_exists(region, estimator.model_data)
    _assert_checkpoint_exists(region, estimator.model_dir, 200)
Esempio n. 2
0
def test_distributed_training_horovod(sagemaker_session, instance_type,
                                      ecr_image, tmpdir, framework_version):

    mpi_options = '-verbose -x orte_base_help_aggregate=0'
    estimator = TensorFlow(entry_point=os.path.join(RESOURCE_PATH, 'mnist',
                                                    'horovod_mnist.py'),
                           role='SageMakerRole',
                           train_instance_type=instance_type,
                           train_instance_count=2,
                           image_name=ecr_image,
                           framework_version=framework_version,
                           py_version='py3',
                           script_mode=True,
                           hyperparameters={
                               'sagemaker_mpi_enabled': True,
                               'sagemaker_mpi_custom_mpi_options': mpi_options,
                               'sagemaker_mpi_num_of_processes_per_host': 1
                           },
                           sagemaker_session=sagemaker_session)

    estimator.fit(job_name=unique_name_from_base('test-tf-horovod'))

    model_data_source = sagemaker.local.data.get_data_source_instance(
        estimator.model_data, sagemaker_session)

    for filename in model_data_source.get_file_list():
        assert os.path.basename(filename) == 'model.tar.gz'
Esempio n. 3
0
def test_tuning(sagemaker_session, ecr_image, instance_type, framework_version):
    resource_path = os.path.join(os.path.dirname(__file__), '..', '..', 'resources')
    script = os.path.join(resource_path, 'mnist', 'mnist.py')

    estimator = TensorFlow(entry_point=script,
                           role='SageMakerRole',
                           train_instance_type=instance_type,
                           train_instance_count=1,
                           sagemaker_session=sagemaker_session,
                           image_name=ecr_image,
                           framework_version=framework_version,
                           script_mode=True)

    hyperparameter_ranges = {'epochs': IntegerParameter(1, 2)}
    objective_metric_name = 'accuracy'
    metric_definitions = [{'Name': objective_metric_name, 'Regex': 'accuracy = ([0-9\\.]+)'}]

    tuner = HyperparameterTuner(estimator,
                                objective_metric_name,
                                hyperparameter_ranges,
                                metric_definitions,
                                max_jobs=2,
                                max_parallel_jobs=2)

    with timeout(minutes=20):
        inputs = estimator.sagemaker_session.upload_data(
            path=os.path.join(resource_path, 'mnist', 'data'),
            key_prefix='scriptmode/mnist')

        tuning_job_name = unique_name_from_base('test-tf-sm-tuning', max_length=32)
        tuner.fit(inputs, job_name=tuning_job_name)
        tuner.wait()
Esempio n. 4
0
def run_test(sagemaker_session,
             ecr_image,
             instance_type,
             framework_version,
             test_data,
             record_wrapper_type=None):
    source_path = os.path.join(os.path.dirname(__file__), '..', '..',
                               'resources', 'pipemode')
    script = os.path.join(source_path, 'pipemode.py')
    estimator = TensorFlow(entry_point=script,
                           role='SageMakerRole',
                           train_instance_type=instance_type,
                           train_instance_count=1,
                           sagemaker_session=sagemaker_session,
                           image_name=ecr_image,
                           framework_version=framework_version,
                           script_mode=True,
                           input_mode='Pipe',
                           hyperparameters={'dimension': DIMENSION})
    input = s3_input(s3_data=test_data,
                     distribution='FullyReplicated',
                     record_wrapping=record_wrapper_type,
                     input_mode='Pipe')
    with timeout(minutes=20):
        estimator.fit(
            {'elizabeth': input},
            job_name=unique_name_from_base('test-sagemaker-pipemode'))
Esempio n. 5
0
def test_distributed_mnist_no_ps(sagemaker_session, ecr_image, instance_type, framework_version):
    resource_path = os.path.join(os.path.dirname(__file__), '..', '..', 'resources')
    script = os.path.join(resource_path, 'mnist', 'mnist.py')
    estimator = TensorFlow(entry_point=script,
                           role='SageMakerRole',
                           train_instance_count=2,
                           train_instance_type=instance_type,
                           sagemaker_session=sagemaker_session,
                           image_name=ecr_image,
                           framework_version=framework_version,
                           script_mode=True)
    inputs = estimator.sagemaker_session.upload_data(
        path=os.path.join(resource_path, 'mnist', 'data'),
        key_prefix='scriptmode/mnist')
    estimator.fit(inputs, job_name=unique_name_from_base('test-tf-sm-distributed-mnist'))
    _assert_s3_file_exists(sagemaker_session.boto_region_name, estimator.model_data)
Esempio n. 6
0
def test_smdebug(sagemaker_session, ecr_image, instance_type, framework_version):
    resource_path = os.path.join(os.path.dirname(__file__), '..', '..', 'resources')
    script = os.path.join(resource_path, 'mnist', 'mnist_smdebug.py')
    hyperparameters = {'smdebug_path': '/opt/ml/output/tensors'}
    estimator = TensorFlow(entry_point=script,
                           role='SageMakerRole',
                           train_instance_type=instance_type,
                           train_instance_count=1,
                           sagemaker_session=sagemaker_session,
                           image_name=ecr_image,
                           framework_version=framework_version,
                           script_mode=True,
                           hyperparameters=hyperparameters)
    inputs = estimator.sagemaker_session.upload_data(
        path=os.path.join(resource_path, 'mnist', 'data'),
        key_prefix='scriptmode/mnist_smdebug')
    estimator.fit(inputs, job_name=unique_name_from_base('test-sagemaker-mnist-smdebug'))
    _assert_s3_file_exists(sagemaker_session.boto_region_name, estimator.model_data)
Esempio n. 7
0
def test_model_dir_with_training_job_name(sagemaker_session, ecr_image, instance_type, framework_version):
    resource_path = os.path.join(os.path.dirname(__file__), '../..', 'resources')
    script = os.path.join(resource_path, 'tuning_model_dir', 'entry.py')

    estimator = TensorFlow(entry_point=script,
                           role='SageMakerRole',
                           train_instance_type=instance_type,
                           train_instance_count=1,
                           image_name=ecr_image,
                           framework_version=framework_version,
                           py_version='py3',
                           sagemaker_session=sagemaker_session)

    tuner = HyperparameterTuner(estimator=estimator,
                                objective_metric_name='accuracy',
                                hyperparameter_ranges={'arbitrary_value': IntegerParameter(0, 1)},
                                metric_definitions=[{'Name': 'accuracy', 'Regex': 'accuracy=([01])'}],
                                max_jobs=1,
                                max_parallel_jobs=1)

    # User script has logic to check for the correct model_dir
    tuner.fit(job_name=unique_name_from_base('test-tf-model-dir', max_length=32))
    tuner.wait()