Ejemplo n.º 1
0
def test_distributed_training_horovod(sagemaker_session, instance_type,
                                      image_uri, tmpdir, framework_version):

    mpi_options = '-verbose -x orte_base_help_aggregate=0'
    estimator = TensorFlow(entry_point=os.path.join(RESOURCE_PATH, 'mnist',
                                                    'horovod_mnist.py'),
                           role='SageMakerRole',
                           train_instance_type=instance_type,
                           train_instance_count=2,
                           image_name=image_uri,
                           framework_version=framework_version,
                           py_version='py3',
                           script_mode=True,
                           hyperparameters={
                               'sagemaker_mpi_enabled': True,
                               'sagemaker_mpi_custom_mpi_options': mpi_options,
                               'sagemaker_mpi_num_of_processes_per_host': 1
                           },
                           sagemaker_session=sagemaker_session)

    estimator.fit(job_name=unique_name_from_base('test-tf-horovod'))

    model_data_source = sagemaker.local.data.get_data_source_instance(
        estimator.model_data, sagemaker_session)

    for filename in model_data_source.get_file_list():
        assert os.path.basename(filename) == 'model.tar.gz'
Ejemplo n.º 2
0
def test_mnist(sagemaker_session, image_uri, instance_type, framework_version):
    resource_path = os.path.join(os.path.dirname(__file__), '..', '..',
                                 'resources')
    script = os.path.join(resource_path, 'mnist', 'mnist.py')
    estimator = TensorFlow(entry_point=script,
                           role='SageMakerRole',
                           train_instance_type=instance_type,
                           train_instance_count=1,
                           sagemaker_session=sagemaker_session,
                           image_name=image_uri,
                           framework_version=framework_version,
                           script_mode=True)
    inputs = estimator.sagemaker_session.upload_data(
        path=os.path.join(resource_path, 'mnist', 'data'),
        key_prefix='scriptmode/mnist')
    estimator.fit(inputs,
                  job_name=unique_name_from_base('test-sagemaker-mnist'))
    _assert_s3_file_exists(sagemaker_session.boto_region_name,
                           estimator.model_data)
Ejemplo n.º 3
0
def test_tuning(sagemaker_session, image_uri, instance_type,
                framework_version):
    resource_path = os.path.join(os.path.dirname(__file__), '..', '..',
                                 'resources')
    script = os.path.join(resource_path, 'mnist', 'mnist.py')

    estimator = TensorFlow(entry_point=script,
                           role='SageMakerRole',
                           train_instance_type=instance_type,
                           train_instance_count=1,
                           sagemaker_session=sagemaker_session,
                           image_name=image_uri,
                           framework_version=framework_version,
                           script_mode=True)

    hyperparameter_ranges = {'epochs': IntegerParameter(1, 2)}
    objective_metric_name = 'accuracy'
    metric_definitions = [{
        'Name': objective_metric_name,
        'Regex': 'accuracy = ([0-9\\.]+)'
    }]

    tuner = HyperparameterTuner(estimator,
                                objective_metric_name,
                                hyperparameter_ranges,
                                metric_definitions,
                                max_jobs=2,
                                max_parallel_jobs=2)

    with timeout(minutes=20):
        inputs = estimator.sagemaker_session.upload_data(
            path=os.path.join(resource_path, 'mnist', 'data'),
            key_prefix='scriptmode/mnist')

        tuning_job_name = unique_name_from_base('test-tf-sm-tuning',
                                                max_length=32)
        tuner.fit(inputs, job_name=tuning_job_name)
        tuner.wait()
def test_model_dir_with_training_job_name(sagemaker_session, image_uri, instance_type, framework_version):
    resource_path = os.path.join(os.path.dirname(__file__), '../..', 'resources')
    script = os.path.join(resource_path, 'tuning_model_dir', 'entry.py')

    estimator = TensorFlow(entry_point=script,
                           role='SageMakerRole',
                           train_instance_type=instance_type,
                           train_instance_count=1,
                           image_name=image_uri,
                           framework_version=framework_version,
                           py_version='py3',
                           sagemaker_session=sagemaker_session)

    tuner = HyperparameterTuner(estimator=estimator,
                                objective_metric_name='accuracy',
                                hyperparameter_ranges={'arbitrary_value': IntegerParameter(0, 1)},
                                metric_definitions=[{'Name': 'accuracy', 'Regex': 'accuracy=([01])'}],
                                max_jobs=1,
                                max_parallel_jobs=1)

    # User script has logic to check for the correct model_dir
    tuner.fit(job_name=unique_name_from_base('test-tf-model-dir', max_length=32))
    tuner.wait()
Ejemplo n.º 5
0
def test_distributed_mnist_ps(sagemaker_session, image_uri, instance_type,
                              framework_version):
    resource_path = os.path.join(os.path.dirname(__file__), '..', '..',
                                 'resources')
    script = os.path.join(resource_path, 'mnist', 'mnist_estimator.py')
    estimator = TensorFlow(
        entry_point=script,
        role='SageMakerRole',
        hyperparameters={'sagemaker_parameter_server_enabled': True},
        train_instance_count=2,
        train_instance_type=instance_type,
        sagemaker_session=sagemaker_session,
        image_name=image_uri,
        framework_version=framework_version,
        script_mode=True)
    inputs = estimator.sagemaker_session.upload_data(
        path=os.path.join(resource_path, 'mnist', 'data-distributed'),
        key_prefix='scriptmode/mnist-distributed')
    estimator.fit(
        inputs, job_name=unique_name_from_base('test-tf-sm-distributed-mnist'))
    _assert_checkpoint_exists(sagemaker_session.boto_region_name,
                              estimator.model_dir, 0)
    _assert_s3_file_exists(sagemaker_session.boto_region_name,
                           estimator.model_data)