Esempio n. 1
0
def test_run_tensorboard_locally_port_in_use(time, strftime, popen, call,
                                             access, socket,
                                             sagemaker_session):
    tf = TensorFlow(entry_point=SCRIPT_PATH,
                    role=ROLE,
                    sagemaker_session=sagemaker_session,
                    train_instance_count=INSTANCE_COUNT,
                    train_instance_type=INSTANCE_TYPE)

    popen().poll.side_effect = [True, False]

    tf.fit(inputs='s3://mybucket/train', run_tensorboard_locally=True)

    popen.assert_any_call([
        'tensorboard', '--logdir', '/my/temp/folder', '--host', 'localhost',
        '--port', '6006'
    ],
                          stderr=-1,
                          stdout=-1)

    popen.assert_any_call([
        'tensorboard', '--logdir', '/my/temp/folder', '--host', 'localhost',
        '--port', '6007'
    ],
                          stderr=-1,
                          stdout=-1)
Esempio n. 2
0
def test_horovod_local_mode(sagemaker_local_session, instances, processes,
                            tmpdir):
    output_path = 'file://%s' % tmpdir
    job_name = sagemaker.utils.unique_name_from_base('tf-horovod')
    estimator = TensorFlow(entry_point=os.path.join(horovod_dir,
                                                    'test_hvd_basic.py'),
                           role='SageMakerRole',
                           train_instance_count=2,
                           train_instance_type='local',
                           sagemaker_session=sagemaker_local_session,
                           py_version=integ.PYTHON_VERSION,
                           script_mode=True,
                           output_path=output_path,
                           framework_version='1.12',
                           distributions={
                               'mpi': {
                                   'enabled': True,
                                   'processes_per_host': processes
                               }
                           })

    with timeout.timeout(minutes=integ.TRAINING_DEFAULT_TIMEOUT_MINUTES):
        estimator.fit(job_name=job_name)

        tmp = str(tmpdir)
        extract_files(output_path.replace('file://', ''), tmp)

        size = instances * processes

        for rank in range(size):
            assert read_json('rank-%s' % rank, tmp)['rank'] == rank
Esempio n. 3
0
def test_create_model(sagemaker_session, tf_version):
    container_log_level = '"logging.INFO"'
    source_dir = 's3://mybucket/source'
    enable_cloudwatch_metrics = 'true'
    tf = TensorFlow(entry_point=SCRIPT_PATH,
                    role=ROLE,
                    sagemaker_session=sagemaker_session,
                    training_steps=1000,
                    evaluation_steps=10,
                    train_instance_count=INSTANCE_COUNT,
                    train_instance_type=INSTANCE_TYPE,
                    framework_version=tf_version,
                    container_log_level=container_log_level,
                    base_job_name='job',
                    source_dir=source_dir,
                    enable_cloudwatch_metrics=enable_cloudwatch_metrics)

    job_name = 'doing something'
    tf.fit(inputs='s3://mybucket/train', job_name=job_name)
    model = tf.create_model()

    assert model.sagemaker_session == sagemaker_session
    assert model.framework_version == tf_version
    assert model.py_version == tf.py_version
    assert model.entry_point == SCRIPT_PATH
    assert model.role == ROLE
    assert model.name == job_name
    assert model.container_log_level == container_log_level
    assert model.source_dir == source_dir
    assert model.enable_cloudwatch_metrics == enable_cloudwatch_metrics
Esempio n. 4
0
def _run_distributed_training_horovod_basic(
    instances, processes, sagemaker_local_session, docker_image, tmpdir, framework_version
):
    output_path = "file://%s" % tmpdir
    estimator = TensorFlow(
        entry_point=os.path.join(RESOURCE_PATH, "hvdbasic", "train_hvd_basic.py"),
        role="SageMakerRole",
        train_instance_type="local",
        sagemaker_session=sagemaker_local_session,
        train_instance_count=instances,
        image_name=docker_image,
        output_path=output_path,
        framework_version=framework_version,
        hyperparameters={
            "sagemaker_mpi_enabled": True,
            "sagemaker_network_interface_name": "eth0",
            "sagemaker_mpi_num_of_processes_per_host": processes,
        },
    )

    estimator.fit("file://{}".format(os.path.join(RESOURCE_PATH, "mnist", "data-distributed")))

    tmp = str(tmpdir)
    extract_files(output_path.replace("file://", ""), tmp)

    size = instances * processes

    for rank in range(size):
        local_rank = rank % processes
        assert read_json("local-rank-%s-rank-%s" % (local_rank, rank), tmp) == {
            "local-rank": local_rank,
            "rank": rank,
            "size": size,
        }
Esempio n. 5
0
def test_s3_plugin(sagemaker_session, ecr_image, instance_type, region, framework_version):
    resource_path = os.path.join(os.path.dirname(__file__), '..', '..', 'resources')
    script = os.path.join(resource_path, 'mnist', 'mnist_estimator.py')
    estimator = TensorFlow(entry_point=script,
                           role='SageMakerRole',
                           hyperparameters={
                               # Saving a checkpoint after every 5 steps to hammer the S3 plugin
                               'save-checkpoint-steps': 10,
                               # Disable throttling for checkpoint and model saving
                               'throttle-secs': 0,
                               # Without the patch training jobs would fail around 100th to
                               # 150th step
                               'max-steps': 200,
                               # Large batch size would result in a larger checkpoint file
                               'batch-size': 1024,
                               # This makes the training job exporting model during training.
                               # Stale model garbage collection will also be performed.
                               'export-model-during-training': True
                           },
                           train_instance_count=1,
                           train_instance_type=instance_type,
                           sagemaker_session=sagemaker_session,
                           image_name=ecr_image,
                           framework_version=framework_version,
                           script_mode=True)
    estimator.fit('s3://sagemaker-sample-data-{}/tensorflow/mnist'.format(region),
                  job_name=unique_name_from_base('test-tf-sm-s3-mnist'))
    _assert_s3_file_exists(region, estimator.model_data)
    _assert_checkpoint_exists(region, estimator.model_dir, 200)
Esempio n. 6
0
def test_distributed_training_horovod(
    sagemaker_session, instance_type, image_uri, tmpdir, framework_version
):

    mpi_options = "-verbose -x orte_base_help_aggregate=0"
    estimator = TensorFlow(
        entry_point=os.path.join(RESOURCE_PATH, "mnist", "horovod_mnist.py"),
        role="SageMakerRole",
        train_instance_type=instance_type,
        train_instance_count=2,
        image_name=image_uri,
        framework_version=framework_version,
        py_version="py3",
        script_mode=True,
        hyperparameters={
            "sagemaker_mpi_enabled": True,
            "sagemaker_mpi_custom_mpi_options": mpi_options,
            "sagemaker_mpi_num_of_processes_per_host": 1,
        },
        sagemaker_session=sagemaker_session,
    )

    estimator.fit(job_name=unique_name_from_base("test-tf-horovod"))

    model_data_source = sagemaker.local.data.get_data_source_instance(
        estimator.model_data, sagemaker_session
    )

    for filename in model_data_source.get_file_list():
        assert os.path.basename(filename) == "model.tar.gz"
Esempio n. 7
0
def test_tf_script_mode(time, strftime, sagemaker_session):
    tf = TensorFlow(entry_point=SCRIPT_FILE,
                    role=ROLE,
                    sagemaker_session=sagemaker_session,
                    py_version='py3',
                    train_instance_type=INSTANCE_TYPE,
                    train_instance_count=1,
                    framework_version='1.11',
                    source_dir=DATA_DIR)

    inputs = 's3://mybucket/train'
    tf.fit(inputs=inputs)

    call_names = [c[0] for c in sagemaker_session.method_calls]
    assert call_names == ['train', 'logs_for_job']

    expected_train_args = _create_train_job('1.11',
                                            script_mode=True,
                                            repo_name=SM_IMAGE_REPO_NAME,
                                            py_version='py3')
    expected_train_args['input_config'][0]['DataSource']['S3DataSource'][
        'S3Uri'] = inputs

    actual_train_args = sagemaker_session.method_calls[0][2]
    assert actual_train_args == expected_train_args
Esempio n. 8
0
def test_create_model_with_optional_params(sagemaker_session):
    container_log_level = '"logging.INFO"'
    source_dir = 's3://mybucket/source'
    enable_cloudwatch_metrics = 'true'
    tf = TensorFlow(entry_point=SCRIPT_PATH,
                    role=ROLE,
                    sagemaker_session=sagemaker_session,
                    training_steps=1000,
                    evaluation_steps=10,
                    train_instance_count=INSTANCE_COUNT,
                    train_instance_type=INSTANCE_TYPE,
                    container_log_level=container_log_level,
                    base_job_name='job',
                    source_dir=source_dir,
                    enable_cloudwatch_metrics=enable_cloudwatch_metrics)

    job_name = 'doing something'
    tf.fit(inputs='s3://mybucket/train', job_name=job_name)

    new_role = 'role'
    model_server_workers = 2
    vpc_config = {'Subnets': ['foo'], 'SecurityGroupIds': ['bar']}
    model = tf.create_model(role=new_role,
                            model_server_workers=model_server_workers,
                            vpc_config_override=vpc_config)

    assert model.role == new_role
    assert model.model_server_workers == model_server_workers
    assert model.vpc_config == vpc_config
Esempio n. 9
0
def main():
    download_training_and_eval_data()

    print('Starting model training.')
    print(
        'Note: if launching for the first time in local mode, container image download might take a few minutes to complete.')
    california_housing_estimator = TensorFlow(entry_point='california_housing_tf2.py',
                                              source_dir='code',
                                              role=DUMMY_IAM_ROLE,
                                              instance_count=1,
                                              instance_type='local',
                                              framework_version='2.4.1',
                                              py_version='py37')

    inputs = {'train': 'file://./data/train', 'test': 'file://./data/test'}
    california_housing_estimator.fit(inputs)
    print('Completed model training')

    print('Deploying endpoint in local mode')
    predictor = california_housing_estimator.deploy(initial_instance_count=1, instance_type='local')

    do_inference_on_local_endpoint(predictor)

    print('About to delete the endpoint to stop paying (if in cloud mode).')
    predictor.delete_endpoint(predictor.endpoint_name)
def test_deploy_with_input_handlers(sagemaker_session, instance_type):
    estimator = TensorFlow(
        entry_point="training.py",
        source_dir=TFS_RESOURCE_PATH,
        role=ROLE,
        train_instance_count=1,
        train_instance_type=instance_type,
        py_version=tests.integ.PYTHON_VERSION,
        sagemaker_session=sagemaker_session,
        script_mode=True,
        framework_version=TensorFlow.LATEST_VERSION,
        tags=TAGS,
    )

    estimator.fit(job_name=unique_name_from_base("test-tf-tfs-deploy"))

    endpoint_name = estimator.latest_training_job.name

    with timeout.timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        predictor = estimator.deploy(
            initial_instance_count=1,
            instance_type=instance_type,
            endpoint_name=endpoint_name,
            entry_point=os.path.join(TFS_RESOURCE_PATH, "inference.py"),
        )

        input_data = {"instances": [1.0, 2.0, 5.0]}
        expected_result = {"predictions": [4.0, 4.5, 6.0]}

        result = predictor.predict(input_data)
        assert expected_result == result
def main():
    download_training_and_eval_data()

    print('Starting model training.')
    print(
        'Note: if launching for the first time in local mode, container image download might take a few minutes to complete.')
    california_housing_estimator = TensorFlow(entry_point='california_housing_tf2.py',
                                              source_dir='code',
                                              role=DUMMY_IAM_ROLE,
                                              instance_count=1,
                                              instance_type='local',
                                              framework_version='2.4.1',
                                              py_version='py37')

    inputs = {'train': 'file://./data/train', 'test': 'file://./data/test'}
    california_housing_estimator.fit(inputs)
    print('Completed model training')

    print('Running Batch Transform in local mode')
    tensorflow_serving_transformer = california_housing_estimator.transformer(
        instance_count=1,
        instance_type='local',
        output_path='file:./data/output',
    )

    tensorflow_serving_transformer.transform('file://./data/input',
                                             split_type='Line',
                                             content_type='text/csv')

    print('Printing Batch Transform output file content')
    output_file = open('./data/output/x_test.csv.out', 'r').read()
    print(output_file)
def test_tf_local_data_local_script():
    with timeout(minutes=5):
        script_path = os.path.join(DATA_DIR, 'iris', 'iris-dnn-classifier.py')

        estimator = TensorFlow(entry_point=script_path,
                               role='SageMakerRole',
                               training_steps=1,
                               evaluation_steps=1,
                               hyperparameters={'input_tensor_name': 'inputs'},
                               train_instance_count=1,
                               train_instance_type='local',
                               base_job_name='test-tf',
                               sagemaker_session=LocalNoS3Session())

        inputs = 'file://' + DATA_PATH

        estimator.fit(inputs)
        print('job succeeded: {}'.format(estimator.latest_training_job.name))

    endpoint_name = estimator.latest_training_job.name
    with local_mode_utils.lock():
        try:
            json_predictor = estimator.deploy(initial_instance_count=1,
                                              instance_type='local',
                                              endpoint_name=endpoint_name)

            features = [6.4, 3.2, 4.5, 1.5]
            dict_result = json_predictor.predict({'inputs': features})
            print('predict result: {}'.format(dict_result))
            list_result = json_predictor.predict(features)
            print('predict result: {}'.format(list_result))

            assert dict_result == list_result
        finally:
            estimator.delete_endpoint()
Esempio n. 13
0
def test_keras_training(sagemaker_local_session, docker_image, tmpdir,
                        framework_version):
    entry_point = os.path.join(RESOURCE_PATH, 'keras_inception.py')
    output_path = 'file://{}'.format(tmpdir)

    estimator = TensorFlow(entry_point=entry_point,
                           role='SageMakerRole',
                           train_instance_count=1,
                           train_instance_type='local',
                           image_name=docker_image,
                           sagemaker_session=sagemaker_local_session,
                           model_dir='/opt/ml/model',
                           output_path=output_path,
                           framework_version=framework_version,
                           py_version='py3')

    estimator.fit()

    model = serving.Model(model_data=output_path,
                          role='SageMakerRole',
                          framework_version=framework_version,
                          sagemaker_session=sagemaker_local_session)

    predictor = model.deploy(initial_instance_count=1, instance_type='local')

    assert predictor.predict(np.random.randn(4, 4, 4, 2) * 255)

    predictor.delete_endpoint()
Esempio n. 14
0
def test_distributed_training_horovod(sagemaker_session, instance_type,
                                      ecr_image, tmpdir, framework_version):

    mpi_options = '-verbose -x orte_base_help_aggregate=0 -x RDMAV_FORK_SAFE=1'
    estimator = TensorFlow(entry_point=os.path.join(RESOURCE_PATH, 'mnist',
                                                    'horovod_mnist.py'),
                           role='SageMakerRole',
                           instance_type=instance_type,
                           instance_count=2,
                           image_uri=ecr_image,
                           framework_version=framework_version,
                           py_version='py3',
                           hyperparameters={
                               'sagemaker_mpi_enabled': True,
                               'sagemaker_mpi_custom_mpi_options': mpi_options,
                               'sagemaker_mpi_num_of_processes_per_host': 1
                           },
                           sagemaker_session=sagemaker_session)

    estimator.fit(job_name=unique_name_from_base('test-tf-horovod'))

    model_data_source = sagemaker.local.data.get_data_source_instance(
        estimator.model_data, sagemaker_session)

    for filename in model_data_source.get_file_list():
        assert os.path.basename(filename) == 'model.tar.gz'
def test_mnist_distributed(sagemaker_session, instance_type, tf_full_version):
    estimator = TensorFlow(
        entry_point=SCRIPT,
        role=ROLE,
        train_instance_count=2,
        train_instance_type=instance_type,
        sagemaker_session=sagemaker_session,
        py_version=tests.integ.PYTHON_VERSION,
        script_mode=True,
        framework_version=tf_full_version,
        distributions=PARAMETER_SERVER_DISTRIBUTION,
    )
    inputs = estimator.sagemaker_session.upload_data(
        path=os.path.join(MNIST_RESOURCE_PATH, "data"),
        key_prefix="scriptmode/distributed_mnist")

    with tests.integ.timeout.timeout(
            minutes=tests.integ.TRAINING_DEFAULT_TIMEOUT_MINUTES):
        estimator.fit(inputs=inputs,
                      job_name=unique_name_from_base("test-tf-sm-distributed"))
    assert_s3_files_exist(
        sagemaker_session,
        estimator.model_dir,
        ["graph.pbtxt", "model.ckpt-0.index", "model.ckpt-0.meta"],
    )
Esempio n. 16
0
def test_server_side_encryption(sagemaker_session):

    boto_session = sagemaker_session.boto_session
    with kms_utils.bucket_with_encryption(boto_session,
                                          ROLE) as (bucket_with_kms, kms_key):

        output_path = os.path.join(bucket_with_kms,
                                   'test-server-side-encryption',
                                   time.strftime('%y%m%d-%H%M'))

        estimator = TensorFlow(entry_point=SCRIPT,
                               role=ROLE,
                               train_instance_count=1,
                               train_instance_type='ml.c5.xlarge',
                               sagemaker_session=sagemaker_session,
                               py_version='py3',
                               framework_version=TensorFlow.LATEST_VERSION,
                               code_location=output_path,
                               output_path=output_path,
                               model_dir='/opt/ml/model',
                               output_kms_key=kms_key)

        inputs = estimator.sagemaker_session.upload_data(
            path=os.path.join(RESOURCE_PATH, 'data'),
            key_prefix='scriptmode/mnist')

        with timeout.timeout(minutes=integ.TRAINING_DEFAULT_TIMEOUT_MINUTES):
            estimator.fit(
                inputs=inputs,
                job_name=unique_name_from_base('test-server-side-encryption'))
Esempio n. 17
0
def test_smdataparallel_tf_mnist(
    sagemaker_session,
    tensorflow_training_latest_version,
    tensorflow_training_latest_py_version,
):
    job_name = sagemaker.utils.unique_name_from_base(
        "tf-sm-distributed-dataparallel")
    estimator = TensorFlow(
        entry_point="mnist_tf.py",
        role="SageMakerRole",
        source_dir=smdataparallel_dir,
        instance_count=2,
        instance_type="ml.p3.16xlarge",
        sagemaker_session=sagemaker_session,
        framework_version=tensorflow_training_latest_version,
        py_version=tensorflow_training_latest_py_version,
        distribution={"smdistributed": {
            "dataparallel": {
                "enabled": True
            }
        }},
    )

    with timeout.timeout(minutes=integ.TRAINING_DEFAULT_TIMEOUT_MINUTES):
        estimator.fit(job_name=job_name)
Esempio n. 18
0
def test_mnist(sagemaker_session, instance_type):
    estimator = TensorFlow(entry_point=SCRIPT,
                           role='SageMakerRole',
                           train_instance_count=1,
                           train_instance_type=instance_type,
                           sagemaker_session=sagemaker_session,
                           py_version='py3',
                           framework_version=TensorFlow.LATEST_VERSION,
                           metric_definitions=[{
                               'Name':
                               'train:global_steps',
                               'Regex':
                               r'global_step\/sec:\s(.*)'
                           }])
    inputs = estimator.sagemaker_session.upload_data(
        path=os.path.join(RESOURCE_PATH, 'data'),
        key_prefix='scriptmode/mnist')

    with timeout.timeout(minutes=integ.TRAINING_DEFAULT_TIMEOUT_MINUTES):
        estimator.fit(inputs=inputs,
                      job_name=unique_name_from_base('test-tf-sm-mnist'))
    _assert_s3_files_exist(
        estimator.model_dir,
        ['graph.pbtxt', 'model.ckpt-0.index', 'model.ckpt-0.meta'])
    df = estimator.training_job_analytics.dataframe()
    print(df)
    assert df.size > 0
Esempio n. 19
0
def test_mnist_async(sagemaker_session):
    estimator = TensorFlow(entry_point=SCRIPT,
                           role=ROLE,
                           train_instance_count=1,
                           train_instance_type='ml.c5.4xlarge',
                           sagemaker_session=sagemaker_session,
                           py_version='py3',
                           framework_version=TensorFlow.LATEST_VERSION,
                           tags=TAGS)
    inputs = estimator.sagemaker_session.upload_data(
        path=os.path.join(RESOURCE_PATH, 'data'),
        key_prefix='scriptmode/mnist')
    estimator.fit(inputs=inputs,
                  wait=False,
                  job_name=unique_name_from_base('test-tf-sm-async'))
    training_job_name = estimator.latest_training_job.name
    time.sleep(20)
    endpoint_name = training_job_name
    _assert_training_job_tags_match(sagemaker_session.sagemaker_client,
                                    estimator.latest_training_job.name, TAGS)
    with timeout.timeout_and_delete_endpoint_by_name(endpoint_name,
                                                     sagemaker_session):
        estimator = TensorFlow.attach(training_job_name=training_job_name,
                                      sagemaker_session=sagemaker_session)
        predictor = estimator.deploy(initial_instance_count=1,
                                     instance_type='ml.c4.xlarge',
                                     endpoint_name=endpoint_name)

        result = predictor.predict(np.zeros(784))
        print('predict result: {}'.format(result))
        _assert_endpoint_tags_match(sagemaker_session.sagemaker_client,
                                    predictor.endpoint, TAGS)
        _assert_model_tags_match(sagemaker_session.sagemaker_client,
                                 estimator.latest_training_job.name, TAGS)
Esempio n. 20
0
def test_estimator_deploy(sagemaker_session):
    container_log_level = '"logging.INFO"'
    source_dir = "s3://mybucket/source"
    custom_image = "custom:1.0"
    tf = TensorFlow(
        entry_point="script.py",
        role=ROLE,
        sagemaker_session=sagemaker_session,
        training_steps=1000,
        evaluation_steps=10,
        train_instance_count=INSTANCE_COUNT,
        train_instance_type=INSTANCE_TYPE,
        image_name=custom_image,
        container_log_level=container_log_level,
        base_job_name="job",
        source_dir=source_dir,
    )

    job_name = "doing something"
    tf.fit(inputs="s3://mybucket/train", job_name=job_name)
    predictor = tf.deploy(INSTANCE_COUNT,
                          INSTANCE_TYPE,
                          endpoint_name="endpoint",
                          endpoint_type="tensorflow-serving")
    assert isinstance(predictor, Predictor)
Esempio n. 21
0
def test_tf_script_mode_mpi(time, strftime, sagemaker_session):
    tf = TensorFlow(entry_point=SCRIPT_FILE,
                    role=ROLE,
                    sagemaker_session=sagemaker_session,
                    py_version='py3',
                    train_instance_type=INSTANCE_TYPE,
                    train_instance_count=1,
                    framework_version='1.11',
                    source_dir=DATA_DIR,
                    distributions=DISTRIBUTION_MPI_ENABLED)

    inputs = 's3://mybucket/train'
    tf.fit(inputs=inputs)

    call_names = [c[0] for c in sagemaker_session.method_calls]
    assert call_names == ['train', 'logs_for_job']

    expected_train_args = _create_train_job('1.11',
                                            script_mode=True,
                                            horovod=True,
                                            repo_name=SM_IMAGE_REPO_NAME,
                                            py_version='py3')
    expected_train_args['input_config'][0]['DataSource']['S3DataSource'][
        'S3Uri'] = inputs
    expected_train_args['hyperparameters'][
        TensorFlow.LAUNCH_MPI_ENV_NAME] = json.dumps(True)
    expected_train_args['hyperparameters'][
        TensorFlow.MPI_NUM_PROCESSES_PER_HOST] = json.dumps(2)
    expected_train_args['hyperparameters'][
        TensorFlow.MPI_CUSTOM_MPI_OPTIONS] = json.dumps('options')

    actual_train_args = sagemaker_session.method_calls[0][2]
    assert actual_train_args == expected_train_args
def test_mnist(sagemaker_session, instance_type):
    estimator = TensorFlow(
        entry_point=SCRIPT,
        role="SageMakerRole",
        train_instance_count=1,
        train_instance_type=instance_type,
        sagemaker_session=sagemaker_session,
        script_mode=True,
        framework_version=TensorFlow.LATEST_VERSION,
        py_version=tests.integ.PYTHON_VERSION,
        metric_definitions=[{"Name": "train:global_steps", "Regex": r"global_step\/sec:\s(.*)"}],
    )
    inputs = estimator.sagemaker_session.upload_data(
        path=os.path.join(MNIST_RESOURCE_PATH, "data"), key_prefix="scriptmode/mnist"
    )

    with tests.integ.timeout.timeout(minutes=tests.integ.TRAINING_DEFAULT_TIMEOUT_MINUTES):
        estimator.fit(inputs=inputs, job_name=unique_name_from_base("test-tf-sm-mnist"))
    _assert_s3_files_exist(
        estimator.model_dir,
        ["graph.pbtxt", "model.ckpt-0.index", "model.ckpt-0.meta"],
        sagemaker_session.boto_region_name,
    )
    df = estimator.training_job_analytics.dataframe()
    assert df.size > 0
Esempio n. 23
0
def test_failed_tf_training(sagemaker_session, tf_full_version):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        script_path = os.path.join(DATA_DIR, 'iris', 'failure_script.py')
        ec2_client = sagemaker_session.boto_session.client('ec2')
        subnet, security_group_id = get_or_create_subnet_and_security_group(ec2_client, VPC_NAME)
        estimator = TensorFlow(entry_point=script_path,
                               role='SageMakerRole',
                               framework_version=tf_full_version,
                               training_steps=1,
                               evaluation_steps=1,
                               hyperparameters={'input_tensor_name': 'inputs'},
                               train_instance_count=1,
                               train_instance_type='ml.c4.xlarge',
                               sagemaker_session=sagemaker_session,
                               subnets=[subnet],
                               security_group_ids=[security_group_id])

        inputs = estimator.sagemaker_session.upload_data(path=DATA_PATH, key_prefix='integ-test-data/tf-failure')

        with pytest.raises(ValueError) as e:
            estimator.fit(inputs)
        assert 'This failure is expected' in str(e.value)

        job_desc = estimator.sagemaker_session.sagemaker_client.describe_training_job(
            TrainingJobName=estimator.latest_training_job.name)
        assert [subnet] == job_desc['VpcConfig']['Subnets']
        assert [security_group_id] == job_desc['VpcConfig']['SecurityGroupIds']
Esempio n. 24
0
def test_tf(sagemaker_session, tf_full_version):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        script_path = os.path.join(DATA_DIR, 'iris', 'iris-dnn-classifier.py')

        estimator = TensorFlow(entry_point=script_path,
                               role='SageMakerRole',
                               framework_version=tf_full_version,
                               training_steps=1,
                               evaluation_steps=1,
                               hyperparameters={'input_tensor_name': 'inputs'},
                               train_instance_count=1,
                               train_instance_type='ml.c4.xlarge',
                               sagemaker_session=sagemaker_session,
                               base_job_name='test-tf')

        inputs = sagemaker_session.upload_data(path=DATA_PATH, key_prefix='integ-test-data/tf_iris')
        estimator.fit(inputs)
        print('job succeeded: {}'.format(estimator.latest_training_job.name))

    endpoint_name = estimator.latest_training_job.name
    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        json_predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge',
                                          endpoint_name=endpoint_name)

        features = [6.4, 3.2, 4.5, 1.5]
        dict_result = json_predictor.predict({'inputs': features})
        print('predict result: {}'.format(dict_result))
        list_result = json_predictor.predict(features)
        print('predict result: {}'.format(list_result))

        assert dict_result == list_result
Esempio n. 25
0
def test_failed_tf_training(sagemaker_session, tf_full_version):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        script_path = os.path.join(DATA_DIR, 'iris', 'failure_script.py')
        ec2_client = sagemaker_session.boto_session.client('ec2')
        subnet, security_group_id = get_or_create_subnet_and_security_group(ec2_client, VPC_NAME)
        estimator = TensorFlow(entry_point=script_path,
                               role='SageMakerRole',
                               framework_version=tf_full_version,
                               training_steps=1,
                               evaluation_steps=1,
                               hyperparameters={'input_tensor_name': 'inputs'},
                               train_instance_count=1,
                               train_instance_type='ml.c4.xlarge',
                               sagemaker_session=sagemaker_session,
                               subnets=[subnet],
                               security_group_ids=[security_group_id])

        inputs = estimator.sagemaker_session.upload_data(path=DATA_PATH, key_prefix='integ-test-data/tf-failure')

        with pytest.raises(ValueError) as e:
            estimator.fit(inputs)
        assert 'This failure is expected' in str(e.value)

        job_desc = estimator.sagemaker_session.sagemaker_client.describe_training_job(
            TrainingJobName=estimator.latest_training_job.name)
        assert [subnet] == job_desc['VpcConfig']['Subnets']
        assert [security_group_id] == job_desc['VpcConfig']['SecurityGroupIds']
Esempio n. 26
0
def test_tf_async(sagemaker_session):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        script_path = os.path.join(DATA_DIR, 'iris', 'iris-dnn-classifier.py')

        estimator = TensorFlow(entry_point=script_path,
                               role='SageMakerRole',
                               training_steps=1,
                               evaluation_steps=1,
                               hyperparameters={'input_tensor_name': 'inputs'},
                               train_instance_count=1,
                               train_instance_type='ml.c4.xlarge',
                               sagemaker_session=sagemaker_session,
                               base_job_name='test-tf')

        inputs = estimator.sagemaker_session.upload_data(path=DATA_PATH, key_prefix='integ-test-data/tf_iris')
        estimator.fit(inputs, wait=False)
        training_job_name = estimator.latest_training_job.name
        time.sleep(20)

    endpoint_name = training_job_name
    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        estimator = TensorFlow.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session)
        json_predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge',
                                          endpoint_name=endpoint_name)

        result = json_predictor.predict([6.4, 3.2, 4.5, 1.5])
        print('predict result: {}'.format(result))
def run_tf_training(script,
                    instance_type,
                    instance_count,
                    sagemaker_local_session,
                    docker_image,
                    framework_version,
                    training_data_path,
                    output_path=None,
                    hyperparameters=None):

    hyperparameters = hyperparameters or {}

    estimator = TensorFlow(entry_point=script,
                           role='SageMakerRole',
                           instance_count=instance_count,
                           instance_type=instance_type,
                           sagemaker_session=sagemaker_local_session,
                           image_uri=docker_image,
                           model_dir='/opt/ml/model',
                           output_path=output_path,
                           hyperparameters=hyperparameters,
                           base_job_name='test-tf',
                           framework_version=framework_version,
                           py_version='py3')

    estimator.fit(training_data_path)
def test_cifar(sagemaker_session, tf_full_version):
    with timeout(minutes=45):
        script_path = os.path.join(DATA_DIR, 'cifar_10', 'source')

        dataset_path = os.path.join(DATA_DIR, 'cifar_10', 'data')

        estimator = TensorFlow(entry_point='resnet_cifar_10.py', source_dir=script_path, role='SageMakerRole',
                               framework_version=tf_full_version, training_steps=500, evaluation_steps=5,
                               train_instance_count=2, train_instance_type='ml.p2.xlarge',
                               sagemaker_session=sagemaker_session, train_max_run=45 * 60,
                               base_job_name='test-cifar')

        inputs = estimator.sagemaker_session.upload_data(path=dataset_path, key_prefix='data/cifar10')
        estimator.fit(inputs, logs=False)
        print('job succeeded: {}'.format(estimator.latest_training_job.name))

    endpoint_name = estimator.latest_training_job.name
    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.p2.xlarge')
        predictor.serializer = PickleSerializer()
        predictor.content_type = PICKLE_CONTENT_TYPE

        data = np.random.randn(32, 32, 3)
        predict_response = predictor.predict(data)
        assert len(predict_response['outputs']['probabilities']['floatVal']) == 10
Esempio n. 29
0
def test_horovod_local_mode(sagemaker_local_session, instances, processes, tmpdir):
    output_path = "file://%s" % tmpdir
    job_name = sagemaker.utils.unique_name_from_base("tf-horovod")
    estimator = TensorFlow(
        entry_point=os.path.join(horovod_dir, "hvd_basic.py"),
        role="SageMakerRole",
        train_instance_count=2,
        train_instance_type="local",
        sagemaker_session=sagemaker_local_session,
        py_version=integ.PYTHON_VERSION,
        script_mode=True,
        output_path=output_path,
        framework_version="1.12",
        distributions={"mpi": {"enabled": True, "processes_per_host": processes}},
    )

    with timeout.timeout(minutes=integ.TRAINING_DEFAULT_TIMEOUT_MINUTES):
        estimator.fit(job_name=job_name)

        tmp = str(tmpdir)
        extract_files(output_path.replace("file://", ""), tmp)

        size = instances * processes

        for rank in range(size):
            assert read_json("rank-%s" % rank, tmp)["rank"] == rank
Esempio n. 30
0
def test_cifar(sagemaker_session, tf_full_version):
    with timeout(minutes=45):
        script_path = os.path.join(DATA_DIR, 'cifar_10', 'source')

        dataset_path = os.path.join(DATA_DIR, 'cifar_10', 'data')

        estimator = TensorFlow(entry_point='resnet_cifar_10.py', source_dir=script_path, role='SageMakerRole',
                               framework_version=tf_full_version, training_steps=500, evaluation_steps=5,
                               train_instance_count=2, train_instance_type='ml.p2.xlarge',
                               sagemaker_session=sagemaker_session, train_max_run=20 * 60,
                               base_job_name='test-cifar')

        inputs = estimator.sagemaker_session.upload_data(path=dataset_path, key_prefix='data/cifar10')
        estimator.fit(inputs, logs=False)
        print('job succeeded: {}'.format(estimator.latest_training_job.name))

    endpoint_name = estimator.latest_training_job.name
    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.p2.xlarge')
        predictor.serializer = PickleSerializer()
        predictor.content_type = PICKLE_CONTENT_TYPE

        data = np.random.randn(32, 32, 3)
        predict_response = predictor.predict(data)
        assert len(predict_response['outputs']['probabilities']['floatVal']) == 10
Esempio n. 31
0
def test_tf_async(sagemaker_session):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        script_path = os.path.join(DATA_DIR, 'iris', 'iris-dnn-classifier.py')

        estimator = TensorFlow(entry_point=script_path,
                               role='SageMakerRole',
                               training_steps=1,
                               evaluation_steps=1,
                               hyperparameters={'input_tensor_name': 'inputs'},
                               train_instance_count=1,
                               train_instance_type='ml.c4.xlarge',
                               sagemaker_session=sagemaker_session,
                               base_job_name='test-tf')

        inputs = estimator.sagemaker_session.upload_data(path=DATA_PATH, key_prefix='integ-test-data/tf_iris')
        estimator.fit(inputs, wait=False)
        training_job_name = estimator.latest_training_job.name
        time.sleep(20)

    endpoint_name = training_job_name
    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        estimator = TensorFlow.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session)
        json_predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge',
                                          endpoint_name=endpoint_name)

        result = json_predictor.predict([6.4, 3.2, 4.5, 1.5])
        print('predict result: {}'.format(result))
Esempio n. 32
0
def test_tf(sagemaker_session, tf_full_version):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        script_path = os.path.join(DATA_DIR, 'iris', 'iris-dnn-classifier.py')

        estimator = TensorFlow(entry_point=script_path,
                               role='SageMakerRole',
                               framework_version=tf_full_version,
                               training_steps=1,
                               evaluation_steps=1,
                               hyperparameters={'input_tensor_name': 'inputs'},
                               train_instance_count=1,
                               train_instance_type='ml.c4.xlarge',
                               sagemaker_session=sagemaker_session,
                               base_job_name='test-tf')

        inputs = sagemaker_session.upload_data(path=DATA_PATH, key_prefix='integ-test-data/tf_iris')
        estimator.fit(inputs)
        print('job succeeded: {}'.format(estimator.latest_training_job.name))

    endpoint_name = estimator.latest_training_job.name
    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        json_predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge',
                                          endpoint_name=endpoint_name)

        features = [6.4, 3.2, 4.5, 1.5]
        dict_result = json_predictor.predict({'inputs': features})
        print('predict result: {}'.format(dict_result))
        list_result = json_predictor.predict(features)
        print('predict result: {}'.format(list_result))

        assert dict_result == list_result
Esempio n. 33
0
def test_fit_mpi(time, strftime, sagemaker_session):
    tf = TensorFlow(
        entry_point=SCRIPT_FILE,
        framework_version="1.11",
        py_version="py2",
        role=ROLE,
        sagemaker_session=sagemaker_session,
        instance_type=INSTANCE_TYPE,
        instance_count=1,
        source_dir=DATA_DIR,
        distribution=DISTRIBUTION_MPI_ENABLED,
    )

    inputs = "s3://mybucket/train"
    tf.fit(inputs=inputs)

    call_names = [c[0] for c in sagemaker_session.method_calls]
    assert call_names == ["train", "logs_for_job"]

    expected_train_args = _create_train_job("1.11",
                                            horovod=True,
                                            py_version="py2")
    expected_train_args["input_config"][0]["DataSource"]["S3DataSource"][
        "S3Uri"] = inputs
    expected_train_args["hyperparameters"][
        TensorFlow.LAUNCH_MPI_ENV_NAME] = json.dumps(True)
    expected_train_args["hyperparameters"][
        TensorFlow.MPI_NUM_PROCESSES_PER_HOST] = json.dumps(2)
    expected_train_args["hyperparameters"][
        TensorFlow.MPI_CUSTOM_MPI_OPTIONS] = json.dumps("options")

    actual_train_args = sagemaker_session.method_calls[0][2]
    assert actual_train_args == expected_train_args
def test_server_side_encryption(sagemaker_session):
    boto_session = sagemaker_session.boto_session
    with tests.integ.kms_utils.bucket_with_encryption(boto_session, ROLE) as (
        bucket_with_kms,
        kms_key,
    ):
        output_path = os.path.join(
            bucket_with_kms, "test-server-side-encryption", time.strftime("%y%m%d-%H%M")
        )

        estimator = TensorFlow(
            entry_point=SCRIPT,
            role=ROLE,
            train_instance_count=1,
            train_instance_type="ml.c5.xlarge",
            sagemaker_session=sagemaker_session,
            script_mode=True,
            framework_version=TensorFlow.LATEST_VERSION,
            py_version=tests.integ.PYTHON_VERSION,
            code_location=output_path,
            output_path=output_path,
            model_dir="/opt/ml/model",
            output_kms_key=kms_key,
        )

        inputs = estimator.sagemaker_session.upload_data(
            path=os.path.join(MNIST_RESOURCE_PATH, "data"), key_prefix="scriptmode/mnist"
        )

        with tests.integ.timeout.timeout(minutes=tests.integ.TRAINING_DEFAULT_TIMEOUT_MINUTES):
            estimator.fit(
                inputs=inputs, job_name=unique_name_from_base("test-server-side-encryption")
            )
def test_run_tensorboard_locally_without_awscli_binary(time, strftime, popen, call, access, sagemaker_session):
    tf = TensorFlow(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session,
                    train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE)

    with pytest.raises(EnvironmentError) as error:
        tf.fit(inputs='s3://mybucket/train', run_tensorboard_locally=True)
    assert str(error.value) == 'The AWS CLI is not installed in the system. Please install the AWS CLI using the ' \
                               'following command: \n pip install awscli'
def test_run_tensorboard_locally(sleep, time, strftime, popen, call, access, rmtree, mkdtemp, sync, sagemaker_session):
    tf = TensorFlow(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session,
                    train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE)

    popen().poll.return_value = None

    tf.fit(inputs='s3://mybucket/train', run_tensorboard_locally=True)

    popen.assert_called_with(['tensorboard', '--logdir', '/my/temp/folder', '--host', 'localhost', '--port', '6006'],
                             stderr=-1,
                             stdout=-1)
def test_run_tensorboard_locally_port_in_use(time, strftime, popen, call, access, socket, rmtree, mkdtemp, sync,
                                             sagemaker_session):
    tf = TensorFlow(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session,
                    train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE)

    popen().poll.side_effect = [-1, None]

    tf.fit(inputs='s3://mybucket/train', run_tensorboard_locally=True)

    popen.assert_any_call(['tensorboard', '--logdir', '/my/temp/folder', '--host', 'localhost', '--port', '6006'],
                          stderr=-1, stdout=-1)

    popen.assert_any_call(['tensorboard', '--logdir', '/my/temp/folder', '--host', 'localhost', '--port', '6007'],
                          stderr=-1, stdout=-1)
def test_create_model_with_custom_image(sagemaker_session):
    container_log_level = '"logging.INFO"'
    source_dir = 's3://mybucket/source'
    custom_image = 'tensorflow:1.0'
    tf = TensorFlow(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session,
                    training_steps=1000, evaluation_steps=10, train_instance_count=INSTANCE_COUNT,
                    train_instance_type=INSTANCE_TYPE, image_name=custom_image,
                    container_log_level=container_log_level, base_job_name='job',
                    source_dir=source_dir)

    job_name = 'doing something'
    tf.fit(inputs='s3://mybucket/train', job_name=job_name)
    model = tf.create_model()

    assert model.image == custom_image
Esempio n. 39
0
def test_failed_tf_training(sagemaker_session, tf_full_version):
    with timeout(minutes=15):
        script_path = os.path.join(DATA_DIR, 'iris', 'failure_script.py')
        estimator = TensorFlow(entry_point=script_path,
                               role='SageMakerRole',
                               framework_version=tf_full_version,
                               training_steps=1,
                               evaluation_steps=1,
                               hyperparameters={'input_tensor_name': 'inputs'},
                               train_instance_count=1,
                               train_instance_type='ml.c4.xlarge',
                               sagemaker_session=sagemaker_session)

        inputs = estimator.sagemaker_session.upload_data(path=DATA_PATH, key_prefix='integ-test-data/tf-failure')

        with pytest.raises(ValueError) as e:
            estimator.fit(inputs)
        assert 'This failure is expected' in str(e.value)
def test_create_model_with_optional_params(sagemaker_session):
    container_log_level = '"logging.INFO"'
    source_dir = 's3://mybucket/source'
    enable_cloudwatch_metrics = 'true'
    tf = TensorFlow(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session,
                    training_steps=1000, evaluation_steps=10, train_instance_count=INSTANCE_COUNT,
                    train_instance_type=INSTANCE_TYPE, container_log_level=container_log_level, base_job_name='job',
                    source_dir=source_dir, enable_cloudwatch_metrics=enable_cloudwatch_metrics)

    job_name = 'doing something'
    tf.fit(inputs='s3://mybucket/train', job_name=job_name)

    new_role = 'role'
    model_server_workers = 2
    model = tf.create_model(role=new_role, model_server_workers=2)

    assert model.role == new_role
    assert model.model_server_workers == model_server_workers
def test_tf_local_mode(tf_full_version, sagemaker_local_session):
    local_mode_lock_fd = open(LOCK_PATH, 'w')
    local_mode_lock = local_mode_lock_fd.fileno()
    with timeout(minutes=5):
        script_path = os.path.join(DATA_DIR, 'iris', 'iris-dnn-classifier.py')

        estimator = TensorFlow(entry_point=script_path,
                               role='SageMakerRole',
                               framework_version=tf_full_version,
                               training_steps=1,
                               evaluation_steps=1,
                               hyperparameters={'input_tensor_name': 'inputs'},
                               train_instance_count=1,
                               train_instance_type='local',
                               base_job_name='test-tf',
                               sagemaker_session=sagemaker_local_session)

        inputs = estimator.sagemaker_session.upload_data(path=DATA_PATH,
                                                         key_prefix='integ-test-data/tf_iris')
        estimator.fit(inputs)
        print('job succeeded: {}'.format(estimator.latest_training_job.name))

    endpoint_name = estimator.latest_training_job.name
    try:
        # Since Local Mode uses the same port for serving, we need a lock in order
        # to allow concurrent test execution. The serving test is really fast so it still
        # makes sense to allow this behavior.
        fcntl.lockf(local_mode_lock, fcntl.LOCK_EX)
        json_predictor = estimator.deploy(initial_instance_count=1,
                                          instance_type='local',
                                          endpoint_name=endpoint_name)

        features = [6.4, 3.2, 4.5, 1.5]
        dict_result = json_predictor.predict({'inputs': features})
        print('predict result: {}'.format(dict_result))
        list_result = json_predictor.predict(features)
        print('predict result: {}'.format(list_result))

        assert dict_result == list_result
    finally:
        estimator.delete_endpoint()
        time.sleep(5)
        fcntl.lockf(local_mode_lock, fcntl.LOCK_UN)
def test_create_model(sagemaker_session, tf_version):
    container_log_level = '"logging.INFO"'
    source_dir = 's3://mybucket/source'
    tf = TensorFlow(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session,
                    training_steps=1000, evaluation_steps=10, train_instance_count=INSTANCE_COUNT,
                    train_instance_type=INSTANCE_TYPE, framework_version=tf_version,
                    container_log_level=container_log_level, base_job_name='job',
                    source_dir=source_dir)

    job_name = 'doing something'
    tf.fit(inputs='s3://mybucket/train', job_name=job_name)
    model = tf.create_model()

    assert model.sagemaker_session == sagemaker_session
    assert model.framework_version == tf_version
    assert model.py_version == tf.py_version
    assert model.entry_point == SCRIPT_PATH
    assert model.role == ROLE
    assert model.name == job_name
    assert model.container_log_level == container_log_level
    assert model.source_dir == source_dir
def test_tf(m_tar, e_tar, time, strftime, sagemaker_session, tf_version):
    tf = TensorFlow(entry_point=SCRIPT_FILE, role=ROLE, sagemaker_session=sagemaker_session, training_steps=1000,
                    evaluation_steps=10, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE,
                    framework_version=tf_version, requirements_file=REQUIREMENTS_FILE, source_dir=DATA_DIR)

    inputs = 's3://mybucket/train'
    s3_prefix = 's3://{}/{}/source/sourcedir.tar.gz'.format(BUCKET_NAME, JOB_NAME)
    e_tar.return_value = UploadedCode(s3_prefix=s3_prefix, script_name=SCRIPT_FILE)
    s3_prefix = 's3://{}/{}/sourcedir.tar.gz'.format(BUCKET_NAME, JOB_NAME)
    m_tar.return_value = UploadedCode(s3_prefix=s3_prefix, script_name=SCRIPT_FILE)
    tf.fit(inputs=inputs)

    call_names = [c[0] for c in sagemaker_session.method_calls]
    assert call_names == ['train', 'logs_for_job']

    expected_train_args = _create_train_job(tf_version)
    expected_train_args['input_config'][0]['DataSource']['S3DataSource']['S3Uri'] = inputs

    actual_train_args = sagemaker_session.method_calls[0][2]
    assert actual_train_args == expected_train_args

    model = tf.create_model()

    environment = {
        'Environment': {
            'SAGEMAKER_SUBMIT_DIRECTORY': 's3://{}/{}/sourcedir.tar.gz'.format(BUCKET_NAME, JOB_NAME),
            'SAGEMAKER_PROGRAM': 'dummy_script.py', 'SAGEMAKER_REQUIREMENTS': 'dummy_requirements.txt',
            'SAGEMAKER_ENABLE_CLOUDWATCH_METRICS': 'false', 'SAGEMAKER_REGION': 'us-west-2',
            'SAGEMAKER_CONTAINER_LOG_LEVEL': '20'
        },
        'Image': create_image_uri('us-west-2', "tensorflow", INSTANCE_TYPE, tf_version, "py2"),
        'ModelDataUrl': 's3://m/m.tar.gz'
    }
    assert environment == model.prepare_container_def(INSTANCE_TYPE)

    assert 'cpu' in model.prepare_container_def(INSTANCE_TYPE)['Image']
    predictor = tf.deploy(1, INSTANCE_TYPE)
    assert isinstance(predictor, TensorFlowPredictor)
def test_keras(sagemaker_session, tf_full_version):
    script_path = os.path.join(DATA_DIR, 'cifar_10', 'source')
    dataset_path = os.path.join(DATA_DIR, 'cifar_10', 'data')

    with timeout(minutes=45):
        estimator = TensorFlow(entry_point='keras_cnn_cifar_10.py',
                               source_dir=script_path,
                               role='SageMakerRole', sagemaker_session=sagemaker_session,
                               hyperparameters={'learning_rate': 1e-4, 'decay': 1e-6},
                               training_steps=500, evaluation_steps=5,
                               train_instance_count=1, train_instance_type='ml.c4.xlarge',
                               train_max_run=45 * 60)

        inputs = estimator.sagemaker_session.upload_data(path=dataset_path, key_prefix='data/cifar10')

        estimator.fit(inputs)

    endpoint_name = estimator.latest_training_job.name
    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.p2.xlarge')

        data = np.random.randn(32, 32, 3)
        predict_response = predictor.predict(data)
        assert len(predict_response['outputs']['probabilities']['floatVal']) == 10
def test_deploy(sagemaker_session, tf_version):
    estimator = TensorFlow(entry_point=SCRIPT, source_dir=SOURCE_DIR, role=ROLE,
                           framework_version=tf_version,
                           train_instance_count=2, train_instance_type=INSTANCE_TYPE_CPU,
                           sagemaker_session=sagemaker_session,
                           base_job_name='test-cifar')

    estimator.fit('s3://mybucket/train')
    print('job succeeded: {}'.format(estimator.latest_training_job.name))

    estimator.deploy(initial_instance_count=1, instance_type=INSTANCE_TYPE_CPU)
    image = IMAGE_URI_FORMAT_STRING.format(REGION, CPU_IMAGE_NAME, tf_version, 'cpu', 'py2')
    sagemaker_session.create_model.assert_called_with(
        estimator._current_job_name,
        ROLE,
        {'Environment':
         {'SAGEMAKER_ENABLE_CLOUDWATCH_METRICS': 'false',
          'SAGEMAKER_CONTAINER_LOG_LEVEL': '20',
          'SAGEMAKER_SUBMIT_DIRECTORY': SOURCE_DIR,
          'SAGEMAKER_REQUIREMENTS': '',
          'SAGEMAKER_REGION': REGION,
          'SAGEMAKER_PROGRAM': SCRIPT},
         'Image': image,
         'ModelDataUrl': 's3://m/m.tar.gz'})