def test_deploy(sagemaker_session):
    estimator = TensorFlow(
        entry_point=SCRIPT,
        source_dir=SOURCE_DIR,
        role=ROLE,
        framework_version="2.3.0",
        py_version="py37",
        instance_count=2,
        instance_type=INSTANCE_TYPE_CPU,
        sagemaker_session=sagemaker_session,
        base_job_name="test-cifar",
    )

    estimator.fit("s3://mybucket/train")

    estimator.deploy(initial_instance_count=1, instance_type=INSTANCE_TYPE_CPU)
    image = IMAGE_URI_FORMAT_STRING.format(REGION, REPOSITORY, "2.3.0",
                                           PROCESSOR)
    sagemaker_session.create_model.assert_called_with(
        ANY,
        ROLE,
        {
            "Image": image,
            "Environment": {
                "SAGEMAKER_TFS_NGINX_LOGLEVEL": "info"
            },
            "ModelDataUrl": "s3://m/m.tar.gz",
        },
        vpc_config=None,
        enable_network_isolation=False,
        tags=None,
    )
def test_deploy(sagemaker_session, tf_version):
    estimator = TensorFlow(
        entry_point=SCRIPT,
        source_dir=SOURCE_DIR,
        role=ROLE,
        framework_version=tf_version,
        train_instance_count=2,
        train_instance_type=INSTANCE_TYPE_CPU,
        sagemaker_session=sagemaker_session,
        base_job_name="test-cifar",
    )

    estimator.fit("s3://mybucket/train")
    print("job succeeded: {}".format(estimator.latest_training_job.name))

    estimator.deploy(initial_instance_count=1, instance_type=INSTANCE_TYPE_CPU)
    image = IMAGE_URI_FORMAT_STRING.format(REGION, CPU_IMAGE_NAME, tf_version,
                                           "cpu", "py2")
    sagemaker_session.create_model.assert_called_with(
        estimator._current_job_name,
        ROLE,
        {
            "Environment": {
                "SAGEMAKER_CONTAINER_LOG_LEVEL": "20",
                "SAGEMAKER_SUBMIT_DIRECTORY": SOURCE_DIR,
                "SAGEMAKER_REQUIREMENTS": "",
                "SAGEMAKER_REGION": REGION,
                "SAGEMAKER_PROGRAM": SCRIPT,
            },
            "Image": image,
            "ModelDataUrl": "s3://m/m.tar.gz",
        },
    )
Exemple #3
0
def test_cifar(sagemaker_session):
    with timeout(minutes=15):
        script_path = os.path.join(DATA_DIR, 'cifar_10', 'source')

        dataset_path = os.path.join(DATA_DIR, 'cifar_10', 'data')

        estimator = TensorFlow(entry_point='resnet_cifar_10.py',
                               source_dir=script_path,
                               role='SageMakerRole',
                               training_steps=20,
                               evaluation_steps=5,
                               train_instance_count=2,
                               train_instance_type='ml.p2.xlarge',
                               sagemaker_session=sagemaker_session,
                               base_job_name='test-cifar')

        inputs = estimator.sagemaker_session.upload_data(
            path=dataset_path, key_prefix='data/cifar10')
        estimator.fit(inputs)
        print('job succeeded: {}'.format(estimator.latest_training_job.name))

    try:
        with timeout(minutes=15):
            estimator.deploy(initial_instance_count=1,
                             instance_type='ml.c4.xlarge')
    finally:
        try:
            estimator.delete_endpoint()
        except Exception:
            pass
def test_deploy(sagemaker_session, tf_version):
    estimator = TensorFlow(entry_point=SCRIPT,
                           source_dir=SOURCE_DIR,
                           role=ROLE,
                           framework_version=tf_version,
                           train_instance_count=2,
                           train_instance_type=INSTANCE_TYPE_CPU,
                           sagemaker_session=sagemaker_session,
                           base_job_name='test-cifar')

    estimator.fit('s3://mybucket/train')
    print('job succeeded: {}'.format(estimator.latest_training_job.name))

    estimator.deploy(initial_instance_count=1, instance_type=INSTANCE_TYPE_CPU)
    image = IMAGE_URI_FORMAT_STRING.format(REGION, CPU_IMAGE_NAME, tf_version,
                                           'cpu', 'py2')
    sagemaker_session.create_model.assert_called_with(
        estimator._current_job_name, ROLE, {
            'Environment': {
                'SAGEMAKER_ENABLE_CLOUDWATCH_METRICS': 'false',
                'SAGEMAKER_CONTAINER_LOG_LEVEL': '20',
                'SAGEMAKER_SUBMIT_DIRECTORY': SOURCE_DIR,
                'SAGEMAKER_REQUIREMENTS': '',
                'SAGEMAKER_REGION': REGION,
                'SAGEMAKER_PROGRAM': SCRIPT
            },
            'Image': image,
            'ModelDataUrl': 's3://m/m.tar.gz'
        })
Exemple #5
0
def main():
    download_training_and_eval_data()

    sagemaker_session = LocalSession()
    sagemaker_session.config = {'local': {'local_code': True}}

    # For local training a dummy role will be sufficient
    role = 'arn:aws:iam::111111111111:role/service-role/AmazonSageMaker-ExecutionRole-20200101T000001'

    print('Starting model training')
    mnist_estimator = TensorFlow(
        entry_point='mnist_tf2.py',
        role=role,
        instance_count=1,
        instance_type='local',
        framework_version='2.3.0',
        py_version='py37',
        distribution={'parameter_server': {
            'enabled': True
        }})

    mnist_estimator.fit("file://./data/")

    print('Deploying local mode endpoint')
    predictor = mnist_estimator.deploy(initial_instance_count=1,
                                       instance_type='local')

    do_inference_on_local_endpoint(predictor)

    predictor.delete_endpoint(predictor.endpoint)
    predictor.delete_model()
Exemple #6
0
def main():
    download_training_and_eval_data()

    print('Starting model training.')
    print(
        'Note: if launching for the first time in local mode, container image download might take a few minutes to complete.')
    california_housing_estimator = TensorFlow(entry_point='california_housing_tf2.py',
                                              source_dir='code',
                                              role=DUMMY_IAM_ROLE,
                                              instance_count=1,
                                              instance_type='local',
                                              framework_version='2.4.1',
                                              py_version='py37')

    inputs = {'train': 'file://./data/train', 'test': 'file://./data/test'}
    california_housing_estimator.fit(inputs)
    print('Completed model training')

    print('Deploying endpoint in local mode')
    predictor = california_housing_estimator.deploy(initial_instance_count=1, instance_type='local')

    do_inference_on_local_endpoint(predictor)

    print('About to delete the endpoint to stop paying (if in cloud mode).')
    predictor.delete_endpoint(predictor.endpoint_name)
def test_mnist_async(sagemaker_session):
    estimator = TensorFlow(entry_point=SCRIPT,
                           role=ROLE,
                           train_instance_count=1,
                           train_instance_type='ml.c5.4xlarge',
                           sagemaker_session=sagemaker_session,
                           py_version='py3',
                           framework_version=TensorFlow.LATEST_VERSION,
                           base_job_name=unique_name_from_base('test-tf-sm-mnist'),
                           tags=TAGS)
    inputs = estimator.sagemaker_session.upload_data(
        path=os.path.join(RESOURCE_PATH, 'data'),
        key_prefix='scriptmode/mnist')
    estimator.fit(inputs, wait=False)
    training_job_name = estimator.latest_training_job.name
    time.sleep(20)
    endpoint_name = training_job_name
    _assert_training_job_tags_match(sagemaker_session.sagemaker_client, estimator.latest_training_job.name, TAGS)
    with timeout.timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        estimator = TensorFlow.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session)
        predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge',
                                     endpoint_name=endpoint_name)

        result = predictor.predict(np.zeros(784))
        print('predict result: {}'.format(result))
        _assert_endpoint_tags_match(sagemaker_session.sagemaker_client, predictor.endpoint, TAGS)
        _assert_model_tags_match(sagemaker_session.sagemaker_client, estimator.latest_training_job.name, TAGS)
Exemple #8
0
def test_keras(sagemaker_session, cpu_instance_type):
    script_path = os.path.join(tests.integ.DATA_DIR, "cifar_10", "source")
    dataset_path = os.path.join(tests.integ.DATA_DIR, "cifar_10", "data")

    with timeout(minutes=45):
        estimator = TensorFlow(
            entry_point="keras_cnn_cifar_10.py",
            source_dir=script_path,
            role="SageMakerRole",
            framework_version="1.12",
            sagemaker_session=sagemaker_session,
            hyperparameters={"learning_rate": 1e-4, "decay": 1e-6},
            training_steps=50,
            evaluation_steps=5,
            train_instance_count=1,
            train_instance_type=cpu_instance_type,
            train_max_run=45 * 60,
        )

        inputs = estimator.sagemaker_session.upload_data(
            path=dataset_path, key_prefix="data/cifar10"
        )
        job_name = unique_name_from_base("test-tf-keras")

        estimator.fit(inputs, job_name=job_name)

    endpoint_name = estimator.latest_training_job.name
    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        predictor = estimator.deploy(initial_instance_count=1, instance_type=cpu_instance_type)

        data = np.random.randn(32, 32, 3)
        predict_response = predictor.predict(data)
        assert len(predict_response["outputs"]["probabilities"]["floatVal"]) == 10
Exemple #9
0
def test_tf(sagemaker_session):
    with timeout(minutes=15):
        script_path = os.path.join(DATA_DIR, 'iris', 'iris-dnn-classifier.py')

        estimator = TensorFlow(entry_point=script_path,
                               role='SageMakerRole',
                               training_steps=1,
                               evaluation_steps=1,
                               hyperparameters={'input_tensor_name': 'inputs'},
                               train_instance_count=1,
                               train_instance_type='ml.c4.xlarge',
                               sagemaker_session=sagemaker_session,
                               base_job_name='test-tf')

        inputs = estimator.sagemaker_session.upload_data(
            path=DATA_PATH, key_prefix='integ-test-data/tf_iris')
        estimator.fit(inputs)
        print('job succeeded: {}'.format(estimator.latest_training_job.name))

    with timeout_and_delete_endpoint(estimator=estimator, minutes=20):
        json_predictor = estimator.deploy(initial_instance_count=1,
                                          instance_type='ml.c4.xlarge')

        result = json_predictor.predict([6.4, 3.2, 4.5, 1.5])
        print('predict result: {}'.format(result))
Exemple #10
0
def test_tf_async(sagemaker_session):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        script_path = os.path.join(DATA_DIR, 'iris', 'iris-dnn-classifier.py')

        estimator = TensorFlow(entry_point=script_path,
                               role='SageMakerRole',
                               training_steps=1,
                               evaluation_steps=1,
                               hyperparameters={'input_tensor_name': 'inputs'},
                               train_instance_count=1,
                               train_instance_type='ml.c4.xlarge',
                               sagemaker_session=sagemaker_session,
                               base_job_name='test-tf')

        inputs = estimator.sagemaker_session.upload_data(
            path=DATA_PATH, key_prefix='integ-test-data/tf_iris')
        estimator.fit(inputs, wait=False)
        training_job_name = estimator.latest_training_job.name
        time.sleep(20)

    endpoint_name = training_job_name
    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        estimator = TensorFlow.attach(training_job_name=training_job_name,
                                      sagemaker_session=sagemaker_session)
        json_predictor = estimator.deploy(initial_instance_count=1,
                                          instance_type='ml.c4.xlarge',
                                          endpoint_name=endpoint_name)

        result = json_predictor.predict([6.4, 3.2, 4.5, 1.5])
        print('predict result: {}'.format(result))
def test_tf_local_data_local_script():
    with timeout(minutes=5):
        script_path = os.path.join(DATA_DIR, 'iris', 'iris-dnn-classifier.py')

        estimator = TensorFlow(entry_point=script_path,
                               role='SageMakerRole',
                               training_steps=1,
                               evaluation_steps=1,
                               hyperparameters={'input_tensor_name': 'inputs'},
                               train_instance_count=1,
                               train_instance_type='local',
                               base_job_name='test-tf',
                               sagemaker_session=LocalNoS3Session())

        inputs = 'file://' + DATA_PATH

        estimator.fit(inputs)
        print('job succeeded: {}'.format(estimator.latest_training_job.name))

    endpoint_name = estimator.latest_training_job.name
    with local_mode_utils.lock():
        try:
            json_predictor = estimator.deploy(initial_instance_count=1,
                                              instance_type='local',
                                              endpoint_name=endpoint_name)

            features = [6.4, 3.2, 4.5, 1.5]
            dict_result = json_predictor.predict({'inputs': features})
            print('predict result: {}'.format(dict_result))
            list_result = json_predictor.predict(features)
            print('predict result: {}'.format(list_result))

            assert dict_result == list_result
        finally:
            estimator.delete_endpoint()
Exemple #12
0
def test_deploy_with_input_handlers(sagemaker_session, instance_type,
                                    tf_full_version, tf_full_py_version):
    estimator = TensorFlow(
        entry_point="training.py",
        source_dir=TFS_RESOURCE_PATH,
        role=ROLE,
        instance_count=1,
        instance_type=instance_type,
        framework_version=tf_full_version,
        py_version=tf_full_py_version,
        sagemaker_session=sagemaker_session,
        tags=TAGS,
    )

    estimator.fit(job_name=unique_name_from_base("test-tf-tfs-deploy"))

    endpoint_name = estimator.latest_training_job.name

    with timeout.timeout_and_delete_endpoint_by_name(endpoint_name,
                                                     sagemaker_session):
        predictor = estimator.deploy(
            initial_instance_count=1,
            instance_type=instance_type,
            endpoint_name=endpoint_name,
            entry_point=os.path.join(TFS_RESOURCE_PATH, "inference.py"),
        )

        input_data = {"instances": [1.0, 2.0, 5.0]}
        expected_result = {"predictions": [4.0, 4.5, 6.0]}

        result = predictor.predict(input_data)
        assert expected_result == result
def test_tf_async(sagemaker_session):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        script_path = os.path.join(DATA_DIR, 'iris', 'iris-dnn-classifier.py')

        estimator = TensorFlow(entry_point=script_path,
                               role='SageMakerRole',
                               training_steps=1,
                               evaluation_steps=1,
                               hyperparameters={'input_tensor_name': 'inputs'},
                               train_instance_count=1,
                               train_instance_type='ml.c4.xlarge',
                               sagemaker_session=sagemaker_session,
                               base_job_name='test-tf')

        inputs = estimator.sagemaker_session.upload_data(path=DATA_PATH, key_prefix='integ-test-data/tf_iris')
        estimator.fit(inputs, wait=False)
        training_job_name = estimator.latest_training_job.name
        time.sleep(20)

    endpoint_name = training_job_name
    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        estimator = TensorFlow.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session)
        json_predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge',
                                          endpoint_name=endpoint_name)

        result = json_predictor.predict([6.4, 3.2, 4.5, 1.5])
        print('predict result: {}'.format(result))
def test_cifar(sagemaker_session, tf_full_version):
    with timeout(minutes=45):
        script_path = os.path.join(DATA_DIR, 'cifar_10', 'source')

        dataset_path = os.path.join(DATA_DIR, 'cifar_10', 'data')

        estimator = TensorFlow(entry_point='resnet_cifar_10.py', source_dir=script_path, role='SageMakerRole',
                               framework_version=tf_full_version, training_steps=500, evaluation_steps=5,
                               train_instance_count=2, train_instance_type='ml.p2.xlarge',
                               sagemaker_session=sagemaker_session, train_max_run=45 * 60,
                               base_job_name='test-cifar')

        inputs = estimator.sagemaker_session.upload_data(path=dataset_path, key_prefix='data/cifar10')
        estimator.fit(inputs, logs=False)
        print('job succeeded: {}'.format(estimator.latest_training_job.name))

    endpoint_name = estimator.latest_training_job.name
    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.p2.xlarge')
        predictor.serializer = PickleSerializer()
        predictor.content_type = PICKLE_CONTENT_TYPE

        data = np.random.randn(32, 32, 3)
        predict_response = predictor.predict(data)
        assert len(predict_response['outputs']['probabilities']['floatVal']) == 10
Exemple #15
0
def test_tf(m_tar, e_tar, time, strftime, sagemaker_session, tf_version):
    tf = TensorFlow(entry_point=SCRIPT_FILE,
                    role=ROLE,
                    sagemaker_session=sagemaker_session,
                    training_steps=1000,
                    evaluation_steps=10,
                    train_instance_count=INSTANCE_COUNT,
                    train_instance_type=INSTANCE_TYPE,
                    framework_version=tf_version,
                    requirements_file=REQUIREMENTS_FILE,
                    source_dir=DATA_DIR)

    inputs = 's3://mybucket/train'
    s3_prefix = 's3://{}/{}/source/sourcedir.tar.gz'.format(
        BUCKET_NAME, JOB_NAME)
    e_tar.return_value = UploadedCode(s3_prefix=s3_prefix,
                                      script_name=SCRIPT_FILE)
    s3_prefix = 's3://{}/{}/sourcedir.tar.gz'.format(BUCKET_NAME, JOB_NAME)
    m_tar.return_value = UploadedCode(s3_prefix=s3_prefix,
                                      script_name=SCRIPT_FILE)
    tf.fit(inputs=inputs)

    call_names = [c[0] for c in sagemaker_session.method_calls]
    assert call_names == ['train', 'logs_for_job']

    expected_train_args = _create_train_job(tf_version)
    expected_train_args['input_config'][0]['DataSource']['S3DataSource'][
        'S3Uri'] = inputs

    actual_train_args = sagemaker_session.method_calls[0][2]
    assert actual_train_args == expected_train_args

    model = tf.create_model()

    environment = {
        'Environment': {
            'SAGEMAKER_SUBMIT_DIRECTORY':
            's3://{}/{}/sourcedir.tar.gz'.format(BUCKET_NAME, JOB_NAME),
            'SAGEMAKER_PROGRAM':
            'dummy_script.py',
            'SAGEMAKER_REQUIREMENTS':
            'dummy_requirements.txt',
            'SAGEMAKER_ENABLE_CLOUDWATCH_METRICS':
            'false',
            'SAGEMAKER_REGION':
            'us-west-2',
            'SAGEMAKER_CONTAINER_LOG_LEVEL':
            '20'
        },
        'Image':
        create_image_uri('us-west-2', "tensorflow", INSTANCE_TYPE, tf_version,
                         "py2"),
        'ModelDataUrl':
        's3://m/m.tar.gz'
    }
    assert environment == model.prepare_container_def(INSTANCE_TYPE)

    assert 'cpu' in model.prepare_container_def(INSTANCE_TYPE)['Image']
    predictor = tf.deploy(1, INSTANCE_TYPE)
    assert isinstance(predictor, TensorFlowPredictor)
def test_keras(sagemaker_session, tf_full_version):
    script_path = os.path.join(tests.integ.DATA_DIR, 'cifar_10', 'source')
    dataset_path = os.path.join(tests.integ.DATA_DIR, 'cifar_10', 'data')

    with timeout(minutes=45):
        estimator = TensorFlow(entry_point='keras_cnn_cifar_10.py',
                               source_dir=script_path,
                               role='SageMakerRole',
                               sagemaker_session=sagemaker_session,
                               hyperparameters={
                                   'learning_rate': 1e-4,
                                   'decay': 1e-6
                               },
                               training_steps=50,
                               evaluation_steps=5,
                               train_instance_count=1,
                               train_instance_type='ml.c4.xlarge',
                               train_max_run=45 * 60)

        inputs = estimator.sagemaker_session.upload_data(
            path=dataset_path, key_prefix='data/cifar10')

        estimator.fit(inputs)

    endpoint_name = estimator.latest_training_job.name
    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        predictor = estimator.deploy(initial_instance_count=1,
                                     instance_type='ml.p2.xlarge')

        data = np.random.randn(32, 32, 3)
        predict_response = predictor.predict(data)
        assert len(
            predict_response['outputs']['probabilities']['floatVal']) == 10
def test_cifar(sagemaker_session, tf_full_version):
    with timeout(minutes=45):
        script_path = os.path.join(tests.integ.DATA_DIR, 'cifar_10', 'source')

        dataset_path = os.path.join(tests.integ.DATA_DIR, 'cifar_10', 'data')

        estimator = TensorFlow(entry_point='resnet_cifar_10.py', source_dir=script_path,
                               role='SageMakerRole',
                               framework_version=tf_full_version, training_steps=500,
                               evaluation_steps=5,
                               train_instance_count=2, train_instance_type='ml.p2.xlarge',
                               sagemaker_session=sagemaker_session, train_max_run=45 * 60,
                               base_job_name='test-cifar')

        inputs = estimator.sagemaker_session.upload_data(path=dataset_path,
                                                         key_prefix='data/cifar10')
        job_name = unique_name_from_base('test-tf-cifar')

        estimator.fit(inputs, logs=False, job_name=job_name)
        print('job succeeded: {}'.format(estimator.latest_training_job.name))

    endpoint_name = estimator.latest_training_job.name
    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.p2.xlarge')
        predictor.serializer = PickleSerializer()
        predictor.content_type = PICKLE_CONTENT_TYPE

        data = np.random.randn(32, 32, 3)
        predict_response = predictor.predict(data)
        assert len(predict_response['outputs']['probabilities']['floatVal']) == 10
def test_tf(sagemaker_session, tf_full_version):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        script_path = os.path.join(DATA_DIR, 'iris', 'iris-dnn-classifier.py')

        estimator = TensorFlow(entry_point=script_path,
                               role='SageMakerRole',
                               framework_version=tf_full_version,
                               training_steps=1,
                               evaluation_steps=1,
                               hyperparameters={'input_tensor_name': 'inputs'},
                               train_instance_count=1,
                               train_instance_type='ml.c4.xlarge',
                               sagemaker_session=sagemaker_session,
                               base_job_name='test-tf')

        inputs = sagemaker_session.upload_data(path=DATA_PATH, key_prefix='integ-test-data/tf_iris')
        estimator.fit(inputs)
        print('job succeeded: {}'.format(estimator.latest_training_job.name))

    endpoint_name = estimator.latest_training_job.name
    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        json_predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge',
                                          endpoint_name=endpoint_name)

        features = [6.4, 3.2, 4.5, 1.5]
        dict_result = json_predictor.predict({'inputs': features})
        print('predict result: {}'.format(dict_result))
        list_result = json_predictor.predict(features)
        print('predict result: {}'.format(list_result))

        assert dict_result == list_result
Exemple #19
0
def test_deploy_with_input_handlers(sagemaker_session, instance_type):
    estimator = TensorFlow(
        entry_point="inference.py",
        source_dir=TFS_RESOURCE_PATH,
        role=ROLE,
        train_instance_count=1,
        train_instance_type=instance_type,
        py_version=tests.integ.PYTHON_VERSION,
        sagemaker_session=sagemaker_session,
        script_mode=True,
        framework_version=TensorFlow.LATEST_VERSION,
        tags=TAGS,
    )

    estimator.fit(job_name=unique_name_from_base("test-tf-tfs-deploy"))

    endpoint_name = estimator.latest_training_job.name

    with timeout.timeout_and_delete_endpoint_by_name(endpoint_name,
                                                     sagemaker_session):

        predictor = estimator.deploy(initial_instance_count=1,
                                     instance_type=instance_type,
                                     endpoint_name=endpoint_name)

        input_data = {"instances": [1.0, 2.0, 5.0]}
        expected_result = {"predictions": [4.0, 4.5, 6.0]}

        result = predictor.predict(input_data)
        assert expected_result == result
def test_estimator_deploy(sagemaker_session):
    container_log_level = '"logging.INFO"'
    source_dir = "s3://mybucket/source"
    custom_image = "custom:1.0"
    tf = TensorFlow(
        entry_point="script.py",
        role=ROLE,
        sagemaker_session=sagemaker_session,
        training_steps=1000,
        evaluation_steps=10,
        train_instance_count=INSTANCE_COUNT,
        train_instance_type=INSTANCE_TYPE,
        image_name=custom_image,
        container_log_level=container_log_level,
        base_job_name="job",
        source_dir=source_dir,
    )

    job_name = "doing something"
    tf.fit(inputs="s3://mybucket/train", job_name=job_name)
    predictor = tf.deploy(INSTANCE_COUNT,
                          INSTANCE_TYPE,
                          endpoint_name="endpoint",
                          endpoint_type="tensorflow-serving")
    assert isinstance(predictor, Predictor)
Exemple #21
0
def test_tf(sagemaker_session, tf_full_version):
    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
        script_path = os.path.join(DATA_DIR, 'iris', 'iris-dnn-classifier.py')

        estimator = TensorFlow(entry_point=script_path,
                               role='SageMakerRole',
                               framework_version=tf_full_version,
                               training_steps=1,
                               evaluation_steps=1,
                               hyperparameters={'input_tensor_name': 'inputs'},
                               train_instance_count=1,
                               train_instance_type='ml.c4.xlarge',
                               sagemaker_session=sagemaker_session,
                               base_job_name='test-tf')

        inputs = sagemaker_session.upload_data(
            path=DATA_PATH, key_prefix='integ-test-data/tf_iris')
        estimator.fit(inputs)
        print('job succeeded: {}'.format(estimator.latest_training_job.name))

    endpoint_name = estimator.latest_training_job.name
    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        json_predictor = estimator.deploy(initial_instance_count=1,
                                          instance_type='ml.c4.xlarge',
                                          endpoint_name=endpoint_name)

        features = [6.4, 3.2, 4.5, 1.5]
        dict_result = json_predictor.predict({'inputs': features})
        print('predict result: {}'.format(dict_result))
        list_result = json_predictor.predict(features)
        print('predict result: {}'.format(list_result))

        assert dict_result == list_result
def test_server_side_encryption(sagemaker_session, tf_full_version,
                                py_version):
    with kms_utils.bucket_with_encryption(sagemaker_session,
                                          ROLE) as (bucket_with_kms, kms_key):
        output_path = os.path.join(bucket_with_kms,
                                   "test-server-side-encryption",
                                   time.strftime("%y%m%d-%H%M"))

        estimator = TensorFlow(
            entry_point="training.py",
            source_dir=TFS_RESOURCE_PATH,
            role=ROLE,
            train_instance_count=1,
            train_instance_type="ml.c5.xlarge",
            sagemaker_session=sagemaker_session,
            script_mode=True,
            framework_version=tf_full_version,
            py_version=py_version,
            code_location=output_path,
            output_path=output_path,
            model_dir="/opt/ml/model",
            output_kms_key=kms_key,
        )

        inputs = estimator.sagemaker_session.upload_data(
            path=os.path.join(MNIST_RESOURCE_PATH, "data"),
            key_prefix="scriptmode/mnist")

        with tests.integ.timeout.timeout(
                minutes=tests.integ.TRAINING_DEFAULT_TIMEOUT_MINUTES):
            estimator.fit(
                inputs=inputs,
                job_name=unique_name_from_base("test-server-side-encryption"))

        endpoint_name = unique_name_from_base("test-server-side-encryption")
        with timeout.timeout_and_delete_endpoint_by_name(
                endpoint_name, sagemaker_session):
            estimator.deploy(
                initial_instance_count=1,
                instance_type="ml.c5.xlarge",
                endpoint_name=endpoint_name,
                entry_point=os.path.join(TFS_RESOURCE_PATH, "inference.py"),
            )
Exemple #23
0
	def create_predictor_from_csv(self):
			Log.i('initiating sagemaker model creation')
			role = AppConfig.setting('AWS_PREDICTOR_ROLE')
			bucket='cryptrade-sagemaker'
			custom_code_upload_location = 's3://{}/customcode/tensorflow_iris'.format(bucket)
			model_artifacts_location = 's3://{}/artifacts'.format(bucket)
			Log.d('training data will be uploaded to: {}', custom_code_upload_location)
			Log.d('training artifacts will be uploaded to: {}', model_artifacts_location)
			sess = sagemaker.Session()
			def upload_to_s3(channel, filepath, skip_if_name_and_size_matches=False):
				file = Path(filepath)
				"""From SM examples. Like here: https://github.com/awslabs/amazon-sagemaker-examples/blob/master/introduction_to_amazon_algorithms/imageclassification_caltech/Image-classification-transfer-learning.ipynb"""
				s3 = boto3.resource('s3')
				key = channel + '/' + file.name
				bucket_ref = s3.Bucket(bucket)
				objs = list(bucket_ref.objects.filter(Prefix=key))
				is_file_already_existing = len(objs) > 0 and objs[0].key == key
				if is_file_already_existing is True:
					if skip_if_name_and_size_matches is True:
						s3_client = boto3.client('s3')
						response = s3_client.head_object(Bucket=bucket, Key=key)
						local_size = file.stat().st_size
						remote_size = response['ContentLength']
						if remote_size == local_size:
							Log.w('skipping upload as s3 key of same size ({:.2f}kb) already exists: {}', local_size/1000, key)
							return
					Log.w('overwriting existing s3 key: {}', key)
				with open(filepath, "rb") as data:
					s3.Bucket(bucket).put_object(Key=key, Body=data)
			s3_data_folder = 'data'
			upload_to_s3(s3_data_folder, self.train_filepath, True)
			upload_to_s3(s3_data_folder, self.test_filepath, True)
			upload_to_s3(s3_data_folder, self.meta_filepath)
			estimator = TensorFlow(
				entry_point='aws_dnn_predictor_entry.py',
				role=role,
				output_path=model_artifacts_location,
				code_location=custom_code_upload_location,
				train_instance_count=1,
				train_instance_type='ml.c5.xlarge',
				training_steps=1000,
				evaluation_steps=100
				)
			train_data_location = 's3://{}/{}'.format(bucket, s3_data_folder)
			Log.i('fitting train data: {}', train_data_location)
			estimator.fit(train_data_location)
			Log.i('deploying model')
			deploy_start = datetime.now()
			predictor = estimator.deploy(initial_instance_count=1,
			                                       instance_type='ml.t2.medium'
			                                       )
			deploy_end = datetime.now()
			Log.i('deployed predictor in {}s, endpoint is:\n{}', deploy_end - deploy_start, predictor.endpoint)
			
			self.predictor = predictor
Exemple #24
0
def test_tf(time, strftime, sagemaker_session, tf_version):
    tf = TensorFlow(entry_point=SCRIPT_PATH,
                    role=ROLE,
                    sagemaker_session=sagemaker_session,
                    training_steps=1000,
                    evaluation_steps=10,
                    train_instance_count=INSTANCE_COUNT,
                    train_instance_type=INSTANCE_TYPE,
                    framework_version=tf_version)

    inputs = 's3://mybucket/train'

    tf.fit(inputs=inputs)

    call_names = [c[0] for c in sagemaker_session.method_calls]
    assert call_names == ['train', 'logs_for_job']
    boto_call_names = [
        c[0] for c in sagemaker_session.boto_session.method_calls
    ]
    assert boto_call_names == ['resource']

    expected_train_args = _create_train_job(tf_version)
    expected_train_args['input_config'][0]['DataSource']['S3DataSource'][
        'S3Uri'] = inputs

    actual_train_args = sagemaker_session.method_calls[0][2]
    assert actual_train_args == expected_train_args

    model = tf.create_model()

    assert {
        'Environment': {
            'SAGEMAKER_SUBMIT_DIRECTORY':
            's3://{}/{}/sourcedir.tar.gz'.format(BUCKET_NAME, JOB_NAME),
            'SAGEMAKER_PROGRAM':
            'dummy_script.py',
            'SAGEMAKER_ENABLE_CLOUDWATCH_METRICS':
            'false',
            'SAGEMAKER_REGION':
            'us-west-2',
            'SAGEMAKER_CONTAINER_LOG_LEVEL':
            '20'
        },
        'Image':
        create_image_uri('us-west-2', "tensorflow", GPU_IMAGE_NAME, tf_version,
                         "py2"),
        'ModelDataUrl':
        's3://m/m.tar.gz'
    } == model.prepare_container_def(GPU_IMAGE_NAME)

    assert 'cpu' in model.prepare_container_def(CPU_IMAGE_NAME)['Image']
    predictor = tf.deploy(1, GPU_IMAGE_NAME)
    assert isinstance(predictor, TensorFlowPredictor)
def test_tf(sagemaker_session, tf_version):
    tf = TensorFlow(
        entry_point=SCRIPT_FILE,
        role=ROLE,
        sagemaker_session=sagemaker_session,
        training_steps=1000,
        evaluation_steps=10,
        train_instance_count=INSTANCE_COUNT,
        train_instance_type=INSTANCE_TYPE,
        framework_version=tf_version,
        requirements_file=REQUIREMENTS_FILE,
        source_dir=DATA_DIR,
    )

    inputs = "s3://mybucket/train"

    tf.fit(inputs=inputs, experiment_config=EXPERIMENT_CONFIG)

    call_names = [c[0] for c in sagemaker_session.method_calls]
    assert call_names == ["train", "logs_for_job"]

    expected_train_args = _create_train_job(tf_version)
    expected_train_args["input_config"][0]["DataSource"]["S3DataSource"][
        "S3Uri"] = inputs
    expected_train_args["experiment_config"] = EXPERIMENT_CONFIG

    actual_train_args = sagemaker_session.method_calls[0][2]
    assert actual_train_args == expected_train_args

    model = tf.create_model()

    environment = {
        "Environment": {
            "SAGEMAKER_SUBMIT_DIRECTORY":
            "s3://mybucket/sagemaker-tensorflow-2017-11-06-14:14:15.673/source/sourcedir.tar.gz",  # noqa: E501
            "SAGEMAKER_PROGRAM": "dummy_script.py",
            "SAGEMAKER_REQUIREMENTS": "dummy_requirements.txt",
            "SAGEMAKER_ENABLE_CLOUDWATCH_METRICS": "false",
            "SAGEMAKER_REGION": "us-west-2",
            "SAGEMAKER_CONTAINER_LOG_LEVEL": "20",
        },
        "Image":
        create_image_uri("us-west-2", "tensorflow", INSTANCE_TYPE, tf_version,
                         "py2"),
        "ModelDataUrl":
        "s3://m/m.tar.gz",
    }
    assert environment == model.prepare_container_def(INSTANCE_TYPE)

    assert "cpu" in model.prepare_container_def(INSTANCE_TYPE)["Image"]
    predictor = tf.deploy(1, INSTANCE_TYPE)
    assert isinstance(predictor, TensorFlowPredictor)
def test_deploy(sagemaker_session, tf_version):
    estimator = TensorFlow(entry_point=SCRIPT, source_dir=SOURCE_DIR, role=ROLE,
                           framework_version=tf_version,
                           train_instance_count=2, train_instance_type=INSTANCE_TYPE_CPU,
                           sagemaker_session=sagemaker_session,
                           base_job_name='test-cifar')

    estimator.fit('s3://mybucket/train')
    print('job succeeded: {}'.format(estimator.latest_training_job.name))

    estimator.deploy(initial_instance_count=1, instance_type=INSTANCE_TYPE_CPU)
    image = IMAGE_URI_FORMAT_STRING.format(REGION, CPU_IMAGE_NAME, tf_version, 'cpu', 'py2')
    sagemaker_session.create_model.assert_called_with(
        estimator._current_job_name,
        ROLE,
        {'Environment':
         {'SAGEMAKER_ENABLE_CLOUDWATCH_METRICS': 'false',
          'SAGEMAKER_CONTAINER_LOG_LEVEL': '20',
          'SAGEMAKER_SUBMIT_DIRECTORY': SOURCE_DIR,
          'SAGEMAKER_REQUIREMENTS': '',
          'SAGEMAKER_REGION': REGION,
          'SAGEMAKER_PROGRAM': SCRIPT},
         'Image': image,
         'ModelDataUrl': 's3://m/m.tar.gz'})
Exemple #27
0
def test_mnist_async(sagemaker_session, cpu_instance_type, tf_full_version,
                     tf_full_py_version):
    if tf_full_version == "2.7.0":
        tf_full_version = "2.7"

    estimator = TensorFlow(
        entry_point=SCRIPT,
        source_dir=MNIST_RESOURCE_PATH,
        role=ROLE,
        instance_count=1,
        instance_type="ml.c5.4xlarge",
        sagemaker_session=sagemaker_session,
        framework_version=tf_full_version,
        py_version=tf_full_py_version,
        tags=TAGS,
    )
    inputs = estimator.sagemaker_session.upload_data(
        path=os.path.join(MNIST_RESOURCE_PATH, "data"),
        key_prefix="scriptmode/mnist")
    estimator.fit(inputs=inputs,
                  wait=False,
                  job_name=unique_name_from_base("test-tf-sm-async"))
    training_job_name = estimator.latest_training_job.name
    time.sleep(20)
    endpoint_name = training_job_name
    _assert_training_job_tags_match(sagemaker_session.sagemaker_client,
                                    estimator.latest_training_job.name, TAGS)
    with tests.integ.timeout.timeout_and_delete_endpoint_by_name(
            endpoint_name, sagemaker_session):
        estimator = TensorFlow.attach(training_job_name=training_job_name,
                                      sagemaker_session=sagemaker_session)
        model_name = "model-mnist-async"
        predictor = estimator.deploy(
            initial_instance_count=1,
            instance_type=cpu_instance_type,
            endpoint_name=endpoint_name,
            model_name=model_name,
        )

        result = predictor.predict(np.zeros(784))
        print("predict result: {}".format(result))
        _assert_endpoint_tags_match(sagemaker_session.sagemaker_client,
                                    predictor.endpoint_name, TAGS)
        _assert_model_tags_match(sagemaker_session.sagemaker_client,
                                 model_name, TAGS)
        _assert_model_name_match(sagemaker_session.sagemaker_client,
                                 endpoint_name, model_name)
def test_mnist_async(sagemaker_session, cpu_instance_type):
    estimator = TensorFlow(
        entry_point=SCRIPT,
        role=ROLE,
        train_instance_count=1,
        train_instance_type="ml.c5.4xlarge",
        py_version=tests.integ.PYTHON_VERSION,
        sagemaker_session=sagemaker_session,
        script_mode=True,
        # testing py-sdk functionality, no need to run against all TF versions
        framework_version=TensorFlow.LATEST_VERSION,
        tags=TAGS,
    )
    inputs = estimator.sagemaker_session.upload_data(
        path=os.path.join(MNIST_RESOURCE_PATH, "data"),
        key_prefix="scriptmode/mnist")
    estimator.fit(inputs=inputs,
                  wait=False,
                  job_name=unique_name_from_base("test-tf-sm-async"))
    training_job_name = estimator.latest_training_job.name
    time.sleep(20)
    endpoint_name = training_job_name
    _assert_training_job_tags_match(sagemaker_session.sagemaker_client,
                                    estimator.latest_training_job.name, TAGS)
    with tests.integ.timeout.timeout_and_delete_endpoint_by_name(
            endpoint_name, sagemaker_session):
        estimator = TensorFlow.attach(training_job_name=training_job_name,
                                      sagemaker_session=sagemaker_session)
        model_name = "model-mnist-async"
        predictor = estimator.deploy(
            initial_instance_count=1,
            instance_type=cpu_instance_type,
            endpoint_name=endpoint_name,
            model_name=model_name,
        )

        result = predictor.predict(np.zeros(784))
        print("predict result: {}".format(result))
        _assert_endpoint_tags_match(sagemaker_session.sagemaker_client,
                                    predictor.endpoint, TAGS)
        _assert_model_tags_match(sagemaker_session.sagemaker_client,
                                 model_name, TAGS)
        _assert_model_name_match(sagemaker_session.sagemaker_client,
                                 endpoint_name, model_name)
def test_tf_local_mode(tf_full_version, sagemaker_local_session):
    local_mode_lock_fd = open(LOCK_PATH, 'w')
    local_mode_lock = local_mode_lock_fd.fileno()
    with timeout(minutes=5):
        script_path = os.path.join(DATA_DIR, 'iris', 'iris-dnn-classifier.py')

        estimator = TensorFlow(entry_point=script_path,
                               role='SageMakerRole',
                               framework_version=tf_full_version,
                               training_steps=1,
                               evaluation_steps=1,
                               hyperparameters={'input_tensor_name': 'inputs'},
                               train_instance_count=1,
                               train_instance_type='local',
                               base_job_name='test-tf',
                               sagemaker_session=sagemaker_local_session)

        inputs = estimator.sagemaker_session.upload_data(
            path=DATA_PATH, key_prefix='integ-test-data/tf_iris')
        estimator.fit(inputs)
        print('job succeeded: {}'.format(estimator.latest_training_job.name))

    endpoint_name = estimator.latest_training_job.name
    try:
        # Since Local Mode uses the same port for serving, we need a lock in order
        # to allow concurrent test execution. The serving test is really fast so it still
        # makes sense to allow this behavior.
        fcntl.lockf(local_mode_lock, fcntl.LOCK_EX)
        json_predictor = estimator.deploy(initial_instance_count=1,
                                          instance_type='local',
                                          endpoint_name=endpoint_name)

        features = [6.4, 3.2, 4.5, 1.5]
        dict_result = json_predictor.predict({'inputs': features})
        print('predict result: {}'.format(dict_result))
        list_result = json_predictor.predict(features)
        print('predict result: {}'.format(list_result))

        assert dict_result == list_result
    finally:
        estimator.delete_endpoint()
        time.sleep(5)
        fcntl.lockf(local_mode_lock, fcntl.LOCK_UN)
def test_tf_local_mode(tf_full_version, sagemaker_local_session):
    local_mode_lock_fd = open(LOCK_PATH, 'w')
    local_mode_lock = local_mode_lock_fd.fileno()
    with timeout(minutes=5):
        script_path = os.path.join(DATA_DIR, 'iris', 'iris-dnn-classifier.py')

        estimator = TensorFlow(entry_point=script_path,
                               role='SageMakerRole',
                               framework_version=tf_full_version,
                               training_steps=1,
                               evaluation_steps=1,
                               hyperparameters={'input_tensor_name': 'inputs'},
                               train_instance_count=1,
                               train_instance_type='local',
                               base_job_name='test-tf',
                               sagemaker_session=sagemaker_local_session)

        inputs = estimator.sagemaker_session.upload_data(path=DATA_PATH,
                                                         key_prefix='integ-test-data/tf_iris')
        estimator.fit(inputs)
        print('job succeeded: {}'.format(estimator.latest_training_job.name))

    endpoint_name = estimator.latest_training_job.name
    try:
        # Since Local Mode uses the same port for serving, we need a lock in order
        # to allow concurrent test execution. The serving test is really fast so it still
        # makes sense to allow this behavior.
        fcntl.lockf(local_mode_lock, fcntl.LOCK_EX)
        json_predictor = estimator.deploy(initial_instance_count=1,
                                          instance_type='local',
                                          endpoint_name=endpoint_name)

        features = [6.4, 3.2, 4.5, 1.5]
        dict_result = json_predictor.predict({'inputs': features})
        print('predict result: {}'.format(dict_result))
        list_result = json_predictor.predict(features)
        print('predict result: {}'.format(list_result))

        assert dict_result == list_result
    finally:
        estimator.delete_endpoint()
        time.sleep(5)
        fcntl.lockf(local_mode_lock, fcntl.LOCK_UN)
Exemple #31
0
def run():
    # sagemakerを使うためのインスタンス生成
    # どのparamがrequiredでどれがoptionalかはまだ調べてないのでよくわかってない
    # 一旦公式docsにあったやつを利用してる
    tf_estimator = TensorFlow(
        entry_point='tf-keras-train.py',  # モデルなどについて書かれているファイル
        role='SageMakerRole',  # IAMロール. SageMakerが使える権限を与えておく必要がある
        training_steps=20,  # 学習時のパラメータ
        evaluation_steps=10,  # 学習時のパラメータ
        train_instance_count=1,  # 学習に使うインスタンスの個数. 複数指定して分散学習とかできる
        train_instance_type='ml.m5.large'  # インスタンスタイプ
    )

    # 学習を開始する
    tf_estimator.fit(
        's3://sagemaker-ap-northeast-1-192494425048')  # s3のバケット名を与える

    # 学習したモデルをデプロイする
    tf_predictor = tf_estimator.deploy(initial_instance_count=1,
                                       instance_type='ml.t2.medium')
def test_tf_local_mode(sagemaker_local_session):
    with stopit.ThreadingTimeout(5 * 60, swallow_exc=False):
        script_path = os.path.join(DATA_DIR, "iris", "iris-dnn-classifier.py")

        estimator = TensorFlow(
            entry_point=script_path,
            role="SageMakerRole",
            framework_version="1.12",
            training_steps=1,
            evaluation_steps=1,
            hyperparameters={"input_tensor_name": "inputs"},
            train_instance_count=1,
            train_instance_type="local",
            base_job_name="test-tf",
            sagemaker_session=sagemaker_local_session,
        )

        inputs = estimator.sagemaker_session.upload_data(
            path=DATA_PATH, key_prefix="integ-test-data/tf_iris"
        )
        estimator.fit(inputs)
        print("job succeeded: {}".format(estimator.latest_training_job.name))

    endpoint_name = estimator.latest_training_job.name
    with lock.lock(LOCK_PATH):
        try:
            json_predictor = estimator.deploy(
                initial_instance_count=1, instance_type="local", endpoint_name=endpoint_name
            )

            features = [6.4, 3.2, 4.5, 1.5]
            dict_result = json_predictor.predict({"inputs": features})
            print("predict result: {}".format(dict_result))
            list_result = json_predictor.predict(features)
            print("predict result: {}".format(list_result))

            assert dict_result == list_result
        finally:
            estimator.delete_endpoint()
def test_tf(m_tar, e_tar, time, strftime, sagemaker_session, tf_version):
    tf = TensorFlow(entry_point=SCRIPT_FILE, role=ROLE, sagemaker_session=sagemaker_session, training_steps=1000,
                    evaluation_steps=10, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE,
                    framework_version=tf_version, requirements_file=REQUIREMENTS_FILE, source_dir=DATA_DIR)

    inputs = 's3://mybucket/train'
    s3_prefix = 's3://{}/{}/source/sourcedir.tar.gz'.format(BUCKET_NAME, JOB_NAME)
    e_tar.return_value = UploadedCode(s3_prefix=s3_prefix, script_name=SCRIPT_FILE)
    s3_prefix = 's3://{}/{}/sourcedir.tar.gz'.format(BUCKET_NAME, JOB_NAME)
    m_tar.return_value = UploadedCode(s3_prefix=s3_prefix, script_name=SCRIPT_FILE)
    tf.fit(inputs=inputs)

    call_names = [c[0] for c in sagemaker_session.method_calls]
    assert call_names == ['train', 'logs_for_job']

    expected_train_args = _create_train_job(tf_version)
    expected_train_args['input_config'][0]['DataSource']['S3DataSource']['S3Uri'] = inputs

    actual_train_args = sagemaker_session.method_calls[0][2]
    assert actual_train_args == expected_train_args

    model = tf.create_model()

    environment = {
        'Environment': {
            'SAGEMAKER_SUBMIT_DIRECTORY': 's3://{}/{}/sourcedir.tar.gz'.format(BUCKET_NAME, JOB_NAME),
            'SAGEMAKER_PROGRAM': 'dummy_script.py', 'SAGEMAKER_REQUIREMENTS': 'dummy_requirements.txt',
            'SAGEMAKER_ENABLE_CLOUDWATCH_METRICS': 'false', 'SAGEMAKER_REGION': 'us-west-2',
            'SAGEMAKER_CONTAINER_LOG_LEVEL': '20'
        },
        'Image': create_image_uri('us-west-2', "tensorflow", INSTANCE_TYPE, tf_version, "py2"),
        'ModelDataUrl': 's3://m/m.tar.gz'
    }
    assert environment == model.prepare_container_def(INSTANCE_TYPE)

    assert 'cpu' in model.prepare_container_def(INSTANCE_TYPE)['Image']
    predictor = tf.deploy(1, INSTANCE_TYPE)
    assert isinstance(predictor, TensorFlowPredictor)
Exemple #34
0
def test_cifar(sagemaker_session):
    with timeout(minutes=45):
        script_path = os.path.join(tests.integ.DATA_DIR, "cifar_10", "source")

        dataset_path = os.path.join(tests.integ.DATA_DIR, "cifar_10", "data")

        estimator = TensorFlow(
            entry_point="resnet_cifar_10.py",
            source_dir=script_path,
            role="SageMakerRole",
            framework_version="1.12",
            training_steps=50,
            evaluation_steps=5,
            train_instance_count=2,
            train_instance_type="ml.p2.xlarge",
            sagemaker_session=sagemaker_session,
            train_max_run=45 * 60,
            base_job_name="test-cifar",
        )

        inputs = estimator.sagemaker_session.upload_data(
            path=dataset_path, key_prefix="data/cifar10")
        job_name = unique_name_from_base("test-tf-cifar")

        estimator.fit(inputs, logs=False, job_name=job_name)
        print("job succeeded: {}".format(estimator.latest_training_job.name))

    endpoint_name = estimator.latest_training_job.name
    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        predictor = estimator.deploy(initial_instance_count=1,
                                     instance_type="ml.p2.xlarge")
        predictor.serializer = PickleSerializer()
        predictor.content_type = PICKLE_CONTENT_TYPE

        data = np.random.randn(32, 32, 3)
        predict_response = predictor.predict(data)
        assert len(
            predict_response["outputs"]["probabilities"]["floatVal"]) == 10
def main():
    config = get_config(LOCAL_MODE)
    #config = get_config(CLOUD_MODE)

    download_training_and_eval_data()

    if config['mode'] is CLOUD_MODE:
        upload_data_to_s3(config['bucket'], config['s3_data_prefix'])

    print('Starting model training.')
    print(
        'Note: if launching for the first time in local mode, container image download might take a few minutes to complete.'
    )
    mnist_estimator = TensorFlow(
        entry_point='mnist_tf2.py',
        source_dir='code',
        role=config['role'],
        instance_count=1,
        instance_type=config['instance_type'],
        framework_version='2.4.1',
        py_version='py37',
        distribution={'parameter_server': {
            'enabled': True
        }})

    mnist_estimator.fit(config['training_dataset_path'])
    print('Completed model training')

    print('Deploying endpoint in ' + config['mode'])
    predictor = mnist_estimator.deploy(initial_instance_count=1,
                                       instance_type=config['instance_type'])

    do_inference_on_local_endpoint(predictor, config['mode'])

    print('About to delete the endpoint to stop paying (if in cloud mode).')
    predictor.delete_endpoint(predictor.endpoint_name)
    predictor.delete_model()
Exemple #36
0
 def test_serving(self, sagemaker_session, ecr_image, framework_version,
                  instance_type, instance_count, tmpdir, capsys,
                  mnist_dataset):
     script = os.path.join(resource_path, 'mnist', 'mnist.py')
     estimator = TensorFlow(
         entry_point=script,
         role='SageMakerRole',
         instance_type=instance_type,
         instance_count=instance_count,
         sagemaker_session=sagemaker_session,
         image_uri=ecr_image,
         framework_version=framework_version,
         hyperparameters={
             TrainingCompilerConfig.HP_ENABLE_COMPILER: True,
         },
     )
     estimator.fit(mnist_dataset,
                   job_name=unique_name_from_base('test-TF-trcomp-serving'))
     _assert_model_exported_to_s3(estimator)
     captured = capsys.readouterr()
     _assert_training_compiler_invoked(captured)
     predictor = estimator.deploy(initial_instance_count=1,
                                  instance_type=instance_type)
     predictor.delete_predictor()
def test_keras(sagemaker_session, tf_full_version):
    script_path = os.path.join(DATA_DIR, 'cifar_10', 'source')
    dataset_path = os.path.join(DATA_DIR, 'cifar_10', 'data')

    with timeout(minutes=45):
        estimator = TensorFlow(entry_point='keras_cnn_cifar_10.py',
                               source_dir=script_path,
                               role='SageMakerRole', sagemaker_session=sagemaker_session,
                               hyperparameters={'learning_rate': 1e-4, 'decay': 1e-6},
                               training_steps=500, evaluation_steps=5,
                               train_instance_count=1, train_instance_type='ml.c4.xlarge',
                               train_max_run=45 * 60)

        inputs = estimator.sagemaker_session.upload_data(path=dataset_path, key_prefix='data/cifar10')

        estimator.fit(inputs)

    endpoint_name = estimator.latest_training_job.name
    with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
        predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.p2.xlarge')

        data = np.random.randn(32, 32, 3)
        predict_response = predictor.predict(data)
        assert len(predict_response['outputs']['probabilities']['floatVal']) == 10