def test_deploy(sagemaker_session): estimator = TensorFlow( entry_point=SCRIPT, source_dir=SOURCE_DIR, role=ROLE, framework_version="2.3.0", py_version="py37", instance_count=2, instance_type=INSTANCE_TYPE_CPU, sagemaker_session=sagemaker_session, base_job_name="test-cifar", ) estimator.fit("s3://mybucket/train") estimator.deploy(initial_instance_count=1, instance_type=INSTANCE_TYPE_CPU) image = IMAGE_URI_FORMAT_STRING.format(REGION, REPOSITORY, "2.3.0", PROCESSOR) sagemaker_session.create_model.assert_called_with( ANY, ROLE, { "Image": image, "Environment": { "SAGEMAKER_TFS_NGINX_LOGLEVEL": "info" }, "ModelDataUrl": "s3://m/m.tar.gz", }, vpc_config=None, enable_network_isolation=False, tags=None, )
def test_deploy(sagemaker_session, tf_version): estimator = TensorFlow( entry_point=SCRIPT, source_dir=SOURCE_DIR, role=ROLE, framework_version=tf_version, train_instance_count=2, train_instance_type=INSTANCE_TYPE_CPU, sagemaker_session=sagemaker_session, base_job_name="test-cifar", ) estimator.fit("s3://mybucket/train") print("job succeeded: {}".format(estimator.latest_training_job.name)) estimator.deploy(initial_instance_count=1, instance_type=INSTANCE_TYPE_CPU) image = IMAGE_URI_FORMAT_STRING.format(REGION, CPU_IMAGE_NAME, tf_version, "cpu", "py2") sagemaker_session.create_model.assert_called_with( estimator._current_job_name, ROLE, { "Environment": { "SAGEMAKER_CONTAINER_LOG_LEVEL": "20", "SAGEMAKER_SUBMIT_DIRECTORY": SOURCE_DIR, "SAGEMAKER_REQUIREMENTS": "", "SAGEMAKER_REGION": REGION, "SAGEMAKER_PROGRAM": SCRIPT, }, "Image": image, "ModelDataUrl": "s3://m/m.tar.gz", }, )
def test_cifar(sagemaker_session): with timeout(minutes=15): script_path = os.path.join(DATA_DIR, 'cifar_10', 'source') dataset_path = os.path.join(DATA_DIR, 'cifar_10', 'data') estimator = TensorFlow(entry_point='resnet_cifar_10.py', source_dir=script_path, role='SageMakerRole', training_steps=20, evaluation_steps=5, train_instance_count=2, train_instance_type='ml.p2.xlarge', sagemaker_session=sagemaker_session, base_job_name='test-cifar') inputs = estimator.sagemaker_session.upload_data( path=dataset_path, key_prefix='data/cifar10') estimator.fit(inputs) print('job succeeded: {}'.format(estimator.latest_training_job.name)) try: with timeout(minutes=15): estimator.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge') finally: try: estimator.delete_endpoint() except Exception: pass
def test_deploy(sagemaker_session, tf_version): estimator = TensorFlow(entry_point=SCRIPT, source_dir=SOURCE_DIR, role=ROLE, framework_version=tf_version, train_instance_count=2, train_instance_type=INSTANCE_TYPE_CPU, sagemaker_session=sagemaker_session, base_job_name='test-cifar') estimator.fit('s3://mybucket/train') print('job succeeded: {}'.format(estimator.latest_training_job.name)) estimator.deploy(initial_instance_count=1, instance_type=INSTANCE_TYPE_CPU) image = IMAGE_URI_FORMAT_STRING.format(REGION, CPU_IMAGE_NAME, tf_version, 'cpu', 'py2') sagemaker_session.create_model.assert_called_with( estimator._current_job_name, ROLE, { 'Environment': { 'SAGEMAKER_ENABLE_CLOUDWATCH_METRICS': 'false', 'SAGEMAKER_CONTAINER_LOG_LEVEL': '20', 'SAGEMAKER_SUBMIT_DIRECTORY': SOURCE_DIR, 'SAGEMAKER_REQUIREMENTS': '', 'SAGEMAKER_REGION': REGION, 'SAGEMAKER_PROGRAM': SCRIPT }, 'Image': image, 'ModelDataUrl': 's3://m/m.tar.gz' })
def main(): download_training_and_eval_data() sagemaker_session = LocalSession() sagemaker_session.config = {'local': {'local_code': True}} # For local training a dummy role will be sufficient role = 'arn:aws:iam::111111111111:role/service-role/AmazonSageMaker-ExecutionRole-20200101T000001' print('Starting model training') mnist_estimator = TensorFlow( entry_point='mnist_tf2.py', role=role, instance_count=1, instance_type='local', framework_version='2.3.0', py_version='py37', distribution={'parameter_server': { 'enabled': True }}) mnist_estimator.fit("file://./data/") print('Deploying local mode endpoint') predictor = mnist_estimator.deploy(initial_instance_count=1, instance_type='local') do_inference_on_local_endpoint(predictor) predictor.delete_endpoint(predictor.endpoint) predictor.delete_model()
def main(): download_training_and_eval_data() print('Starting model training.') print( 'Note: if launching for the first time in local mode, container image download might take a few minutes to complete.') california_housing_estimator = TensorFlow(entry_point='california_housing_tf2.py', source_dir='code', role=DUMMY_IAM_ROLE, instance_count=1, instance_type='local', framework_version='2.4.1', py_version='py37') inputs = {'train': 'file://./data/train', 'test': 'file://./data/test'} california_housing_estimator.fit(inputs) print('Completed model training') print('Deploying endpoint in local mode') predictor = california_housing_estimator.deploy(initial_instance_count=1, instance_type='local') do_inference_on_local_endpoint(predictor) print('About to delete the endpoint to stop paying (if in cloud mode).') predictor.delete_endpoint(predictor.endpoint_name)
def test_mnist_async(sagemaker_session): estimator = TensorFlow(entry_point=SCRIPT, role=ROLE, train_instance_count=1, train_instance_type='ml.c5.4xlarge', sagemaker_session=sagemaker_session, py_version='py3', framework_version=TensorFlow.LATEST_VERSION, base_job_name=unique_name_from_base('test-tf-sm-mnist'), tags=TAGS) inputs = estimator.sagemaker_session.upload_data( path=os.path.join(RESOURCE_PATH, 'data'), key_prefix='scriptmode/mnist') estimator.fit(inputs, wait=False) training_job_name = estimator.latest_training_job.name time.sleep(20) endpoint_name = training_job_name _assert_training_job_tags_match(sagemaker_session.sagemaker_client, estimator.latest_training_job.name, TAGS) with timeout.timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): estimator = TensorFlow.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session) predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge', endpoint_name=endpoint_name) result = predictor.predict(np.zeros(784)) print('predict result: {}'.format(result)) _assert_endpoint_tags_match(sagemaker_session.sagemaker_client, predictor.endpoint, TAGS) _assert_model_tags_match(sagemaker_session.sagemaker_client, estimator.latest_training_job.name, TAGS)
def test_keras(sagemaker_session, cpu_instance_type): script_path = os.path.join(tests.integ.DATA_DIR, "cifar_10", "source") dataset_path = os.path.join(tests.integ.DATA_DIR, "cifar_10", "data") with timeout(minutes=45): estimator = TensorFlow( entry_point="keras_cnn_cifar_10.py", source_dir=script_path, role="SageMakerRole", framework_version="1.12", sagemaker_session=sagemaker_session, hyperparameters={"learning_rate": 1e-4, "decay": 1e-6}, training_steps=50, evaluation_steps=5, train_instance_count=1, train_instance_type=cpu_instance_type, train_max_run=45 * 60, ) inputs = estimator.sagemaker_session.upload_data( path=dataset_path, key_prefix="data/cifar10" ) job_name = unique_name_from_base("test-tf-keras") estimator.fit(inputs, job_name=job_name) endpoint_name = estimator.latest_training_job.name with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): predictor = estimator.deploy(initial_instance_count=1, instance_type=cpu_instance_type) data = np.random.randn(32, 32, 3) predict_response = predictor.predict(data) assert len(predict_response["outputs"]["probabilities"]["floatVal"]) == 10
def test_tf(sagemaker_session): with timeout(minutes=15): script_path = os.path.join(DATA_DIR, 'iris', 'iris-dnn-classifier.py') estimator = TensorFlow(entry_point=script_path, role='SageMakerRole', training_steps=1, evaluation_steps=1, hyperparameters={'input_tensor_name': 'inputs'}, train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session, base_job_name='test-tf') inputs = estimator.sagemaker_session.upload_data( path=DATA_PATH, key_prefix='integ-test-data/tf_iris') estimator.fit(inputs) print('job succeeded: {}'.format(estimator.latest_training_job.name)) with timeout_and_delete_endpoint(estimator=estimator, minutes=20): json_predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge') result = json_predictor.predict([6.4, 3.2, 4.5, 1.5]) print('predict result: {}'.format(result))
def test_tf_async(sagemaker_session): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, 'iris', 'iris-dnn-classifier.py') estimator = TensorFlow(entry_point=script_path, role='SageMakerRole', training_steps=1, evaluation_steps=1, hyperparameters={'input_tensor_name': 'inputs'}, train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session, base_job_name='test-tf') inputs = estimator.sagemaker_session.upload_data( path=DATA_PATH, key_prefix='integ-test-data/tf_iris') estimator.fit(inputs, wait=False) training_job_name = estimator.latest_training_job.name time.sleep(20) endpoint_name = training_job_name with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): estimator = TensorFlow.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session) json_predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge', endpoint_name=endpoint_name) result = json_predictor.predict([6.4, 3.2, 4.5, 1.5]) print('predict result: {}'.format(result))
def test_tf_local_data_local_script(): with timeout(minutes=5): script_path = os.path.join(DATA_DIR, 'iris', 'iris-dnn-classifier.py') estimator = TensorFlow(entry_point=script_path, role='SageMakerRole', training_steps=1, evaluation_steps=1, hyperparameters={'input_tensor_name': 'inputs'}, train_instance_count=1, train_instance_type='local', base_job_name='test-tf', sagemaker_session=LocalNoS3Session()) inputs = 'file://' + DATA_PATH estimator.fit(inputs) print('job succeeded: {}'.format(estimator.latest_training_job.name)) endpoint_name = estimator.latest_training_job.name with local_mode_utils.lock(): try: json_predictor = estimator.deploy(initial_instance_count=1, instance_type='local', endpoint_name=endpoint_name) features = [6.4, 3.2, 4.5, 1.5] dict_result = json_predictor.predict({'inputs': features}) print('predict result: {}'.format(dict_result)) list_result = json_predictor.predict(features) print('predict result: {}'.format(list_result)) assert dict_result == list_result finally: estimator.delete_endpoint()
def test_deploy_with_input_handlers(sagemaker_session, instance_type, tf_full_version, tf_full_py_version): estimator = TensorFlow( entry_point="training.py", source_dir=TFS_RESOURCE_PATH, role=ROLE, instance_count=1, instance_type=instance_type, framework_version=tf_full_version, py_version=tf_full_py_version, sagemaker_session=sagemaker_session, tags=TAGS, ) estimator.fit(job_name=unique_name_from_base("test-tf-tfs-deploy")) endpoint_name = estimator.latest_training_job.name with timeout.timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): predictor = estimator.deploy( initial_instance_count=1, instance_type=instance_type, endpoint_name=endpoint_name, entry_point=os.path.join(TFS_RESOURCE_PATH, "inference.py"), ) input_data = {"instances": [1.0, 2.0, 5.0]} expected_result = {"predictions": [4.0, 4.5, 6.0]} result = predictor.predict(input_data) assert expected_result == result
def test_tf_async(sagemaker_session): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, 'iris', 'iris-dnn-classifier.py') estimator = TensorFlow(entry_point=script_path, role='SageMakerRole', training_steps=1, evaluation_steps=1, hyperparameters={'input_tensor_name': 'inputs'}, train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session, base_job_name='test-tf') inputs = estimator.sagemaker_session.upload_data(path=DATA_PATH, key_prefix='integ-test-data/tf_iris') estimator.fit(inputs, wait=False) training_job_name = estimator.latest_training_job.name time.sleep(20) endpoint_name = training_job_name with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): estimator = TensorFlow.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session) json_predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge', endpoint_name=endpoint_name) result = json_predictor.predict([6.4, 3.2, 4.5, 1.5]) print('predict result: {}'.format(result))
def test_cifar(sagemaker_session, tf_full_version): with timeout(minutes=45): script_path = os.path.join(DATA_DIR, 'cifar_10', 'source') dataset_path = os.path.join(DATA_DIR, 'cifar_10', 'data') estimator = TensorFlow(entry_point='resnet_cifar_10.py', source_dir=script_path, role='SageMakerRole', framework_version=tf_full_version, training_steps=500, evaluation_steps=5, train_instance_count=2, train_instance_type='ml.p2.xlarge', sagemaker_session=sagemaker_session, train_max_run=45 * 60, base_job_name='test-cifar') inputs = estimator.sagemaker_session.upload_data(path=dataset_path, key_prefix='data/cifar10') estimator.fit(inputs, logs=False) print('job succeeded: {}'.format(estimator.latest_training_job.name)) endpoint_name = estimator.latest_training_job.name with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.p2.xlarge') predictor.serializer = PickleSerializer() predictor.content_type = PICKLE_CONTENT_TYPE data = np.random.randn(32, 32, 3) predict_response = predictor.predict(data) assert len(predict_response['outputs']['probabilities']['floatVal']) == 10
def test_tf(m_tar, e_tar, time, strftime, sagemaker_session, tf_version): tf = TensorFlow(entry_point=SCRIPT_FILE, role=ROLE, sagemaker_session=sagemaker_session, training_steps=1000, evaluation_steps=10, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, framework_version=tf_version, requirements_file=REQUIREMENTS_FILE, source_dir=DATA_DIR) inputs = 's3://mybucket/train' s3_prefix = 's3://{}/{}/source/sourcedir.tar.gz'.format( BUCKET_NAME, JOB_NAME) e_tar.return_value = UploadedCode(s3_prefix=s3_prefix, script_name=SCRIPT_FILE) s3_prefix = 's3://{}/{}/sourcedir.tar.gz'.format(BUCKET_NAME, JOB_NAME) m_tar.return_value = UploadedCode(s3_prefix=s3_prefix, script_name=SCRIPT_FILE) tf.fit(inputs=inputs) call_names = [c[0] for c in sagemaker_session.method_calls] assert call_names == ['train', 'logs_for_job'] expected_train_args = _create_train_job(tf_version) expected_train_args['input_config'][0]['DataSource']['S3DataSource'][ 'S3Uri'] = inputs actual_train_args = sagemaker_session.method_calls[0][2] assert actual_train_args == expected_train_args model = tf.create_model() environment = { 'Environment': { 'SAGEMAKER_SUBMIT_DIRECTORY': 's3://{}/{}/sourcedir.tar.gz'.format(BUCKET_NAME, JOB_NAME), 'SAGEMAKER_PROGRAM': 'dummy_script.py', 'SAGEMAKER_REQUIREMENTS': 'dummy_requirements.txt', 'SAGEMAKER_ENABLE_CLOUDWATCH_METRICS': 'false', 'SAGEMAKER_REGION': 'us-west-2', 'SAGEMAKER_CONTAINER_LOG_LEVEL': '20' }, 'Image': create_image_uri('us-west-2', "tensorflow", INSTANCE_TYPE, tf_version, "py2"), 'ModelDataUrl': 's3://m/m.tar.gz' } assert environment == model.prepare_container_def(INSTANCE_TYPE) assert 'cpu' in model.prepare_container_def(INSTANCE_TYPE)['Image'] predictor = tf.deploy(1, INSTANCE_TYPE) assert isinstance(predictor, TensorFlowPredictor)
def test_keras(sagemaker_session, tf_full_version): script_path = os.path.join(tests.integ.DATA_DIR, 'cifar_10', 'source') dataset_path = os.path.join(tests.integ.DATA_DIR, 'cifar_10', 'data') with timeout(minutes=45): estimator = TensorFlow(entry_point='keras_cnn_cifar_10.py', source_dir=script_path, role='SageMakerRole', sagemaker_session=sagemaker_session, hyperparameters={ 'learning_rate': 1e-4, 'decay': 1e-6 }, training_steps=50, evaluation_steps=5, train_instance_count=1, train_instance_type='ml.c4.xlarge', train_max_run=45 * 60) inputs = estimator.sagemaker_session.upload_data( path=dataset_path, key_prefix='data/cifar10') estimator.fit(inputs) endpoint_name = estimator.latest_training_job.name with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.p2.xlarge') data = np.random.randn(32, 32, 3) predict_response = predictor.predict(data) assert len( predict_response['outputs']['probabilities']['floatVal']) == 10
def test_cifar(sagemaker_session, tf_full_version): with timeout(minutes=45): script_path = os.path.join(tests.integ.DATA_DIR, 'cifar_10', 'source') dataset_path = os.path.join(tests.integ.DATA_DIR, 'cifar_10', 'data') estimator = TensorFlow(entry_point='resnet_cifar_10.py', source_dir=script_path, role='SageMakerRole', framework_version=tf_full_version, training_steps=500, evaluation_steps=5, train_instance_count=2, train_instance_type='ml.p2.xlarge', sagemaker_session=sagemaker_session, train_max_run=45 * 60, base_job_name='test-cifar') inputs = estimator.sagemaker_session.upload_data(path=dataset_path, key_prefix='data/cifar10') job_name = unique_name_from_base('test-tf-cifar') estimator.fit(inputs, logs=False, job_name=job_name) print('job succeeded: {}'.format(estimator.latest_training_job.name)) endpoint_name = estimator.latest_training_job.name with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.p2.xlarge') predictor.serializer = PickleSerializer() predictor.content_type = PICKLE_CONTENT_TYPE data = np.random.randn(32, 32, 3) predict_response = predictor.predict(data) assert len(predict_response['outputs']['probabilities']['floatVal']) == 10
def test_tf(sagemaker_session, tf_full_version): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, 'iris', 'iris-dnn-classifier.py') estimator = TensorFlow(entry_point=script_path, role='SageMakerRole', framework_version=tf_full_version, training_steps=1, evaluation_steps=1, hyperparameters={'input_tensor_name': 'inputs'}, train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session, base_job_name='test-tf') inputs = sagemaker_session.upload_data(path=DATA_PATH, key_prefix='integ-test-data/tf_iris') estimator.fit(inputs) print('job succeeded: {}'.format(estimator.latest_training_job.name)) endpoint_name = estimator.latest_training_job.name with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): json_predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge', endpoint_name=endpoint_name) features = [6.4, 3.2, 4.5, 1.5] dict_result = json_predictor.predict({'inputs': features}) print('predict result: {}'.format(dict_result)) list_result = json_predictor.predict(features) print('predict result: {}'.format(list_result)) assert dict_result == list_result
def test_deploy_with_input_handlers(sagemaker_session, instance_type): estimator = TensorFlow( entry_point="inference.py", source_dir=TFS_RESOURCE_PATH, role=ROLE, train_instance_count=1, train_instance_type=instance_type, py_version=tests.integ.PYTHON_VERSION, sagemaker_session=sagemaker_session, script_mode=True, framework_version=TensorFlow.LATEST_VERSION, tags=TAGS, ) estimator.fit(job_name=unique_name_from_base("test-tf-tfs-deploy")) endpoint_name = estimator.latest_training_job.name with timeout.timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): predictor = estimator.deploy(initial_instance_count=1, instance_type=instance_type, endpoint_name=endpoint_name) input_data = {"instances": [1.0, 2.0, 5.0]} expected_result = {"predictions": [4.0, 4.5, 6.0]} result = predictor.predict(input_data) assert expected_result == result
def test_estimator_deploy(sagemaker_session): container_log_level = '"logging.INFO"' source_dir = "s3://mybucket/source" custom_image = "custom:1.0" tf = TensorFlow( entry_point="script.py", role=ROLE, sagemaker_session=sagemaker_session, training_steps=1000, evaluation_steps=10, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, image_name=custom_image, container_log_level=container_log_level, base_job_name="job", source_dir=source_dir, ) job_name = "doing something" tf.fit(inputs="s3://mybucket/train", job_name=job_name) predictor = tf.deploy(INSTANCE_COUNT, INSTANCE_TYPE, endpoint_name="endpoint", endpoint_type="tensorflow-serving") assert isinstance(predictor, Predictor)
def test_tf(sagemaker_session, tf_full_version): with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES): script_path = os.path.join(DATA_DIR, 'iris', 'iris-dnn-classifier.py') estimator = TensorFlow(entry_point=script_path, role='SageMakerRole', framework_version=tf_full_version, training_steps=1, evaluation_steps=1, hyperparameters={'input_tensor_name': 'inputs'}, train_instance_count=1, train_instance_type='ml.c4.xlarge', sagemaker_session=sagemaker_session, base_job_name='test-tf') inputs = sagemaker_session.upload_data( path=DATA_PATH, key_prefix='integ-test-data/tf_iris') estimator.fit(inputs) print('job succeeded: {}'.format(estimator.latest_training_job.name)) endpoint_name = estimator.latest_training_job.name with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): json_predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.c4.xlarge', endpoint_name=endpoint_name) features = [6.4, 3.2, 4.5, 1.5] dict_result = json_predictor.predict({'inputs': features}) print('predict result: {}'.format(dict_result)) list_result = json_predictor.predict(features) print('predict result: {}'.format(list_result)) assert dict_result == list_result
def test_server_side_encryption(sagemaker_session, tf_full_version, py_version): with kms_utils.bucket_with_encryption(sagemaker_session, ROLE) as (bucket_with_kms, kms_key): output_path = os.path.join(bucket_with_kms, "test-server-side-encryption", time.strftime("%y%m%d-%H%M")) estimator = TensorFlow( entry_point="training.py", source_dir=TFS_RESOURCE_PATH, role=ROLE, train_instance_count=1, train_instance_type="ml.c5.xlarge", sagemaker_session=sagemaker_session, script_mode=True, framework_version=tf_full_version, py_version=py_version, code_location=output_path, output_path=output_path, model_dir="/opt/ml/model", output_kms_key=kms_key, ) inputs = estimator.sagemaker_session.upload_data( path=os.path.join(MNIST_RESOURCE_PATH, "data"), key_prefix="scriptmode/mnist") with tests.integ.timeout.timeout( minutes=tests.integ.TRAINING_DEFAULT_TIMEOUT_MINUTES): estimator.fit( inputs=inputs, job_name=unique_name_from_base("test-server-side-encryption")) endpoint_name = unique_name_from_base("test-server-side-encryption") with timeout.timeout_and_delete_endpoint_by_name( endpoint_name, sagemaker_session): estimator.deploy( initial_instance_count=1, instance_type="ml.c5.xlarge", endpoint_name=endpoint_name, entry_point=os.path.join(TFS_RESOURCE_PATH, "inference.py"), )
def create_predictor_from_csv(self): Log.i('initiating sagemaker model creation') role = AppConfig.setting('AWS_PREDICTOR_ROLE') bucket='cryptrade-sagemaker' custom_code_upload_location = 's3://{}/customcode/tensorflow_iris'.format(bucket) model_artifacts_location = 's3://{}/artifacts'.format(bucket) Log.d('training data will be uploaded to: {}', custom_code_upload_location) Log.d('training artifacts will be uploaded to: {}', model_artifacts_location) sess = sagemaker.Session() def upload_to_s3(channel, filepath, skip_if_name_and_size_matches=False): file = Path(filepath) """From SM examples. Like here: https://github.com/awslabs/amazon-sagemaker-examples/blob/master/introduction_to_amazon_algorithms/imageclassification_caltech/Image-classification-transfer-learning.ipynb""" s3 = boto3.resource('s3') key = channel + '/' + file.name bucket_ref = s3.Bucket(bucket) objs = list(bucket_ref.objects.filter(Prefix=key)) is_file_already_existing = len(objs) > 0 and objs[0].key == key if is_file_already_existing is True: if skip_if_name_and_size_matches is True: s3_client = boto3.client('s3') response = s3_client.head_object(Bucket=bucket, Key=key) local_size = file.stat().st_size remote_size = response['ContentLength'] if remote_size == local_size: Log.w('skipping upload as s3 key of same size ({:.2f}kb) already exists: {}', local_size/1000, key) return Log.w('overwriting existing s3 key: {}', key) with open(filepath, "rb") as data: s3.Bucket(bucket).put_object(Key=key, Body=data) s3_data_folder = 'data' upload_to_s3(s3_data_folder, self.train_filepath, True) upload_to_s3(s3_data_folder, self.test_filepath, True) upload_to_s3(s3_data_folder, self.meta_filepath) estimator = TensorFlow( entry_point='aws_dnn_predictor_entry.py', role=role, output_path=model_artifacts_location, code_location=custom_code_upload_location, train_instance_count=1, train_instance_type='ml.c5.xlarge', training_steps=1000, evaluation_steps=100 ) train_data_location = 's3://{}/{}'.format(bucket, s3_data_folder) Log.i('fitting train data: {}', train_data_location) estimator.fit(train_data_location) Log.i('deploying model') deploy_start = datetime.now() predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.t2.medium' ) deploy_end = datetime.now() Log.i('deployed predictor in {}s, endpoint is:\n{}', deploy_end - deploy_start, predictor.endpoint) self.predictor = predictor
def test_tf(time, strftime, sagemaker_session, tf_version): tf = TensorFlow(entry_point=SCRIPT_PATH, role=ROLE, sagemaker_session=sagemaker_session, training_steps=1000, evaluation_steps=10, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, framework_version=tf_version) inputs = 's3://mybucket/train' tf.fit(inputs=inputs) call_names = [c[0] for c in sagemaker_session.method_calls] assert call_names == ['train', 'logs_for_job'] boto_call_names = [ c[0] for c in sagemaker_session.boto_session.method_calls ] assert boto_call_names == ['resource'] expected_train_args = _create_train_job(tf_version) expected_train_args['input_config'][0]['DataSource']['S3DataSource'][ 'S3Uri'] = inputs actual_train_args = sagemaker_session.method_calls[0][2] assert actual_train_args == expected_train_args model = tf.create_model() assert { 'Environment': { 'SAGEMAKER_SUBMIT_DIRECTORY': 's3://{}/{}/sourcedir.tar.gz'.format(BUCKET_NAME, JOB_NAME), 'SAGEMAKER_PROGRAM': 'dummy_script.py', 'SAGEMAKER_ENABLE_CLOUDWATCH_METRICS': 'false', 'SAGEMAKER_REGION': 'us-west-2', 'SAGEMAKER_CONTAINER_LOG_LEVEL': '20' }, 'Image': create_image_uri('us-west-2', "tensorflow", GPU_IMAGE_NAME, tf_version, "py2"), 'ModelDataUrl': 's3://m/m.tar.gz' } == model.prepare_container_def(GPU_IMAGE_NAME) assert 'cpu' in model.prepare_container_def(CPU_IMAGE_NAME)['Image'] predictor = tf.deploy(1, GPU_IMAGE_NAME) assert isinstance(predictor, TensorFlowPredictor)
def test_tf(sagemaker_session, tf_version): tf = TensorFlow( entry_point=SCRIPT_FILE, role=ROLE, sagemaker_session=sagemaker_session, training_steps=1000, evaluation_steps=10, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, framework_version=tf_version, requirements_file=REQUIREMENTS_FILE, source_dir=DATA_DIR, ) inputs = "s3://mybucket/train" tf.fit(inputs=inputs, experiment_config=EXPERIMENT_CONFIG) call_names = [c[0] for c in sagemaker_session.method_calls] assert call_names == ["train", "logs_for_job"] expected_train_args = _create_train_job(tf_version) expected_train_args["input_config"][0]["DataSource"]["S3DataSource"][ "S3Uri"] = inputs expected_train_args["experiment_config"] = EXPERIMENT_CONFIG actual_train_args = sagemaker_session.method_calls[0][2] assert actual_train_args == expected_train_args model = tf.create_model() environment = { "Environment": { "SAGEMAKER_SUBMIT_DIRECTORY": "s3://mybucket/sagemaker-tensorflow-2017-11-06-14:14:15.673/source/sourcedir.tar.gz", # noqa: E501 "SAGEMAKER_PROGRAM": "dummy_script.py", "SAGEMAKER_REQUIREMENTS": "dummy_requirements.txt", "SAGEMAKER_ENABLE_CLOUDWATCH_METRICS": "false", "SAGEMAKER_REGION": "us-west-2", "SAGEMAKER_CONTAINER_LOG_LEVEL": "20", }, "Image": create_image_uri("us-west-2", "tensorflow", INSTANCE_TYPE, tf_version, "py2"), "ModelDataUrl": "s3://m/m.tar.gz", } assert environment == model.prepare_container_def(INSTANCE_TYPE) assert "cpu" in model.prepare_container_def(INSTANCE_TYPE)["Image"] predictor = tf.deploy(1, INSTANCE_TYPE) assert isinstance(predictor, TensorFlowPredictor)
def test_deploy(sagemaker_session, tf_version): estimator = TensorFlow(entry_point=SCRIPT, source_dir=SOURCE_DIR, role=ROLE, framework_version=tf_version, train_instance_count=2, train_instance_type=INSTANCE_TYPE_CPU, sagemaker_session=sagemaker_session, base_job_name='test-cifar') estimator.fit('s3://mybucket/train') print('job succeeded: {}'.format(estimator.latest_training_job.name)) estimator.deploy(initial_instance_count=1, instance_type=INSTANCE_TYPE_CPU) image = IMAGE_URI_FORMAT_STRING.format(REGION, CPU_IMAGE_NAME, tf_version, 'cpu', 'py2') sagemaker_session.create_model.assert_called_with( estimator._current_job_name, ROLE, {'Environment': {'SAGEMAKER_ENABLE_CLOUDWATCH_METRICS': 'false', 'SAGEMAKER_CONTAINER_LOG_LEVEL': '20', 'SAGEMAKER_SUBMIT_DIRECTORY': SOURCE_DIR, 'SAGEMAKER_REQUIREMENTS': '', 'SAGEMAKER_REGION': REGION, 'SAGEMAKER_PROGRAM': SCRIPT}, 'Image': image, 'ModelDataUrl': 's3://m/m.tar.gz'})
def test_mnist_async(sagemaker_session, cpu_instance_type, tf_full_version, tf_full_py_version): if tf_full_version == "2.7.0": tf_full_version = "2.7" estimator = TensorFlow( entry_point=SCRIPT, source_dir=MNIST_RESOURCE_PATH, role=ROLE, instance_count=1, instance_type="ml.c5.4xlarge", sagemaker_session=sagemaker_session, framework_version=tf_full_version, py_version=tf_full_py_version, tags=TAGS, ) inputs = estimator.sagemaker_session.upload_data( path=os.path.join(MNIST_RESOURCE_PATH, "data"), key_prefix="scriptmode/mnist") estimator.fit(inputs=inputs, wait=False, job_name=unique_name_from_base("test-tf-sm-async")) training_job_name = estimator.latest_training_job.name time.sleep(20) endpoint_name = training_job_name _assert_training_job_tags_match(sagemaker_session.sagemaker_client, estimator.latest_training_job.name, TAGS) with tests.integ.timeout.timeout_and_delete_endpoint_by_name( endpoint_name, sagemaker_session): estimator = TensorFlow.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session) model_name = "model-mnist-async" predictor = estimator.deploy( initial_instance_count=1, instance_type=cpu_instance_type, endpoint_name=endpoint_name, model_name=model_name, ) result = predictor.predict(np.zeros(784)) print("predict result: {}".format(result)) _assert_endpoint_tags_match(sagemaker_session.sagemaker_client, predictor.endpoint_name, TAGS) _assert_model_tags_match(sagemaker_session.sagemaker_client, model_name, TAGS) _assert_model_name_match(sagemaker_session.sagemaker_client, endpoint_name, model_name)
def test_mnist_async(sagemaker_session, cpu_instance_type): estimator = TensorFlow( entry_point=SCRIPT, role=ROLE, train_instance_count=1, train_instance_type="ml.c5.4xlarge", py_version=tests.integ.PYTHON_VERSION, sagemaker_session=sagemaker_session, script_mode=True, # testing py-sdk functionality, no need to run against all TF versions framework_version=TensorFlow.LATEST_VERSION, tags=TAGS, ) inputs = estimator.sagemaker_session.upload_data( path=os.path.join(MNIST_RESOURCE_PATH, "data"), key_prefix="scriptmode/mnist") estimator.fit(inputs=inputs, wait=False, job_name=unique_name_from_base("test-tf-sm-async")) training_job_name = estimator.latest_training_job.name time.sleep(20) endpoint_name = training_job_name _assert_training_job_tags_match(sagemaker_session.sagemaker_client, estimator.latest_training_job.name, TAGS) with tests.integ.timeout.timeout_and_delete_endpoint_by_name( endpoint_name, sagemaker_session): estimator = TensorFlow.attach(training_job_name=training_job_name, sagemaker_session=sagemaker_session) model_name = "model-mnist-async" predictor = estimator.deploy( initial_instance_count=1, instance_type=cpu_instance_type, endpoint_name=endpoint_name, model_name=model_name, ) result = predictor.predict(np.zeros(784)) print("predict result: {}".format(result)) _assert_endpoint_tags_match(sagemaker_session.sagemaker_client, predictor.endpoint, TAGS) _assert_model_tags_match(sagemaker_session.sagemaker_client, model_name, TAGS) _assert_model_name_match(sagemaker_session.sagemaker_client, endpoint_name, model_name)
def test_tf_local_mode(tf_full_version, sagemaker_local_session): local_mode_lock_fd = open(LOCK_PATH, 'w') local_mode_lock = local_mode_lock_fd.fileno() with timeout(minutes=5): script_path = os.path.join(DATA_DIR, 'iris', 'iris-dnn-classifier.py') estimator = TensorFlow(entry_point=script_path, role='SageMakerRole', framework_version=tf_full_version, training_steps=1, evaluation_steps=1, hyperparameters={'input_tensor_name': 'inputs'}, train_instance_count=1, train_instance_type='local', base_job_name='test-tf', sagemaker_session=sagemaker_local_session) inputs = estimator.sagemaker_session.upload_data( path=DATA_PATH, key_prefix='integ-test-data/tf_iris') estimator.fit(inputs) print('job succeeded: {}'.format(estimator.latest_training_job.name)) endpoint_name = estimator.latest_training_job.name try: # Since Local Mode uses the same port for serving, we need a lock in order # to allow concurrent test execution. The serving test is really fast so it still # makes sense to allow this behavior. fcntl.lockf(local_mode_lock, fcntl.LOCK_EX) json_predictor = estimator.deploy(initial_instance_count=1, instance_type='local', endpoint_name=endpoint_name) features = [6.4, 3.2, 4.5, 1.5] dict_result = json_predictor.predict({'inputs': features}) print('predict result: {}'.format(dict_result)) list_result = json_predictor.predict(features) print('predict result: {}'.format(list_result)) assert dict_result == list_result finally: estimator.delete_endpoint() time.sleep(5) fcntl.lockf(local_mode_lock, fcntl.LOCK_UN)
def test_tf_local_mode(tf_full_version, sagemaker_local_session): local_mode_lock_fd = open(LOCK_PATH, 'w') local_mode_lock = local_mode_lock_fd.fileno() with timeout(minutes=5): script_path = os.path.join(DATA_DIR, 'iris', 'iris-dnn-classifier.py') estimator = TensorFlow(entry_point=script_path, role='SageMakerRole', framework_version=tf_full_version, training_steps=1, evaluation_steps=1, hyperparameters={'input_tensor_name': 'inputs'}, train_instance_count=1, train_instance_type='local', base_job_name='test-tf', sagemaker_session=sagemaker_local_session) inputs = estimator.sagemaker_session.upload_data(path=DATA_PATH, key_prefix='integ-test-data/tf_iris') estimator.fit(inputs) print('job succeeded: {}'.format(estimator.latest_training_job.name)) endpoint_name = estimator.latest_training_job.name try: # Since Local Mode uses the same port for serving, we need a lock in order # to allow concurrent test execution. The serving test is really fast so it still # makes sense to allow this behavior. fcntl.lockf(local_mode_lock, fcntl.LOCK_EX) json_predictor = estimator.deploy(initial_instance_count=1, instance_type='local', endpoint_name=endpoint_name) features = [6.4, 3.2, 4.5, 1.5] dict_result = json_predictor.predict({'inputs': features}) print('predict result: {}'.format(dict_result)) list_result = json_predictor.predict(features) print('predict result: {}'.format(list_result)) assert dict_result == list_result finally: estimator.delete_endpoint() time.sleep(5) fcntl.lockf(local_mode_lock, fcntl.LOCK_UN)
def run(): # sagemakerを使うためのインスタンス生成 # どのparamがrequiredでどれがoptionalかはまだ調べてないのでよくわかってない # 一旦公式docsにあったやつを利用してる tf_estimator = TensorFlow( entry_point='tf-keras-train.py', # モデルなどについて書かれているファイル role='SageMakerRole', # IAMロール. SageMakerが使える権限を与えておく必要がある training_steps=20, # 学習時のパラメータ evaluation_steps=10, # 学習時のパラメータ train_instance_count=1, # 学習に使うインスタンスの個数. 複数指定して分散学習とかできる train_instance_type='ml.m5.large' # インスタンスタイプ ) # 学習を開始する tf_estimator.fit( 's3://sagemaker-ap-northeast-1-192494425048') # s3のバケット名を与える # 学習したモデルをデプロイする tf_predictor = tf_estimator.deploy(initial_instance_count=1, instance_type='ml.t2.medium')
def test_tf_local_mode(sagemaker_local_session): with stopit.ThreadingTimeout(5 * 60, swallow_exc=False): script_path = os.path.join(DATA_DIR, "iris", "iris-dnn-classifier.py") estimator = TensorFlow( entry_point=script_path, role="SageMakerRole", framework_version="1.12", training_steps=1, evaluation_steps=1, hyperparameters={"input_tensor_name": "inputs"}, train_instance_count=1, train_instance_type="local", base_job_name="test-tf", sagemaker_session=sagemaker_local_session, ) inputs = estimator.sagemaker_session.upload_data( path=DATA_PATH, key_prefix="integ-test-data/tf_iris" ) estimator.fit(inputs) print("job succeeded: {}".format(estimator.latest_training_job.name)) endpoint_name = estimator.latest_training_job.name with lock.lock(LOCK_PATH): try: json_predictor = estimator.deploy( initial_instance_count=1, instance_type="local", endpoint_name=endpoint_name ) features = [6.4, 3.2, 4.5, 1.5] dict_result = json_predictor.predict({"inputs": features}) print("predict result: {}".format(dict_result)) list_result = json_predictor.predict(features) print("predict result: {}".format(list_result)) assert dict_result == list_result finally: estimator.delete_endpoint()
def test_tf(m_tar, e_tar, time, strftime, sagemaker_session, tf_version): tf = TensorFlow(entry_point=SCRIPT_FILE, role=ROLE, sagemaker_session=sagemaker_session, training_steps=1000, evaluation_steps=10, train_instance_count=INSTANCE_COUNT, train_instance_type=INSTANCE_TYPE, framework_version=tf_version, requirements_file=REQUIREMENTS_FILE, source_dir=DATA_DIR) inputs = 's3://mybucket/train' s3_prefix = 's3://{}/{}/source/sourcedir.tar.gz'.format(BUCKET_NAME, JOB_NAME) e_tar.return_value = UploadedCode(s3_prefix=s3_prefix, script_name=SCRIPT_FILE) s3_prefix = 's3://{}/{}/sourcedir.tar.gz'.format(BUCKET_NAME, JOB_NAME) m_tar.return_value = UploadedCode(s3_prefix=s3_prefix, script_name=SCRIPT_FILE) tf.fit(inputs=inputs) call_names = [c[0] for c in sagemaker_session.method_calls] assert call_names == ['train', 'logs_for_job'] expected_train_args = _create_train_job(tf_version) expected_train_args['input_config'][0]['DataSource']['S3DataSource']['S3Uri'] = inputs actual_train_args = sagemaker_session.method_calls[0][2] assert actual_train_args == expected_train_args model = tf.create_model() environment = { 'Environment': { 'SAGEMAKER_SUBMIT_DIRECTORY': 's3://{}/{}/sourcedir.tar.gz'.format(BUCKET_NAME, JOB_NAME), 'SAGEMAKER_PROGRAM': 'dummy_script.py', 'SAGEMAKER_REQUIREMENTS': 'dummy_requirements.txt', 'SAGEMAKER_ENABLE_CLOUDWATCH_METRICS': 'false', 'SAGEMAKER_REGION': 'us-west-2', 'SAGEMAKER_CONTAINER_LOG_LEVEL': '20' }, 'Image': create_image_uri('us-west-2', "tensorflow", INSTANCE_TYPE, tf_version, "py2"), 'ModelDataUrl': 's3://m/m.tar.gz' } assert environment == model.prepare_container_def(INSTANCE_TYPE) assert 'cpu' in model.prepare_container_def(INSTANCE_TYPE)['Image'] predictor = tf.deploy(1, INSTANCE_TYPE) assert isinstance(predictor, TensorFlowPredictor)
def test_cifar(sagemaker_session): with timeout(minutes=45): script_path = os.path.join(tests.integ.DATA_DIR, "cifar_10", "source") dataset_path = os.path.join(tests.integ.DATA_DIR, "cifar_10", "data") estimator = TensorFlow( entry_point="resnet_cifar_10.py", source_dir=script_path, role="SageMakerRole", framework_version="1.12", training_steps=50, evaluation_steps=5, train_instance_count=2, train_instance_type="ml.p2.xlarge", sagemaker_session=sagemaker_session, train_max_run=45 * 60, base_job_name="test-cifar", ) inputs = estimator.sagemaker_session.upload_data( path=dataset_path, key_prefix="data/cifar10") job_name = unique_name_from_base("test-tf-cifar") estimator.fit(inputs, logs=False, job_name=job_name) print("job succeeded: {}".format(estimator.latest_training_job.name)) endpoint_name = estimator.latest_training_job.name with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): predictor = estimator.deploy(initial_instance_count=1, instance_type="ml.p2.xlarge") predictor.serializer = PickleSerializer() predictor.content_type = PICKLE_CONTENT_TYPE data = np.random.randn(32, 32, 3) predict_response = predictor.predict(data) assert len( predict_response["outputs"]["probabilities"]["floatVal"]) == 10
def main(): config = get_config(LOCAL_MODE) #config = get_config(CLOUD_MODE) download_training_and_eval_data() if config['mode'] is CLOUD_MODE: upload_data_to_s3(config['bucket'], config['s3_data_prefix']) print('Starting model training.') print( 'Note: if launching for the first time in local mode, container image download might take a few minutes to complete.' ) mnist_estimator = TensorFlow( entry_point='mnist_tf2.py', source_dir='code', role=config['role'], instance_count=1, instance_type=config['instance_type'], framework_version='2.4.1', py_version='py37', distribution={'parameter_server': { 'enabled': True }}) mnist_estimator.fit(config['training_dataset_path']) print('Completed model training') print('Deploying endpoint in ' + config['mode']) predictor = mnist_estimator.deploy(initial_instance_count=1, instance_type=config['instance_type']) do_inference_on_local_endpoint(predictor, config['mode']) print('About to delete the endpoint to stop paying (if in cloud mode).') predictor.delete_endpoint(predictor.endpoint_name) predictor.delete_model()
def test_serving(self, sagemaker_session, ecr_image, framework_version, instance_type, instance_count, tmpdir, capsys, mnist_dataset): script = os.path.join(resource_path, 'mnist', 'mnist.py') estimator = TensorFlow( entry_point=script, role='SageMakerRole', instance_type=instance_type, instance_count=instance_count, sagemaker_session=sagemaker_session, image_uri=ecr_image, framework_version=framework_version, hyperparameters={ TrainingCompilerConfig.HP_ENABLE_COMPILER: True, }, ) estimator.fit(mnist_dataset, job_name=unique_name_from_base('test-TF-trcomp-serving')) _assert_model_exported_to_s3(estimator) captured = capsys.readouterr() _assert_training_compiler_invoked(captured) predictor = estimator.deploy(initial_instance_count=1, instance_type=instance_type) predictor.delete_predictor()
def test_keras(sagemaker_session, tf_full_version): script_path = os.path.join(DATA_DIR, 'cifar_10', 'source') dataset_path = os.path.join(DATA_DIR, 'cifar_10', 'data') with timeout(minutes=45): estimator = TensorFlow(entry_point='keras_cnn_cifar_10.py', source_dir=script_path, role='SageMakerRole', sagemaker_session=sagemaker_session, hyperparameters={'learning_rate': 1e-4, 'decay': 1e-6}, training_steps=500, evaluation_steps=5, train_instance_count=1, train_instance_type='ml.c4.xlarge', train_max_run=45 * 60) inputs = estimator.sagemaker_session.upload_data(path=dataset_path, key_prefix='data/cifar10') estimator.fit(inputs) endpoint_name = estimator.latest_training_job.name with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session): predictor = estimator.deploy(initial_instance_count=1, instance_type='ml.p2.xlarge') data = np.random.randn(32, 32, 3) predict_response = predictor.predict(data) assert len(predict_response['outputs']['probabilities']['floatVal']) == 10