def test_deserializer(): array_data = [[1.0, 2.0, 3.0], [10.0, 20.0, 30.0]] s = numpy_to_record_serializer() buf = s(np.array(array_data)) d = record_deserializer() for record, expected in zip(d(buf, 'who cares'), array_data): assert record.features["values"].float64_tensor.values == expected
def __init__(self, endpoint, sagemaker_session=None): super(LDAPredictor, self).__init__( endpoint, sagemaker_session, serializer=numpy_to_record_serializer(), deserializer=record_deserializer(), )
def test_deserializer(): array_data = [[1.0, 2.0, 3.0], [10.0, 20.0, 30.0]] s = numpy_to_record_serializer() buf = s(np.array(array_data)) d = record_deserializer() for record, expected in zip(d(buf, 'who cares'), array_data): assert record.features["values"].float64_tensor.values == expected
def test_serializer_accepts_one_dimensional_array(): s = numpy_to_record_serializer() array_data = [1.0, 2.0, 3.0] buf = s(np.array(array_data)) record_data = next(_read_recordio(buf)) record = Record() record.ParseFromString(record_data) assert record.features["values"].float64_tensor.values == array_data
def test_serializer(): s = numpy_to_record_serializer() array_data = [[1.0, 2.0, 3.0], [10.0, 20.0, 30.0]] buf = s(np.array(array_data)) for record_data, expected in zip(_read_recordio(buf), array_data): record = Record() record.ParseFromString(record_data) assert record.features["values"].float64_tensor.values == expected
def test_serializer(): s = numpy_to_record_serializer() array_data = [[1.0, 2.0, 3.0], [10.0, 20.0, 30.0]] buf = s(np.array(array_data)) for record_data, expected in zip(_read_recordio(buf), array_data): record = Record() record.ParseFromString(record_data) assert record.features["values"].float64_tensor.values == expected
def test_serializer_accepts_one_dimensional_array(): s = numpy_to_record_serializer() array_data = [1.0, 2.0, 3.0] buf = s(np.array(array_data)) record_data = next(read_recordio(buf)) record = Record() record.ParseFromString(record_data) assert record.features["values"].float64_tensor.values == array_data
def __init__(self, endpoint, sagemaker_session=None): """ Args: endpoint: sagemaker_session: """ super(LinearLearnerPredictor, self).__init__( endpoint, sagemaker_session, serializer=numpy_to_record_serializer(), deserializer=record_deserializer(), )
def __init__(self, endpoint, sagemaker_session=None): """ Args: endpoint: sagemaker_session: """ super(FactorizationMachinesPredictor, self).__init__( endpoint, sagemaker_session, serializer=numpy_to_record_serializer(), deserializer=record_deserializer(), )
def predict(): """ Provide this endpoint an image in jpeg format. The image should be equal in size to the training images (28x28). """ img = Image.open(BytesIO(app.current_request.raw_body)).convert("L") img_arr = np.array(img, dtype=np.float32) runtime = boto3.Session().client(service_name="sagemaker-runtime", region_name="eu-west-1") response = runtime.invoke_endpoint( EndpointName="mnistclassifier", ContentType="application/x-recordio-protobuf", Body=numpy_to_record_serializer()(img_arr.flatten()), ) result = json.loads(response["Body"].read().decode("utf-8")) return Response(result, status_code=200, headers={"Content-Type": "application/json"})
def convert_and_upload_training_data(ndarray, bucket, prefix, filename='data.pbr'): import boto3 import os from sagemaker.amazon.common import numpy_to_record_serializer # convert Numpy array to Protobuf RecordIO format serializer = numpy_to_record_serializer() data = shingle(ndarray) buffer = serializer(data.astype(np.int32)) # upload to S3 s3_object = os.path.join(prefix, 'train', filename) boto3.Session().resource('s3').Bucket(bucket).Object( s3_object).upload_fileobj(buffer) s3_path = 's3://{}/{}'.format(bucket, s3_object) return s3_path
def __init__(self, endpoint, sagemaker_session=None): super(LinearLearnerPredictor, self).__init__(endpoint, sagemaker_session, serializer=numpy_to_record_serializer(), deserializer=record_deserializer())
def __init__(self, endpoint, sagemaker_session=None): super(FactorizationMachinesPredictor, self).__init__(endpoint, sagemaker_session, serializer=numpy_to_record_serializer(), deserializer=record_deserializer())
import boto3 import pandas import os import json import logging import sys from sagemaker.amazon.common import numpy_to_record_serializer logger = logging.getLogger() logger.setLevel(logging.INFO) s3 = boto3.resource('s3') client = boto3.client('s3') serializer = numpy_to_record_serializer() def toProto(event, context): try: logger.info(event) s3Record = event['Records'][0]['s3'] bucket = s3Record['bucket']['name'] key = s3Record['object']['key'] logger.info('Getting original object') origin = s3.Object(bucket, key) logger.info('Object retrieved') logger.info(origin) logger.info('Getting tags') tagging = client.get_object_tagging(Bucket=bucket, Key=key) logger.info('Tags retrieved') logger.info(tagging['TagSet'])
def main(): """ example call python3 train_lda.py \ --pageInputDir pages \ --vocabFile vocab.pkl \ --s3Bucket alex9311-sagemaker \ --s3Prefix LDA-testing \ --awsRole aws-sagemaker-execution-role """ parser = argparse.ArgumentParser() parser.add_argument('--vocabFile', action='store', required=True) parser.add_argument('--pageInputDir', action='store', required=True) parser.add_argument('--s3Bucket', action='store', required=True) parser.add_argument('--s3Prefix', action='store', required=True) args = parser.parse_args() bucket = args.s3Bucket prefix = args.s3Prefix role = args.awsRole vocab = pickle.load(open(args.vocabFile, 'rb')) documents = import_documents_on_disk(args.pageInputDir) for d in documents: d['term_counts'] = document_to_term_counts(d['tokens'], vocab) print('length of vocab: ', len(vocab)) print('number of documents: ', len(documents)) training_docs = np.array([d['term_counts'] for d in documents]) # convert training_docs to Protobuf RecordIO format recordio_protobuf_serializer = numpy_to_record_serializer() training_docs_recordio = recordio_protobuf_serializer(training_docs) # upload to S3 in bucket/prefix/train fname = 'lda_training.data' s3_object = os.path.join(prefix, 'train', fname) boto3.Session().resource('s3').Bucket(bucket).Object( s3_object).upload_fileobj(training_docs_recordio) s3_train_data = 's3://{}/{}'.format(bucket, s3_object) print('Uploaded training data to S3: {}'.format(s3_train_data)) region_name = boto3.Session().region_name container = get_image_uri(region_name, 'lda') print('Using SageMaker LDA container: {} ({})'.format( container, region_name)) session = sagemaker.Session() print('Training input/output will be stored in {}/{}'.format( bucket, prefix)) print('\nIAM Role: {}'.format(role)) lda = sagemaker.estimator.Estimator( container, role, output_path='s3://{}/{}/output'.format(bucket, prefix), train_instance_count=1, train_instance_type='ml.m5.large', sagemaker_session=session, ) # set algorithm-specific hyperparameters lda.set_hyperparameters( num_topics=10, feature_dim=len(vocab), mini_batch_size=len(documents), alpha0=1.0, ) # run the training job on input data stored in S3 lda.fit({'train': s3_train_data}) training_job_name = lda.latest_training_job.job_name print('Training job name: {}'.format(training_job_name)) model_fname = 'model.tar.gz' model_object = os.path.join(prefix, 'output', training_job_name, 'output', model_fname) boto3.Session().resource('s3').Bucket(bucket).Object( model_object).download_file(fname) with tarfile.open(fname) as tar: tar.extractall() print('Downloaded and extracted model tarball: {}'.format(model_object)) # obtain the model file model_list = [ fname for fname in os.listdir('.') if fname.startswith('model_') ] model_fname = model_list[0] print('Found model file: {}'.format(model_fname)) # get the model from the model file and store in Numpy arrays alpha, beta = mx.ndarray.load(model_fname) learned_alpha_permuted = alpha.asnumpy() learned_beta_permuted = beta.asnumpy() topic_distributions = learned_beta_permuted.tolist() topic_word_weights_list = [] for topic_distribution in topic_distributions: this_topic_word_weights = {} for word_index, weight in enumerate(topic_distribution): this_topic_word_weights[vocab[word_index]] = weight topic_word_weights_list.append(this_topic_word_weights) top_words_in_topics = [] for topic_word_weights in topic_word_weights_list: top_words_in_topics.append( sorted(topic_word_weights, key=topic_word_weights.get, reverse=True)[:10]) for index, top_words_in_topic in enumerate(top_words_in_topics): print('topic', index) for word in top_words_in_topic: print('\t', word, ':', topic_word_weights_list[index][word])
def __init__(self, endpoint, sagemaker_session=None): super(RandomCutForestPredictor, self).__init__(endpoint, sagemaker_session, serializer=numpy_to_record_serializer(), deserializer=record_deserializer())