def transform_dataset_for_download(model_id, dataset_id): model = Model.query.get(model_id) dataset = DataSet.query.get(dataset_id) init_logger('transform_for_download_log', obj=int(dataset_id)) logging.info('Starting Transform For Download Task') try: transformed = model.transform_dataset(dataset) logging.info('Saving transformed data to disk') temp_file = tempfile.NamedTemporaryFile() numpy.savez_compressed(temp_file, **transformed) s3_filename = "dataset_{0}_vectorized_for_model_{1}.npz".format( dataset.id, model.id) from api.amazon_utils import AmazonS3Helper s3 = AmazonS3Helper() logging.info('Uploading file {0} to s3 with name {1}...'.format( temp_file.name, s3_filename)) s3.save_key(s3_filename, temp_file.name, { 'model_id': model.id, 'dataset_id': dataset.id}, compressed=False) s3.close() return s3.get_download_url(s3_filename, 60 * 60 * 24 * 7) except Exception as e: logging.error("Got exception when transforming dataset: " " {0} \n {1}".format(e.message, get_task_traceback(e))) raise TaskException(e.message, e)
def test_put_file(self): # Amazon mock self.pill.attach(self.session, os.path.join(self.PILL_RESPONSES_DIR, 'put')) self.pill.playback() helper = AmazonS3Helper(**self.credentials) with patch("boto3.s3.transfer.S3Transfer._multipart_upload") as mu: app.config['MULTIPART_UPLOAD_CHUNK_SIZE'] = 128 helper.save_gz_file( 'name', os.path.join(self.PILL_RESPONSES_DIR, 'put/test_file.py'), {'model_id': 234}) mu.assert_called_with( os.path.join(self.PILL_RESPONSES_DIR, 'put/test_file.py'), 'bucket_name', 'name', ANY, ANY) # PutObject_1 self.assertTrue( helper.save_key('name', os.path.join(self.PILL_RESPONSES_DIR, 'put/test_file.py'), {'model_id': 234}, compressed=False)) # PutObject_2 self.assertTrue( helper.save_key( 'name', os.path.join(self.PILL_RESPONSES_DIR, 'put/test_file.py'), {'model_id': 234})) # PutObject_3 self.assertTrue( helper.save_key_string('name', 'data', {'model_id': 234}))
def test_delete_key(self): # Amazon mock self.pill.attach(self.session, os.path.join(self.PILL_RESPONSES_DIR, 'delete_key')) self.pill.playback() helper = AmazonS3Helper(**self.credentials) self.assertTrue(helper.delete_key('name'))
def upload_import_handler_to_server(server_id, handler_type, handler_id, user_id): """ Upload importhandler to S3 for cloudml-predict. """ init_logger('importdata_log', obj=int(handler_id)) logging.info('Starting uploading to cloudml_predict') try: server = Server.query.get(server_id) user = User.query.get(user_id) handler = XmlImportHandler.query.get(handler_id) handler_files = server.list_keys(FOLDER_IMPORT_HANDLERS) for file_ in handler_files: if file_['name'] == handler.name: raise ValueError('Import Handler with name "{0}" already exist' ' on the server {1}'.format( handler.name, server.name)) uid = get_a_Uuid() # TODO: Shall we use another account? s3 = AmazonS3Helper( bucket_name=app.config['CLOUDML_PREDICT_BUCKET_NAME']) path = '{0}/{1}/{2}.{3}'.format( server.folder.strip('/'), FOLDER_IMPORT_HANDLERS, uid, 'xml' if handler_type == XmlImportHandler.TYPE else 'json') meta = { 'id': handler.id, 'name': handler.name, 'object_name': handler.name, 'type': handler.TYPE, 'user_id': user.id, 'user_name': user.name, 'hide': "False", 'uploaded_on': str(datetime.now()), 'crc32': handler.crc32 } handler_data = handler.get_plan_config() handler.locked = True s_ids = list(handler.servers_ids) if (isinstance( handler.servers_ids, list)) else [] s_ids.append(server.id) handler.servers_ids = list(s_ids) handler.save() s3.save_key_string(path, handler_data, meta) s3.close() logging.info('Import Handler has been uploaded: %s' % handler.name) return '{0}/{1}.{2}'.format( FOLDER_IMPORT_HANDLERS, uid, 'xml' if handler_type == XmlImportHandler.TYPE else 'json') except Exception as e: logging.error("Got exception on uploading import handler to predict: " " {0} \n {1}".format(e.message, get_task_traceback(e))) raise TaskException(e.message, e)
def test_get_dnowload_url(self): # Amazon mock self.pill.attach(self.session, os.path.join(self.PILL_RESPONSES_DIR, 'download_url')) self.pill.playback() helper = AmazonS3Helper(**self.credentials) self.assertTrue(helper.get_download_url('test', 3600)) self.assertRaises(ValueError, helper.get_download_url, 'test', 'time')
def process_bind_param(self, value, dialect): if value is not None: helper = AmazonS3Helper() uid = str(uuid.uuid1().hex) helper.save_key_string(uid, value) value = uid filename = self._get_file_path(value) if os.path.exists(filename): os.remove(filename) return value
def test_key_exists(self): # Amazon mock self.pill.attach(self.session, os.path.join(self.PILL_RESPONSES_DIR, 'key_exists')) self.pill.playback() helper = AmazonS3Helper(**self.credentials) # HeadObject_1 self.assertFalse(helper.key_exists('name')) # HeadObject_2 self.assertTrue(helper.key_exists('name'))
def save_to_s3(self): meta = { 'handler': self.import_handler_id, 'dataset': self.name, 'params': str(self.import_params) } self.set_uid() helper = AmazonS3Helper() helper.save_gz_file(self.uid, self.filename, meta) helper.close() self.on_s3 = True self.save()
def run(self, **kwargs): import zlib from api.amazon_utils import AmazonS3Helper s3 = AmazonS3Helper( bucket_name=app.config['CLOUDML_PREDICT_BUCKET_NAME']) for key in s3.list_keys('staging/importhandlers/'): data = s3.load_key(key.name) crc32 = '0x%08X' % (zlib.crc32(data) & 0xffffffff) try: s3.set_key_metadata(key.name, {'crc32': crc32}, False) except: print 'Error'
def process_result_value(self, value, dialect): if value is not None: filename = self._get_file_path(value) if os.path.exists(filename): with open(filename, 'rb') as f: return f.read() helper = AmazonS3Helper() value = helper.load_key(value) with open(filename, 'w') as f: f.write(str(value)) return value
def test_list_keys(self): # Amazon mock self.pill.attach(self.session, os.path.join(self.PILL_RESPONSES_DIR, 'list_keys')) self.pill.playback() helper = AmazonS3Helper(**self.credentials) # ListObjects_1 res = helper.list_keys('prefix') self.assertEqual(set(['a', 'b', 'c']), set([k['Key'] for k in res])) # ListObjects_2 self.assertEqual([], helper.list_keys('another_prefix')) self.assertRaises(ParamValidationError, helper.list_keys, None)
def set_key_metadata(self, uid, folder, key, value): if self.check_edit_metadata(folder, key, value): key_name = '{0}/{1}/{2}'.format(self.folder, folder, uid) s3 = AmazonS3Helper( bucket_name=app.config['CLOUDML_PREDICT_BUCKET_NAME']) s3.set_key_metadata(key_name, {key: value}, True) # this means key is deleted, need to update model/import handler if key == 'hide' and value == 'True': obj = s3.load_key(key_name, with_metadata=True) cl = Model if folder == FOLDER_MODELS else XmlImportHandler model = cl.query.get(obj['Metadata']['id']) server_list = [s for s in model.servers_ids if s != self.id] model.servers_ids = server_list model.save()
def test_check_or_create_bucket(self): # Amazon mock self.pill.attach(self.session, os.path.join(self.PILL_RESPONSES_DIR, 'bucket')) self.pill.playback() helper = AmazonS3Helper(**self.credentials) # HeadBucket_1 self.assertRaises(S3ResponseError, helper._check_or_create_bucket) # HeadBucket_2 self.assertTrue(helper._check_or_create_bucket()) # HeadBucket_3 self.assertTrue(helper._check_or_create_bucket())
def test_set_key_metadata(self): # Amazon mock self.pill.attach(self.session, os.path.join(self.PILL_RESPONSES_DIR, 'metadata')) self.pill.playback() helper = AmazonS3Helper(**self.credentials) # Key not found (HeadObject_1) self.assertRaises(AmazonS3ObjectNotFound, helper.set_key_metadata, 'name', {}) # Key exists, empty metadata self.assertTrue( helper.set_key_metadata('name', meta={ 'Name': 'new_name', 'Other': 'value', 'Third': '3value' }, store_previous=True))
def test_load_key(self): # Amazon mock self.pill.attach(self.session, os.path.join(self.PILL_RESPONSES_DIR, 'load_key')) self.pill.playback() helper = AmazonS3Helper(**self.credentials) # GetObject_1 res = helper.load_key('name') self.assertTrue(isinstance(res, basestring)) # GetObject_2 res = helper.load_key('name', with_metadata=True) self.assertTrue(isinstance(res, dict)) self.assertTrue(isinstance(res['Body'], StreamingBody)) self.assertEqual(res['Metadata']['Name'], 'name') # GetObject_3-6 self.assertRaises(AmazonS3ObjectNotFound, helper.load_key, 'any')
def to_s3(data, import_handler_id): from api.amazon_utils import AmazonS3Helper from datetime import datetime import api try: handler = XmlImportHandler.query.get(import_handler_id) if not handler: raise ValueError("Import handler {0} not found".format( import_handler_id)) key = "{0}/{1}_python_script_{2}.py".format( api.app.config['IMPORT_HANDLER_SCRIPTS_FOLDER'], handler.name, datetime.now().strftime("%Y-%m-%d %H:%M:%S")) s3helper = AmazonS3Helper() s3helper.save_key_string(key, data) except Exception as e: raise ValueError("Error when uploading file to Amazon S3: " "{0}".format(e)) return key
def delete(self): # Stop task # self.terminate_task() # TODO filename = self.filename on_s3 = self.on_s3 uid = self.uid super(DataSet, self).delete() LogMessage.delete_related_logs(self.id, type_=LogMessage.IMPORT_DATA) # TODO: check import handler type try: os.remove(filename) except OSError: pass if on_s3: from botocore.exceptions import ClientError helper = AmazonS3Helper() try: helper.delete_key(uid) except ClientError as e: logging.exception(str(e))
def get_s3_download_url(self, expires_in=3600): helper = AmazonS3Helper() return helper.get_download_url(self.uid, expires_in)
def load_from_s3(self): helper = AmazonS3Helper() return helper.load_key(self.uid)
def get_trainer_s3url(self, expires_in=3600): trainer_filename = self.get_trainer_filename() if self.status != self.STATUS_TRAINED or not trainer_filename: return None helper = AmazonS3Helper() return helper.get_download_url(trainer_filename, expires_in)
def upload_segment_features_transformers(model_id, segment_id, fformat): model = Model.query.get(model_id) segment = Segment.query.get(segment_id) log_id = segment_id from api.async_tasks.models import AsyncTask if upload_segment_features_transformers.request.id is not None: tasks = AsyncTask.query\ .filter_by( task_id=upload_segment_features_transformers.request.id ).limit(1) log_id = tasks[0].id init_logger('prepare_transformer_for_download_log', obj=int(log_id)) logging.info('Start preparing segment features transformers for download') try: from zipfile import ZipFile, ZIP_DEFLATED from api.amazon_utils import AmazonS3Helper import os from tempfile import NamedTemporaryFile files = [] arc_name = "{0}-{1}-{2}.zip".format(model.name, segment.name, fformat) def _save_content(content, feature_name, transformer_type): filename = "{0}-{1}-{2}-data.{3}".format(segment.name, feature_name, transformer_type, fformat) logging.info("Creating %s" % filename) if fformat == 'csv': import csv import StringIO si = StringIO.StringIO() if len(content): fieldnames = content[0].keys() writer = csv.DictWriter(si, fieldnames=fieldnames) writer.writeheader() for c in content: writer.writerow(c) response = si.getvalue() else: import json response = json.dumps(content, indent=2) with open(filename, 'w') as fh: fh.write(response) fh.close() return filename trainer = model.get_trainer() if segment.name not in trainer.features: raise TaskException("Segment %s doesn't exists in trained model" % segment.name) for name, feature in trainer.features[segment.name].iteritems(): if "transformer" in feature and feature["transformer"] is not None: try: data = feature["transformer"].load_vocabulary() files.append(_save_content(data, name, feature["transformer-type"])) except AttributeError: logging.warning( "Can't load transformer data for segment {0} feature " "{1} transformer {2}. Transformer doesn't have " "vocabulary to return or feature haven't been " "transformed on model training" .format(segment.name, name, feature["transformer-type"])) continue logging.info("Add files to archive") with ZipFile(arc_name, "w") as z: for f in files: z.write(f, compress_type=ZIP_DEFLATED) z.close() s3 = AmazonS3Helper() logging.info('Uploading archive to s3 with name {0}'.format(arc_name)) s3.save_key(arc_name, arc_name, { 'model_id': model.id, 'segment_id': segment_id}, compressed=False) s3.close() return s3.get_download_url(arc_name, 60 * 60 * 24 * 7) except Exception, e: logging.error("Got exception when preparing features transformers " "of segment {0} for download: {1} \n {2}" .format(segment.name, e.message, get_task_traceback(e))) raise TaskException(e.message, e)
def upload_model_to_server(server_id, model_id, user_id): """ Upload model to S3 for cloudml-predict. """ init_logger('trainmodel_log', obj=int(model_id)) logging.info('Starting uploading to cloudml_predict') try: server = Server.query.get(server_id) user = User.query.get(user_id) model = Model.query.get(model_id) # TODO: Checking name, whether it's enough of the memory, etc. model_files = server.list_keys(FOLDER_MODELS) for file_ in model_files: if file_['name'] == model.name: raise ValueError('Model with name "{0}" already exist on ' 'the server {1}'.format( model.name, server.name)) uid = get_a_Uuid() # TODO: Shall we use another account? s3 = AmazonS3Helper( bucket_name=app.config['CLOUDML_PREDICT_BUCKET_NAME']) path = '{0}/{1}/{2}.model'.format(server.folder.strip('/'), FOLDER_MODELS, uid) meta = { 'id': model.id, 'object_name': model.name, 'name': model.name, 'user_id': user.id, 'user_name': user.name, 'hide': "False", 'uploaded_on': str(datetime.now()) } trainer = model.get_trainer() #from cloudml.trainer.store import load_trainer #trainer = load_trainer(trainer_data) from cloudml.trainer.store import TrainerStorage from bson import Binary import cPickle as pickle trainer_data = Binary(TrainerStorage(trainer).dumps()) logging.info(len(trainer_data)) #trainer.visualization = None #trainer_data = store_trainer(trainer) #trainer_data = model.trainer s3.save_key_string(path, trainer_data, meta) s3.close() model.locked = True s_ids = list(model.servers_ids) if (isinstance(model.servers_ids, list)) else [] s_ids.append(server.id) model.servers_ids = list(s_ids) model.save() feature_set = model.features_set feature_set.locked = True feature_set.save() logging.info('Creating grafan dashboard for model') update_grafana_dashboard(server, model) logging.info('Model has been uploaded: %s' % model.name) return '{0}/{1}.model'.format(FOLDER_MODELS, uid) except Exception as e: logging.error("Got exception on uploading model to predict: " " {0} \n {1}".format(e.message, get_task_traceback(e))) raise TaskException(e.message, e)
def get_csv_results(model_id, test_id, fields): """ Get test classification results in csv format and saves file to Amazon S3. model_id: int ID of the model test_id: int ID of the test, which examples it planned to export. fields: list of string List of field names from TestExample to export to csv file. """ from api.amazon_utils import AmazonS3Helper def generate(test, name): from api.base.io_utils import get_or_create_data_folder path = get_or_create_data_folder() filename = os.path.join(path, name) header = list(fields) if 'prob' in header: prob_index = header.index('prob') for label in reversed(test.classes_set): header.insert(prob_index, 'prob_%s' % label) header.remove('prob') with open(filename, 'w') as fp: writer = csv.writer(fp, delimiter=',', quoting=csv.QUOTE_ALL) writer.writerow(header) for example in TestExample.get_data(test_id, fields): rows = [] for field in fields: if field == '_id': field = 'id' if field == 'id': field = 'example_id' val = example[field] if field in example else '' if field == 'prob': rows += val else: rows.append(val) writer.writerow(rows) return filename init_logger('runtest_log', obj=int(test_id)) try: test = TestResult.query.filter_by(model_id=model_id, id=test_id).first() if test is None: logging.error('Test not found') return name = 'Examples-{0!s}.csv'.format(uuid.uuid1()) expires = 60 * 60 * 24 * 7 # 7 days logging.info('Creating file {0}...'.format(name)) s3 = AmazonS3Helper() filename = generate(test, name) logging.info('Uploading file {0} to s3...'.format(filename)) s3.save_key(name, filename, { 'model_id': model_id, 'test_id': test_id }, compressed=False) s3.close() os.remove(filename) url = s3.get_download_url(name, expires) return url except Exception as e: logging.error("Got exception on getting test classification results: " " {0} \n {1}".format(e.message, get_task_traceback(e))) raise TaskException(e.message, e)
def list_keys(self, folder=None, params={}): path = self.folder.strip('/') if folder and folder in self.ALLOWED_FOLDERS: path += '/{0!s}'.format(folder) objects = [] s3 = AmazonS3Helper( bucket_name=app.config['CLOUDML_PREDICT_BUCKET_NAME']) for key in s3.list_keys(path): uid = key['Key'].split('/')[-1] key = s3.load_key(key['Key'], with_metadata=True) if key['Metadata']['hide'] == 'True': continue objects.append({ 'id': uid, 'object_name': key['Metadata'].get('object_name', None), 'size': key['ContentLength'], 'uploaded_on': key['Metadata'].get('uploaded_on', None), 'last_modified': str(key['LastModified']), 'name': key['Metadata'].get('name', None), 'object_id': key['Metadata'].get('id', None), 'object_type': key['Metadata'].get('type', None), 'user_id': key['Metadata'].get('user_id', None), 'user_name': key['Metadata'].get('user_name', None), 'crc32': key['Metadata'].get('crc32', None), 'server_id': self.id, 'loading_error': key['Metadata'].get('loading_error', None), 'count_400': key['Metadata'].get('count_400', None), 'count_500': key['Metadata'].get('count_500', None), 'count_of_max_response': key['Metadata'].get('count_of_max_response', None), 'longest_resp_count': key['Metadata'].get('longest_resp_count', None), 'longest_resp_time': key['Metadata'].get('longest_resp_time', None), 'max_response_time': key['Metadata'].get('max_response_time', None), 'requests': key['Metadata'].get('requests', None) }) sort_by = params.get('sort_by', None) order = params.get('order', 'asc') if objects and sort_by: obj = objects[0] if sort_by in obj.keys(): return sorted(objects, key=lambda x: x[sort_by], reverse=order != 'asc') else: raise ValueError( 'Unable to sort by %s. Property is not exist.' % sort_by) return objects
def get_key_metadata(self, uid, folder, key): key_name = '{0}/{1}/{2}'.format(self.folder, folder, uid) s3 = AmazonS3Helper( bucket_name=app.config['CLOUDML_PREDICT_BUCKET_NAME']) s3key = s3.load_key(key_name, with_metadata=True) return s3key['Metadata'][key]