Beispiel #1
0
def transform_dataset_for_download(model_id, dataset_id):
    model = Model.query.get(model_id)
    dataset = DataSet.query.get(dataset_id)

    init_logger('transform_for_download_log', obj=int(dataset_id))
    logging.info('Starting Transform For Download Task')
    try:
        transformed = model.transform_dataset(dataset)

        logging.info('Saving transformed data to disk')
        temp_file = tempfile.NamedTemporaryFile()
        numpy.savez_compressed(temp_file, **transformed)

        s3_filename = "dataset_{0}_vectorized_for_model_{1}.npz".format(
            dataset.id, model.id)

        from api.amazon_utils import AmazonS3Helper
        s3 = AmazonS3Helper()
        logging.info('Uploading file {0} to s3 with name {1}...'.format(
            temp_file.name, s3_filename))
        s3.save_key(s3_filename, temp_file.name, {
            'model_id': model.id,
            'dataset_id': dataset.id}, compressed=False)
        s3.close()
        return s3.get_download_url(s3_filename, 60 * 60 * 24 * 7)
    except Exception as e:
        logging.error("Got exception when transforming dataset: "
                      " {0} \n {1}".format(e.message, get_task_traceback(e)))
        raise TaskException(e.message, e)
Beispiel #2
0
    def test_put_file(self):
        # Amazon mock
        self.pill.attach(self.session,
                         os.path.join(self.PILL_RESPONSES_DIR, 'put'))
        self.pill.playback()

        helper = AmazonS3Helper(**self.credentials)
        with patch("boto3.s3.transfer.S3Transfer._multipart_upload") as mu:
            app.config['MULTIPART_UPLOAD_CHUNK_SIZE'] = 128
            helper.save_gz_file(
                'name',
                os.path.join(self.PILL_RESPONSES_DIR, 'put/test_file.py'),
                {'model_id': 234})
            mu.assert_called_with(
                os.path.join(self.PILL_RESPONSES_DIR, 'put/test_file.py'),
                'bucket_name', 'name', ANY, ANY)

        # PutObject_1
        self.assertTrue(
            helper.save_key('name',
                            os.path.join(self.PILL_RESPONSES_DIR,
                                         'put/test_file.py'),
                            {'model_id': 234},
                            compressed=False))
        # PutObject_2
        self.assertTrue(
            helper.save_key(
                'name',
                os.path.join(self.PILL_RESPONSES_DIR, 'put/test_file.py'),
                {'model_id': 234}))
        # PutObject_3
        self.assertTrue(
            helper.save_key_string('name', 'data', {'model_id': 234}))
Beispiel #3
0
    def test_delete_key(self):
        # Amazon mock
        self.pill.attach(self.session,
                         os.path.join(self.PILL_RESPONSES_DIR, 'delete_key'))
        self.pill.playback()

        helper = AmazonS3Helper(**self.credentials)
        self.assertTrue(helper.delete_key('name'))
Beispiel #4
0
def upload_import_handler_to_server(server_id, handler_type, handler_id,
                                    user_id):
    """
    Upload importhandler to S3 for cloudml-predict.
    """
    init_logger('importdata_log', obj=int(handler_id))
    logging.info('Starting uploading to cloudml_predict')

    try:
        server = Server.query.get(server_id)
        user = User.query.get(user_id)
        handler = XmlImportHandler.query.get(handler_id)

        handler_files = server.list_keys(FOLDER_IMPORT_HANDLERS)
        for file_ in handler_files:
            if file_['name'] == handler.name:
                raise ValueError('Import Handler with name "{0}" already exist'
                                 ' on the server {1}'.format(
                                     handler.name, server.name))

        uid = get_a_Uuid()
        # TODO: Shall we use another account?
        s3 = AmazonS3Helper(
            bucket_name=app.config['CLOUDML_PREDICT_BUCKET_NAME'])
        path = '{0}/{1}/{2}.{3}'.format(
            server.folder.strip('/'), FOLDER_IMPORT_HANDLERS, uid,
            'xml' if handler_type == XmlImportHandler.TYPE else 'json')
        meta = {
            'id': handler.id,
            'name': handler.name,
            'object_name': handler.name,
            'type': handler.TYPE,
            'user_id': user.id,
            'user_name': user.name,
            'hide': "False",
            'uploaded_on': str(datetime.now()),
            'crc32': handler.crc32
        }

        handler_data = handler.get_plan_config()
        handler.locked = True
        s_ids = list(handler.servers_ids) if (isinstance(
            handler.servers_ids, list)) else []
        s_ids.append(server.id)
        handler.servers_ids = list(s_ids)
        handler.save()
        s3.save_key_string(path, handler_data, meta)
        s3.close()

        logging.info('Import Handler has been uploaded: %s' % handler.name)

        return '{0}/{1}.{2}'.format(
            FOLDER_IMPORT_HANDLERS, uid,
            'xml' if handler_type == XmlImportHandler.TYPE else 'json')
    except Exception as e:
        logging.error("Got exception on uploading import handler to predict: "
                      " {0} \n {1}".format(e.message, get_task_traceback(e)))
        raise TaskException(e.message, e)
Beispiel #5
0
    def test_get_dnowload_url(self):
        # Amazon mock
        self.pill.attach(self.session,
                         os.path.join(self.PILL_RESPONSES_DIR, 'download_url'))
        self.pill.playback()

        helper = AmazonS3Helper(**self.credentials)
        self.assertTrue(helper.get_download_url('test', 3600))
        self.assertRaises(ValueError, helper.get_download_url, 'test', 'time')
Beispiel #6
0
 def process_bind_param(self, value, dialect):
     if value is not None:
         helper = AmazonS3Helper()
         uid = str(uuid.uuid1().hex)
         helper.save_key_string(uid, value)
         value = uid
         filename = self._get_file_path(value)
         if os.path.exists(filename):
             os.remove(filename)
     return value
Beispiel #7
0
    def test_key_exists(self):
        # Amazon mock
        self.pill.attach(self.session,
                         os.path.join(self.PILL_RESPONSES_DIR, 'key_exists'))
        self.pill.playback()

        helper = AmazonS3Helper(**self.credentials)
        # HeadObject_1
        self.assertFalse(helper.key_exists('name'))
        # HeadObject_2
        self.assertTrue(helper.key_exists('name'))
Beispiel #8
0
 def save_to_s3(self):
     meta = {
         'handler': self.import_handler_id,
         'dataset': self.name,
         'params': str(self.import_params)
     }
     self.set_uid()
     helper = AmazonS3Helper()
     helper.save_gz_file(self.uid, self.filename, meta)
     helper.close()
     self.on_s3 = True
     self.save()
Beispiel #9
0
 def run(self, **kwargs):
     import zlib
     from api.amazon_utils import AmazonS3Helper
     s3 = AmazonS3Helper(
         bucket_name=app.config['CLOUDML_PREDICT_BUCKET_NAME'])
     for key in s3.list_keys('staging/importhandlers/'):
         data = s3.load_key(key.name)
         crc32 = '0x%08X' % (zlib.crc32(data) & 0xffffffff)
         try:
             s3.set_key_metadata(key.name, {'crc32': crc32}, False)
         except:
             print 'Error'
Beispiel #10
0
    def process_result_value(self, value, dialect):
        if value is not None:
            filename = self._get_file_path(value)
            if os.path.exists(filename):
                with open(filename, 'rb') as f:
                    return f.read()

            helper = AmazonS3Helper()
            value = helper.load_key(value)

            with open(filename, 'w') as f:
                f.write(str(value))
        return value
Beispiel #11
0
    def test_list_keys(self):
        # Amazon mock
        self.pill.attach(self.session,
                         os.path.join(self.PILL_RESPONSES_DIR, 'list_keys'))
        self.pill.playback()

        helper = AmazonS3Helper(**self.credentials)
        # ListObjects_1
        res = helper.list_keys('prefix')
        self.assertEqual(set(['a', 'b', 'c']), set([k['Key'] for k in res]))
        # ListObjects_2
        self.assertEqual([], helper.list_keys('another_prefix'))
        self.assertRaises(ParamValidationError, helper.list_keys, None)
Beispiel #12
0
 def set_key_metadata(self, uid, folder, key, value):
     if self.check_edit_metadata(folder, key, value):
         key_name = '{0}/{1}/{2}'.format(self.folder, folder, uid)
         s3 = AmazonS3Helper(
             bucket_name=app.config['CLOUDML_PREDICT_BUCKET_NAME'])
         s3.set_key_metadata(key_name, {key: value}, True)
         # this means key is deleted, need to update model/import handler
         if key == 'hide' and value == 'True':
             obj = s3.load_key(key_name, with_metadata=True)
             cl = Model if folder == FOLDER_MODELS else XmlImportHandler
             model = cl.query.get(obj['Metadata']['id'])
             server_list = [s for s in model.servers_ids if s != self.id]
             model.servers_ids = server_list
             model.save()
Beispiel #13
0
    def test_check_or_create_bucket(self):
        # Amazon mock
        self.pill.attach(self.session,
                         os.path.join(self.PILL_RESPONSES_DIR, 'bucket'))
        self.pill.playback()

        helper = AmazonS3Helper(**self.credentials)
        # HeadBucket_1
        self.assertRaises(S3ResponseError, helper._check_or_create_bucket)

        # HeadBucket_2
        self.assertTrue(helper._check_or_create_bucket())

        # HeadBucket_3
        self.assertTrue(helper._check_or_create_bucket())
Beispiel #14
0
    def test_set_key_metadata(self):
        # Amazon mock
        self.pill.attach(self.session,
                         os.path.join(self.PILL_RESPONSES_DIR, 'metadata'))
        self.pill.playback()

        helper = AmazonS3Helper(**self.credentials)
        # Key not found (HeadObject_1)
        self.assertRaises(AmazonS3ObjectNotFound, helper.set_key_metadata,
                          'name', {})
        # Key exists, empty metadata
        self.assertTrue(
            helper.set_key_metadata('name',
                                    meta={
                                        'Name': 'new_name',
                                        'Other': 'value',
                                        'Third': '3value'
                                    },
                                    store_previous=True))
Beispiel #15
0
    def test_load_key(self):
        # Amazon mock
        self.pill.attach(self.session,
                         os.path.join(self.PILL_RESPONSES_DIR, 'load_key'))
        self.pill.playback()

        helper = AmazonS3Helper(**self.credentials)
        # GetObject_1
        res = helper.load_key('name')
        self.assertTrue(isinstance(res, basestring))

        # GetObject_2
        res = helper.load_key('name', with_metadata=True)
        self.assertTrue(isinstance(res, dict))
        self.assertTrue(isinstance(res['Body'], StreamingBody))
        self.assertEqual(res['Metadata']['Name'], 'name')

        # GetObject_3-6
        self.assertRaises(AmazonS3ObjectNotFound, helper.load_key, 'any')
Beispiel #16
0
 def to_s3(data, import_handler_id):
     from api.amazon_utils import AmazonS3Helper
     from datetime import datetime
     import api
     try:
         handler = XmlImportHandler.query.get(import_handler_id)
         if not handler:
             raise ValueError("Import handler {0} not found".format(
                 import_handler_id))
         key = "{0}/{1}_python_script_{2}.py".format(
             api.app.config['IMPORT_HANDLER_SCRIPTS_FOLDER'],
             handler.name,
             datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
         s3helper = AmazonS3Helper()
         s3helper.save_key_string(key, data)
     except Exception as e:
         raise ValueError("Error when uploading file to Amazon S3: "
                          "{0}".format(e))
     return key
Beispiel #17
0
    def delete(self):
        # Stop task
        # self.terminate_task()  # TODO
        filename = self.filename
        on_s3 = self.on_s3
        uid = self.uid

        super(DataSet, self).delete()
        LogMessage.delete_related_logs(self.id, type_=LogMessage.IMPORT_DATA)

        # TODO: check import handler type
        try:
            os.remove(filename)
        except OSError:
            pass
        if on_s3:
            from botocore.exceptions import ClientError
            helper = AmazonS3Helper()
            try:
                helper.delete_key(uid)
            except ClientError as e:
                logging.exception(str(e))
Beispiel #18
0
 def get_s3_download_url(self, expires_in=3600):
     helper = AmazonS3Helper()
     return helper.get_download_url(self.uid, expires_in)
Beispiel #19
0
 def load_from_s3(self):
     helper = AmazonS3Helper()
     return helper.load_key(self.uid)
Beispiel #20
0
 def get_trainer_s3url(self, expires_in=3600):
     trainer_filename = self.get_trainer_filename()
     if self.status != self.STATUS_TRAINED or not trainer_filename:
         return None
     helper = AmazonS3Helper()
     return helper.get_download_url(trainer_filename, expires_in)
Beispiel #21
0
def upload_segment_features_transformers(model_id, segment_id, fformat):
    model = Model.query.get(model_id)
    segment = Segment.query.get(segment_id)
    log_id = segment_id
    from api.async_tasks.models import AsyncTask
    if upload_segment_features_transformers.request.id is not None:
        tasks = AsyncTask.query\
            .filter_by(
                task_id=upload_segment_features_transformers.request.id
            ).limit(1)
        log_id = tasks[0].id

    init_logger('prepare_transformer_for_download_log',
                obj=int(log_id))
    logging.info('Start preparing segment features transformers for download')

    try:
        from zipfile import ZipFile, ZIP_DEFLATED
        from api.amazon_utils import AmazonS3Helper
        import os
        from tempfile import NamedTemporaryFile
        files = []
        arc_name = "{0}-{1}-{2}.zip".format(model.name, segment.name, fformat)

        def _save_content(content, feature_name, transformer_type):
            filename = "{0}-{1}-{2}-data.{3}".format(segment.name,
                                                     feature_name,
                                                     transformer_type,
                                                     fformat)
            logging.info("Creating %s" % filename)
            if fformat == 'csv':
                import csv
                import StringIO
                si = StringIO.StringIO()
                if len(content):
                    fieldnames = content[0].keys()
                    writer = csv.DictWriter(si, fieldnames=fieldnames)
                    writer.writeheader()
                    for c in content:
                        writer.writerow(c)
                response = si.getvalue()
            else:
                import json
                response = json.dumps(content, indent=2)

            with open(filename, 'w') as fh:
                fh.write(response)
                fh.close()
            return filename

        trainer = model.get_trainer()
        if segment.name not in trainer.features:
            raise TaskException("Segment %s doesn't exists in trained model" %
                                segment.name)
        for name, feature in trainer.features[segment.name].iteritems():
            if "transformer" in feature and feature["transformer"] is not None:
                try:
                    data = feature["transformer"].load_vocabulary()
                    files.append(_save_content(data, name,
                                               feature["transformer-type"]))
                except AttributeError:
                    logging.warning(
                        "Can't load transformer data for segment {0} feature "
                        "{1} transformer {2}. Transformer doesn't have "
                        "vocabulary to return or feature haven't been "
                        "transformed on model training"
                        .format(segment.name, name,
                                feature["transformer-type"]))
                    continue

        logging.info("Add files to archive")
        with ZipFile(arc_name, "w") as z:
            for f in files:
                z.write(f, compress_type=ZIP_DEFLATED)
            z.close()

        s3 = AmazonS3Helper()
        logging.info('Uploading archive to s3 with name {0}'.format(arc_name))
        s3.save_key(arc_name, arc_name, {
            'model_id': model.id,
            'segment_id': segment_id}, compressed=False)
        s3.close()
        return s3.get_download_url(arc_name, 60 * 60 * 24 * 7)

    except Exception, e:
        logging.error("Got exception when preparing features transformers "
                      "of segment {0} for download: {1} \n {2}"
                      .format(segment.name, e.message, get_task_traceback(e)))
        raise TaskException(e.message, e)
Beispiel #22
0
def upload_model_to_server(server_id, model_id, user_id):
    """
    Upload model to S3 for cloudml-predict.
    """
    init_logger('trainmodel_log', obj=int(model_id))
    logging.info('Starting uploading to cloudml_predict')

    try:
        server = Server.query.get(server_id)
        user = User.query.get(user_id)
        model = Model.query.get(model_id)

        # TODO: Checking name, whether it's enough of the memory, etc.
        model_files = server.list_keys(FOLDER_MODELS)
        for file_ in model_files:
            if file_['name'] == model.name:
                raise ValueError('Model with name "{0}" already exist on '
                                 'the server {1}'.format(
                                     model.name, server.name))

        uid = get_a_Uuid()

        # TODO: Shall we use another account?
        s3 = AmazonS3Helper(
            bucket_name=app.config['CLOUDML_PREDICT_BUCKET_NAME'])
        path = '{0}/{1}/{2}.model'.format(server.folder.strip('/'),
                                          FOLDER_MODELS, uid)
        meta = {
            'id': model.id,
            'object_name': model.name,
            'name': model.name,
            'user_id': user.id,
            'user_name': user.name,
            'hide': "False",
            'uploaded_on': str(datetime.now())
        }

        trainer = model.get_trainer()
        #from cloudml.trainer.store import load_trainer
        #trainer = load_trainer(trainer_data)
        from cloudml.trainer.store import TrainerStorage
        from bson import Binary
        import cPickle as pickle
        trainer_data = Binary(TrainerStorage(trainer).dumps())
        logging.info(len(trainer_data))
        #trainer.visualization = None
        #trainer_data = store_trainer(trainer)
        #trainer_data = model.trainer
        s3.save_key_string(path, trainer_data, meta)
        s3.close()
        model.locked = True
        s_ids = list(model.servers_ids) if (isinstance(model.servers_ids,
                                                       list)) else []
        s_ids.append(server.id)
        model.servers_ids = list(s_ids)
        model.save()
        feature_set = model.features_set
        feature_set.locked = True
        feature_set.save()
        logging.info('Creating grafan dashboard for model')
        update_grafana_dashboard(server, model)
        logging.info('Model has been uploaded: %s' % model.name)

        return '{0}/{1}.model'.format(FOLDER_MODELS, uid)
    except Exception as e:
        logging.error("Got exception on uploading model to predict: "
                      " {0} \n {1}".format(e.message, get_task_traceback(e)))
        raise TaskException(e.message, e)
Beispiel #23
0
def get_csv_results(model_id, test_id, fields):
    """
    Get test classification results in csv format and saves file
    to Amazon S3.

    model_id: int
        ID of the model
    test_id: int
        ID of the test, which examples it planned to export.
    fields: list of string
        List of field names from TestExample to export to csv file.
    """
    from api.amazon_utils import AmazonS3Helper

    def generate(test, name):
        from api.base.io_utils import get_or_create_data_folder
        path = get_or_create_data_folder()
        filename = os.path.join(path, name)
        header = list(fields)
        if 'prob' in header:
            prob_index = header.index('prob')
            for label in reversed(test.classes_set):
                header.insert(prob_index, 'prob_%s' % label)
            header.remove('prob')

        with open(filename, 'w') as fp:
            writer = csv.writer(fp, delimiter=',', quoting=csv.QUOTE_ALL)
            writer.writerow(header)
            for example in TestExample.get_data(test_id, fields):
                rows = []
                for field in fields:
                    if field == '_id':
                        field = 'id'
                    if field == 'id':
                        field = 'example_id'
                    val = example[field] if field in example else ''
                    if field == 'prob':
                        rows += val
                    else:
                        rows.append(val)
                writer.writerow(rows)
        return filename

    init_logger('runtest_log', obj=int(test_id))

    try:
        test = TestResult.query.filter_by(model_id=model_id,
                                          id=test_id).first()
        if test is None:
            logging.error('Test not found')
            return

        name = 'Examples-{0!s}.csv'.format(uuid.uuid1())
        expires = 60 * 60 * 24 * 7  # 7 days

        logging.info('Creating file {0}...'.format(name))

        s3 = AmazonS3Helper()
        filename = generate(test, name)
        logging.info('Uploading file {0} to s3...'.format(filename))
        s3.save_key(name,
                    filename, {
                        'model_id': model_id,
                        'test_id': test_id
                    },
                    compressed=False)
        s3.close()
        os.remove(filename)
        url = s3.get_download_url(name, expires)

        return url
    except Exception as e:
        logging.error("Got exception on getting test classification results: "
                      " {0} \n {1}".format(e.message, get_task_traceback(e)))
        raise TaskException(e.message, e)
Beispiel #24
0
    def list_keys(self, folder=None, params={}):
        path = self.folder.strip('/')
        if folder and folder in self.ALLOWED_FOLDERS:
            path += '/{0!s}'.format(folder)

        objects = []
        s3 = AmazonS3Helper(
            bucket_name=app.config['CLOUDML_PREDICT_BUCKET_NAME'])
        for key in s3.list_keys(path):
            uid = key['Key'].split('/')[-1]
            key = s3.load_key(key['Key'], with_metadata=True)

            if key['Metadata']['hide'] == 'True':
                continue

            objects.append({
                'id':
                uid,
                'object_name':
                key['Metadata'].get('object_name', None),
                'size':
                key['ContentLength'],
                'uploaded_on':
                key['Metadata'].get('uploaded_on', None),
                'last_modified':
                str(key['LastModified']),
                'name':
                key['Metadata'].get('name', None),
                'object_id':
                key['Metadata'].get('id', None),
                'object_type':
                key['Metadata'].get('type', None),
                'user_id':
                key['Metadata'].get('user_id', None),
                'user_name':
                key['Metadata'].get('user_name', None),
                'crc32':
                key['Metadata'].get('crc32', None),
                'server_id':
                self.id,
                'loading_error':
                key['Metadata'].get('loading_error', None),
                'count_400':
                key['Metadata'].get('count_400', None),
                'count_500':
                key['Metadata'].get('count_500', None),
                'count_of_max_response':
                key['Metadata'].get('count_of_max_response', None),
                'longest_resp_count':
                key['Metadata'].get('longest_resp_count', None),
                'longest_resp_time':
                key['Metadata'].get('longest_resp_time', None),
                'max_response_time':
                key['Metadata'].get('max_response_time', None),
                'requests':
                key['Metadata'].get('requests', None)
            })

        sort_by = params.get('sort_by', None)
        order = params.get('order', 'asc')
        if objects and sort_by:
            obj = objects[0]
            if sort_by in obj.keys():
                return sorted(objects,
                              key=lambda x: x[sort_by],
                              reverse=order != 'asc')
            else:
                raise ValueError(
                    'Unable to sort by %s. Property is not exist.' % sort_by)
        return objects
Beispiel #25
0
 def get_key_metadata(self, uid, folder, key):
     key_name = '{0}/{1}/{2}'.format(self.folder, folder, uid)
     s3 = AmazonS3Helper(
         bucket_name=app.config['CLOUDML_PREDICT_BUCKET_NAME'])
     s3key = s3.load_key(key_name, with_metadata=True)
     return s3key['Metadata'][key]