def __init__(self, *args, **kwargs):
        """ Default init for all objects. """

        self.logger = util.get_logger()
        self.args = args
        self.kwargs = kwargs
        for key, value in kwargs.items():
            setattr(self, key, value)
Esempio n. 2
0
class Database:
    logger = util.get_logger('app.storage.mysql.Database')

    # Convert 24hrTicker to avoid leading number in table name
    table_conv = {
        '24hrTicker': 'ticker_24hr',
    }

    @classmethod
    async def connect(cls, config):
        db = Database(config)
        await db.init_pool()
        return db

    def __init__(self, config: Dict):
        self.config = config
        self.pool = None

    async def init_pool(self):
        self.pool = await create_pool(use_unicode=True,
                                      charset="utf8",
                                      **self.config)

    async def save(self, entry: Dict):
        if '_ignore' in entry:
            del entry['_ignore']
        table = self.table_conv.get(entry['eventType']) or entry['eventType']
        del entry['eventType']
        sql = "INSERT INTO {} ({}) VALUES ({})".format(
            table, ', '.join(entry.keys()), ', '.join(['%s'] * len(entry)))
        args = tuple(entry.values())
        async with self.pool.acquire() as conn:
            try:
                cursor = await conn.cursor()
                await cursor.execute(sql, args)
                await conn.commit()
            except Exception:
                self.logger.error("MySQL Database Error:%s",
                                  traceback.format_exception(*sys.exc_info()))

    async def close(self):
        self.pool.close()
        await self.pool.wait_closed()
def upload_file():
    """ Accepts a post containing a file, parks it in uploads. """

    # redirect to admin it NOT a post
    if flask.request.method == 'GET':
        return flask.redirect(flask.url_for('admin'))

    logger = util.get_logger(log_name='upload')

    def allowed_file(filename):
        """ private method to check file names for ext. """
        return '.' in filename and filename.rsplit('.', 1)[1].lower() \
            in app.config['ALLOWED_EXTENSIONS']

    processed = 0
    for in_file in flask.request.files.getlist('file[]'):
        if allowed_file(in_file.filename):
            image_obj = models.images.Image(raw_upload=in_file)
            processed += 1

    msg = "Uploaded %s files successfully!" % processed
    return flask.redirect(flask.url_for('admin'))
Esempio n. 4
0
class NER:

    logger = util.get_logger("nlp.ner.NER")

    def __init__(self, model='en_core_web_lg'):
        self._model = model
        self.logger.debug("Loading spaCy %s", self._model)
        self.nlp = spacy.load(self._model)

    def doc(self, expr):
        return self.nlp(expr)

    @staticmethod
    def _ent_to_text(ent):
        if ent.label_ == 'NORP':
            return norp_gpe_map.get(ent.text, ent.text)
        else:
            return ent.text

    def entities(self, doc):
        return set(
            map(self._ent_to_text,
                filter(lambda x: x.label_ in {'NORP', 'LOC', 'GPE'},
                       doc.ents)))
 def __init__(self):
     self.logger = util.get_logger()
Esempio n. 6
0
class Deduplicator:

    logger = util.get_logger("deduplicator.Deduplicator")
    threshold = 0.50
    boost = 0.10

    def __init__(self):
        self.sm = SequenceMatcher()
        self.tokenizer = Tokenizer()
        self.ner = NER()
        self.headlines = dict()
        self._headlines = dict()
        self.parents = dict()
        self.groups = dict()

    def accept(self, _id: str, headline: str) -> str:
        self.headlines[_id] = headline
        tokens = self.tokenizer.tokenize(headline)
        _headline = ' '.join(tokens)
        self._headlines[_id] = _headline

        if len(self.groups) == 0:
            self.logger.debug("[%s] %s - first item", _id, headline)
            self.parents[_id] = _id
            self.groups[_id] = []
            return _id

        matches = []
        a = _headline
        doc1 = self.ner.doc(headline)
        ents1 = util.lowercase(self.ner.entities(doc1))
        for group_id in self.groups:
            b = self._headlines[group_id]
            self.sm.set_seqs(a, b)
            ratio = self.sm.ratio()
            # Check if there are any named entities in common
            doc2 = self.ner.doc(self.headlines[group_id])
            ents2 = util.lowercase(self.ner.entities(doc2))
            ncommon = len(set(ents1) & set(ents2))
            boost = ncommon * self.boost
            ratio += boost
            self.logger.debug("[%s] %s <-> [%s] %s ==> %.2f (+%.2f)", _id, a,
                              group_id, b, ratio, boost)
            if ratio >= self.threshold:
                matches.append((ratio, group_id))

        if not matches:
            self.logger.debug("[%s] %s - no matches found", _id, headline)
            self.parents[_id] = _id
            self.groups[_id] = []
            return _id

        matches.sort(key=lambda x: x[0])
        highest_ratio, group_id = matches.pop()
        b = self._headlines[group_id]
        self.logger.debug("[%s] %s <-> [%s] %s ==> %.2f was the high score",
                          _id, a, group_id, b, highest_ratio)
        self.parents[_id] = group_id
        self.groups[group_id].append(_id)
        return group_id

    def print_tree(self, original=True):
        headlines = self.headlines if original else self._headlines
        print("")
        for group_id in self.groups:
            print("[%s] %s" % (group_id, headlines[group_id]))
            if self.groups[group_id]:
                print(" |")
            for _id in self.groups[group_id]:
                print(" |-- [%s] %s" % (_id, headlines[_id]))
            if self.groups[group_id]:
                print("")
        print("")

    def export(self):
        return {
            'headlines': self.headlines,
            '_headlines': self._headlines,
            'parents': self.parents,
            'groups': self.groups
        }