def __init__(self, *args, **kwargs): """ Default init for all objects. """ self.logger = util.get_logger() self.args = args self.kwargs = kwargs for key, value in kwargs.items(): setattr(self, key, value)
class Database: logger = util.get_logger('app.storage.mysql.Database') # Convert 24hrTicker to avoid leading number in table name table_conv = { '24hrTicker': 'ticker_24hr', } @classmethod async def connect(cls, config): db = Database(config) await db.init_pool() return db def __init__(self, config: Dict): self.config = config self.pool = None async def init_pool(self): self.pool = await create_pool(use_unicode=True, charset="utf8", **self.config) async def save(self, entry: Dict): if '_ignore' in entry: del entry['_ignore'] table = self.table_conv.get(entry['eventType']) or entry['eventType'] del entry['eventType'] sql = "INSERT INTO {} ({}) VALUES ({})".format( table, ', '.join(entry.keys()), ', '.join(['%s'] * len(entry))) args = tuple(entry.values()) async with self.pool.acquire() as conn: try: cursor = await conn.cursor() await cursor.execute(sql, args) await conn.commit() except Exception: self.logger.error("MySQL Database Error:%s", traceback.format_exception(*sys.exc_info())) async def close(self): self.pool.close() await self.pool.wait_closed()
def upload_file(): """ Accepts a post containing a file, parks it in uploads. """ # redirect to admin it NOT a post if flask.request.method == 'GET': return flask.redirect(flask.url_for('admin')) logger = util.get_logger(log_name='upload') def allowed_file(filename): """ private method to check file names for ext. """ return '.' in filename and filename.rsplit('.', 1)[1].lower() \ in app.config['ALLOWED_EXTENSIONS'] processed = 0 for in_file in flask.request.files.getlist('file[]'): if allowed_file(in_file.filename): image_obj = models.images.Image(raw_upload=in_file) processed += 1 msg = "Uploaded %s files successfully!" % processed return flask.redirect(flask.url_for('admin'))
class NER: logger = util.get_logger("nlp.ner.NER") def __init__(self, model='en_core_web_lg'): self._model = model self.logger.debug("Loading spaCy %s", self._model) self.nlp = spacy.load(self._model) def doc(self, expr): return self.nlp(expr) @staticmethod def _ent_to_text(ent): if ent.label_ == 'NORP': return norp_gpe_map.get(ent.text, ent.text) else: return ent.text def entities(self, doc): return set( map(self._ent_to_text, filter(lambda x: x.label_ in {'NORP', 'LOC', 'GPE'}, doc.ents)))
def __init__(self): self.logger = util.get_logger()
class Deduplicator: logger = util.get_logger("deduplicator.Deduplicator") threshold = 0.50 boost = 0.10 def __init__(self): self.sm = SequenceMatcher() self.tokenizer = Tokenizer() self.ner = NER() self.headlines = dict() self._headlines = dict() self.parents = dict() self.groups = dict() def accept(self, _id: str, headline: str) -> str: self.headlines[_id] = headline tokens = self.tokenizer.tokenize(headline) _headline = ' '.join(tokens) self._headlines[_id] = _headline if len(self.groups) == 0: self.logger.debug("[%s] %s - first item", _id, headline) self.parents[_id] = _id self.groups[_id] = [] return _id matches = [] a = _headline doc1 = self.ner.doc(headline) ents1 = util.lowercase(self.ner.entities(doc1)) for group_id in self.groups: b = self._headlines[group_id] self.sm.set_seqs(a, b) ratio = self.sm.ratio() # Check if there are any named entities in common doc2 = self.ner.doc(self.headlines[group_id]) ents2 = util.lowercase(self.ner.entities(doc2)) ncommon = len(set(ents1) & set(ents2)) boost = ncommon * self.boost ratio += boost self.logger.debug("[%s] %s <-> [%s] %s ==> %.2f (+%.2f)", _id, a, group_id, b, ratio, boost) if ratio >= self.threshold: matches.append((ratio, group_id)) if not matches: self.logger.debug("[%s] %s - no matches found", _id, headline) self.parents[_id] = _id self.groups[_id] = [] return _id matches.sort(key=lambda x: x[0]) highest_ratio, group_id = matches.pop() b = self._headlines[group_id] self.logger.debug("[%s] %s <-> [%s] %s ==> %.2f was the high score", _id, a, group_id, b, highest_ratio) self.parents[_id] = group_id self.groups[group_id].append(_id) return group_id def print_tree(self, original=True): headlines = self.headlines if original else self._headlines print("") for group_id in self.groups: print("[%s] %s" % (group_id, headlines[group_id])) if self.groups[group_id]: print(" |") for _id in self.groups[group_id]: print(" |-- [%s] %s" % (_id, headlines[_id])) if self.groups[group_id]: print("") print("") def export(self): return { 'headlines': self.headlines, '_headlines': self._headlines, 'parents': self.parents, 'groups': self.groups }