class AhoCorasickEntityAnalyzer(Analyzer): ORIGIN = 'regex' MIN_LENGTH = 100 cache = AutomatonCache() def prepare(self): self.disabled = not get_config('REGEX_ENTITIES', True) if not self.disabled: self.cache.generate() self.collector = DocumentTagCollector(self.document, self.ORIGIN) def on_text(self, text): if self.cache.automaton.kind == EMPTY: return if text is None or len(text) <= self.MIN_LENGTH: return text = match_form(text) if text is None or len(text) <= self.MIN_LENGTH: return text = text.encode('utf-8') for match in self.cache.automaton.iter(text): for (text, type) in match[1]: self.collector.emit(text, type) def finalize(self): log.info('Aho Corasick extraced %s entities.', len(self.collector)) self.collector.save()
def analyze(self, document): if not document.supports_nlp: return collector = DocumentTagCollector(document, self.ORIGIN) try: self.extract(collector, document) finally: collector.save()
def analyze(self, document): if document.schema in self.IGNORED: return collector = DocumentTagCollector(document, self.ORIGIN) try: self.extract(collector, document) finally: collector.save()
def update(self): """Apply the outcome of the result to the document.""" if self.status == self.STATUS_SUCCESS: self.document.status = Document.STATUS_SUCCESS self.document.error_message = None else: self.document.status = Document.STATUS_FAIL self.document.type = Document.TYPE_OTHER self.document.error_message = self.error_message self.document.foreign_id = stringify(self.id) if self.checksum: self.document.content_hash = self.checksum self.document.uploader_id = self.role_id or self.document.uploader_id self.document.file_size = self.size self.document.file_name = self.file_name self.document.title = stringify(self.title) self.document.summary = stringify(self.summary) self.document.author = stringify(self.author) self.document.keywords = self.keywords self.document.mime_type = stringify(self.mime_type) self.document.encoding = self.encoding self.document.languages = self.languages self.document.headers = self.headers self.document.pdf_version = self.pdf_hash self.document.columns = self.columns.keys() collector = DocumentTagCollector(self.document, 'ingestors') for entity in self.entities: collector.emit(entity, DocumentTag.TYPE_PERSON) for email in self.emails: collector.emit(email, DocumentTag.TYPE_EMAIL) collector.save()
def extract(self, collector, document): DocumentTagCollector(document, 'polyglot').save() DocumentTagCollector(document, 'spacy').save() try: service = EntityExtractStub(self.channel) texts = self.text_iterator(document) entities = service.Extract(texts) for entity in entities.entities: type_ = self.TYPES.get(entity.type) if type_ is None: continue collector.emit(entity.label, type_, weight=entity.weight) log.info('Extracted %s entities.', len(collector)) except self.Error as e: log.warning("gRPC [%s]: %s", e.code(), e.details())
def extract(self, collector, document): DocumentTagCollector(document, 'polyglot').save() DocumentTagCollector(document, 'spacy').save() try: service = EntityExtractStub(self.channel) texts = self.text_iterator(document) entities = service.Extract(texts) for entity in entities.entities: type_ = self.TYPES.get(entity.type) if type_ is None: continue collector.emit(entity.label, type_, weight=entity.weight) log.info('Extracted %s entities.', len(collector)) except self.Error as exc: log.exception("gRPC Error: %s", self.SERVICE) self.reset_channel()
class RegexAnalyzer(Analyzer): REGEX = None FLAG = None def prepare(self): # TODO: re-think this. self.disabled = self.document.type == self.document.TYPE_TABULAR self.collector = DocumentTagCollector(self.document, self.ORIGIN) self.regex = re.compile(self.REGEX, self.FLAG) def on_text(self, text): if not self.disabled: for mobj in self.regex.finditer(text): self.on_match(mobj) def finalize(self): self.collector.save()
def extract(self, collector, document): DocumentTagCollector(document, 'polyglot').save() DocumentTagCollector(document, 'spacy').save() try: service = EntityExtractStub(self.channel) texts = self.text_iterator(document) entities = service.Extract(texts) for entity in entities.entities: if entity.type == ExtractedEntity.COUNTRY: document.add_country(entity.label) if entity.type == ExtractedEntity.LANGUAGE: document.add_language(entity.label) type_ = self.TYPES.get(entity.type) # log.info('%s: %s', entity.label, type_) if type_ is not None: collector.emit(entity.label, type_, weight=entity.weight) log.info('Extracted %s entities.', len(collector)) except self.Error as e: log.warning("gRPC [%s]: %s", e.code(), e.details())
def update(self): """Apply the outcome of the result to the document.""" doc = self.document if self.status == self.STATUS_SUCCESS: doc.status = Document.STATUS_SUCCESS doc.error_message = None else: doc.status = Document.STATUS_FAIL doc.error_message = stringify(self.error_message) schema = model['Document'] for flag, name in self.SCHEMATA: if flag in self.flags: schema = model[name] doc.schema = schema.name doc.foreign_id = safe_string(self.id) doc.content_hash = self.checksum or doc.content_hash doc.pdf_version = self.pdf_checksum doc.title = self.title or doc.meta.get('title') doc.file_name = self.file_name or doc.meta.get('file_name') doc.file_size = self.size or doc.meta.get('file_size') doc.summary = self.summary or doc.meta.get('summary') doc.author = self.author or doc.meta.get('author') doc.generator = self.generator or doc.meta.get('generator') doc.mime_type = self.mime_type or doc.meta.get('mime_type') doc.encoding = self.encoding or doc.meta.get('encoding') doc.date = self.date or doc.meta.get('date') doc.authored_at = self.created_at or doc.meta.get('authored_at') doc.modified_at = self.modified_at or doc.meta.get('modified_at') doc.published_at = self.published_at or doc.meta.get('published_at') doc.message_id = self.message_id or doc.meta.get('message_id') doc.in_reply_to = ensure_list(self.in_reply_to) doc.columns = list(self.columns.keys()) doc.body_raw = self.body_html doc.body_text = self.body_text doc.headers = self.headers for kw in self.keywords: doc.add_keyword(safe_string(kw)) for lang in self.languages: doc.add_language(safe_string(lang)) db.session.flush() collector = DocumentTagCollector(doc, 'ingestors') for entity in self.entities: collector.emit(entity, DocumentTag.TYPE_PERSON) for email in self.emails: collector.emit(email, DocumentTag.TYPE_EMAIL) collector.save()
def update(self): """Apply the outcome of the result to the document.""" doc = self.document if self.status == self.STATUS_SUCCESS: doc.status = Document.STATUS_SUCCESS doc.error_message = None else: doc.status = Document.STATUS_FAIL doc.error_message = stringify(self.error_message) schema = model['Document'] for flag, name in self.SCHEMATA: if flag in self.flags: schema = model[name] doc.schema = schema.name doc.foreign_id = stringify(self.id) doc.content_hash = self.checksum or doc.content_hash doc.uploader_id = self.role_id or doc.uploader_id doc.title = stringify(self.title) or doc.meta.get('title') doc.file_name = stringify(self.file_name) or doc.meta.get('file_name') doc.file_size = self.size or doc.meta.get('file_size') doc.title = stringify(self.title) or doc.meta.get('title') doc.summary = stringify(self.summary) or doc.meta.get('summary') doc.author = stringify(self.author) or doc.meta.get('author') doc.generator = stringify(self.generator) or doc.meta.get('generator') doc.mime_type = stringify(self.mime_type) or doc.meta.get('mime_type') doc.encoding = stringify(self.encoding) or doc.meta.get('encoding') doc.date = self.date or doc.meta.get('date') doc.authored_at = self.created_at or doc.meta.get('authored_at') doc.modified_at = self.modified_at or doc.meta.get('modified_at') doc.published_at = self.published_at or doc.meta.get('published_at') for kw in self.keywords: doc.add_keyword(kw) for lang in self.languages: doc.add_language(lang) doc.headers = self.headers or doc.meta.get('headers') doc.columns = self.columns.keys() if len(self.pages): doc.body_text = '\n\n'.join(self.pages) collector = DocumentTagCollector(doc, 'ingestors') for entity in self.entities: collector.emit(entity, DocumentTag.TYPE_PERSON) for email in self.emails: collector.emit(email, DocumentTag.TYPE_EMAIL) collector.save()
def analyze(self, document): if document.type in [document.TYPE_TABULAR, document.TYPE_OTHER]: return collector = DocumentTagCollector(document, self.ORIGIN) text = document.text if text is None or len(text) <= self.MIN_LENGTH: return try: hint_language_code = None if len(document.languages) == 1: hint_language_code = document.languages[0] text = Text(text, hint_language_code=hint_language_code) for entity in text.entities: if entity.tag == 'I-LOC' or len(entity) == 1: continue label = ' '.join(entity) if len(label) < 4 or len(label) > 200: continue collector.emit(label, self.TYPES[entity.tag]) except ValueError as ve: log.info('NER value error: %r', ve) except Exception as ex: log.warning('NER failed: %r', ex) finally: log.info('Polyglot extracted %s entities.', len(collector)) collector.save()
def analyze(self, document): if document.schema in self.IGNORED: return collector = DocumentTagCollector(document, self.ORIGIN) text = document.text if text is None or len(text) <= self.MIN_LENGTH: return try: hint_language_code = None if len(document.languages) == 1: hint_language_code = document.languages[0] text = Text(text, hint_language_code=hint_language_code) for entity in text.entities: if entity.tag == 'I-LOC': continue label = ' '.join(entity) label = self.CLEAN.sub(' ', label) label = collapse_spaces(label) if ' ' not in label or len(label) < 4 or len(label) > 200: continue # log.info("Entity [Doc %s]: %s [%s]", # document.id, label, entity.tag) collector.emit(label, self.TYPES[entity.tag]) except ValueError as ve: log.warning('NER value error: %r', ve) except Exception as ex: log.warning('NER failed: %r', ex) finally: collector.save() log.info('Polyglot extracted %s entities.', len(collector))
def analyze(self, document): collector = DocumentTagCollector(document, self.ORIGIN) for match in self.RE.finditer(document.text): text = self.extract_match(document, match) if text is not None: collector.emit(text, self.TYPE) collector.save()
class PolyglotEntityAnalyzer(Analyzer): ORIGIN = 'polyglot' MIN_LENGTH = 100 TYPES = { 'I-PER': DocumentTag.TYPE_PERSON, 'I-ORG': DocumentTag.TYPE_ORGANIZATION, 'I-LOC': DocumentTag.TYPE_LOCATION } def prepare(self): self.disabled = self.document.type != self.document.TYPE_TEXT self.collector = DocumentTagCollector(self.document, self.ORIGIN) def on_text(self, text): if text is None or len(text) <= self.MIN_LENGTH: return try: hint_language_code = None if len(self.document.languages) == 1: hint_language_code = self.document.languages[0] text = Text(text, hint_language_code=hint_language_code) for entity in text.entities: if entity.tag == 'I-LOC' or len(entity) == 1: continue label = ' '.join(entity) if len(label) < 4 or len(label) > 200: continue self.collector.emit(label, self.TYPES.get(entity.tag)) except ValueError as ve: log.info('NER value error: %r', ve) except Exception as ex: log.warning('NER failed: %r', ex) def finalize(self): log.info('Polyglot extracted %s entities.', len(self.collector)) self.collector.save()
def analyze(self, document): text = match_form(document.text) if text is None or len(text) <= self.MIN_LENGTH: return collector = DocumentTagCollector(document, self.ORIGIN) self.cache.generate() if self.cache.automaton.kind != EMPTY: text = text.encode('utf-8') for match in self.cache.automaton.iter(text): for (text, tag) in match[1]: collector.emit(text, tag) log.info('Aho Corasick extraced %s entities.', len(collector)) collector.save()
def extract_document_tags(document): if document.status != Document.STATUS_SUCCESS: return load_places() log.info("Tagging [%s]: %s", document.id, document.name) languages = list(document.languages) if not len(languages): languages = [settings.DEFAULT_LANGUAGE] aggregator = EntityAggregator() for text in document.texts: aggregator.extract(text, languages) DocumentTagCollector(document, 'polyglot').save() DocumentTagCollector(document, 'spacy').save() collector = DocumentTagCollector(document, 'ner') for (label, category, weight) in aggregator.entities: collector.emit(label, category, weight=weight) log.info("Extracted tags: %s", len(collector)) collector.save() db.session.add(document) db.session.commit()
def analyze(self, document): collector = DocumentTagCollector(document, 'corasick') if self.automaton is None: return for text in document.texts: if len(text) <= self.MIN_LENGTH: continue text = self.match_form(text) if text is None: continue for match in self.automaton.iter(text): for (match_text, tag) in match[1]: collector.emit(match_text, tag) if len(collector): log.info('Aho Corasick extraced %s entities.', len(collector)) collector.save()
def analyze(self, document): if document.schema in self.IGNORED: return collector = DocumentTagCollector(document, self.ORIGIN) try: languages = set(document.languages) if len(self.languages): languages = languages.intersection(self.languages) if not len(languages): languages = [settings.DEFAULT_LANGUAGE] for text in document.texts: if len(text) <= self.MIN_LENGTH: continue for label, tag in self.tag_text(text, languages): # log.info("Entity [%s]: %s", document.id, label) collector.emit(label, self.TYPES[tag]) except ValueError as ve: log.warning('NER value error: %r', ve) collector.save() if len(collector): log.info('Polyglot extracted %s entities.', len(collector))
def prepare(self): self.disabled = not get_config('REGEX_ENTITIES', True) if not self.disabled: self.cache.generate() self.collector = DocumentTagCollector(self.document, self.ORIGIN)
def prepare(self): # TODO: re-think this. self.disabled = self.document.type == self.document.TYPE_TABULAR self.collector = DocumentTagCollector(self.document, self.ORIGIN) self.regex = re.compile(self.REGEX, self.FLAG)
def prepare(self): self.disabled = self.document.type != self.document.TYPE_TEXT self.collector = DocumentTagCollector(self.document, self.ORIGIN)