def on_text(self, text): if self.cache.automaton.kind == EMPTY: return text = match_form(text) if text is None or len(text) <= 2: return text = text.encode('utf-8') for match in self.cache.automaton.iter(text): for entity_id in match[1]: self.entities[entity_id] += 1
def on_text(self, text): if self.cache.automaton.kind == EMPTY: return if text is None or len(text) <= self.MIN_LENGTH: return text = match_form(text) if text is None or len(text) <= self.MIN_LENGTH: return text = text.encode('utf-8') for match in self.cache.automaton.iter(text): for (text, type) in match[1]: self.collector.emit(text, type)
def regex_terms(self): # This is to find the shortest possible regex for each entity. # If, for example, and entity matches both "Al Qaeda" and # "Al Qaeda in Iraq, Syria and the Levant", it is useless to # search for the latter. terms = set([match_form(t) for t in self.terms]) regex_terms = set() for term in terms: if term is None or len(term) < 4 or len(term) > 120: continue contained = False for other in terms: if other is None or other == term: continue if other in term: contained = True if not contained: regex_terms.add(term) return regex_terms
def generate_entity_references(entity): # This is all a bit hacky: we're re-generating all the entity # references for the given entity by effectively re-implementing # the RegexEntityAnalyzer. The alternative was to conduct a # search for potential matching documents, re-analyze them and # re-index them. This proved to be too slow in reality. if entity.state != Entity.STATE_ACTIVE: entity.delete_references(origin='regex') return log.info("Updating document references: %r", entity) rex = '|'.join([t for t in entity.regex_terms]) rex = re.compile('(%s)' % rex) documents = defaultdict(int) try: for document_id, text in scan_entity_mentions(entity): text = match_form(text) if text is None or len(text) <= 2: continue for match in rex.finditer(text): documents[document_id] += 1 except Exception as ex: log.exception(ex) log.info("Re-matching %r gave %r documents.", entity, len(documents)) entity.delete_references(origin='regex') for document_id, weight in documents.items(): doc = Document.by_id(document_id) if doc is None: continue ref = Reference() ref.document_id = doc.id ref.entity_id = entity.id ref.origin = 'regex' ref.weight = weight db.session.add(ref) db.session.commit() delete_entity_references(entity.id) update_entity_references(entity)
def analyze(self, document): text = document.text if text is None or len(text) <= self.MIN_LENGTH: return text = match_form(text) if text is None or len(text) <= self.MIN_LENGTH: return self.cache.generate() if self.cache.automaton.kind == EMPTY: return text = text.encode('utf-8') collector = DocumentTagCollector(document, self.ORIGIN) for match in self.cache.automaton.iter(text): for (text, type) in match[1]: collector.emit(text, type) log.info('Aho Corasick extraced %s entities.', len(collector)) collector.save()