def analyze(self, document, meta): begin_time = time() self.cache.generate() entities = defaultdict(int) for text, rec in document.text_parts(): text = normalize_strong(text) if text is None or len(text) <= 2: continue for rex in self.cache.regexes: for match in rex.finditer(text): match = match.group(2) # match = match.group(1) for entity_id in self.cache.matches.get(match, []): entities[entity_id] += 1 Reference.delete_document(document.id, origin=self.origin) for entity_id, weight in entities.items(): ref = Reference() ref.document_id = document.id ref.entity_id = entity_id ref.origin = self.origin ref.weight = weight db.session.add(ref) self.save(document, meta) duration_time = int((time() - begin_time) * 1000) if len(entities): log.info("Regex tagged %r with %d entities (%sms)", document, len(entities), duration_time) else: log.info("Regex found no entities on %r (%sms)", document, duration_time)
def _generate(self): latest = Entity.latest() if self.latest is not None and self.latest >= latest: return self.latest = latest self.matches = defaultdict(set) q = Entity.all() q = q.options(joinedload('other_names')) q = q.filter(Entity.state == Entity.STATE_ACTIVE) for entity in q: for term in entity.regex_terms: self.matches[normalize_strong(term)].add(entity.id) self.regexes = [] terms = self.matches.keys() terms = [t for t in terms if len(t) > 2] for i in count(0): terms_slice = terms[i * BATCH_SIZE:(i + 1) * BATCH_SIZE] if not len(terms_slice): break body = '|'.join(terms_slice) rex = re.compile('( |^)(%s)( |$)' % body) # rex = re.compile('(%s)' % body) self.regexes.append(rex) log.info('Generating entity tagger: %r (%s terms)', latest, len(terms))
def on_text(self, text): text = normalize_strong(text) if text is None or len(text) <= 2: return for rex in self.cache.regexes: for match in rex.finditer(text): match = match.group(2) for entity_id in self.cache.matches.get(match, []): self.entities[entity_id] += 1
def on_text(self, text): if self.cache.automaton is None: return text = normalize_strong(text) if text is None or len(text) <= 2: return text = text.encode('utf-8') for match in self.cache.automaton.iter(text): for entity_id in match[1]: self.entities[entity_id] += 1
def on_text(self, text): if self.cache.automaton is None: return text = normalize_strong(text) if text is None or len(text) <= 2: return text = ' %s ' % text.encode('utf-8') for match in self.cache.automaton.iter(text): for entity_id in match[1]: self.entities[entity_id] += 1
def generate_entity_references(entity): if entity.state != Entity.STATE_ACTIVE: return # This is all a bit hacky: we're re-generating all the entity # references for the given entity by effectively re-implementing # the RegexEntityAnalyzer. The alternative was to conduct a # search for potential matching documents, re-analyze them and # re-index them. This proved to be too slow in reality. log.info("Updating document references: %r", entity) rex = '|'.join(entity.regex_terms) rex = re.compile('( |^)(%s)( |$)' % rex) documents = defaultdict(int) try: for document_id, text in scan_entity_mentions(entity): text = normalize_strong(text) if text is None or len(text) <= 2: continue for match in rex.finditer(text): documents[document_id] += 1 except Exception: log.exception('Failed to fully scan documents for entity refresh.') q = db.session.query(Reference) q = q.filter(Reference.entity_id == entity.id) q = q.filter(Reference.origin == 'regex') q.delete(synchronize_session='fetch') log.info("Re-matching %r gave %r documents.", entity, len(documents)) for document_id, weight in documents.items(): doc = Document.by_id(document_id) if doc is None: continue ref = Reference() ref.document_id = document_id ref.entity_id = entity.id ref.origin = 'regex' ref.weight = weight db.session.add(ref) db.session.commit() delete_entity_references(entity.id) update_entity_references(entity.id)
def regex_terms(self): # This is to find the shortest possible regex for each entity. # If, for example, and entity matches both "Al Qaeda" and # "Al Qaeda in Iraq, Syria and the Levant", it is useless to # search for the latter. terms = set([normalize_strong(t) for t in self.terms]) regex_terms = set() for term in terms: if term is None or len(term) < 4 or len(term) > 120: continue contained = False for other in terms: if other is None or other == term: continue if other in term: contained = True if not contained: regex_terms.add(term) return regex_terms
def regex_terms(self): # This is to find the shortest possible regex for each entity. # If, for example, and entity matches both "Al Qaeda" and # "Al Qaeda in Iraq, Syria and the Levant", it is useless to # search for the latter. terms = [' %s ' % normalize_strong(t) for t in self.terms] regex_terms = set() for term in terms: if len(term) < 5: continue contained = False for other in terms: if other == term: continue if other in term: contained = True if not contained: regex_terms.add(term.strip()) return regex_terms
def generate_entity_references(entity): if entity.state != Entity.STATE_ACTIVE: return rex = '|'.join(entity.regex_terms) rex = re.compile('( |^)(%s)( |$)' % rex) documents = defaultdict(int) try: for document_id, text in scan_entity_mentions(entity): text = normalize_strong(text) if text is None or len(text) <= 2: continue for match in rex.finditer(text): documents[document_id] += 1 except Exception: log.exception('Failed to fully scan documents for entity refresh.') q = db.session.query(Reference) q = q.filter(Reference.entity_id == entity.id) q = q.filter(Reference.origin == 'regex') q.delete(synchronize_session='fetch') log.info("Re-matching %r gave %r documents.", entity, len(documents)) for document_id, weight in documents.items(): doc = Document.by_id(document_id) if doc is None: continue ref = Reference() ref.document_id = document_id ref.entity_id = entity.id ref.origin = 'regex' ref.weight = weight db.session.add(ref) db.session.commit() delete_entity_references(entity.id) q = db.session.query(func.distinct(Reference.document_id)) q = q.filter(Reference.entity_id == entity.id) for document_id, in q: index_document(document_id, index_records=False)