예제 #1
0
 def on_text(self, text):
     if self.cache.automaton.kind == EMPTY:
         return
     text = match_form(text)
     if text is None or len(text) <= 2:
         return
     text = text.encode('utf-8')
     for match in self.cache.automaton.iter(text):
         for entity_id in match[1]:
             self.entities[entity_id] += 1
예제 #2
0
 def on_text(self, text):
     if self.cache.automaton.kind == EMPTY:
         return
     if text is None or len(text) <= self.MIN_LENGTH:
         return
     text = match_form(text)
     if text is None or len(text) <= self.MIN_LENGTH:
         return
     text = text.encode('utf-8')
     for match in self.cache.automaton.iter(text):
         for (text, type) in match[1]:
             self.collector.emit(text, type)
예제 #3
0
 def regex_terms(self):
     # This is to find the shortest possible regex for each entity.
     # If, for example, and entity matches both "Al Qaeda" and
     # "Al Qaeda in Iraq, Syria and the Levant", it is useless to
     # search for the latter.
     terms = set([match_form(t) for t in self.terms])
     regex_terms = set()
     for term in terms:
         if term is None or len(term) < 4 or len(term) > 120:
             continue
         contained = False
         for other in terms:
             if other is None or other == term:
                 continue
             if other in term:
                 contained = True
         if not contained:
             regex_terms.add(term)
     return regex_terms
예제 #4
0
파일: entities.py 프로젝트: wcyn/aleph
def generate_entity_references(entity):
    # This is all a bit hacky: we're re-generating all the entity
    # references for the given entity by effectively re-implementing
    # the RegexEntityAnalyzer. The alternative was to conduct a
    # search for potential matching documents, re-analyze them and
    # re-index them. This proved to be too slow in reality.
    if entity.state != Entity.STATE_ACTIVE:
        entity.delete_references(origin='regex')
        return

    log.info("Updating document references: %r", entity)
    rex = '|'.join([t for t in entity.regex_terms])
    rex = re.compile('(%s)' % rex)

    documents = defaultdict(int)
    try:
        for document_id, text in scan_entity_mentions(entity):
            text = match_form(text)
            if text is None or len(text) <= 2:
                continue
            for match in rex.finditer(text):
                documents[document_id] += 1
    except Exception as ex:
        log.exception(ex)

    log.info("Re-matching %r gave %r documents.", entity, len(documents))

    entity.delete_references(origin='regex')
    for document_id, weight in documents.items():
        doc = Document.by_id(document_id)
        if doc is None:
            continue
        ref = Reference()
        ref.document_id = doc.id
        ref.entity_id = entity.id
        ref.origin = 'regex'
        ref.weight = weight
        db.session.add(ref)

    db.session.commit()
    delete_entity_references(entity.id)
    update_entity_references(entity)
예제 #5
0
    def analyze(self, document):
        text = document.text
        if text is None or len(text) <= self.MIN_LENGTH:
            return
        text = match_form(text)
        if text is None or len(text) <= self.MIN_LENGTH:
            return

        self.cache.generate()
        if self.cache.automaton.kind == EMPTY:
            return

        text = text.encode('utf-8')
        collector = DocumentTagCollector(document, self.ORIGIN)
        for match in self.cache.automaton.iter(text):
            for (text, type) in match[1]:
                collector.emit(text, type)

        log.info('Aho Corasick extraced %s entities.', len(collector))
        collector.save()