예제 #1
0
    def analyze(self, document, meta):
        begin_time = time()
        self.cache.generate()
        entities = defaultdict(int)
        for text, rec in document.text_parts():
            text = normalize_strong(text)
            if text is None or len(text) <= 2:
                continue
            for rex in self.cache.regexes:
                for match in rex.finditer(text):
                    match = match.group(2)
                    # match = match.group(1)
                    for entity_id in self.cache.matches.get(match, []):
                        entities[entity_id] += 1

        Reference.delete_document(document.id, origin=self.origin)
        for entity_id, weight in entities.items():
            ref = Reference()
            ref.document_id = document.id
            ref.entity_id = entity_id
            ref.origin = self.origin
            ref.weight = weight
            db.session.add(ref)
        self.save(document, meta)

        duration_time = int((time() - begin_time) * 1000)
        if len(entities):
            log.info("Regex tagged %r with %d entities (%sms)",
                     document, len(entities), duration_time)
        else:
            log.info("Regex found no entities on %r (%sms)",
                     document, duration_time)
예제 #2
0
    def analyze(self, document, meta):
        begin_time = time()
        self.cache.generate()
        entities = defaultdict(int)
        for text, rec in document.text_parts():
            text = normalize_strong(text)
            if text is None or len(text) <= 2:
                continue
            for rex in self.cache.regexes:
                for match in rex.finditer(text):
                    match = match.group(2)
                    # match = match.group(1)
                    for entity_id in self.cache.matches.get(match, []):
                        entities[entity_id] += 1

        Reference.delete_document(document.id, origin=self.origin)
        for entity_id, weight in entities.items():
            ref = Reference()
            ref.document_id = document.id
            ref.entity_id = entity_id
            ref.origin = self.origin
            ref.weight = weight
            db.session.add(ref)
        self.save(document, meta)

        duration_time = int((time() - begin_time) * 1000)
        if len(entities):
            log.info("Regex tagged %r with %d entities (%sms)", document,
                     len(entities), duration_time)
        else:
            log.info("Regex found no entities on %r (%sms)", document,
                     duration_time)
예제 #3
0
 def finalize(self):
     Reference.delete_document(self.document.id, origin=self.origin)
     for entity_id, weight in self.entities.items():
         ref = Reference()
         ref.document_id = self.document.id
         ref.entity_id = entity_id
         ref.origin = self.origin
         ref.weight = weight
         db.session.add(ref)
     log.info('Regex extraced %s entities.', len(self.entities))
예제 #4
0
 def finalize(self):
     self.document.delete_references(origin=self.origin)
     for entity_id, weight in self.entities.items():
         ref = Reference()
         ref.document_id = self.document.id
         ref.entity_id = entity_id
         ref.origin = self.origin
         ref.weight = weight
         db.session.add(ref)
     log.info('Aho Corasick extraced %s entities.', len(self.entities))
예제 #5
0
    def finalize(self):
        if self.disabled:
            return

        self.document.delete_references(origin=self.origin)
        for regno, name, full in self.entities:
            entity_id = self.load_entity(regno, name, full)
            ref = Reference()
            ref.document_id = self.document.id
            ref.entity_id = entity_id
            ref.origin = self.origin
            ref.weight = 1
            db.session.add(ref)
        log.info('za_companies extraced %s entities.', len(self.entities))
예제 #6
0
 def finalize(self):
     self.document.delete_references(origin=self.origin)
     for fk, schemas in self.entity_schemata.items():
         schema = max(set(schemas), key=schemas.count)
         name = self.entity_names.get(fk)
         entity = self.load_entity(fk, name, schema)
         if entity.deleted_at is not None:
             continue
         ref = Reference()
         ref.document_id = self.document.id
         ref.entity_id = entity.id
         ref.origin = self.origin
         ref.weight = len(schemas)
         db.session.add(ref)
     log.info('Polyglot extracted %s entities.', len(self.entity_schemata))
예제 #7
0
파일: __init__.py 프로젝트: andkamau/aleph
def generate_entity_references(entity):
    if entity.state != Entity.STATE_ACTIVE:
        return
    # This is all a bit hacky: we're re-generating all the entity
    # references for the given entity by effectively re-implementing
    # the RegexEntityAnalyzer. The alternative was to conduct a
    # search for potential matching documents, re-analyze them and
    # re-index them. This proved to be too slow in reality.

    log.info("Updating document references: %r", entity)
    rex = '|'.join(entity.regex_terms)
    rex = re.compile('( |^)(%s)( |$)' % rex)

    documents = defaultdict(int)
    try:
        for document_id, text in scan_entity_mentions(entity):
            text = normalize_strong(text)
            if text is None or len(text) <= 2:
                continue
            for match in rex.finditer(text):
                documents[document_id] += 1
    except Exception:
        log.exception('Failed to fully scan documents for entity refresh.')

    q = db.session.query(Reference)
    q = q.filter(Reference.entity_id == entity.id)
    q = q.filter(Reference.origin == 'regex')
    q.delete(synchronize_session='fetch')

    log.info("Re-matching %r gave %r documents.", entity,
             len(documents))

    for document_id, weight in documents.items():
        doc = Document.by_id(document_id)
        if doc is None:
            continue
        ref = Reference()
        ref.document_id = document_id
        ref.entity_id = entity.id
        ref.origin = 'regex'
        ref.weight = weight
        db.session.add(ref)

    db.session.commit()
    delete_entity_references(entity.id)
    update_entity_references(entity.id)
예제 #8
0
 def finalize(self):
     if self.disabled:
         return
     log.debug("%s deleting old refs for document %d", self,
               self.document.id)
     self.document.delete_references(origin=self.origin)
     for sa_id, name, full in self.entities:
         log.debug("%s Linking %s to document %d", self, sa_id,
                   self.document.id)
         entity_id = self.load_entity(sa_id, name, full)
         ref = Reference()
         ref.document_id = self.document.id
         ref.entity_id = entity_id
         ref.origin = self.origin
         ref.weight = 1
         db.session.add(ref)
     log.info('za_persons extraced %s entities.', len(self.entities))
예제 #9
0
def generate_entity_references(entity):
    if entity.state != Entity.STATE_ACTIVE:
        return
    # This is all a bit hacky: we're re-generating all the entity
    # references for the given entity by effectively re-implementing
    # the RegexEntityAnalyzer. The alternative was to conduct a
    # search for potential matching documents, re-analyze them and
    # re-index them. This proved to be too slow in reality.

    log.info("Updating document references: %r", entity)
    rex = '|'.join(entity.regex_terms)
    rex = re.compile('( |^)(%s)( |$)' % rex)

    documents = defaultdict(int)
    try:
        for document_id, text in scan_entity_mentions(entity):
            text = normalize_strong(text)
            if text is None or len(text) <= 2:
                continue
            for match in rex.finditer(text):
                documents[document_id] += 1
    except Exception:
        log.exception('Failed to fully scan documents for entity refresh.')

    q = db.session.query(Reference)
    q = q.filter(Reference.entity_id == entity.id)
    q = q.filter(Reference.origin == 'regex')
    q.delete(synchronize_session='fetch')

    log.info("Re-matching %r gave %r documents.", entity, len(documents))

    for document_id, weight in documents.items():
        doc = Document.by_id(document_id)
        if doc is None:
            continue
        ref = Reference()
        ref.document_id = document_id
        ref.entity_id = entity.id
        ref.origin = 'regex'
        ref.weight = weight
        db.session.add(ref)

    db.session.commit()
    delete_entity_references(entity.id)
    update_entity_references(entity.id)
예제 #10
0
    def finalize(self):
        output = []
        for entity_name, schemas in self.entities.items():
            schema = max(set(schemas), key=schemas.count)
            output.append((entity_name, len(schemas), schema))

        Reference.delete_document(self.document.id, origin=self.origin)
        for name, weight, schema in output:
            entity_id = self.load_entity(name, schema)
            if entity_id is None:
                continue
            ref = Reference()
            ref.document_id = self.document.id
            ref.entity_id = entity_id
            ref.origin = self.origin
            ref.weight = weight
            db.session.add(ref)
        log.info('Polyglot extraced %s entities.', len(output))
예제 #11
0
    def finalize(self):
        output = []
        for entity_name, schemas in self.entities.items():
            schema = max(set(schemas), key=schemas.count)
            output.append((entity_name, len(schemas), schema))

        self.document.delete_references(origin=self.origin)
        for name, weight, schema in output:
            entity_id = self.load_entity(name, schema)
            if entity_id is None:
                continue
            ref = Reference()
            ref.document_id = self.document.id
            ref.entity_id = entity_id
            ref.origin = self.origin
            ref.weight = weight
            db.session.add(ref)
        log.info('Polyglot extraced %s entities.', len(output))
예제 #12
0
def generate_entity_references(entity):
    if entity.state != Entity.STATE_ACTIVE:
        return

    rex = '|'.join(entity.regex_terms)
    rex = re.compile('( |^)(%s)( |$)' % rex)

    documents = defaultdict(int)
    try:
        for document_id, text in scan_entity_mentions(entity):
            text = normalize_strong(text)
            if text is None or len(text) <= 2:
                continue
            for match in rex.finditer(text):
                documents[document_id] += 1
    except Exception:
        log.exception('Failed to fully scan documents for entity refresh.')

    q = db.session.query(Reference)
    q = q.filter(Reference.entity_id == entity.id)
    q = q.filter(Reference.origin == 'regex')
    q.delete(synchronize_session='fetch')

    log.info("Re-matching %r gave %r documents.", entity,
             len(documents))

    for document_id, weight in documents.items():
        doc = Document.by_id(document_id)
        if doc is None:
            continue
        ref = Reference()
        ref.document_id = document_id
        ref.entity_id = entity.id
        ref.origin = 'regex'
        ref.weight = weight
        db.session.add(ref)

    db.session.commit()
    delete_entity_references(entity.id)
    q = db.session.query(func.distinct(Reference.document_id))
    q = q.filter(Reference.entity_id == entity.id)
    for document_id, in q:
        index_document(document_id, index_records=False)
예제 #13
0
파일: entities.py 프로젝트: wcyn/aleph
def generate_entity_references(entity):
    # This is all a bit hacky: we're re-generating all the entity
    # references for the given entity by effectively re-implementing
    # the RegexEntityAnalyzer. The alternative was to conduct a
    # search for potential matching documents, re-analyze them and
    # re-index them. This proved to be too slow in reality.
    if entity.state != Entity.STATE_ACTIVE:
        entity.delete_references(origin='regex')
        return

    log.info("Updating document references: %r", entity)
    rex = '|'.join([t for t in entity.regex_terms])
    rex = re.compile('(%s)' % rex)

    documents = defaultdict(int)
    try:
        for document_id, text in scan_entity_mentions(entity):
            text = match_form(text)
            if text is None or len(text) <= 2:
                continue
            for match in rex.finditer(text):
                documents[document_id] += 1
    except Exception as ex:
        log.exception(ex)

    log.info("Re-matching %r gave %r documents.", entity, len(documents))

    entity.delete_references(origin='regex')
    for document_id, weight in documents.items():
        doc = Document.by_id(document_id)
        if doc is None:
            continue
        ref = Reference()
        ref.document_id = doc.id
        ref.entity_id = entity.id
        ref.origin = 'regex'
        ref.weight = weight
        db.session.add(ref)

    db.session.commit()
    delete_entity_references(entity.id)
    update_entity_references(entity)
예제 #14
0
def generate_entity_references(entity):
    if entity.state != Entity.STATE_ACTIVE:
        return

    rex = '|'.join(entity.regex_terms)
    rex = re.compile('( |^)(%s)( |$)' % rex)

    documents = defaultdict(int)
    try:
        for document_id, text in scan_entity_mentions(entity):
            text = normalize_strong(text)
            if text is None or len(text) <= 2:
                continue
            for match in rex.finditer(text):
                documents[document_id] += 1
    except Exception:
        log.exception('Failed to fully scan documents for entity refresh.')

    q = db.session.query(Reference)
    q = q.filter(Reference.entity_id == entity.id)
    q = q.filter(Reference.origin == 'regex')
    q.delete(synchronize_session='fetch')

    log.info("Re-matching %r gave %r documents.", entity, len(documents))

    for document_id, weight in documents.items():
        doc = Document.by_id(document_id)
        if doc is None:
            continue
        ref = Reference()
        ref.document_id = document_id
        ref.entity_id = entity.id
        ref.origin = 'regex'
        ref.weight = weight
        db.session.add(ref)

    db.session.commit()
    delete_entity_references(entity.id)
    q = db.session.query(func.distinct(Reference.document_id))
    q = q.filter(Reference.entity_id == entity.id)
    for document_id, in q:
        index_document(document_id, index_records=False)