Example #1
0
def analyze_document(document):
    """Run analyzers (such as NER) on a given document."""
    log.info("Analyze document: %r", document)
    start = timer()

    # initialise the analyzers
    analyzers = []
    for cls in get_analyzers():
        analyzer = cls(document)
        analyzer.prepare()
        analyzers.append(analyzer)

    # run the analyzers on each fragment of text in the given
    # document (row cells or text pages).
    for text in document.text_parts():
        for analyzer in analyzers:
            if not analyzer.disabled:
                analyzer.on_text(text)

    # collect outputs.
    for analyzer in analyzers:
        if not analyzer.disabled:
            analyzer.finalize()
    db.session.add(document)
    db.session.commit()

    end = timer()
    log.info("Completed analysis: %r (elapsed: %.2fms)", document, end - start)

    # next: update the search index.
    index_document(document)
    index_records(document)
Example #2
0
 def handle_exception(cls, meta, collection_id, exception):
     if isinstance(exception, SQLAlchemyError):
         log.exception(exception)
         return
     (error_type, error_message, error_details) = sys.exc_info()
     if error_type is not None:
         error_message = unicode(error_message)
         error_details = traceback.format_exc()
         log.info(error_details)
     else:
         error_message = unicode(exception)
     error_type = exception.__class__.__name__
     log.warn('Error [%s]: %s', error_type, error_message)
     try:
         db.session.rollback()
         db.session.close()
         document = cls.document_by_meta(collection_id, meta)
         document.type = Document.TYPE_OTHER
         document.status = Document.STATUS_FAIL
         document.error_type = error_type
         document.error_message = error_message
         document.error_details = error_details
         db.session.add(document)
         db.session.commit()
         index_document(document)
     except Exception as ex:
         log.exception(ex)
Example #3
0
def delete_collection(collection_id=None):
    # Deleting a collection affects many associated objects and requires
    # checks, so this is done manually and in detail here.
    q = db.session.query(Collection).filter(Collection.id == collection_id)
    collection = q.first()
    if collection is None:
        log.error("No collection with ID: %r", collection_id)

    log.info("Deleting collection [%r]: %r", collection.id, collection.label)
    deleted_at = datetime.utcnow()
    for entity in collection.entities:
        entity.collections = [
            c for c in entity.collections if c.id != collection.id
        ]
        db.session.add(entity)
        if not len(entity.collections):
            entity.delete(deleted_at=deleted_at)
        reindex_entity(entity)

    for document in collection.documents:
        document.collections = [
            c for c in document.collections if c.id != collection.id
        ]
        if not len(document.collections):
            document.delete(deleted_at=deleted_at)
            delete_document(document.id)
        else:
            if collection_id == document.source_collection_id:
                document.source_collection_id = None
            db.session.add(document)
            index_document(document)

    collection.delete(deleted_at=deleted_at)
    db.session.commit()
    index_delete(collection_id)
Example #4
0
def delete_collection(collection_id=None):
    # Deleting a collection affects many associated objects and requires
    # checks, so this is done manually and in detail here.
    q = db.session.query(Collection).filter(Collection.id == collection_id)
    collection = q.first()
    if collection is None:
        log.error("No collection with ID: %r", collection_id)

    log.info("Deleting collection [%r]: %r", collection.id, collection.label)
    deleted_at = datetime.utcnow()
    for entity in collection.entities:
        entity.collections = [c for c in entity.collections
                              if c.id != collection.id]
        db.session.add(entity)
        if not len(entity.collections):
            entity.delete(deleted_at=deleted_at)
        reindex_entity(entity)

    for document in collection.documents:
        document.collections = [c for c in document.collections
                                if c.id != collection.id]
        if not len(document.collections):
            document.delete(deleted_at=deleted_at)
            delete_document(document.id)
        else:
            if collection_id == document.source_collection_id:
                document.source_collection_id = None
            db.session.add(document)
            index_document(document)

    collection.delete(deleted_at=deleted_at)
    db.session.commit()
    index_delete(collection_id)
Example #5
0
def update_document(document):
    # These are operations that should be executed after each
    # write to a document or its metadata.
    analyze_document_id.apply_async([document.id],
                                    queue=USER_QUEUE,
                                    routing_key=USER_ROUTING_KEY)
    index_document(document, index_records=False)
Example #6
0
def update_document(document):
    # These are operations that should be executed after each
    # write to a document or its metadata.
    analyze_document_id.delay(document.id)
    index_document(document, index_records=False)
    with graph.transaction() as tx:
        graph.load_document(tx, document)
Example #7
0
def update_document(document):
    # These are operations that should be executed after each
    # write to a document or its metadata.
    analyze_document_id.apply_async([document.id], queue=USER_QUEUE,
                                    routing_key=USER_ROUTING_KEY)
    index_document(document, index_records=False)
    with graph.transaction() as tx:
        graph.load_document(tx, document)
Example #8
0
def analyze_document(document_id):
    clear_session()
    document = Document.by_id(document_id)
    if document is None:
        log.info("Could not find document: %r", document_id)
        return
    log.info("Analyze document: %r", document)
    for cls in get_analyzers():
        cls().analyze(document, document.meta)
    index_document(document_id)
Example #9
0
def analyze_document(document_id):
    document = Document.by_id(document_id)
    if document is None:
        log.info("Could not find document: %r", document_id)
        return
    log.info("Analyze document: %r", document)
    try:
        for cls in get_analyzers():
            cls().analyze(document, document.meta)
    except Exception as ex:
        log.exception(ex)
    index_document(document_id)
Example #10
0
def analyze_document(document_id):
    document = Document.by_id(document_id)
    if document is None:
        log.info("Could not find document: %r", document_id)
        return
    log.info("Analyze document: %r", document)
    for cls in get_analyzers():
        try:
            cls().analyze(document, document.meta)
        except Exception as ex:
            log.exception(ex)
            process.exception(process.ANALYZE, component=cls.__name__,
                              document_id=document.id, meta=document.meta,
                              source_id=document.source_id, exception=ex)
    index_document(document_id)
Example #11
0
def index(foreign_id=None, immediate=False):
    """Index documents in the given source (or throughout)."""
    q = Document.all_ids()
    if foreign_id:
        source = Source.by_foreign_id(foreign_id)
        if source is None:
            raise ValueError("No such source: %r" % foreign_id)
        q = q.filter(Document.source_id == source.id)
    for doc_id, in q:
        #import time; time.sleep(10) #let's not get banned
        print('indexing %s' % doc_id)
        if immediate: #bypass the queue
            index_document(doc_id)
        else:
            index_document.delay(doc_id)
    if foreign_id is None:
        reindex_entities()
Example #12
0
def analyze_document(document_id):
    document = Document.by_id(document_id)
    if document is None:
        log.info("Could not find document: %r", document_id)
        return
    log.info("Analyze document: %r", document)
    for cls in get_analyzers():
        try:
            cls().analyze(document, document.meta)
        except Exception as ex:
            log.exception(ex)
            process.exception(process.ANALYZE,
                              component=cls.__name__,
                              document_id=document.id,
                              meta=document.meta,
                              source_id=document.source_id,
                              exception=ex)
    index_document(document_id)
Example #13
0
def analyze_document(document):
    log.info("Analyze document: %r", document)
    analyzers = []
    meta = document.meta
    for cls in get_analyzers():
        analyzer = cls(document, meta)
        analyzer.prepare()
        analyzers.append(analyzer)

    for text in document.text_parts():
        for analyzer in analyzers:
            analyzer.on_text(text)

    for analyzer in analyzers:
        analyzer.finalize()
    document.meta = meta
    db.session.add(document)
    db.session.commit()
    index_document(document)
Example #14
0
def generate_entity_references(entity):
    if entity.state != Entity.STATE_ACTIVE:
        return

    rex = '|'.join(entity.regex_terms)
    rex = re.compile('( |^)(%s)( |$)' % rex)

    documents = defaultdict(int)
    try:
        for document_id, text in scan_entity_mentions(entity):
            text = normalize_strong(text)
            if text is None or len(text) <= 2:
                continue
            for match in rex.finditer(text):
                documents[document_id] += 1
    except Exception:
        log.exception('Failed to fully scan documents for entity refresh.')

    q = db.session.query(Reference)
    q = q.filter(Reference.entity_id == entity.id)
    q = q.filter(Reference.origin == 'regex')
    q.delete(synchronize_session='fetch')

    log.info("Re-matching %r gave %r documents.", entity,
             len(documents))

    for document_id, weight in documents.items():
        doc = Document.by_id(document_id)
        if doc is None:
            continue
        ref = Reference()
        ref.document_id = document_id
        ref.entity_id = entity.id
        ref.origin = 'regex'
        ref.weight = weight
        db.session.add(ref)

    db.session.commit()
    delete_entity_references(entity.id)
    q = db.session.query(func.distinct(Reference.document_id))
    q = q.filter(Reference.entity_id == entity.id)
    for document_id, in q:
        index_document(document_id, index_records=False)
Example #15
0
def analyze_document(document):
    log.info("Analyze document: %r", document)
    analyzers = []
    meta = document.meta
    for cls in get_analyzers():
        analyzer = cls(document, meta)
        analyzer.prepare()
        analyzers.append(analyzer)

    for text in document.text_parts():
        for analyzer in analyzers:
            analyzer.on_text(text)

    for analyzer in analyzers:
        analyzer.finalize()
    document.meta = meta
    db.session.add(document)
    db.session.commit()
    index_document(document)
Example #16
0
def generate_entity_references(entity):
    if entity.state != Entity.STATE_ACTIVE:
        return

    rex = '|'.join(entity.regex_terms)
    rex = re.compile('( |^)(%s)( |$)' % rex)

    documents = defaultdict(int)
    try:
        for document_id, text in scan_entity_mentions(entity):
            text = normalize_strong(text)
            if text is None or len(text) <= 2:
                continue
            for match in rex.finditer(text):
                documents[document_id] += 1
    except Exception:
        log.exception('Failed to fully scan documents for entity refresh.')

    q = db.session.query(Reference)
    q = q.filter(Reference.entity_id == entity.id)
    q = q.filter(Reference.origin == 'regex')
    q.delete(synchronize_session='fetch')

    log.info("Re-matching %r gave %r documents.", entity, len(documents))

    for document_id, weight in documents.items():
        doc = Document.by_id(document_id)
        if doc is None:
            continue
        ref = Reference()
        ref.document_id = document_id
        ref.entity_id = entity.id
        ref.origin = 'regex'
        ref.weight = weight
        db.session.add(ref)

    db.session.commit()
    delete_entity_references(entity.id)
    q = db.session.query(func.distinct(Reference.document_id))
    q = q.filter(Reference.entity_id == entity.id)
    for document_id, in q:
        index_document(document_id, index_records=False)
Example #17
0
def analyze_document(document_id):
    document = Document.by_id(document_id)
    if document is None:
        log.info("Could not find document: %r", document_id)
        return
    log.info("Analyze document: %r", document)
    analyzers = []
    meta = document.meta
    for cls in get_analyzers():
        try:
            analyzer = cls(document, meta)
            analyzer.prepare()
            analyzers.append(analyzer)
        except Exception as ex:
            log.exception(ex)

    if document.type == Document.TYPE_TEXT:
        for page in document.pages:
            for analyzer in analyzers:
                analyzer.on_page(page)
            for text in page.text_parts():
                for analyzer in analyzers:
                    analyzer.on_text(text)

    if document.type == Document.TYPE_TABULAR:
        for record in document.records:
            for analyzer in analyzers:
                analyzer.on_record(record)
            for text in record.text_parts():
                for analyzer in analyzers:
                    analyzer.on_text(text)

    for analyzer in analyzers:
        try:
            analyzer.finalize()
        except Exception as ex:
            log.exception(ex)
    document.meta = meta
    db.session.add(document)
    db.session.commit()
    index_document(document_id)
Example #18
0
def analyze_document(document_id):
    document = Document.by_id(document_id)
    if document is None:
        log.info("Could not find document: %r", document_id)
        return
    log.info("Analyze document: %r", document)
    analyzers = []
    meta = document.meta
    for cls in get_analyzers():
        try:
            analyzer = cls(document, meta)
            analyzer.prepare()
            analyzers.append(analyzer)
        except Exception as ex:
            log.exception(ex)

    if document.type == Document.TYPE_TEXT:
        for page in document.pages:
            for analyzer in analyzers:
                analyzer.on_page(page)
            for text in page.text_parts():
                for analyzer in analyzers:
                    analyzer.on_text(text)

    if document.type == Document.TYPE_TABULAR:
        for record in document.records:
            for analyzer in analyzers:
                analyzer.on_record(record)
            for text in record.text_parts():
                for analyzer in analyzers:
                    analyzer.on_text(text)

    for analyzer in analyzers:
        try:
            analyzer.finalize()
        except Exception as ex:
            log.exception(ex)
    document.meta = meta
    db.session.add(document)
    db.session.commit()
    index_document(document_id)
Example #19
0
def update_document(document):
    # These are operations that should be executed after each
    # write to a document or its metadata.
    analyze_document_id.delay(document.id)
    index_document(document, index_records=False)