def analyze_document(document): """Run analyzers (such as NER) on a given document.""" log.info("Analyze document: %r", document) start = timer() # initialise the analyzers analyzers = [] for cls in get_analyzers(): analyzer = cls(document) analyzer.prepare() analyzers.append(analyzer) # run the analyzers on each fragment of text in the given # document (row cells or text pages). for text in document.text_parts(): for analyzer in analyzers: if not analyzer.disabled: analyzer.on_text(text) # collect outputs. for analyzer in analyzers: if not analyzer.disabled: analyzer.finalize() db.session.add(document) db.session.commit() end = timer() log.info("Completed analysis: %r (elapsed: %.2fms)", document, end - start) # next: update the search index. index_document(document) index_records(document)
def handle_exception(cls, meta, collection_id, exception): if isinstance(exception, SQLAlchemyError): log.exception(exception) return (error_type, error_message, error_details) = sys.exc_info() if error_type is not None: error_message = unicode(error_message) error_details = traceback.format_exc() log.info(error_details) else: error_message = unicode(exception) error_type = exception.__class__.__name__ log.warn('Error [%s]: %s', error_type, error_message) try: db.session.rollback() db.session.close() document = cls.document_by_meta(collection_id, meta) document.type = Document.TYPE_OTHER document.status = Document.STATUS_FAIL document.error_type = error_type document.error_message = error_message document.error_details = error_details db.session.add(document) db.session.commit() index_document(document) except Exception as ex: log.exception(ex)
def delete_collection(collection_id=None): # Deleting a collection affects many associated objects and requires # checks, so this is done manually and in detail here. q = db.session.query(Collection).filter(Collection.id == collection_id) collection = q.first() if collection is None: log.error("No collection with ID: %r", collection_id) log.info("Deleting collection [%r]: %r", collection.id, collection.label) deleted_at = datetime.utcnow() for entity in collection.entities: entity.collections = [ c for c in entity.collections if c.id != collection.id ] db.session.add(entity) if not len(entity.collections): entity.delete(deleted_at=deleted_at) reindex_entity(entity) for document in collection.documents: document.collections = [ c for c in document.collections if c.id != collection.id ] if not len(document.collections): document.delete(deleted_at=deleted_at) delete_document(document.id) else: if collection_id == document.source_collection_id: document.source_collection_id = None db.session.add(document) index_document(document) collection.delete(deleted_at=deleted_at) db.session.commit() index_delete(collection_id)
def delete_collection(collection_id=None): # Deleting a collection affects many associated objects and requires # checks, so this is done manually and in detail here. q = db.session.query(Collection).filter(Collection.id == collection_id) collection = q.first() if collection is None: log.error("No collection with ID: %r", collection_id) log.info("Deleting collection [%r]: %r", collection.id, collection.label) deleted_at = datetime.utcnow() for entity in collection.entities: entity.collections = [c for c in entity.collections if c.id != collection.id] db.session.add(entity) if not len(entity.collections): entity.delete(deleted_at=deleted_at) reindex_entity(entity) for document in collection.documents: document.collections = [c for c in document.collections if c.id != collection.id] if not len(document.collections): document.delete(deleted_at=deleted_at) delete_document(document.id) else: if collection_id == document.source_collection_id: document.source_collection_id = None db.session.add(document) index_document(document) collection.delete(deleted_at=deleted_at) db.session.commit() index_delete(collection_id)
def update_document(document): # These are operations that should be executed after each # write to a document or its metadata. analyze_document_id.apply_async([document.id], queue=USER_QUEUE, routing_key=USER_ROUTING_KEY) index_document(document, index_records=False)
def update_document(document): # These are operations that should be executed after each # write to a document or its metadata. analyze_document_id.delay(document.id) index_document(document, index_records=False) with graph.transaction() as tx: graph.load_document(tx, document)
def update_document(document): # These are operations that should be executed after each # write to a document or its metadata. analyze_document_id.apply_async([document.id], queue=USER_QUEUE, routing_key=USER_ROUTING_KEY) index_document(document, index_records=False) with graph.transaction() as tx: graph.load_document(tx, document)
def analyze_document(document_id): clear_session() document = Document.by_id(document_id) if document is None: log.info("Could not find document: %r", document_id) return log.info("Analyze document: %r", document) for cls in get_analyzers(): cls().analyze(document, document.meta) index_document(document_id)
def analyze_document(document_id): document = Document.by_id(document_id) if document is None: log.info("Could not find document: %r", document_id) return log.info("Analyze document: %r", document) try: for cls in get_analyzers(): cls().analyze(document, document.meta) except Exception as ex: log.exception(ex) index_document(document_id)
def analyze_document(document_id): document = Document.by_id(document_id) if document is None: log.info("Could not find document: %r", document_id) return log.info("Analyze document: %r", document) for cls in get_analyzers(): try: cls().analyze(document, document.meta) except Exception as ex: log.exception(ex) process.exception(process.ANALYZE, component=cls.__name__, document_id=document.id, meta=document.meta, source_id=document.source_id, exception=ex) index_document(document_id)
def index(foreign_id=None, immediate=False): """Index documents in the given source (or throughout).""" q = Document.all_ids() if foreign_id: source = Source.by_foreign_id(foreign_id) if source is None: raise ValueError("No such source: %r" % foreign_id) q = q.filter(Document.source_id == source.id) for doc_id, in q: #import time; time.sleep(10) #let's not get banned print('indexing %s' % doc_id) if immediate: #bypass the queue index_document(doc_id) else: index_document.delay(doc_id) if foreign_id is None: reindex_entities()
def analyze_document(document): log.info("Analyze document: %r", document) analyzers = [] meta = document.meta for cls in get_analyzers(): analyzer = cls(document, meta) analyzer.prepare() analyzers.append(analyzer) for text in document.text_parts(): for analyzer in analyzers: analyzer.on_text(text) for analyzer in analyzers: analyzer.finalize() document.meta = meta db.session.add(document) db.session.commit() index_document(document)
def generate_entity_references(entity): if entity.state != Entity.STATE_ACTIVE: return rex = '|'.join(entity.regex_terms) rex = re.compile('( |^)(%s)( |$)' % rex) documents = defaultdict(int) try: for document_id, text in scan_entity_mentions(entity): text = normalize_strong(text) if text is None or len(text) <= 2: continue for match in rex.finditer(text): documents[document_id] += 1 except Exception: log.exception('Failed to fully scan documents for entity refresh.') q = db.session.query(Reference) q = q.filter(Reference.entity_id == entity.id) q = q.filter(Reference.origin == 'regex') q.delete(synchronize_session='fetch') log.info("Re-matching %r gave %r documents.", entity, len(documents)) for document_id, weight in documents.items(): doc = Document.by_id(document_id) if doc is None: continue ref = Reference() ref.document_id = document_id ref.entity_id = entity.id ref.origin = 'regex' ref.weight = weight db.session.add(ref) db.session.commit() delete_entity_references(entity.id) q = db.session.query(func.distinct(Reference.document_id)) q = q.filter(Reference.entity_id == entity.id) for document_id, in q: index_document(document_id, index_records=False)
def analyze_document(document_id): document = Document.by_id(document_id) if document is None: log.info("Could not find document: %r", document_id) return log.info("Analyze document: %r", document) analyzers = [] meta = document.meta for cls in get_analyzers(): try: analyzer = cls(document, meta) analyzer.prepare() analyzers.append(analyzer) except Exception as ex: log.exception(ex) if document.type == Document.TYPE_TEXT: for page in document.pages: for analyzer in analyzers: analyzer.on_page(page) for text in page.text_parts(): for analyzer in analyzers: analyzer.on_text(text) if document.type == Document.TYPE_TABULAR: for record in document.records: for analyzer in analyzers: analyzer.on_record(record) for text in record.text_parts(): for analyzer in analyzers: analyzer.on_text(text) for analyzer in analyzers: try: analyzer.finalize() except Exception as ex: log.exception(ex) document.meta = meta db.session.add(document) db.session.commit() index_document(document_id)
def update_document(document): # These are operations that should be executed after each # write to a document or its metadata. analyze_document_id.delay(document.id) index_document(document, index_records=False)