def test_load_pdf_file(self): pdf_path = self.get_fixture_path('demo.pdf') document = Document.by_keys(collection_id=self.collection.id, foreign_id='demo.pdf') db.session.commit() db.session.refresh(document) ingest_document(document, pdf_path) assert Document.all().count() == 1, Document.all().count()
def test_load_sample_directory(self): samples_path = self.get_fixture_path('samples') document = Document.by_keys(collection_id=self.collection.id, foreign_id='samples') db.session.commit() db.session.refresh(document) ingest_document(document, samples_path) assert Document.all().count() == 5, Document.all().count()
def foreign_id_exists(self, source, foreign_id): q = Document.all_ids().filter(Document.source_id == source.id) q = q.filter(Document.foreign_id == foreign_id) exists = q.first() is not None if exists: log.info("Foreign ID exists (%s): %s", source, foreign_id) return exists
def index_document(document_id): document = Document.by_id(document_id) if document is None: log.info("Could not find document: %r", document_id) return try: log.info("Index document: %r", document) data = document.to_index_dict() data['entities'] = generate_entities(document) data['title_latin'] = latinize_text(data.get('title')) data['summary_latin'] = latinize_text(data.get('summary')) get_es().index(index=get_es_index(), doc_type=TYPE_DOCUMENT, body=data, id=document.id) clear_children(document) if document.type == Document.TYPE_TEXT: bulk(get_es(), generate_pages(document), stats_only=True, chunk_size=2000, request_timeout=60.0) if document.type == Document.TYPE_TABULAR: bulk(get_es(), generate_records(document), stats_only=True, chunk_size=2000, request_timeout=60.0) except Exception as ex: log.exception(ex) process.exception(process.INDEX, component=__name__, document_id=document.id, meta=document.meta, source_id=document.source_id, exception=ex)
def ingest_upload(collection_id): require(request.authz.can(collection_id, request.authz.WRITE)) sync = get_flag('sync') meta, foreign_id = _load_metadata() parent_id = _load_parent(collection_id, meta) upload_dir = mkdtemp(prefix='aleph.upload.') try: path = None content_hash = None for storage in request.files.values(): path = safe_filename(storage.filename, default='upload') path = os.path.join(upload_dir, path) storage.save(path) content_hash = checksum(path) document = Document.by_keys(collection_id=collection_id, parent_id=parent_id, foreign_id=foreign_id, content_hash=content_hash) document.update(meta) document.schema = Document.SCHEMA if content_hash is None: document.schema = Document.SCHEMA_FOLDER ingest_document(document, path, role_id=request.authz.id, content_hash=content_hash) finally: shutil.rmtree(upload_dir) if document.collection.casefile: # Make sure collection counts are always accurate. update_document(document, sync=sync) return jsonify({ 'status': 'ok', 'id': stringify(document.id) }, status=201)
def get_document(document_id): document = Document.by_id(document_id) if document is None: raise NotFound() readable = [c for c in document.collection_ids if authz.collection_read(c)] authz.require(len(readable)) return document
def index_document(document_id): clear_session() document = Document.by_id(document_id) if document is None: log.info("Could not find document: %r", document_id) return log.info("Index document: %r", document) data = document.to_dict() data['entities'] = generate_entities(document) data['title_latin'] = latinize_text(data.get('title')) data['summary_latin'] = latinize_text(data.get('summary')) es.index(index=es_index, doc_type=TYPE_DOCUMENT, body=data, id=document.id) clear_children(document) try: if document.type == Document.TYPE_TEXT: bulk(es, generate_pages(document), stats_only=True, chunk_size=2000, request_timeout=60.0) if document.type == Document.TYPE_TABULAR: bulk(es, generate_records(document), stats_only=True, chunk_size=2000, request_timeout=60.0) except Exception as ex: log.exception(ex)
def index(): sources_ids = match_ids('sources', authz.sources(authz.READ)) q = Document.all().filter(Document.source_id.in_(sources_ids)) hashes = request.args.getlist('content_hash') if len(hashes): q = q.filter(Document.content_hash.in_(hashes)) return jsonify(Pager(q))
def create_document(self, meta, type=None): if meta.content_hash: q = Document.all() if meta.foreign_id: q = q.filter(Document.foreign_id == meta.foreign_id) else: q = q.filter(Document.content_hash == meta.content_hash) q = q.filter(Document.source_id == self.source_id) document = q.first() if document is None: document = Document() document.source_id = self.source_id document.meta = meta document.type = type or self.DOCUMENT_TYPE db.session.add(document) db.session.flush() return document
def load_fixtures(self, file_name, process_documents=True): filepath = self.get_fixture_path(file_name) load_fixtures(db, loaders.load(filepath)) db.session.commit() if process_documents: for doc_id, in Document.all_ids(): analyze_document(doc_id) optimize_search()
def load_fixtures(self, file_name, process_documents=True): filepath = os.path.abspath(os.path.join(FIXTURES, file_name)) load_fixtures(db, loaders.load(filepath)) db.session.commit() if process_documents: for doc_id, in Document.all_ids(): analyze_document(doc_id) optimize_search()
def load_documents(): graph = get_graph() tx = graph.begin() for i, document in enumerate(Document.all()): load_document(tx, document) if i > 0 and i % 1000 == 0: tx.commit() tx = graph.begin() tx.commit()
def create_document(self, meta, type=None): if meta.content_hash: q = Document.all() if meta.foreign_id: q = q.filter(Document.foreign_id == meta.foreign_id) else: q = q.filter(Document.content_hash == meta.content_hash) clause = Collection.id == self.collection_id q = q.filter(Document.collections.any(clause)) document = q.first() if document is None: document = Document() document.collections = [Collection.by_id(self.collection_id)] document.meta = meta document.type = type or self.DOCUMENT_TYPE db.session.add(document) db.session.flush() return document
def index(): collection_ids = match_ids('collection', authz.collections(authz.READ)) q = Document.all() clause = Collection.id.in_(collection_ids) q = q.filter(Document.collections.any(clause)) hashes = request.args.getlist('content_hash') if len(hashes): q = q.filter(Document.content_hash.in_(hashes)) return jsonify(Pager(q))
def analyze_document(document_id): clear_session() document = Document.by_id(document_id) if document is None: log.info("Could not find document: %r", document_id) return log.info("Analyze document: %r", document) for cls in get_analyzers(): cls().analyze(document, document.meta) index_document(document_id)
def process_documents(collection_id=None, failed_only=False): """Re-ingest or re-index all documents. Can be filtered to cover only documents which failed to properly import last time, or those which are part of a particular collection.""" q = Document.find_ids(collection_id=collection_id, failed_only=failed_only) for idx, (doc_id,) in enumerate(q.yield_per(5000), 1): ingest.apply_async([doc_id], {'refresh': True}, priority=1) if idx % 10000 == 0: log.info("Process: %s documents...", idx)
def load_documents(): graph = get_graph() tx = graph.begin() for i, document in enumerate(Document.all()): log.info("Load doc [%s]: %r", document.id, document.meta) load_document(tx, document) if i > 0 and i % 1000 == 0: tx.commit() tx = graph.begin() tx.commit()
def test_load_csv_file(self): csv_path = self.get_fixture_path('experts.csv') crawler = DirectoryCrawler() crawler.execute(directory=csv_path) assert Document.all().count() == 1, Document.all().count() records = db.session.query(DocumentRecord).all() assert len(records) == 14, len(records) rec0 = records[0] assert str(rec0.id) in repr(rec0), repr(rec0) assert 'experts.csv' in rec0.document.meta.file_name, \ rec0.document.meta assert 'nationality' in rec0.data, rec0.data assert 'name' in rec0.data, rec0.data doc = rec0.document assert 'experts' in repr(doc) doc.delete_records() records = db.session.query(DocumentRecord).all() assert len(records) == 0, len(records)
def test_load_csv_file(self): csv_path = self.get_fixture_path('experts.csv') document = Document.by_keys(collection_id=self.collection.id, foreign_id='experts.csv') document.file_name = 'experts.csv' db.session.commit() db.session.refresh(document) ingest_document(document, csv_path) assert Document.all().count() == 1, Document.all().count() records = db.session.query(DocumentRecord).all() assert len(records) == 14, len(records) rec0 = records[0] assert str(rec0.id) in repr(rec0), repr(rec0) assert 'nationality' in rec0.data, rec0.data assert 'name' in rec0.data, rec0.data doc = rec0.document doc.delete_records() records = db.session.query(DocumentRecord).all() assert len(records) == 0, len(records)
def analyze_document(document_id): document = Document.by_id(document_id) if document is None: log.info("Could not find document: %r", document_id) return log.info("Analyze document: %r", document) try: for cls in get_analyzers(): cls().analyze(document, document.meta) except Exception as ex: log.exception(ex) index_document(document_id)
def index(foreign_id=None): """Index documents in the given source (or throughout).""" q = Document.all_ids() if foreign_id: source = Source.by_foreign_id(foreign_id) if source is None: raise ValueError("No such source: %r" % foreign_id) q = q.filter(Document.source_id == source.id) for doc_id, in q: index_document.delay(doc_id) if foreign_id is None: reindex_entities()
def generate_collection_docs(collection): q = Document.by_collection(collection.id) q = q.order_by(Document.id.asc()) for idx, document in enumerate(q.yield_per(BULK_PAGE)): try: log.info("Index [%s]: %s", document.id, document.name) yield from generate_document(document) except Exception: log.exception("Cannot index [%s]: %s", document.id, document.name) if idx % 1000 == 0: db.session.expunge_all()
def _load_parent(collection_id, meta): """Determine the parent document for the document that is to be ingested.""" parent_id = meta.get('parent_id') if parent_id is None: return parent = Document.by_id(parent_id, collection_id=collection_id) if parent is None: raise BadRequest(response=jsonify({ 'status': 'error', 'message': 'Cannot load parent document' }, status=400)) return parent_id
def index(foreign_id=None): """Index documents in the given collection (or throughout).""" q = Document.all_ids() if foreign_id: collection = Collection.by_foreign_id(foreign_id) if collection is None: raise ValueError("No such collection: %r" % foreign_id) clause = Collection.id == collection.id q = q.filter(Document.collections.any(clause)) for doc_id, in q: index_document_id.delay(doc_id) if foreign_id is None: reindex_entities()
def index_document(document_id): document = Document.by_id(document_id) if document is None: log.info("Could not find document: %r", document_id) return log.info("Index document: %r", document) data = document.to_index_dict() data['entities'] = generate_entities(document) data['title_latin'] = latinize_text(data.get('title')) data['summary_latin'] = latinize_text(data.get('summary')) get_es().index(index=get_es_index(), doc_type=TYPE_DOCUMENT, body=data, id=document.id) clear_records(document) bulk_op(generate_records(document))
def analyze_document(document_id): document = Document.by_id(document_id) if document is None: log.info("Could not find document: %r", document_id) return log.info("Analyze document: %r", document) for cls in get_analyzers(): try: cls().analyze(document, document.meta) except Exception as ex: log.exception(ex) process.exception(process.ANALYZE, component=cls.__name__, document_id=document.id, meta=document.meta, source_id=document.source_id, exception=ex) index_document(document_id)
def index(): try: authorized = authz.collections(authz.READ) collection_ids = [int(f) for f in request.args.getlist('collection')] collection_ids = collection_ids or authorized collection_ids = [c for c in collection_ids if c in authorized] except ValueError: raise BadRequest() q = Document.all() clause = Collection.id.in_(collection_ids) q = q.filter(Document.collections.any(clause)) hashes = request.args.getlist('content_hash') if len(hashes): q = q.filter(Document.content_hash.in_(hashes)) return jsonify(Pager(q))
def index_document(document_id): document = Document.by_id(document_id) if document is None: log.info("Could not find document: %r", document_id) return log.info("Index document: %r", document) data = document.to_index_dict() data['entities'] = generate_entities(document) data['title_latin'] = latinize_text(data.get('title')) data['summary_latin'] = latinize_text(data.get('summary')) get_es().index(index=get_es_index(), doc_type=TYPE_DOCUMENT, body=data, id=document.id) clear_records(document) bulk(get_es(), generate_records(document), stats_only=True, chunk_size=2000, request_timeout=60.0)
def generate_entity_references(entity): if entity.state != Entity.STATE_ACTIVE: return # This is all a bit hacky: we're re-generating all the entity # references for the given entity by effectively re-implementing # the RegexEntityAnalyzer. The alternative was to conduct a # search for potential matching documents, re-analyze them and # re-index them. This proved to be too slow in reality. log.info("Updating document references: %r", entity) rex = '|'.join(entity.regex_terms) rex = re.compile('( |^)(%s)( |$)' % rex) documents = defaultdict(int) try: for document_id, text in scan_entity_mentions(entity): text = normalize_strong(text) if text is None or len(text) <= 2: continue for match in rex.finditer(text): documents[document_id] += 1 except Exception: log.exception('Failed to fully scan documents for entity refresh.') q = db.session.query(Reference) q = q.filter(Reference.entity_id == entity.id) q = q.filter(Reference.origin == 'regex') q.delete(synchronize_session='fetch') log.info("Re-matching %r gave %r documents.", entity, len(documents)) for document_id, weight in documents.items(): doc = Document.by_id(document_id) if doc is None: continue ref = Reference() ref.document_id = document_id ref.entity_id = entity.id ref.origin = 'regex' ref.weight = weight db.session.add(ref) db.session.commit() delete_entity_references(entity.id) update_entity_references(entity.id)
def generate_entity_references(entity): if entity.state != Entity.STATE_ACTIVE: return rex = '|'.join(entity.regex_terms) rex = re.compile('( |^)(%s)( |$)' % rex) documents = defaultdict(int) try: for document_id, text in scan_entity_mentions(entity): text = normalize_strong(text) if text is None or len(text) <= 2: continue for match in rex.finditer(text): documents[document_id] += 1 except Exception: log.exception('Failed to fully scan documents for entity refresh.') q = db.session.query(Reference) q = q.filter(Reference.entity_id == entity.id) q = q.filter(Reference.origin == 'regex') q.delete(synchronize_session='fetch') log.info("Re-matching %r gave %r documents.", entity, len(documents)) for document_id, weight in documents.items(): doc = Document.by_id(document_id) if doc is None: continue ref = Reference() ref.document_id = document_id ref.entity_id = entity.id ref.origin = 'regex' ref.weight = weight db.session.add(ref) db.session.commit() delete_entity_references(entity.id) q = db.session.query(func.distinct(Reference.document_id)) q = q.filter(Reference.entity_id == entity.id) for document_id, in q: index_document(document_id, index_records=False)
def ingest_upload(collection_id): """ --- post: summary: Upload a document to a collection description: Upload a document to a collection with id `collection_id` parameters: - in: path name: collection_id required: true schema: type: integer requestBody: content: multipart/form-data: schema: type: object properties: file: type: string format: binary description: The document to upload meta: $ref: '#/components/schemas/DocumentIngest' responses: '200': description: OK content: application/json: schema: properties: id: description: id of the uploaded document type: integer status: type: string type: object tags: - Ingest - Collection """ collection = get_db_collection(collection_id, request.authz.WRITE) job_id = get_session_id() sync = get_flag('sync', default=False) index = get_flag('index', default=True) meta, foreign_id = _load_metadata() parent = _load_parent(collection, meta) upload_dir = ensure_path(mkdtemp(prefix='aleph.upload.')) try: content_hash = None for storage in request.files.values(): path = safe_filename(storage.filename, default='upload') path = upload_dir.joinpath(path) storage.save(str(path)) content_hash = archive.archive_file(path) document = Document.save(collection=collection, parent=parent, foreign_id=foreign_id, content_hash=content_hash, meta=meta, role_id=request.authz.id) collection.touch() db.session.commit() proxy = document.to_proxy(ns=collection.ns) if proxy.schema.is_a(Document.SCHEMA_FOLDER) and sync and index: index_proxy(collection, proxy, sync=sync) ingest_entity(collection, proxy, job_id=job_id, index=index) _notify(collection, proxy.id) return jsonify({'status': 'ok', 'id': proxy.id}, status=201) finally: shutil.rmtree(upload_dir)