def test_load_sample_directory(self): samples_path = self.get_fixture_path('samples') document = Document.by_keys(collection=self.collection, foreign_id='samples') db.session.commit() db.session.refresh(document) ingest_document(document, samples_path) assert Document.all().count() == 5, Document.all().count()
def test_load_pdf_file(self): pdf_path = self.get_fixture_path('demo.pdf') document = Document.by_keys(collection=self.collection, foreign_id='demo.pdf') db.session.commit() db.session.refresh(document) ingest_document(document, pdf_path) assert Document.all().count() == 1, Document.all().count()
def test_load_pdf_file(self): pdf_path = self.get_fixture_path('demo.pdf') document = Document.by_keys(collection_id=self.collection.id, foreign_id='demo.pdf') db.session.commit() db.session.refresh(document) ingest_document(document, pdf_path) assert Document.all().count() == 1, Document.all().count()
def test_load_sample_directory(self): samples_path = self.get_fixture_path('samples') document = Document.by_keys(collection_id=self.collection.id, foreign_id='samples') db.session.commit() db.session.refresh(document) ingest_document(document, samples_path) assert Document.all().count() == 5, Document.all().count()
def test_crawler_execute(self): tdc = TDocumentCrawler() ccnt = Document.all().count() assert ccnt == 0, ccnt tdc.execute() states = Document.all().all() assert len(states) == 1, len(states) demo = states[0] assert 'kitty' in demo.title, demo.meta coll = Collection.by_foreign_id('test') assert coll is not None, coll assert len(list(coll.documents)) == 1, list(coll.documents)
def load_fixtures(self, file_name): filepath = self.get_fixture_path(file_name) load_fixtures(db, loaders.load(filepath)) db.session.commit() for document in Document.all(): index_document(document) self.update_index()
def index(): sources_ids = match_ids('sources', authz.sources(authz.READ)) q = Document.all().filter(Document.source_id.in_(sources_ids)) hashes = request.args.getlist('content_hash') if len(hashes): q = q.filter(Document.content_hash.in_(hashes)) return jsonify(Pager(q))
def load_fixtures(self, file_name): filepath = self.get_fixture_path(file_name) load_fixtures(db, loaders.load(filepath)) db.session.commit() update_collections() for doc in Document.all(): process_document(doc) self.flush_index()
def index(): collection_ids = match_ids('collection', authz.collections(authz.READ)) q = Document.all() clause = Collection.id.in_(collection_ids) q = q.filter(Document.collections.any(clause)) hashes = request.args.getlist('content_hash') if len(hashes): q = q.filter(Document.content_hash.in_(hashes)) return jsonify(Pager(q))
def load_documents(): graph = get_graph() tx = graph.begin() for i, document in enumerate(Document.all()): load_document(tx, document) if i > 0 and i % 1000 == 0: tx.commit() tx = graph.begin() tx.commit()
def load_fixtures(self, file_name, process_documents=True): filepath = self.get_fixture_path(file_name) load_fixtures(db, loaders.load(filepath)) db.session.commit() reindex_entities() if process_documents: for doc in Document.all(): analyze_document(doc) optimize_search()
def load_fixtures(self, file_name, process_documents=True): filepath = self.get_fixture_path(file_name) load_fixtures(db, loaders.load(filepath)) db.session.commit() reindex_entities() if process_documents: for doc in Document.all(): process_document(doc) self.flush_index()
def test_load_csv_file(self): csv_path = self.get_fixture_path('experts.csv') document = Document.by_keys(collection=self.collection, foreign_id='experts.csv') db.session.commit() db.session.refresh(document) ingest_document(document, csv_path) assert Document.all().count() == 1, Document.all().count() records = db.session.query(DocumentRecord).all() assert len(records) == 14, len(records) rec0 = records[0] assert str(rec0.id) in repr(rec0), repr(rec0) assert 'nationality' in rec0.data, rec0.data assert 'name' in rec0.data, rec0.data doc = rec0.document doc.delete_records() records = db.session.query(DocumentRecord).all() assert len(records) == 0, len(records)
def index(): authz = request.authz collections = request.args.getlist('collection') collections = authz.collections_intersect(authz.READ, collections) q = Document.all() q = q.filter(Document.collection_id.in_(collections)) hashes = request.args.getlist('content_hash') if len(hashes): q = q.filter(Document.content_hash.in_(hashes)) return jsonify(Pager(q))
def load_documents(): graph = get_graph() tx = graph.begin() for i, document in enumerate(Document.all()): log.info("Load doc [%s]: %r", document.id, document.meta) load_document(tx, document) if i > 0 and i % 1000 == 0: tx.commit() tx = graph.begin() tx.commit()
def test_load_csv_file(self): csv_path = self.get_fixture_path('experts.csv') crawler = DirectoryCrawler() crawler.execute(directory=csv_path) assert Document.all().count() == 1, Document.all().count() records = db.session.query(DocumentRecord).all() assert len(records) == 14, len(records) rec0 = records[0] assert str(rec0.id) in repr(rec0), repr(rec0) assert 'experts.csv' in rec0.document.meta.file_name, \ rec0.document.meta assert 'nationality' in rec0.data, rec0.data assert 'name' in rec0.data, rec0.data doc = rec0.document assert 'experts' in repr(doc) doc.delete_records() records = db.session.query(DocumentRecord).all() assert len(records) == 0, len(records)
def test_load_csv_file(self): csv_path = self.get_fixture_path('experts.csv') document = Document.by_keys(collection_id=self.collection.id, foreign_id='experts.csv') document.file_name = 'experts.csv' db.session.commit() db.session.refresh(document) ingest_document(document, csv_path) assert Document.all().count() == 1, Document.all().count() records = db.session.query(DocumentRecord).all() assert len(records) == 14, len(records) rec0 = records[0] assert str(rec0.id) in repr(rec0), repr(rec0) assert 'nationality' in rec0.data, rec0.data assert 'name' in rec0.data, rec0.data doc = rec0.document doc.delete_records() records = db.session.query(DocumentRecord).all() assert len(records) == 0, len(records)
def index(): try: authorized = authz.collections(authz.READ) collection_ids = [int(f) for f in request.args.getlist('collection')] collection_ids = collection_ids or authorized collection_ids = [c for c in collection_ids if c in authorized] except ValueError: raise BadRequest() q = Document.all() clause = Collection.id.in_(collection_ids) q = q.filter(Document.collections.any(clause)) hashes = request.args.getlist('content_hash') if len(hashes): q = q.filter(Document.content_hash.in_(hashes)) return jsonify(Pager(q))
def create_document(self, meta, type=None): if meta.content_hash: q = Document.all() if meta.foreign_id: q = q.filter(Document.foreign_id == meta.foreign_id) else: q = q.filter(Document.content_hash == meta.content_hash) q = q.filter(Document.source_id == self.source_id) document = q.first() if document is None: document = Document() document.source_id = self.source_id document.meta = meta document.type = type or self.DOCUMENT_TYPE db.session.add(document) db.session.flush() return document
def document_by_meta(cls, collection_id, meta): q = Document.all() if meta.foreign_id: q = q.filter(Document.foreign_id == meta.foreign_id) elif meta.content_hash: q = q.filter(Document.content_hash == meta.content_hash) else: raise ValueError("No unique criterion for document: %s" % meta) q = q.filter(Document.collection_id == collection_id) document = q.first() if document is None: document = Document() document.collection_id = collection_id document.foreign_id = meta.foreign_id document.content_hash = meta.content_hash document.meta = meta return document
def create_document(self, meta, type=None): if meta.content_hash: q = Document.all() if meta.foreign_id: q = q.filter(Document.foreign_id == meta.foreign_id) else: q = q.filter(Document.content_hash == meta.content_hash) clause = Collection.id == self.collection_id q = q.filter(Document.collections.any(clause)) document = q.first() if document is None: document = Document() document.collections = [Collection.by_id(self.collection_id)] document.meta = meta document.type = type or self.DOCUMENT_TYPE db.session.add(document) db.session.flush() return document
def test_incremental(self): tdc = TDocumentCrawler() tdc.execute() tdc.execute(incremental=True) states = Document.all().all() assert len(states) == 1, len(states)
def test_crawl_sample_directory(self): samples_path = self.get_fixture_path("samples") crawl_directory(self.collection, samples_path) assert Document.all().count() == 4, Document.all().count()