def test_load_sample_directory(self):
     samples_path = self.get_fixture_path('samples')
     document = Document.by_keys(collection=self.collection,
                                 foreign_id='samples')
     db.session.commit()
     db.session.refresh(document)
     ingest_document(document, samples_path)
     assert Document.all().count() == 5, Document.all().count()
 def test_load_pdf_file(self):
     pdf_path = self.get_fixture_path('demo.pdf')
     document = Document.by_keys(collection=self.collection,
                                 foreign_id='demo.pdf')
     db.session.commit()
     db.session.refresh(document)
     ingest_document(document, pdf_path)
     assert Document.all().count() == 1, Document.all().count()
Example #3
0
 def test_load_pdf_file(self):
     pdf_path = self.get_fixture_path('demo.pdf')
     document = Document.by_keys(collection_id=self.collection.id,
                                 foreign_id='demo.pdf')
     db.session.commit()
     db.session.refresh(document)
     ingest_document(document, pdf_path)
     assert Document.all().count() == 1, Document.all().count()
Example #4
0
 def test_load_sample_directory(self):
     samples_path = self.get_fixture_path('samples')
     document = Document.by_keys(collection_id=self.collection.id,
                                 foreign_id='samples')
     db.session.commit()
     db.session.refresh(document)
     ingest_document(document, samples_path)
     assert Document.all().count() == 5, Document.all().count()
Example #5
0
    def test_crawler_execute(self):
        tdc = TDocumentCrawler()
        ccnt = Document.all().count()
        assert ccnt == 0, ccnt
        tdc.execute()
        states = Document.all().all()
        assert len(states) == 1, len(states)
        demo = states[0]
        assert 'kitty' in demo.title, demo.meta

        coll = Collection.by_foreign_id('test')
        assert coll is not None, coll
        assert len(list(coll.documents)) == 1, list(coll.documents)
Example #6
0
 def load_fixtures(self, file_name):
     filepath = self.get_fixture_path(file_name)
     load_fixtures(db, loaders.load(filepath))
     db.session.commit()
     for document in Document.all():
         index_document(document)
     self.update_index()
Example #7
0
def index():
    sources_ids = match_ids('sources', authz.sources(authz.READ))
    q = Document.all().filter(Document.source_id.in_(sources_ids))
    hashes = request.args.getlist('content_hash')
    if len(hashes):
        q = q.filter(Document.content_hash.in_(hashes))
    return jsonify(Pager(q))
Example #8
0
def index():
    sources_ids = match_ids('sources', authz.sources(authz.READ))
    q = Document.all().filter(Document.source_id.in_(sources_ids))
    hashes = request.args.getlist('content_hash')
    if len(hashes):
        q = q.filter(Document.content_hash.in_(hashes))
    return jsonify(Pager(q))
Example #9
0
 def load_fixtures(self, file_name):
     filepath = self.get_fixture_path(file_name)
     load_fixtures(db, loaders.load(filepath))
     db.session.commit()
     update_collections()
     for doc in Document.all():
         process_document(doc)
     self.flush_index()
Example #10
0
def index():
    collection_ids = match_ids('collection', authz.collections(authz.READ))
    q = Document.all()
    clause = Collection.id.in_(collection_ids)
    q = q.filter(Document.collections.any(clause))
    hashes = request.args.getlist('content_hash')
    if len(hashes):
        q = q.filter(Document.content_hash.in_(hashes))
    return jsonify(Pager(q))
Example #11
0
def index():
    collection_ids = match_ids('collection', authz.collections(authz.READ))
    q = Document.all()
    clause = Collection.id.in_(collection_ids)
    q = q.filter(Document.collections.any(clause))
    hashes = request.args.getlist('content_hash')
    if len(hashes):
        q = q.filter(Document.content_hash.in_(hashes))
    return jsonify(Pager(q))
Example #12
0
def load_documents():
    graph = get_graph()
    tx = graph.begin()
    for i, document in enumerate(Document.all()):
        load_document(tx, document)
        if i > 0 and i % 1000 == 0:
            tx.commit()
            tx = graph.begin()
    tx.commit()
Example #13
0
File: util.py Project: tomjie/aleph
 def load_fixtures(self, file_name, process_documents=True):
     filepath = self.get_fixture_path(file_name)
     load_fixtures(db, loaders.load(filepath))
     db.session.commit()
     reindex_entities()
     if process_documents:
         for doc in Document.all():
             analyze_document(doc)
         optimize_search()
Example #14
0
def load_documents():
    graph = get_graph()
    tx = graph.begin()
    for i, document in enumerate(Document.all()):
        load_document(tx, document)
        if i > 0 and i % 1000 == 0:
            tx.commit()
            tx = graph.begin()
    tx.commit()
Example #15
0
 def load_fixtures(self, file_name, process_documents=True):
     filepath = self.get_fixture_path(file_name)
     load_fixtures(db, loaders.load(filepath))
     db.session.commit()
     reindex_entities()
     if process_documents:
         for doc in Document.all():
             analyze_document(doc)
         optimize_search()
Example #16
0
 def load_fixtures(self, file_name, process_documents=True):
     filepath = self.get_fixture_path(file_name)
     load_fixtures(db, loaders.load(filepath))
     db.session.commit()
     reindex_entities()
     if process_documents:
         for doc in Document.all():
             process_document(doc)
     self.flush_index()
Example #17
0
    def test_load_csv_file(self):
        csv_path = self.get_fixture_path('experts.csv')
        document = Document.by_keys(collection=self.collection,
                                    foreign_id='experts.csv')
        db.session.commit()
        db.session.refresh(document)
        ingest_document(document, csv_path)
        assert Document.all().count() == 1, Document.all().count()
        records = db.session.query(DocumentRecord).all()
        assert len(records) == 14, len(records)
        rec0 = records[0]
        assert str(rec0.id) in repr(rec0), repr(rec0)
        assert 'nationality' in rec0.data, rec0.data
        assert 'name' in rec0.data, rec0.data

        doc = rec0.document
        doc.delete_records()
        records = db.session.query(DocumentRecord).all()
        assert len(records) == 0, len(records)
Example #18
0
def index():
    authz = request.authz
    collections = request.args.getlist('collection')
    collections = authz.collections_intersect(authz.READ, collections)
    q = Document.all()
    q = q.filter(Document.collection_id.in_(collections))
    hashes = request.args.getlist('content_hash')
    if len(hashes):
        q = q.filter(Document.content_hash.in_(hashes))
    return jsonify(Pager(q))
Example #19
0
def load_documents():
    graph = get_graph()
    tx = graph.begin()
    for i, document in enumerate(Document.all()):
        log.info("Load doc [%s]: %r", document.id, document.meta)
        load_document(tx, document)
        if i > 0 and i % 1000 == 0:
            tx.commit()
            tx = graph.begin()
    tx.commit()
Example #20
0
    def test_load_csv_file(self):
        csv_path = self.get_fixture_path('experts.csv')
        crawler = DirectoryCrawler()
        crawler.execute(directory=csv_path)
        assert Document.all().count() == 1, Document.all().count()
        records = db.session.query(DocumentRecord).all()
        assert len(records) == 14, len(records)
        rec0 = records[0]
        assert str(rec0.id) in repr(rec0), repr(rec0)
        assert 'experts.csv' in rec0.document.meta.file_name, \
            rec0.document.meta
        assert 'nationality' in rec0.data, rec0.data
        assert 'name' in rec0.data, rec0.data

        doc = rec0.document
        assert 'experts' in repr(doc)

        doc.delete_records()
        records = db.session.query(DocumentRecord).all()
        assert len(records) == 0, len(records)
Example #21
0
    def test_load_csv_file(self):
        csv_path = self.get_fixture_path('experts.csv')
        crawler = DirectoryCrawler()
        crawler.execute(directory=csv_path)
        assert Document.all().count() == 1, Document.all().count()
        records = db.session.query(DocumentRecord).all()
        assert len(records) == 14, len(records)
        rec0 = records[0]
        assert str(rec0.id) in repr(rec0), repr(rec0)
        assert 'experts.csv' in rec0.document.meta.file_name, \
            rec0.document.meta
        assert 'nationality' in rec0.data, rec0.data
        assert 'name' in rec0.data, rec0.data

        doc = rec0.document
        assert 'experts' in repr(doc)

        doc.delete_records()
        records = db.session.query(DocumentRecord).all()
        assert len(records) == 0, len(records)
Example #22
0
    def test_load_csv_file(self):
        csv_path = self.get_fixture_path('experts.csv')
        document = Document.by_keys(collection_id=self.collection.id,
                                    foreign_id='experts.csv')
        document.file_name = 'experts.csv'
        db.session.commit()
        db.session.refresh(document)
        ingest_document(document, csv_path)
        assert Document.all().count() == 1, Document.all().count()
        records = db.session.query(DocumentRecord).all()
        assert len(records) == 14, len(records)
        rec0 = records[0]
        assert str(rec0.id) in repr(rec0), repr(rec0)
        assert 'nationality' in rec0.data, rec0.data
        assert 'name' in rec0.data, rec0.data

        doc = rec0.document
        doc.delete_records()
        records = db.session.query(DocumentRecord).all()
        assert len(records) == 0, len(records)
Example #23
0
def index():
    try:
        authorized = authz.collections(authz.READ)
        collection_ids = [int(f) for f in request.args.getlist('collection')]
        collection_ids = collection_ids or authorized
        collection_ids = [c for c in collection_ids if c in authorized]
    except ValueError:
        raise BadRequest()
    q = Document.all()
    clause = Collection.id.in_(collection_ids)
    q = q.filter(Document.collections.any(clause))
    hashes = request.args.getlist('content_hash')
    if len(hashes):
        q = q.filter(Document.content_hash.in_(hashes))
    return jsonify(Pager(q))
Example #24
0
def index():
    try:
        authorized = authz.collections(authz.READ)
        collection_ids = [int(f) for f in request.args.getlist('collection')]
        collection_ids = collection_ids or authorized
        collection_ids = [c for c in collection_ids if c in authorized]
    except ValueError:
        raise BadRequest()
    q = Document.all()
    clause = Collection.id.in_(collection_ids)
    q = q.filter(Document.collections.any(clause))
    hashes = request.args.getlist('content_hash')
    if len(hashes):
        q = q.filter(Document.content_hash.in_(hashes))
    return jsonify(Pager(q))
Example #25
0
 def create_document(self, meta, type=None):
     if meta.content_hash:
         q = Document.all()
         if meta.foreign_id:
             q = q.filter(Document.foreign_id == meta.foreign_id)
         else:
             q = q.filter(Document.content_hash == meta.content_hash)
         q = q.filter(Document.source_id == self.source_id)
         document = q.first()
     if document is None:
         document = Document()
         document.source_id = self.source_id
     document.meta = meta
     document.type = type or self.DOCUMENT_TYPE
     db.session.add(document)
     db.session.flush()
     return document
Example #26
0
 def create_document(self, meta, type=None):
     if meta.content_hash:
         q = Document.all()
         if meta.foreign_id:
             q = q.filter(Document.foreign_id == meta.foreign_id)
         else:
             q = q.filter(Document.content_hash == meta.content_hash)
         q = q.filter(Document.source_id == self.source_id)
         document = q.first()
     if document is None:
         document = Document()
         document.source_id = self.source_id
     document.meta = meta
     document.type = type or self.DOCUMENT_TYPE
     db.session.add(document)
     db.session.flush()
     return document
Example #27
0
 def document_by_meta(cls, collection_id, meta):
     q = Document.all()
     if meta.foreign_id:
         q = q.filter(Document.foreign_id == meta.foreign_id)
     elif meta.content_hash:
         q = q.filter(Document.content_hash == meta.content_hash)
     else:
         raise ValueError("No unique criterion for document: %s" % meta)
     q = q.filter(Document.collection_id == collection_id)
     document = q.first()
     if document is None:
         document = Document()
         document.collection_id = collection_id
         document.foreign_id = meta.foreign_id
         document.content_hash = meta.content_hash
     document.meta = meta
     return document
Example #28
0
 def create_document(self, meta, type=None):
     if meta.content_hash:
         q = Document.all()
         if meta.foreign_id:
             q = q.filter(Document.foreign_id == meta.foreign_id)
         else:
             q = q.filter(Document.content_hash == meta.content_hash)
         clause = Collection.id == self.collection_id
         q = q.filter(Document.collections.any(clause))
         document = q.first()
     if document is None:
         document = Document()
         document.collections = [Collection.by_id(self.collection_id)]
     document.meta = meta
     document.type = type or self.DOCUMENT_TYPE
     db.session.add(document)
     db.session.flush()
     return document
Example #29
0
 def create_document(self, meta, type=None):
     if meta.content_hash:
         q = Document.all()
         if meta.foreign_id:
             q = q.filter(Document.foreign_id == meta.foreign_id)
         else:
             q = q.filter(Document.content_hash == meta.content_hash)
         clause = Collection.id == self.collection_id
         q = q.filter(Document.collections.any(clause))
         document = q.first()
     if document is None:
         document = Document()
         document.collections = [Collection.by_id(self.collection_id)]
     document.meta = meta
     document.type = type or self.DOCUMENT_TYPE
     db.session.add(document)
     db.session.flush()
     return document
Example #30
0
 def test_incremental(self):
     tdc = TDocumentCrawler()
     tdc.execute()
     tdc.execute(incremental=True)
     states = Document.all().all()
     assert len(states) == 1, len(states)
Example #31
0
 def test_crawl_sample_directory(self):
     samples_path = self.get_fixture_path("samples")
     crawl_directory(self.collection, samples_path)
     assert Document.all().count() == 4, Document.all().count()