Example #1
0
def ingest_upload(id):
    collection = get_db_collection(id, request.authz.WRITE)
    meta, foreign_id = _load_metadata(collection)
    parent_id = _load_parent(collection, meta)
    upload_dir = mkdtemp(prefix='aleph.upload.')
    try:
        documents = []
        for storage in request.files.values():
            path = safe_filename(storage.filename, default='upload')
            path = os.path.join(upload_dir, path)
            storage.save(path)
            content_hash = checksum(path)
            document = Document.by_keys(collection=collection,
                                        parent_id=parent_id,
                                        foreign_id=foreign_id,
                                        content_hash=content_hash)
            document.update(meta)
            document.uploader_id = request.authz.id
            ingest_document(document, path)
            documents.append(document)

        if not len(request.files):
            # If there is no files uploaded, try to create an empty
            # directory instead. Maybe this should be more explicit,
            # but it seemed like the most simple way of fitting it
            # into the API.
            document = Document.by_keys(collection=collection,
                                        parent_id=parent_id,
                                        foreign_id=foreign_id)
            document.schema = Document.SCHEMA_FOLDER
            document.update(meta)
            document.uploader_id = request.authz.id
            ingest_document(document, None)
            documents.append(document)
    finally:
        shutil.rmtree(upload_dir)

    if collection.casefile:
        for document in documents:
            params = {'document': document, 'collection': collection}
            publish(Events.INGEST_DOCUMENT,
                    actor_id=document.uploader_id,
                    params=params)

    # Update child counts in index.
    if parent_id is not None:
        index_document_id.apply_async([parent_id], priority=1)

    refresh_index(index=entities_index())
    return jsonify({
        'status':
        'ok',
        'documents': [CombinedSchema().dump(d).data for d in documents]
    })
Example #2
0
def process_documents(collection_id=None, failed_only=False, index_only=False):
    """Re-ingest or re-index all documents. Can be filtered to cover only
    documents which failed to properly import last time, or those which
    are part of a particular collection."""
    q = Document.find_ids(collection_id=collection_id, failed_only=failed_only)
    q = q.all() if settings.EAGER else q.yield_per(5000)
    for idx, (doc_id, ) in enumerate(q, 1):
        if index_only:
            index_document_id.apply_async([doc_id], priority=1)
        else:
            ingest.apply_async([doc_id], {'refresh': True}, priority=1)
        if idx % 10000 == 0:
            log.info("Process: %s documents...", idx)
Example #3
0
def index(foreign_id=None):
    """Index documents in the given collection (or throughout)."""
    q = Document.all_ids()
    # re-index newest document first.
    q = q.order_by(Document.id.desc())
    if foreign_id:
        collection = Collection.by_foreign_id(foreign_id)
        if collection is None:
            raise ValueError("No such collection: %r" % foreign_id)
        q = q.filter(Document.collection_id == collection.id)
    for idx, (doc_id, ) in enumerate(q.yield_per(5000), 1):
        index_document_id.apply_async([doc_id], priority=1)
        if idx % 1000 == 0:
            log.info("Index: %s documents...", idx)
    if foreign_id is None:
        update_collections()