def ingest_upload(id): collection = get_db_collection(id, request.authz.WRITE) meta, foreign_id = _load_metadata(collection) parent_id = _load_parent(collection, meta) upload_dir = mkdtemp(prefix='aleph.upload.') try: documents = [] for storage in request.files.values(): path = safe_filename(storage.filename, default='upload') path = os.path.join(upload_dir, path) storage.save(path) content_hash = checksum(path) document = Document.by_keys(collection=collection, parent_id=parent_id, foreign_id=foreign_id, content_hash=content_hash) document.update(meta) document.uploader_id = request.authz.id ingest_document(document, path) documents.append(document) if not len(request.files): # If there is no files uploaded, try to create an empty # directory instead. Maybe this should be more explicit, # but it seemed like the most simple way of fitting it # into the API. document = Document.by_keys(collection=collection, parent_id=parent_id, foreign_id=foreign_id) document.schema = Document.SCHEMA_FOLDER document.update(meta) document.uploader_id = request.authz.id ingest_document(document, None) documents.append(document) finally: shutil.rmtree(upload_dir) if collection.casefile: for document in documents: params = {'document': document, 'collection': collection} publish(Events.INGEST_DOCUMENT, actor_id=document.uploader_id, params=params) # Update child counts in index. if parent_id is not None: index_document_id.apply_async([parent_id], priority=1) refresh_index(index=entities_index()) return jsonify({ 'status': 'ok', 'documents': [CombinedSchema().dump(d).data for d in documents] })
def process_documents(collection_id=None, failed_only=False, index_only=False): """Re-ingest or re-index all documents. Can be filtered to cover only documents which failed to properly import last time, or those which are part of a particular collection.""" q = Document.find_ids(collection_id=collection_id, failed_only=failed_only) q = q.all() if settings.EAGER else q.yield_per(5000) for idx, (doc_id, ) in enumerate(q, 1): if index_only: index_document_id.apply_async([doc_id], priority=1) else: ingest.apply_async([doc_id], {'refresh': True}, priority=1) if idx % 10000 == 0: log.info("Process: %s documents...", idx)
def index(foreign_id=None): """Index documents in the given collection (or throughout).""" q = Document.all_ids() # re-index newest document first. q = q.order_by(Document.id.desc()) if foreign_id: collection = Collection.by_foreign_id(foreign_id) if collection is None: raise ValueError("No such collection: %r" % foreign_id) q = q.filter(Document.collection_id == collection.id) for idx, (doc_id, ) in enumerate(q.yield_per(5000), 1): index_document_id.apply_async([doc_id], priority=1) if idx % 1000 == 0: log.info("Index: %s documents...", idx) if foreign_id is None: update_collections()