def flush(collection_id, mapping_id): """Flush all entities loaded by mapping with id `mapping_id`. --- post: summary: Flush entities loaded by a mapping parameters: - description: The collection id. in: path name: collection_id required: true schema: minimum: 1 type: integer example: 2 - description: The mapping id. in: path name: mapping_id required: true schema: minimum: 1 type: integer example: 2 responses: '202': description: No Content tags: - Collection """ collection = get_db_collection(collection_id, request.authz.WRITE) mapping = obj_or_404(Mapping.by_id(mapping_id)) queue_task(collection, OP_FLUSH_MAPPING, job_id=get_session_id(), payload={'mapping_id': mapping.id}) return ('', 202)
def ingest_upload(collection_id): collection = get_db_collection(collection_id, request.authz.WRITE) job_id = get_session_id() sync = get_flag('sync', default=False) meta, foreign_id = _load_metadata() parent = _load_parent(collection, meta) upload_dir = ensure_path(mkdtemp(prefix='aleph.upload.')) try: content_hash = None for storage in request.files.values(): path = safe_filename(storage.filename, default='upload') path = upload_dir.joinpath(path) storage.save(str(path)) content_hash = archive.archive_file(path) document = Document.save(collection=collection, parent=parent, foreign_id=foreign_id, content_hash=content_hash, meta=meta, uploader_id=request.authz.id) collection.touch() db.session.commit() proxy = document.to_proxy() if proxy.schema.is_a(Document.SCHEMA_FOLDER) and sync: index_proxy(collection, proxy, sync=sync) ingest_entity(collection, proxy, job_id=job_id, sync=sync) document_id = collection.ns.sign(document.id) _notify(collection, document_id) finally: shutil.rmtree(upload_dir) return jsonify({'status': 'ok', 'id': document_id}, status=201)
def export(collection_id): """ --- post: summary: Download cross-reference results description: Download results of cross-referencing as an Excel file parameters: - in: path name: collection_id required: true schema: type: integer responses: '202': description: Accepted tags: - Xref - Collection """ collection = get_db_collection(collection_id, request.authz.READ) label = "%s - Crossreference results" % collection.label export = create_export( operation=OP_EXPORT_XREF_RESULTS, role_id=request.authz.id, label=label, collection=collection, mime_type=XLSX, ) job_id = get_session_id() payload = { "collection_id": collection_id, "export_id": export.id, } queue_task(None, OP_EXPORT_XREF_RESULTS, job_id=job_id, payload=payload) return ("", 202)
def delete(entity_id): """ --- delete: summary: Delete an entity description: Delete the entity with id `entity_id` parameters: - in: path name: entity_id required: true schema: type: string responses: '204': description: No Content tags: - Entity """ entity = get_index_entity(entity_id, request.authz.WRITE) collection = get_db_collection(entity.get("collection_id"), request.authz.WRITE) tag_request(collection_id=collection.id) sync = get_flag("sync", default=True) job_id = get_session_id() delete_entity(collection, entity, sync=sync, job_id=job_id) return ("", 204)
def export(): """ --- post: summary: Download the results of a search description: >- Downloads all the results of a search as a zip archive; upto a max of 10,000 results. The returned file will contain an Excel document with structured data as well as the binary files from all matching documents. Supports the same query parameters as the search API. responses: '202': description: Accepted tags: - Entity """ require(request.authz.logged_in) parser = SearchQueryParser(request.args, request.authz) tag_request(query=parser.text, prefix=parser.prefix) query = EntitiesQuery(parser) label = gettext("Search: %s") % query.to_text() export = create_export( operation=OP_EXPORT_SEARCH, role_id=request.authz.id, label=label, mime_type=ZIP, meta={"query": query.get_full_query()}, ) job_id = get_session_id() queue_task(None, OP_EXPORT_SEARCH, job_id=job_id, export_id=export.id) return ("", 202)
def reingest(collection_id): """ --- post: summary: Re-ingest a collection description: > Trigger a process to re-parse the content of all documents stored in the collection with id `collection_id`. parameters: - description: The collection ID. in: path name: collection_id required: true schema: minimum: 1 type: integer - in: query name: index description: Index documents while they're being processed. schema: type: boolean responses: '202': description: Accepted tags: - Collection """ collection = get_db_collection(collection_id, request.authz.WRITE) job_id = get_session_id() data = {"index": get_flag("index", False)} queue_task(collection, OP_REINGEST, job_id=job_id, payload=data) return ("", 202)
def reindex(collection_id): """ --- post: summary: Re-index a collection description: > Re-index the entities in the collection with id `collection_id` parameters: - description: The collection ID. in: path name: collection_id required: true schema: minimum: 1 type: integer - in: query description: Delete the index before re-generating it. name: flush schema: type: boolean responses: '202': description: Accepted tags: - Collection """ collection = get_db_collection(collection_id, request.authz.WRITE) job_id = get_session_id() data = {"flush": get_flag("flush", False)} queue_task(collection, OP_REINDEX, job_id=job_id, payload=data) return ("", 202)
def update(entity_id): """ --- post: summary: Update an entity description: > Update the entity with id `entity_id`. This only applies to entities which are backed by a database row, i.e. not any entities resulting from a mapping or bulk load. parameters: - in: path name: entity_id required: true schema: type: string format: entity_id - in: query name: sign description: Sign entity IDs referenced in nested properties. required: false schema: type: boolean requestBody: content: application/json: schema: $ref: '#/components/schemas/EntityUpdate' responses: '200': description: OK content: application/json: schema: $ref: '#/components/schemas/Entity' tags: - Entity """ data = parse_request("EntityUpdate") try: entity = get_index_entity(entity_id, request.authz.WRITE) require(check_write_entity(entity, request.authz)) collection = get_db_collection(entity.get("collection_id"), request.authz.WRITE) except NotFound: collection = get_nested_collection(data, request.authz.WRITE) tag_request(collection_id=collection.id) data["id"] = entity_id if get_flag("validate", default=False): validate_entity(data) entity_id = upsert_entity( data, collection, authz=request.authz, sync=get_flag("sync", default=True), sign=get_flag("sign", default=False), job_id=get_session_id(), ) db.session.commit() return view(entity_id)
def item_update(entityset_id): """Add an item to the entity set with id `entityset_id`, or change the items judgement. To delete an item from the entity set, apply the judgement: `no_judgement`. --- post: summary: Add item to an entityset parameters: - description: The entityset id. in: path name: entityset_id required: true schema: type: string example: 3a0d91ece2dce88ad3259594c7b642485235a048 requestBody: content: application/json: schema: $ref: '#/components/schemas/EntitySetItemUpdate' responses: '200': content: application/json: schema: $ref: '#/components/schemas/EntitySetItem' description: OK '204': description: Item removed tags: - EntitySetItem """ entityset = get_entityset(entityset_id, request.authz.WRITE) data = parse_request("EntitySetItemUpdate") entity = data.pop("entity", {}) entity_id = data.pop("entity_id", entity.get("id")) entity = get_index_entity(entity_id, request.authz.READ) collection = get_db_collection(entity["collection_id"]) data["added_by_id"] = request.authz.id data.pop("collection", None) item = save_entityset_item(entityset, collection, entity_id, **data) db.session.commit() job_id = get_session_id() queue_task(collection, OP_UPDATE_ENTITY, job_id=job_id, entity_id=entity_id) if item is not None: # The entityset is needed to check if the item is writeable in the serializer: item = item.to_dict(entityset=entityset) else: item = { "id": "$".join((entityset_id, entity_id)), "entityset_id": entityset_id, "entityset_collection_id": entityset.collection_id, "entity_id": entity_id, "collection_id": entity["collection_id"], "judgement": Judgement.NO_JUDGEMENT, } return EntitySetItemSerializer.jsonify(item)
def process(collection_id): collection = get_db_collection(collection_id, request.authz.WRITE) # re-process the documents data = {'reset': get_flag('reset', True)} queue_task(collection, OP_PROCESS, job_id=get_session_id(), payload=data) collection.touch() db.session.commit() refresh_collection(collection_id) return ('', 202)
def pairwise(): """ --- post: summary: Make a pairwise judgement between an entity and a match. description: > This lets a user decide if they think a given xref match is a true or false match. Implicitly, this might create or alter a profile in the collection used by requestBody: content: application/json: schema: $ref: '#/components/schemas/Pairwise' responses: '200': content: application/json: schema: properties: status: description: accepted type: string profile_id: description: profile_id for `entity`. type: string type: object description: Accepted tags: - Profile """ data = parse_request("Pairwise") entity = get_index_entity(data.get("entity_id")) collection = get_db_collection(entity["collection_id"], request.authz.WRITE) match = get_index_entity(data.get("match_id")) match_collection = get_db_collection(match["collection_id"]) profile = decide_pairwise( collection, entity, match_collection, match, judgement=data.get("judgement"), authz=request.authz, ) job_id = get_session_id() queue_task(collection, OP_UPDATE_ENTITY, job_id=job_id, entity_id=entity.get("id")) profile_id = profile.id if profile is not None else None return jsonify({"status": "ok", "profile_id": profile_id}, status=200)
def create(): """ --- post: summary: Create an entity in a collection description: >- Create an entity in a collection with a given schema and a set of given properties in the database. This is not the API you want to be using to load bulk data, but only for interactive entity manipulation in the UI. Always use the `bulk` API or for loading source datasets, no exceptions. parameters: - in: query name: sign description: Sign entity IDs referenced in nested properties. required: false schema: type: boolean requestBody: content: application/json: schema: $ref: '#/components/schemas/EntityCreate' responses: '200': description: Resturns the created entity content: application/json: schema: $ref: '#/components/schemas/Entity' tags: - Entity """ data = parse_request("EntityCreate") collection = get_nested_collection(data, request.authz.WRITE) data.pop("id", None) if get_flag("validate", default=False): validate_entity(data) entity_id = upsert_entity( data, collection, authz=request.authz, sync=True, sign=get_flag("sign", default=False), job_id=get_session_id(), ) db.session.commit() tag_request(entity_id=entity_id, collection_id=collection.id) entity = get_index_entity(entity_id, request.authz.READ) return EntitySerializer.jsonify(entity)
def bulk(collection_id): """ --- post: summary: Load entities into a collection description: > Bulk load entities into the collection with id `collection_id` parameters: - description: The collection ID. in: path name: collection_id required: true schema: minimum: 1 type: integer - description: >- This will disable checksum security measures in order to allow bulk loading of document data. in: query name: unsafe schema: type: boolean requestBody: description: Entities to be loaded. content: application/json: schema: type: array items: $ref: '#/components/schemas/EntityUpdate' responses: '204': description: No Content tags: - Collection """ collection = get_db_collection(collection_id, request.authz.WRITE) require(request.authz.can_bulk_import()) job_id = get_session_id() # This will disable checksum security measures in order to allow bulk # loading of document data. unsafe = get_flag('unsafe', default=False) unsafe = unsafe and request.authz.is_admin entities = ensure_list(request.get_json(force=True)) bulk_write(collection, entities, job_id=job_id, unsafe=unsafe) collection.touch() db.session.commit() return ('', 204)
def mapping(collection_id): collection = get_db_collection(collection_id, request.authz.WRITE) require(request.authz.can_bulk_import()) if not request.is_json: raise BadRequest() data = request.get_json().get(collection.foreign_id) for query in keys_values(data, 'queries', 'query'): try: model.make_mapping(query) except InvalidMapping as invalid: raise BadRequest(invalid) queue_task(collection, OP_BULKLOAD, job_id=get_session_id(), payload=data) collection.touch() db.session.commit() refresh_collection(collection_id) return ('', 202)
def bulk(collection_id): collection = get_db_collection(collection_id, request.authz.WRITE) require(request.authz.can_bulk_import()) job_id = get_session_id() # This will disable checksum security measures in order to allow bulk # loading of document data. unsafe = get_flag('unsafe', default=False) unsafe = unsafe and request.authz.is_admin entities = ensure_list(request.get_json(force=True)) bulk_write(collection, entities, job_id=job_id, unsafe=unsafe) collection.touch() db.session.commit() refresh_collection(collection_id) return ('', 204)
def flush(collection_id, mapping_id): """Flush all entities loaded by mapping with id `mapping_id`. --- post: summary: Flush entities loaded by a mapping parameters: - description: The collection id. in: path name: collection_id required: true schema: minimum: 1 type: integer example: 2 - description: The mapping id. in: path name: mapping_id required: true schema: minimum: 1 type: integer example: 2 responses: '202': description: No Content tags: - Collection - Mapping """ collection = get_db_collection(collection_id, request.authz.WRITE) mapping = obj_or_404(Mapping.by_id(mapping_id)) mapping.disabled = True mapping.last_run_status = None mapping.last_run_err_msg = None db.session.add(mapping) db.session.commit() queue_task( collection, OP_FLUSH_MAPPING, job_id=get_session_id(), mapping_id=mapping_id, ) return ("", 202)
def trigger(collection_id, mapping_id): """Load entities by running the mapping with id `mapping_id`. Flushes previously loaded entities before loading new entities. --- post: summary: Load entities from a mapping parameters: - description: The collection id. in: path name: collection_id required: true schema: minimum: 1 type: integer example: 2 - description: The mapping id. in: path name: mapping_id required: true schema: minimum: 1 type: integer example: 2 responses: '202': description: No Content tags: - Collection - Mapping """ collection = get_db_collection(collection_id, request.authz.WRITE) mapping = obj_or_404(Mapping.by_id(mapping_id)) mapping.disabled = False mapping.set_status(Status.PENDING) db.session.commit() job_id = get_session_id() queue_task(collection, OP_LOAD_MAPPING, job_id=job_id, mapping_id=mapping.id) mapping = obj_or_404(Mapping.by_id(mapping_id)) return MappingSerializer.jsonify(mapping, status=202)
def delete(collection_id, mapping_id): """Delete a mapping. --- delete: summary: Delete a mapping parameters: - description: The collection id. in: path name: collection_id required: true schema: minimum: 1 type: integer example: 2 - description: The mapping id. in: path name: mapping_id required: true schema: minimum: 1 type: integer example: 2 responses: '204': description: No Content tags: - Collection - Mapping """ collection = get_db_collection(collection_id, request.authz.WRITE) mapping = obj_or_404(Mapping.by_id(mapping_id)) mapping.delete() db.session.commit() queue_task( collection, OP_FLUSH_MAPPING, job_id=get_session_id(), mapping_id=mapping_id, ) return ("", 204)
def export(): """ --- post: summary: Download the results of a search description: >- Downloads all the results of a search as a zip archive; upto a max of 10,000 results. The returned file will contain an Excel document with structured data as well as the binary files from all matching documents. Supports the same query parameters as the search API. responses: '202': description: Accepted tags: - Entity """ require(request.authz.logged_in) parser = SearchQueryParser(request.args, request.authz) parser.limit = MAX_PAGE tag_request(query=parser.text, prefix=parser.prefix) result = EntitiesQuery.handle(request, parser=parser) label = "Search results for query: %s" % parser.text export = create_export( operation=OP_EXPORT_SEARCH_RESULTS, role_id=request.authz.id, label=label, file_path=None, expires_after=Export.DEFAULT_EXPIRATION, collection=None, mime_type=ZIP, ) job_id = get_session_id() payload = { "export_id": export.id, "result": result.to_dict(), } queue_task(None, OP_EXPORT_SEARCH_RESULTS, job_id=job_id, payload=payload) return ("", 202)
def trigger(collection_id, mapping_id): """Load entities by running the mapping with id `mapping_id`. Flushes previously loaded entities before loading new entities. --- post: summary: Load entities from a mapping parameters: - description: The collection id. in: path name: collection_id required: true schema: minimum: 1 type: integer example: 2 - description: The mapping id. in: path name: mapping_id required: true schema: minimum: 1 type: integer example: 2 responses: '202': description: No Content tags: - Collection - Mapping """ collection = get_db_collection(collection_id, request.authz.WRITE) mapping = obj_or_404(Mapping.by_id(mapping_id)) job_id = get_session_id() payload = {'mapping_id': mapping.id} queue_task(collection, OP_LOAD_MAPPING, job_id=job_id, payload=payload) collection.touch() db.session.commit() return ('', 202)
def process(collection_id): """ --- post: summary: Process a collection description: Start processing the collection with id `collection_id` parameters: - description: The collection ID. in: path name: collection_id required: true schema: minimum: 1 type: integer - in: query name: ingest schema: type: boolean - in: query name: reset schema: type: boolean responses: '202': description: Accepted tags: - Collection """ collection = get_db_collection(collection_id, request.authz.WRITE) # re-process the documents data = {'reset': get_flag('reset', True)} queue_task(collection, OP_PROCESS, job_id=get_session_id(), payload=data) collection.touch() db.session.commit() refresh_collection(collection_id) return ('', 202)
def entities_update(entityset_id): """ --- post: summary: Update an entity and add it to the entity set. description: > Update the entity with id `entity_id`. If it does not exist it will be created. If the user cannot edit the given entity, it is merely added to the entity set. New entities are always created in the collection of the entity set. Aside from these idiosyncracies, this is the same as `/api/2/entities/<id>`, but handles entity set membership transparently. parameters: - description: The entityset id. in: path name: entityset_id required: true schema: type: string example: 3a0d91ece2dce88ad3259594c7b642485235a048 - in: query name: sign description: Sign entity IDs referenced in nested properties. required: false schema: type: boolean requestBody: content: application/json: schema: $ref: '#/components/schemas/EntityUpdate' responses: '200': description: OK content: application/json: schema: $ref: '#/components/schemas/Entity' tags: - Entity """ entityset = get_entityset(entityset_id, request.authz.WRITE) data = parse_request("EntityUpdate") entity_id = data.get("id", make_textid()) try: entity = get_index_entity(entity_id, request.authz.READ) collection = get_db_collection(entity.get("collection_id"), request.authz.READ) except NotFound: entity = None collection = entityset.collection tag_request(collection_id=entityset.collection_id) if entity is None or check_write_entity(entity, request.authz): if get_flag("validate", default=False): validate_entity(data) entity_id = upsert_entity( data, collection, authz=request.authz, sync=get_flag("sync", default=True), sign=get_flag("sign", default=False), job_id=get_session_id(), ) save_entityset_item( entityset, collection, entity_id, added_by_id=request.authz.id, ) db.session.commit() return entity_view(entity_id)
def ingest_upload(collection_id): """ --- post: summary: Upload a document to a collection description: Upload a document to a collection with id `collection_id` parameters: - in: path name: collection_id required: true schema: type: integer requestBody: content: multipart/form-data: schema: type: object properties: file: type: string format: binary description: The document to upload meta: $ref: '#/components/schemas/DocumentIngest' responses: '200': description: OK content: application/json: schema: properties: id: description: id of the uploaded document type: integer status: type: string type: object tags: - Ingest - Collection """ collection = get_db_collection(collection_id, request.authz.WRITE) job_id = get_session_id() sync = get_flag('sync', default=False) meta, foreign_id = _load_metadata() parent = _load_parent(collection, meta) upload_dir = ensure_path(mkdtemp(prefix='aleph.upload.')) try: content_hash = None for storage in request.files.values(): path = safe_filename(storage.filename, default='upload') path = upload_dir.joinpath(path) storage.save(str(path)) content_hash = archive.archive_file(path) document = Document.save(collection=collection, parent=parent, foreign_id=foreign_id, content_hash=content_hash, meta=meta, uploader_id=request.authz.id) collection.touch() db.session.commit() proxy = document.to_proxy() if proxy.schema.is_a(Document.SCHEMA_FOLDER) and sync: index_proxy(collection, proxy, sync=sync) ingest_entity(collection, proxy, job_id=job_id, sync=sync) document_id = collection.ns.sign(document.id) _notify(collection, document_id) finally: shutil.rmtree(upload_dir) return jsonify({'status': 'ok', 'id': document_id}, status=201)
def bulk(collection_id): """ --- post: summary: Load entities into a collection description: > Bulk load entities into the collection with id `collection_id` parameters: - description: The collection ID. in: path name: collection_id required: true schema: minimum: 1 type: integer - description: >- This will disable checksum security measures in order to allow bulk loading of document data. in: query name: unsafe schema: type: boolean requestBody: description: Entities to be loaded. content: application/json: schema: type: array items: $ref: '#/components/schemas/EntityUpdate' responses: '204': description: No Content tags: - Collection """ collection = get_db_collection(collection_id, request.authz.WRITE) require(request.authz.can_bulk_import()) job_id = get_session_id() entityset = request.args.get("entityset_id") if entityset is not None: entityset = get_entityset(entityset, request.authz.WRITE) # This will disable checksum security measures in order to allow bulk # loading of document data: safe = get_flag("safe", default=True) # Flag is only available for admins: if not request.authz.is_admin: safe = True # Let UI tools change the entities created by this: mutable = get_flag("mutable", default=False) role_id = request.authz.id entities = ensure_list(request.get_json(force=True)) entity_ids = list() for entity_id in bulk_write(collection, entities, safe=safe, mutable=mutable, role_id=role_id): entity_ids.append(entity_id) if entityset is not None: EntitySetItem.save( entityset, entity_id, collection_id=collection.id, added_by_id=request.authz.id, ) collection.touch() db.session.commit() data = {"entity_ids": entity_ids} queue_task(collection, OP_INDEX, job_id=job_id, payload=data) return ("", 204)