def create(): require(request.authz.logged_in) data = parse_request(CollectionSchema) role = Role.by_id(request.authz.id) collection = create_collection(data, role=role) refresh_index(collections_index()) return view(collection.id)
def delete(id): entity = get_db_entity(id, request.authz.WRITE) delete_entity(entity) db.session.commit() update_collection(entity.collection) refresh_index(entities_index()) return ('', 204)
def create(): data = parse_request(EntityCreateSchema) collection = get_db_collection(data['collection_id'], request.authz.WRITE) entity = Entity.create(data, collection) db.session.commit() data = update_entity(entity) update_collection(collection) refresh_index(entities_index()) return serialize_data(data, CombinedSchema)
def ingest_upload(id): collection = get_db_collection(id, request.authz.WRITE) meta, foreign_id = _load_metadata(collection) parent_id = _load_parent(collection, meta) upload_dir = mkdtemp(prefix='aleph.upload.') try: documents = [] for storage in request.files.values(): path = safe_filename(storage.filename, default='upload') path = os.path.join(upload_dir, path) storage.save(path) content_hash = checksum(path) document = Document.by_keys(collection=collection, parent_id=parent_id, foreign_id=foreign_id, content_hash=content_hash) document.update(meta) document.uploader_id = request.authz.id ingest_document(document, path) documents.append(document) if not len(request.files): # If there is no files uploaded, try to create an empty # directory instead. Maybe this should be more explicit, # but it seemed like the most simple way of fitting it # into the API. document = Document.by_keys(collection=collection, parent_id=parent_id, foreign_id=foreign_id) document.schema = Document.SCHEMA_FOLDER document.update(meta) document.uploader_id = request.authz.id ingest_document(document, None) documents.append(document) finally: shutil.rmtree(upload_dir) if collection.casefile: for document in documents: params = {'document': document, 'collection': collection} publish(Events.INGEST_DOCUMENT, actor_id=document.uploader_id, params=params) # Update child counts in index. if parent_id is not None: index_document_id.apply_async([parent_id], priority=1) refresh_index(index=entities_index()) return jsonify({ 'status': 'ok', 'documents': [CombinedSchema().dump(d).data for d in documents] })
def index_records(document): if not document.supports_records: return clear_records(document.id) for attempt in count(): try: bulk_op(generate_records(document)) refresh_index(records_index()) return except Exception as exc: log.warning('Failed to index records: %s', exc) backoff_cluster(failures=attempt)
def index_records(document): if not document.supports_records: return clear_records(document.id, refresh=False) while True: try: bulk_op(generate_records(document)) refresh_index(index=records_index()) return except BulkIndexError as exc: log.exception(exc) time.sleep(RETRY_DELAY)
def delete_documents(collection_id): """Delete documents from a collection.""" records_query = {'term': {'collection_id': collection_id}} query_delete(records_index(), records_query) refresh_index(index=records_index()) query = { 'bool': { 'must': [{ 'term': { 'schemata': 'Document' } }, { 'term': { 'collection_id': collection_id } }] } } query_delete(entities_index(), query)
def flush_index(self): refresh_index()
def delete(document_id): document = get_db_document(document_id, request.authz.WRITE) delete_document(document) update_collection(document.collection) refresh_index(entities_index()) return ('', 204)
def index_collection(collection): """Index a collection.""" if collection.deleted_at is not None: return delete_collection(collection.id) data = { 'foreign_id': collection.foreign_id, 'created_at': collection.created_at, 'updated_at': collection.updated_at, 'label': collection.label, 'kind': collection.kind, 'summary': collection.summary, 'category': Collection.DEFAULT, 'publisher': collection.publisher, 'publisher_url': collection.publisher_url, 'info_url': collection.info_url, 'data_url': collection.data_url, 'casefile': collection.casefile, 'roles': collection.roles, 'schemata': {}, 'team': [] } texts = [v for v in data.values() if isinstance(v, str)] if collection.category in Collection.CATEGORIES: data['category'] = collection.category if collection.creator is not None: data['creator'] = { 'id': collection.creator.id, 'type': collection.creator.type, 'name': collection.creator.name } texts.append(collection.creator.name) for role in collection.team: data['team'].append({ 'id': role.id, 'type': role.type, 'name': role.name }) texts.append(role.name) # Compute some statistics on the content of a collection. query = { 'size': 0, 'query': { 'bool': { 'filter': [{ 'term': { 'collection_id': collection.id } }, { 'term': { 'schemata': Entity.THING } }] } }, 'aggs': { 'schema': { 'terms': { 'field': 'schema', 'size': 1000 } }, 'countries': { 'terms': { 'field': 'countries', 'size': 500 } }, 'languages': { 'terms': { 'field': 'languages', 'size': 100 } }, } } result = search_safe(index=entities_index(), body=query) aggregations = result.get('aggregations') data['count'] = result['hits']['total'] # expose entities by schema count. for schema in aggregations['schema']['buckets']: data['schemata'][schema['key']] = schema['doc_count'] # if no countries or langs are given, take the most common from the data. countries = collection.countries if countries is None or not len(countries): countries = aggregations['countries']['buckets'] countries = [c['key'] for c in countries] data['countries'] = exactitude.countries.normalize_set(countries) languages = collection.languages if languages is None or not len(languages): languages = aggregations['languages']['buckets'] languages = [c['key'] for c in languages] data['languages'] = exactitude.languages.normalize_set(languages) texts.extend([normalize(t, ascii=True) for t in texts]) data['text'] = index_form(texts) data = index_safe(collections_index(), collection.id, data) refresh_index(index=collections_index()) return data
def delete_collection(collection_id): """Delete all documents from a particular collection.""" q = {'ids': {'values': str(collection_id)}} query_delete(collections_index(), q) refresh_index(index=collections_index())
def delete(id): collection = get_db_collection(id, request.authz.WRITE) delete_collection(collection) refresh_index(collections_index()) return ('', 204)
def delete_entity(entity_id): """Delete an entity from the index.""" q = {'ids': {'values': str(entity_id)}} query_delete(entities_index(), q) refresh_index(index=entities_index())
def delete_document(document_id): clear_records(document_id) delete_entity(document_id) refresh_index(index=records_index())
def clear_records(document_id, refresh=True): """Delete all records associated with the given document.""" q = {'term': {'document_id': document_id}} query_delete(records_index(), q) if refresh: refresh_index(index=records_index())
def flush_index(self): refresh_index(all_indexes())