def bulk_write(collection, items): """Write a set of entities - given as raw dicts - to the index in bulk mode. This will perform validation but is dangerous as it means the application has no control over key generation and a few other aspects of building the entity. """ entities = {} for item in items: if not is_mapping(item): raise InvalidData("Failed to read input data") entity = model.get_proxy(item) if entity.id is None: raise InvalidData("No ID for entity") if entity.id in entities: entities[entity.id].merge(entity) else: entities[entity.id] = entity if len(entities) >= BULK_PAGE: index.index_bulk(collection.id, entities) entities = {} if len(entities): index.index_bulk(collection.id, entities) refresh_collection(collection)
def bulk_load_query(collection_id, query): collection = Collection.by_id(collection_id) if collection is None: log.warning("Collection does not exist: %s", collection_id) return mapping = model.make_mapping(query, key_prefix=collection.foreign_id) records_total = len(mapping.source) or 'streaming' entities = {} entities_count = 0 for records_index, record in enumerate(mapping.source.records, 1): for entity in mapping.map(record).values(): # When loading from a tabular data source, we will often # encounter mappings where the same entity is emitted # multiple times in short sequence, e.g. when the data # describes all the directors of a single company. if entity.id in entities: entities[entity.id].merge(entity) else: entities[entity.id] = entity entities_count += 1 if records_index > 0 and records_index % 1000 == 0: log.info("[%s] Loaded %s records (%s), %s entities...", collection.foreign_id, records_index, records_total, entities_count) if len(entities) >= BULK_PAGE: index.index_bulk(collection.id, entities) entities = {} index.index_bulk(collection.id, entities) refresh_collection(collection)
def bulk_write(collection, entities, unsafe=False, role_id=None, index=True): """Write a set of entities - given as dicts - to the index.""" # This is called mainly by the /api/2/collections/X/_bulk API. now = datetime.utcnow().isoformat() aggregator = get_aggregator(collection) writer = aggregator.bulk() entity_ids = set() for data in entities: if not is_mapping(data): raise InvalidData("Failed to read input data", errors=data) entity = model.get_proxy(data) if entity.id is None: raise InvalidData("No ID for entity", errors=entity.to_dict()) entity = collection.ns.apply(entity) if not unsafe: entity = remove_checksums(entity) entity.context = { 'role_id': role_id, 'created_at': now, 'updated_at': now, } writer.put(entity, origin='bulk') if index and len(entity_ids) < MAX_PAGE: entity_ids.add(entity.id) writer.flush() if index: if len(entity_ids) >= MAX_PAGE: entity_ids = None index_aggregator(collection, aggregator, entity_ids=entity_ids) refresh_collection(collection.id)
def bulk_write(collection, entities, safe=False, role_id=None, mutable=True, index=True): """Write a set of entities - given as dicts - to the index.""" # This is called mainly by the /api/2/collections/X/_bulk API. aggregator = get_aggregator(collection) writer = aggregator.bulk() entity_ids = set() for data in entities: entity = model.get_proxy(data, cleaned=False) if entity.id is None: raise InvalidData("No ID for entity", errors=entity.to_dict()) entity = collection.ns.apply(entity) if safe: entity = remove_checksums(entity) entity.context = {"role_id": role_id, "mutable": mutable} for field in ("created_at", "updated_at"): timestamp = data.get(field) if timestamp is not None: dt = registry.date.to_datetime(timestamp) if dt is not None: entity.context[field] = dt.isoformat() writer.put(entity, origin="bulk") if index and len(entity_ids) < MAX_PAGE: entity_ids.add(entity.id) writer.flush() if index: if len(entity_ids) >= MAX_PAGE: entity_ids = None index_aggregator(collection, aggregator, entity_ids=entity_ids) refresh_collection(collection.id)
def cancel(collection_id): """ --- delete: summary: Cancel processing of a collection description: > Cancel all queued tasks for the collection with id `collection_id` parameters: - description: The collection ID. in: path name: collection_id required: true schema: minimum: 1 type: integer responses: '200': content: application/json: schema: $ref: '#/components/schemas/CollectionStatus' description: OK tags: - Collection """ collection = get_db_collection(collection_id, request.authz.WRITE) cancel_queue(collection) refresh_collection(collection_id) return ("", 204)
def bulk_write(collection, items, merge=True): """Write a set of entities - given as dicts - to the index in bulk mode. This will perform validation but is dangerous as it means the application has no control over key generation and a few other aspects of building the entity. """ namespace = Namespace(collection.foreign_id) entities = {} for item in items: if not is_mapping(item): raise InvalidData("Failed to read input data", errors=item) entity = model.get_proxy(item) entity = namespace.apply(entity) entity.context = { 'bulk': True, 'collection_id': collection.id } if entity.id is None: raise InvalidData("No ID for entity", errors=item) if entity.id in entities: entities[entity.id].merge(entity) else: entities[entity.id] = entity if len(entities) >= BULK_PAGE: index.index_bulk(collection.id, entities, merge=merge) entities = {} if len(entities): index.index_bulk(collection.id, entities, merge=merge) refresh_collection(collection)
def bulk(id): collection = get_db_collection(id, request.authz.WRITE) require(request.authz.can_bulk_import()) merge = get_flag('merge', default=False) entities = ensure_list(request.get_json(force=True)) bulk_write(collection, entities, merge=merge) refresh_collection(id) return ('', 204)
def create_entity(data, collection, role=None, sync=False): entity = Entity.create(data, collection) collection.touch() db.session.commit() index.index_entity(entity, sync=sync) refresh_entity(entity.signed_id, sync=sync) refresh_collection(collection.id, sync=sync) return entity.signed_id
def refresh_entity(entity, sync=False): if is_mapping(entity): entity_id = entity.get('id') collection_id = entity.get('collection_id') else: entity_id = entity.id collection_id = entity.collection_id cache.kv.delete(cache.object_key(Entity, entity_id)) refresh_collection(collection_id, sync=sync)
def process(collection_id): collection = get_db_collection(collection_id, request.authz.WRITE) # re-process the documents data = {'reset': get_flag('reset', True)} queue_task(collection, OP_PROCESS, job_id=get_session_id(), payload=data) collection.touch() db.session.commit() refresh_collection(collection_id) return ('', 202)
def index_aggregate(stage, collection, sync=False, entity_ids=None, mapping_id=None): """Project the contents of the collections aggregator into the index.""" entities = _fetch_entities(stage, collection, entity_ids=entity_ids) entities = (_process_entity(e, sync=sync) for e in entities) extra = {'job_id': stage.job.id, 'mapping_id': mapping_id} index_bulk(collection, entities, extra, sync=sync) refresh_collection(collection.id, sync=sync)
def index_aggregate(queue, collection, sync=False): """Project the contents of the collections aggregator into the index.""" aggregator = get_aggregator(collection) try: index_entities(collection, aggregator, sync=sync) refresh_collection(collection.id, sync=sync) index_collection(collection, sync=sync) log.info("Aggregate indexed: %r", collection) finally: aggregator.close() queue.remove()
def crawldir(path, language=None, foreign_id=None): """Crawl the given directory.""" path = Path(path) if foreign_id is None: foreign_id = 'directory:%s' % slugify(path) authz = Authz.from_role(Role.load_cli_user()) config = {'foreign_id': foreign_id, 'label': path.name, 'casefile': False} create_collection(config, authz) collection = Collection.by_foreign_id(foreign_id) log.info('Crawling %s to %s (%s)...', path, foreign_id, collection.id) crawl_directory(collection, path) log.info('Complete. Make sure a worker is running :)') refresh_collection(collection.id)
def index_many(stage, collection, sync=False, entity_ids=None, batch=BATCH_SIZE): """Project the contents of the collections aggregator into the index.""" if entity_ids is not None: entity_ids = ensure_list(entity_ids) # WEIRD: Instead of indexing a single entity, this will try # pull a whole batch of them off the queue and do it at once. tasks = stage.get_tasks(limit=max(1, batch - len(entity_ids))) for task in tasks: entity_ids.extend(ensure_list(task.payload.get("entity_ids"))) stage.mark_done(len(tasks)) aggregator = get_aggregator(collection) index_aggregator(collection, aggregator, entity_ids=entity_ids, sync=sync) refresh_collection(collection.id)
def bulk(collection_id): collection = get_db_collection(collection_id, request.authz.WRITE) require(request.authz.can_bulk_import()) # This will disable checksum security measures in order to allow bulk # loading of document data. unsafe = get_flag('unsafe', default=False) unsafe = unsafe and request.authz.is_admin entities = ensure_list(request.get_json(force=True)) bulk_write(collection, entities, unsafe=unsafe) refresh_collection(id) return ('', 204)
def bulk_write(collection, entities, job_id=None, unsafe=False): """Write a set of entities - given as dicts - to the index.""" def _generate(): for data in entities: if not is_mapping(data): raise InvalidData("Failed to read input data", errors=data) entity = model.get_proxy(data) if not unsafe: entity = remove_checksums(entity) yield _process_entity(entity) index_bulk(collection, _generate(), job_id=job_id) refresh_collection(collection.id)
def bulk(collection_id): collection = get_db_collection(collection_id, request.authz.WRITE) require(request.authz.can_bulk_import()) merge = get_flag('merge', default=False) # This will disable certain security measures in order to allow bulk # loading of document data. unsafe = get_flag('unsafe', default=False) unsafe = unsafe and request.authz.is_admin entities = ensure_list(request.get_json(force=True)) bulk_write(collection, entities, merge=merge, unsafe=unsafe) refresh_collection(id) return ('', 204)
def update(foreign_id=None, index=False, process=False, reset=False): """Re-index all the collections and entities.""" update_roles() q = Collection.all(deleted=True) if foreign_id is not None: q = [get_collection(foreign_id)] for collection in q: if reset: reset_collection(collection, sync=True) refresh_collection(collection.id) index_collection(collection) if collection.deleted_at is not None: continue if index or process: payload = {'ingest': process} queue_task(collection, OP_PROCESS, payload=payload)
def mapping(collection_id): collection = get_db_collection(collection_id, request.authz.WRITE) require(request.authz.can_bulk_import()) if not request.is_json: raise BadRequest() data = request.get_json().get(collection.foreign_id) for query in keys_values(data, 'queries', 'query'): try: model.make_mapping(query) except InvalidMapping as invalid: raise BadRequest(invalid) queue_task(collection, OP_BULKLOAD, job_id=get_session_id(), payload=data) collection.touch() db.session.commit() refresh_collection(collection_id) return ('', 202)
def bulk_write(collection, entities, job_id=None, unsafe=False): """Write a set of entities - given as dicts - to the index.""" # This is called mainly by the /api/2/collections/X/_bulk API. def _generate(): for data in entities: if not is_mapping(data): raise InvalidData("Failed to read input data", errors=data) entity = model.get_proxy(data) if entity.id is None: raise InvalidData("No ID for entity", errors=entity.to_dict()) if not unsafe: entity = remove_checksums(entity) yield _process_entity(entity) index_bulk(collection, _generate(), {'job_id': job_id}) refresh_collection(collection.id)
def index_entities(stage, collection, iterable, sync=False): entities = [] for entity in iterable: if entity.id is None: raise InvalidData("No ID for entity", errors=entity.to_dict()) tag_entity(entity) entities.append(entity) if len(entities) >= BULK_PAGE: stage.report_finished(len(entities)) index_bulk(collection, entities, job_id=stage.job.id, sync=sync) entities = [] if len(entities): stage.report_finished(len(entities)) index_bulk(collection, entities, job_id=stage.job.id, sync=sync) refresh_collection(collection)
def delete_entity(entity, deleted_at=None, sync=False): # This is recursive and will also delete any entities which # reference the given entity. Usually this is going to be child # documents, or directoships referencing a person. It's a pretty # dangerous operation, though. for adjacent in index.iter_adjacent(entity): log.warning("Recursive delete: %r", adjacent) delete_entity(adjacent, deleted_at=deleted_at, sync=sync) flush_notifications(entity.get('id'), clazz=Entity) obj = Entity.by_id(entity.get('id')) if obj is not None: obj.delete(deleted_at=deleted_at) doc = Document.by_id(entity.get('id')) if doc is not None: doc.delete(deleted_at=deleted_at) index.delete_entity(entity.get('id'), sync=sync) refresh_entity(entity.get('id'), sync=sync) refresh_collection(entity.get('collection_id'), sync=sync)
def upsert_entity(data, collection, sync=False): entity = None entity_id = collection.ns.sign(data.get('id')) if entity_id is not None: entity = Entity.by_id(entity_id, collection=collection, deleted=True) # TODO: migrate softly from index. if entity is None: entity = Entity.create(data, collection) else: entity.update(data, collection) collection.touch() db.session.commit() delete_aggregator_entity(collection, entity.id) index.index_entity(entity, sync=sync) refresh_entity(entity.id, sync=sync) refresh_collection(collection.id, sync=sync) return entity.id
def index_entities(collection, iterable, sync=False): queue = get_queue(collection, OP_INDEX) queue.progress.mark_pending(len(iterable)) entities = [] for entity in iterable: if entity.id is None: raise InvalidData("No ID for entity", errors=entity.to_dict()) tag_entity(entity) entities.append(entity) if len(entities) >= BULK_PAGE: queue.progress.mark_finished(len(entities)) index_bulk(collection, entities, sync=sync) entities = [] if len(entities): queue.progress.mark_finished(len(entities)) index_bulk(collection, entities, sync=sync) refresh_collection(collection)
def upsert_entity(data, collection, validate=True, sync=False): """Create or update an entity in the database. This has a side hustle of migrating entities created via the _bulk API or a mapper to a database entity in the event that it gets edited by the user. """ entity = None entity_id = collection.ns.sign(data.get('id')) if entity_id is not None: entity = Entity.by_id(entity_id, collection=collection, deleted=True) # TODO: migrate softly from index. if entity is None: entity = Entity.create(data, collection, validate=validate) else: entity.update(data, collection, validate=validate) collection.touch() db.session.commit() delete_aggregator_entity(collection, entity.id) index.index_entity(entity, sync=sync) refresh_entity(entity.id, sync=sync) refresh_collection(collection.id, sync=sync) return entity.id
def delete_entity(collection, entity, deleted_at=None, sync=False): # This is recursive and will also delete any entities which # reference the given entity. Usually this is going to be child # documents, or directoships referencing a person. It's a pretty # dangerous operation, though. entity_id = collection.ns.sign(entity.get('id')) for adjacent in index.iter_adjacent(entity): log.warning("Recursive delete: %r", adjacent) delete_entity(collection, adjacent, deleted_at=deleted_at, sync=sync) flush_notifications(entity_id, clazz=Entity) obj = Entity.by_id(entity_id, collection=collection) if obj is not None: obj.delete(deleted_at=deleted_at) doc = Document.by_id(entity_id, collection=collection) if doc is not None: doc.delete(deleted_at=deleted_at) index.delete_entity(entity_id, sync=sync) Linkage.delete_by_entity(entity_id) Mapping.delete_by_table(entity_id) xref_index.delete_xref(collection, entity_id=entity_id, sync=sync) delete_aggregator_entity(collection, entity_id) refresh_entity(entity_id, sync=sync) refresh_collection(collection.id, sync=sync)
def bulk_load_query(collection_id, query): collection = Collection.by_id(collection_id) if collection is None: log.warning("Collection does not exist: %s", collection_id) return namespace = Namespace(collection.foreign_id) mapping = model.make_mapping(query, key_prefix=collection.foreign_id) records_total = len(mapping.source) or 'streaming' entities = {} entities_count = 0 for records_index, record in enumerate(mapping.source.records, 1): for entity in mapping.map(record).values(): entity = namespace.apply(entity) # When loading from a tabular data source, we will often # encounter mappings where the same entity is emitted # multiple times in short sequence, e.g. when the data # describes all the directors of a single company. if entity.id in entities: entities[entity.id].merge(entity) else: entities[entity.id] = entity entities_count += 1 if records_index > 0 and records_index % 1000 == 0: log.info("[%s] Loaded %s records (%s), %s entities...", collection.foreign_id, records_index, records_total, entities_count) if len(entities) >= BULK_PAGE: index.index_bulk(collection.id, entities) entities = {} index.index_bulk(collection.id, entities) refresh_collection(collection)
def bulk_write(collection, items, merge=True, unsafe=False): """Write a set of entities - given as dicts - to the index in bulk mode. This will perform validation but is dangerous as it means the application has no control over key generation and a few other aspects of building the entity. """ namespace = Namespace(collection.foreign_id) entities = {} for item in items: if not is_mapping(item): raise InvalidData("Failed to read input data", errors=item) entity = model.get_proxy(item) if not unsafe: entity = namespace.apply(entity) entity = remove_checksums(entity) entity.context = { 'bulk': True, 'collection_id': collection.id } if entity.id is None: raise InvalidData("No ID for entity", errors=item) if entity.id in entities: entities[entity.id].merge(entity) else: entities[entity.id] = entity if len(entities) >= BULK_PAGE: index.index_bulk(collection.id, entities, merge=merge) entities = {} if len(entities): index.index_bulk(collection.id, entities, merge=merge) refresh_collection(collection)
def process(collection_id): """ --- post: summary: Process a collection description: Start processing the collection with id `collection_id` parameters: - description: The collection ID. in: path name: collection_id required: true schema: minimum: 1 type: integer - in: query name: ingest schema: type: boolean - in: query name: reset schema: type: boolean responses: '202': description: Accepted tags: - Collection """ collection = get_db_collection(collection_id, request.authz.WRITE) # re-process the documents data = {'reset': get_flag('reset', True)} queue_task(collection, OP_PROCESS, job_id=get_session_id(), payload=data) collection.touch() db.session.commit() refresh_collection(collection_id) return ('', 202)
def cancel(foreign_id): """Cancel all queued tasks for the dataset.""" collection = get_collection(foreign_id) cancel_queue(collection) refresh_collection(collection.id)
def refresh_entity(collection, entity_id): cache.kv.delete(cache.object_key(Entity, entity_id)) refresh_collection(collection.id)
def after_task(self, task): if task.job.is_done(): collection = Collection.by_foreign_id(task.job.dataset.name) if collection is not None: refresh_collection(collection.id) task.job.remove()
def cleanup_job(self, job): if job.is_done(): collection = Collection.by_foreign_id(job.dataset.name) if collection is not None: refresh_collection(collection.id) job.remove()