def load_fixtures(self): self.admin = self.create_user(foreign_id='admin', is_admin=True) self.private_coll = self.create_collection( foreign_id='test_private', label="Private Collection", category='grey', creator=self.admin ) self._banana = self.create_entity({ 'schema': 'Person', 'properties': { 'name': ['Banana'], 'birthDate': '1970-08-21' } }, self.private_coll) self._banana2 = self.create_entity({ 'schema': 'Person', 'properties': { 'name': ['Banana'], 'birthDate': '1970-03-21' } }, self.private_coll) self._banana3 = self.create_entity({ 'schema': 'Person', 'properties': { 'name': ['Banana'], 'birthDate': '1970-05-21' } }, self.private_coll) user = Role.by_foreign_id(Role.SYSTEM_USER) Permission.grant(self.private_coll, user, True, False) self.public_coll = self.create_collection( foreign_id='test_public', label="Public Collection", category='news', creator=self.admin ) self._kwazulu = self.create_entity({ 'schema': 'Company', 'properties': { 'name': ['KwaZulu'], 'alias': ['kwazulu'] } }, self.public_coll) visitor = Role.by_foreign_id(Role.SYSTEM_GUEST) Permission.grant(self.public_coll, visitor, True, False) db.session.commit() aggregator = get_aggregator(self.public_coll) aggregator.delete() aggregator.close() reindex_collection(self.public_coll, sync=True) aggregator = get_aggregator(self.private_coll) aggregator.delete() for sample in read_entities(self.get_fixture_path('samples.ijson')): aggregator.put(sample, fragment='sample') aggregator.close() reindex_collection(self.private_coll, sync=True)
def bulk_load_query(queue, collection, query_id, query): namespace = Namespace(collection.foreign_id) mapping = model.make_mapping(query, key_prefix=collection.foreign_id) records_total = len(mapping.source) if records_total: queue.progress.mark_pending(records_total) aggregator = get_aggregator(collection) writer = aggregator.bulk() entities_count = 0 for idx, record in enumerate(mapping.source.records, 1): for entity in mapping.map(record).values(): entity = namespace.apply(entity) entities_count += 1 fragment = '%s-%s' % (query_id, idx) writer.put(entity, fragment=fragment) if idx > 0 and idx % 1000 == 0: queue.progress.mark_finished(1000) log.info("[%s] Loaded %s records (%s), %s entities...", collection.foreign_id, idx, records_total or 'streaming', entities_count) writer.flush() aggregator.close() log.info("[%s] Query done (%s entities)", collection.foreign_id, entities_count)
def reindex_collection(collection, skip_errors=True, sync=False, flush=False): """Re-index all entities from the model, mappings and aggregator cache.""" from aleph.logic.mapping import map_to_aggregator from aleph.logic.profiles import profile_fragments aggregator = get_aggregator(collection) for mapping in collection.mappings: if mapping.disabled: log.debug("[%s] Skip mapping: %r", collection, mapping) continue try: map_to_aggregator(collection, mapping, aggregator) except Exception: # More or less ignore broken models. log.exception("Failed mapping: %r", mapping) aggregate_model(collection, aggregator) profile_fragments(collection, aggregator) if flush: log.debug("[%s] Flushing...", collection) index.delete_entities(collection.id, sync=True) index_aggregator(collection, aggregator, skip_errors=skip_errors, sync=sync) compute_collection(collection, force=True)
def process_collection(stage, collection, ingest=True, reset=False, sync=False): """Trigger a full re-parse of all documents and re-build the search index from the aggregator.""" ingest = ingest or reset if reset: reset_collection(collection, sync=True) aggregator = get_aggregator(collection) try: writer = aggregator.bulk() for proxy in _collection_proxies(collection): writer.put(proxy, fragment='db') stage.report_finished(1) writer.flush() if ingest: for proxy in aggregator: ingest_entity(collection, proxy, job_id=stage.job.id) else: queue_task(collection, OP_INDEX, job_id=stage.job.id, context={'sync': sync}) finally: aggregator.close()
def prune_entity(collection, entity_id=None, job_id=None): """Prune handles the full deletion of an entity outside of the HTTP request cycle. This involves cleaning up adjacent entities like xref results, notifications and so on.""" # This is recursive and will also delete any entities which # reference the given entity. Usually this is going to be child # documents, or directoships referencing a person. It's a pretty # dangerous operation, though. log.info("[%s] Prune entity: %s", collection, entity_id) for adjacent in index.iter_adjacent(collection.id, entity_id): log.warning("Recursive delete: %s", adjacent.get("id")) delete_entity(collection, adjacent, job_id=job_id) flush_notifications(entity_id, clazz=Entity) obj = Entity.by_id(entity_id, collection=collection) if obj is not None: obj.delete() doc = Document.by_id(entity_id, collection=collection) if doc is not None: doc.delete() EntitySetItem.delete_by_entity(entity_id) Mapping.delete_by_table(entity_id) xref_index.delete_xref(collection, entity_id=entity_id) aggregator = get_aggregator(collection) aggregator.delete(entity_id=entity_id) refresh_entity(collection, entity_id) collection.touch() db.session.commit()
def delete_collection(collection, keep_metadata=False, sync=False): cancel_queue(collection) aggregator = get_aggregator(collection) try: aggregator.drop() finally: aggregator.close() flush_notifications(collection, sync=sync) index.delete_entities(collection.id, sync=sync) xref_index.delete_xref(collection, sync=sync) deleted_at = collection.deleted_at or datetime.utcnow() Entity.delete_by_collection(collection.id, deleted_at=deleted_at) Mapping.delete_by_collection(collection.id, deleted_at=deleted_at) Diagram.delete_by_collection(collection.id, deleted_at=deleted_at) Document.delete_by_collection(collection.id) if not keep_metadata: # Considering linkages metadata for now, might be wrong: Linkage.delete_by_collection(collection.id) Permission.delete_by_collection(collection.id, deleted_at=deleted_at) collection.delete(deleted_at=deleted_at) db.session.commit() if not keep_metadata: index.delete_collection(collection.id, sync=True) Authz.flush() refresh_collection(collection.id, sync=True)
def bulk_write(collection, entities, safe=False, role_id=None, mutable=True, index=True): """Write a set of entities - given as dicts - to the index.""" # This is called mainly by the /api/2/collections/X/_bulk API. aggregator = get_aggregator(collection) writer = aggregator.bulk() entity_ids = set() for data in entities: entity = model.get_proxy(data, cleaned=False) if entity.id is None: raise InvalidData("No ID for entity", errors=entity.to_dict()) entity = collection.ns.apply(entity) if safe: entity = remove_checksums(entity) entity.context = {"role_id": role_id, "mutable": mutable} for field in ("created_at", "updated_at"): timestamp = data.get(field) if timestamp is not None: dt = registry.date.to_datetime(timestamp) if dt is not None: entity.context[field] = dt.isoformat() writer.put(entity, origin="bulk") if index and len(entity_ids) < MAX_PAGE: entity_ids.add(entity.id) writer.flush() if index: if len(entity_ids) >= MAX_PAGE: entity_ids = None index_aggregator(collection, aggregator, entity_ids=entity_ids) refresh_collection(collection.id)
def update_entity(collection, entity_id=None): """Update xref and aggregator after an entity has been edited.""" from aleph.logic.xref import xref_entity from aleph.logic.profiles import profile_fragments log.info("[%s] Update entity: %s", collection, entity_id) entity = index.get_entity(entity_id) proxy = model.get_proxy(entity) if collection.casefile: xref_entity(collection, proxy) aggregator = get_aggregator(collection, origin=MODEL_ORIGIN) profile_fragments(collection, aggregator, entity_id=entity_id) # Inline name properties from adjacent entities. See the # docstring on `inline_names` for a more detailed discussion. prop = proxy.schema.get("namesMentioned") if prop is not None: entity_ids = proxy.get_type_values(registry.entity) names = set() for related in index.entities_by_ids(entity_ids): related = model.get_proxy(related) names.update(related.get_type_values(registry.name)) if len(names) > 0: name_proxy = model.make_entity(proxy.schema) name_proxy.id = proxy.id name_proxy.add(prop, names) aggregator.put(name_proxy, fragment="names") index_aggregator(collection, aggregator, entity_ids=[entity_id]) refresh_entity(collection, proxy.id)
def upsert_entity(data, collection, authz=None, sync=False, sign=False, job_id=None): """Create or update an entity in the database. This has a side effect of migrating entities created via the _bulk API or a mapper to a database entity in the event that it gets edited by the user. """ from aleph.logic.profiles import profile_fragments entity = None entity_id = collection.ns.sign(data.get("id")) if entity_id is not None: entity = Entity.by_id(entity_id, collection=collection) if entity is None: role_id = authz.id if authz is not None else None entity = Entity.create(data, collection, sign=sign, role_id=role_id) else: entity.update(data, collection, sign=sign) collection.touch() proxy = entity.to_proxy() aggregator = get_aggregator(collection) aggregator.delete(entity_id=proxy.id) aggregator.put(proxy, origin=MODEL_ORIGIN) profile_fragments(collection, aggregator, entity_id=proxy.id) index.index_proxy(collection, proxy, sync=sync) refresh_entity(collection, proxy.id) queue_task(collection, OP_UPDATE_ENTITY, job_id=job_id, entity_id=proxy.id) return entity.id
def load_mapping(collection, mapping_id, sync=False): """Flush and reload all entities generated by a mapping.""" mapping = Mapping.by_id(mapping_id) if mapping is None: return log.error("Could not find mapping: %s", mapping_id) origin = mapping_origin(mapping.id) aggregator = get_aggregator(collection) aggregator.delete(origin=origin) delete_entities(collection.id, origin=origin, sync=True) if mapping.disabled: return log.info("Mapping is disabled: %s", mapping_id) publish( Events.LOAD_MAPPING, params={ "collection": collection, "table": mapping.table_id }, channels=[collection, mapping.role], actor_id=mapping.role_id, ) try: map_to_aggregator(collection, mapping, aggregator) aggregate_model(collection, aggregator) index_aggregator(collection, aggregator, sync=sync) mapping.set_status(status=Status.SUCCESS) db.session.commit() except Exception as exc: mapping.set_status(status=Status.FAILED, error=str(exc)) db.session.commit() aggregator.delete(origin=origin) finally: aggregator.close()
def bulk_write(collection, entities, unsafe=False, role_id=None, index=True): """Write a set of entities - given as dicts - to the index.""" # This is called mainly by the /api/2/collections/X/_bulk API. now = datetime.utcnow().isoformat() aggregator = get_aggregator(collection) writer = aggregator.bulk() entity_ids = set() for data in entities: if not is_mapping(data): raise InvalidData("Failed to read input data", errors=data) entity = model.get_proxy(data) if entity.id is None: raise InvalidData("No ID for entity", errors=entity.to_dict()) entity = collection.ns.apply(entity) if not unsafe: entity = remove_checksums(entity) entity.context = { 'role_id': role_id, 'created_at': now, 'updated_at': now, } writer.put(entity, origin='bulk') if index and len(entity_ids) < MAX_PAGE: entity_ids.add(entity.id) writer.flush() if index: if len(entity_ids) >= MAX_PAGE: entity_ids = None index_aggregator(collection, aggregator, entity_ids=entity_ids) refresh_collection(collection.id)
def _query_mentions(collection): aggregator = get_aggregator(collection, origin=ORIGIN) aggregator.delete(origin=ORIGIN) writer = aggregator.bulk() for proxy in _iter_mentions(collection): schemata = set() countries = set() for score, _, collection_id, match in _query_item(proxy): schemata.add(match.schema) countries.update(match.get_type_values(registry.country)) yield score, proxy, collection_id, match if len(schemata): # Assign only those countries that are backed by one of # the matches: countries = countries.intersection(proxy.get("country")) proxy.set("country", countries) # Try to be more specific about schema: _merge_schemata(proxy, schemata) # Pick a principal name: proxy = name_entity(proxy) proxy.context["mutable"] = True log.debug("Reifying [%s]: %s", proxy.schema.name, proxy) writer.put(proxy, fragment="mention") # pprint(proxy.to_dict()) writer.flush() aggregator.close()
def flush_mapping(collection, mapping_id, sync=True): """Delete entities loaded by a mapping""" log.debug("Flushing entities for mapping: %s", mapping_id) origin = mapping_origin(mapping_id) aggregator = get_aggregator(collection) aggregator.delete(origin=origin) delete_entities(collection.id, origin=origin, sync=sync) update_collection(collection, sync=sync)
def reingest_collection(collection, job_id=None, index=False): """Trigger a re-ingest for all documents in the collection.""" job_id = job_id or Job.random_id() aggregator = get_aggregator(collection) aggregator.delete(origin=OP_ANALYZE) aggregator.delete(origin=OP_INGEST) aggregator.close() for document in Document.by_collection(collection.id): proxy = document.to_proxy(ns=collection.ns) ingest_entity(collection, proxy, job_id=job_id, index=index)
def load_mapping(stage, collection, mapping_id): """Flush and reload all entities generated by a mapping.""" mapping = Mapping.by_id(mapping_id) if mapping is None: return log.error("Could not find mapping: %s", mapping_id) flush_mapping(stage, collection, mapping_id) publish(Events.LOAD_MAPPING, params={'collection': collection, 'table': mapping.table_id}, channels=[collection, mapping.role], actor_id=mapping.role_id) mapper = make_mapper(collection, mapping) aggregator = get_aggregator(collection) try: writer = aggregator.bulk() entities_count = 0 entity_ids = set() for idx, record in enumerate(mapper.source.records, 1): for entity in mapper.map(record).values(): if entity.schema.is_a('Thing'): entity.add('proof', mapping.table_id) entity = collection.ns.apply(entity) entity_ids.add(entity.id) entities_count += 1 fragment = '%s-%s' % (mapping.id, idx) writer.put(entity, fragment=fragment) if idx > 0 and idx % 500 == 0: payload = { 'entity_ids': entity_ids, 'mapping_id': mapping.id } queue_task(collection, OP_INDEX, job_id=stage.job.id, payload=payload) entity_ids = set() stage.report_finished(500) log.info("[%s] Loaded %s records, %s entities...", collection.foreign_id, idx, entities_count) writer.flush() payload = { 'entity_ids': entity_ids, 'mapping_id': mapping.id } queue_task(collection, OP_INDEX, job_id=stage.job.id, payload=payload) mapping.set_status(status=Mapping.SUCCESS) log.info("[%s] Mapping done (%s entities)", mapping.id, entities_count) except Exception as exc: mapping.set_status(status=Mapping.FAILED, error=str(exc)) finally: aggregator.close()
def index_aggregate(queue, collection, sync=False): """Project the contents of the collections aggregator into the index.""" aggregator = get_aggregator(collection) try: index_entities(collection, aggregator, sync=sync) refresh_collection(collection.id, sync=sync) index_collection(collection, sync=sync) log.info("Aggregate indexed: %r", collection) finally: aggregator.close() queue.remove()
def flush_mapping(stage, collection, mapping_id, sync=True): """Delete entities loaded by a mapping""" log.debug("Flushing entities for mapping: %s", mapping_id) origin = mapping_origin(mapping_id) aggregator = get_aggregator(collection) aggregator.delete(origin=origin) aggregator.close() delete_entities(collection.id, origin=origin, sync=sync) collection.touch() db.session.commit() update_collection(collection, sync=sync)
def _fetch_entities(stage, collection, entity_id=None, batch=100): aggregator = get_aggregator(collection) if entity_id is not None: entity_id = ensure_list(entity_id) # WEIRD: Instead of indexing a single entity, this will try # pull a whole batch of them off the queue and do it at once. for task in stage.get_tasks(limit=batch): entity_id.append(task.payload.get('entity_id')) stage.mark_done(len(entity_id) - 1) yield from aggregator.iterate(entity_id=entity_id) aggregator.close()
def index_many(stage, collection, sync=False, entity_ids=None, batch=BATCH_SIZE): """Project the contents of the collections aggregator into the index.""" if entity_ids is not None: entity_ids = ensure_list(entity_ids) # WEIRD: Instead of indexing a single entity, this will try # pull a whole batch of them off the queue and do it at once. tasks = stage.get_tasks(limit=max(1, batch - len(entity_ids))) for task in tasks: entity_ids.extend(ensure_list(task.payload.get("entity_ids"))) stage.mark_done(len(tasks)) aggregator = get_aggregator(collection) index_aggregator(collection, aggregator, entity_ids=entity_ids, sync=sync) refresh_collection(collection.id)
def _fetch_entities(stage, collection, entity_ids=None, batch=100): aggregator = get_aggregator(collection) if entity_ids is not None: entity_ids = ensure_list(entity_ids) # WEIRD: Instead of indexing a single entity, this will try # pull a whole batch of them off the queue and do it at once. tasks = stage.get_tasks(limit=max(1, batch - len(entity_ids))) for task in tasks: entity_ids.extend(ensure_list(task.payload.get('entity_ids'))) # FIXME: this doesn't retain mapping_id properly. stage.mark_done(len(tasks)) yield from aggregator.iterate(entity_id=entity_ids) aggregator.close()
def process_collection(stage, collection, ingest=True, sync=False): """Trigger a full re-parse of all documents and re-build the search index from the aggregator.""" aggregator = get_aggregator(collection) for proxy in _collection_proxies(collection): if ingest and proxy.schema.is_a(Document.SCHEMA): ingest_entity(collection, proxy, job_id=stage.job.id, sync=sync) else: aggregator.put(proxy, fragment='db') queue_task(collection, OP_INDEX, job_id=stage.job.id, payload={'entity_id': proxy.id}, context={'sync': sync}) aggregator.close()
def load_fixtures(self): self.admin = self.create_user(foreign_id='admin', is_admin=True) self.private_coll = self.create_collection(foreign_id='test_private', label="Private Collection", category='grey', casefile=False, creator=self.admin) self._banana = Entity.create( { 'schema': 'Person', 'properties': { 'name': ['Banana'], } }, self.private_coll) user = Role.by_foreign_id(Role.SYSTEM_USER) Permission.grant(self.private_coll, user, True, False) self.public_coll = self.create_collection(foreign_id='test_public', label="Public Collection", category='news', casefile=False, creator=self.admin) self._kwazulu = Entity.create( { 'schema': 'Company', 'properties': { 'name': ['KwaZulu'], 'alias': ['kwazulu'] } }, self.public_coll) visitor = Role.by_foreign_id(Role.SYSTEM_GUEST) Permission.grant(self.public_coll, visitor, True, False) db.session.commit() drop_aggregator(self.public_coll) stage = get_stage(self.public_coll, OP_PROCESS) process_collection(stage, self.public_coll, ingest=False, sync=True) aggregator = get_aggregator(self.private_coll) aggregator.delete() stage = get_stage(self.private_coll, OP_PROCESS) for sample in read_entities(self.get_fixture_path('samples.ijson')): aggregator.put(sample, fragment='sample') index_aggregate(stage, self.private_coll, entity_id=sample.id, sync=True) aggregator.close() process_collection(stage, self.private_coll, ingest=False, sync=True)
def setUp(self): super(MappingAPITest, self).setUp() self.col = self.create_collection(foreign_id="map1") aggregator = get_aggregator(self.col) aggregator.delete() _, self.headers = self.login(is_admin=True) self.rolex = self.create_user(foreign_id="user_3") _, self.headers_x = self.login(foreign_id="user_3") self.fixture = self.get_fixture_path("experts.csv") self.content_hash = archive.archive_file(self.fixture) data = { "id": "foo", "schema": "Table", "properties": { "csvHash": self.content_hash, "contentHash": self.content_hash, "mimeType": "text/csv", "fileName": "experts.csv", "name": "experts.csv", }, } self.ent = EntityProxy.from_dict(model, data, cleaned=False) self.ent.id = self.col.ns.sign(self.ent.id) index_proxy(self.col, self.ent) data = { "id": "foo2", "schema": "Table", "properties": { "csvHash": self.content_hash, "contentHash": self.content_hash, "mimeType": "text/csv", "fileName": "experts.csv", "name": "experts.csv", }, } self.ent2 = EntityProxy.from_dict(model, data, cleaned=False) self.ent2.id = self.col.ns.sign(self.ent2.id) index_proxy(self.col, self.ent2) data = { "id": "bar", "schema": "LegalEntity", "properties": { "name": "John Doe" }, } ent = EntityProxy.from_dict(model, data, cleaned=False) ent.id = self.col.ns.sign(ent.id) index_proxy(self.col, ent)
def reindex_collection(collection, sync=False, flush=False): """Re-index all entities from the model, mappings and aggregator cache.""" from aleph.logic.mapping import map_to_aggregator if flush: log.debug("[%s] Flushing...", collection) index.delete_entities(collection.id, sync=True) aggregator = get_aggregator(collection) for mapping in collection.mappings: try: map_to_aggregator(collection, mapping, aggregator) except Exception as ex: # More or less ignore broken models. log.warn("Failed mapping [%s]: %s", mapping.id, ex) aggregate_model(collection, aggregator) index_aggregator(collection, aggregator, sync=sync) compute_collection(collection, sync=True)
def save_entityset_item(entityset, collection, entity_id, **data): """Change the association between an entity and an entityset. In the case of a profile, this may require re-indexing of the entity to update the associated profile_id. """ item = EntitySetItem.save(entityset, entity_id, collection_id=collection.id, **data) if entityset.type == EntitySet.PROFILE and entityset.collection_id == collection.id: from aleph.logic.profiles import profile_fragments aggregator = get_aggregator(collection) profile_fragments(collection, aggregator, entity_id=entity_id) index_aggregator(collection, aggregator, entity_ids=[entity_id]) refresh_entity(collection, entity_id) refresh_entityset(entityset.id) return item
def update_entity(collection, entity_id=None, job_id=None): """Worker post-processing for entity changes. This action collects operations that should be done after each change to an entity but are too slow to run inside the request cycle. Update xref and aggregator, trigger NER and re-index.""" from aleph.logic.xref import xref_entity from aleph.logic.profiles import profile_fragments log.info("[%s] Update entity: %s", collection, entity_id) entity = index.get_entity(entity_id) proxy = model.get_proxy(entity) if collection.casefile: xref_entity(collection, proxy) aggregator = get_aggregator(collection, origin=MODEL_ORIGIN) profile_fragments(collection, aggregator, entity_id=entity_id) inline_names(aggregator, proxy) pipeline_entity(collection, proxy, job_id=job_id)
def _fetch_entities(stage, collection, entity_id=None, batch=50): aggregator = get_aggregator(collection) try: if entity_id is None: yield from aggregator return yield from aggregator.iterate(entity_id=entity_id) # WEIRD: Instead of indexing a single entity, this will try # pull a whole batch of them off the queue and do it at once. done = 0 for task in stage.get_tasks(limit=batch): entity_id = task.payload.get('entity_id') for entity in aggregator.iterate(entity_id=entity_id): yield entity done += 1 stage.mark_done(done) finally: aggregator.close()
def process_collection(collection, ingest=True, reset=False): """Trigger a full re-parse of all documents and re-build the search index from the aggregator.""" if reset: reset_collection(collection) aggregator = get_aggregator(collection) try: writer = aggregator.bulk() for proxy in _collection_proxies(collection): writer.put(proxy, fragment='db') if ingest: ingest_entity(collection, proxy) writer.flush() if ingest: ingest_wait(collection) else: index_entities(collection, aggregator) finally: aggregator.close()
def delete_collection(collection, keep_metadata=False, sync=False): cancel_queue(collection) aggregator = get_aggregator(collection) aggregator.drop() flush_notifications(collection, sync=sync) index.delete_entities(collection.id, sync=sync) xref_index.delete_xref(collection, sync=sync) deleted_at = collection.deleted_at or datetime.utcnow() Mapping.delete_by_collection(collection.id) EntitySet.delete_by_collection(collection.id, deleted_at) Entity.delete_by_collection(collection.id) Document.delete_by_collection(collection.id) if not keep_metadata: Permission.delete_by_collection(collection.id) collection.delete(deleted_at=deleted_at) db.session.commit() if not keep_metadata: index.delete_collection(collection.id, sync=True) Authz.flush() refresh_collection(collection.id)
def index_aggregate(stage, collection, entity_id=None, sync=False): """Project the contents of the collections aggregator into the index.""" aggregator = get_aggregator(collection) try: entities = aggregator if entity_id is not None: entities = list(aggregator.iterate(entity_id=entity_id)) # WEIRD: Instead of indexing a single entity, this will try # pull a whole batch of them off the queue and do it at once. for task in stage.get_tasks(limit=50): entity_id = task.payload.get('entity_id') entities.extend(aggregator.iterate(entity_id=entity_id)) stage.mark_done(len(entities) - 1) for entity in entities: log.debug("Index: %r", entity) refresh_entity_id(entity.id) index_entities(stage, collection, entities, sync=sync) finally: aggregator.close()