def xref_collection(stage, collection): """Cross-reference all the entities and documents in a collection.""" delete_xref(collection, sync=True) delete_entities(collection.id, origin=ORIGIN, sync=True) index_matches(collection, _query_entities(collection)) index_matches(collection, _query_mentions(collection)) reindex_collection(collection, sync=False)
def load_mapping(stage, collection, mapping_id, sync=False): """Flush and reload all entities generated by a mapping.""" mapping = Mapping.by_id(mapping_id) if mapping is None: return log.error("Could not find mapping: %s", mapping_id) origin = mapping_origin(mapping.id) aggregator = get_aggregator(collection) aggregator.delete(origin=origin) delete_entities(collection.id, origin=origin, sync=True) if mapping.disabled: return log.info("Mapping is disabled: %s", mapping_id) publish( Events.LOAD_MAPPING, params={"collection": collection, "table": mapping.table_id}, channels=[collection, mapping.role], actor_id=mapping.role_id, ) try: map_to_aggregator(collection, mapping, aggregator) mapping.set_status(status=Mapping.SUCCESS) db.session.commit() reindex_collection(collection, sync=sync) except Exception as exc: mapping.set_status(status=Mapping.FAILED, error=str(exc)) db.session.commit() aggregator.delete(origin=origin) finally: aggregator.close()
def test_directory_with_file(self): _, headers = self.login(is_admin=True) meta = { 'file_name': 'directory', 'foreign_id': 'directory', 'schema': 'Folder', 'collection_id': self.col.id, } data = {'meta': json.dumps(meta)} res = self.client.post(self.url, data=data, headers=headers) assert res.status_code == 201, res assert 'id' in res.json, res.json directory = res.json['id'] meta = { 'file_name': 'subdirectory', 'foreign_id': 'subdirectory', 'parent': { 'id': directory }, 'collection_id': self.col.id, } data = {'meta': json.dumps(meta)} res = self.client.post(self.url, data=data, headers=headers) assert res.status_code == 201, res reindex_collection(self.col) assert 'id' in res.json, res.json url = '/api/2/entities/%s' % res.json['id'] res = self.client.get(url, headers=headers) assert res.status_code == 200, res props = res.json.get('properties') assert 'subdirectory' in props['fileName'], res.json
def test_directory_with_file(self): _, headers = self.login(is_admin=True) meta = { "file_name": "directory", "foreign_id": "directory", "schema": "Folder", "collection_id": self.col.id, } data = {"meta": json.dumps(meta)} res = self.client.post(self.url, data=data, headers=headers) assert res.status_code == 201, res assert "id" in res.json, res.json directory = res.json["id"] meta = { "file_name": "subdirectory", "foreign_id": "subdirectory", "parent": { "id": directory }, "collection_id": self.col.id, } data = {"meta": json.dumps(meta)} res = self.client.post(self.url, data=data, headers=headers) assert res.status_code == 201, res reindex_collection(self.col) assert "id" in res.json, res.json url = "/api/2/entities/%s" % res.json["id"] res = self.client.get(url, headers=headers) assert res.status_code == 200, res props = res.json.get("properties") assert "subdirectory" in props["fileName"], res.json
def load_fixtures(self): self.admin = self.create_user(foreign_id='admin', is_admin=True) self.private_coll = self.create_collection( foreign_id='test_private', label="Private Collection", category='grey', creator=self.admin ) self._banana = self.create_entity({ 'schema': 'Person', 'properties': { 'name': ['Banana'], 'birthDate': '1970-08-21' } }, self.private_coll) self._banana2 = self.create_entity({ 'schema': 'Person', 'properties': { 'name': ['Banana'], 'birthDate': '1970-03-21' } }, self.private_coll) self._banana3 = self.create_entity({ 'schema': 'Person', 'properties': { 'name': ['Banana'], 'birthDate': '1970-05-21' } }, self.private_coll) user = Role.by_foreign_id(Role.SYSTEM_USER) Permission.grant(self.private_coll, user, True, False) self.public_coll = self.create_collection( foreign_id='test_public', label="Public Collection", category='news', creator=self.admin ) self._kwazulu = self.create_entity({ 'schema': 'Company', 'properties': { 'name': ['KwaZulu'], 'alias': ['kwazulu'] } }, self.public_coll) visitor = Role.by_foreign_id(Role.SYSTEM_GUEST) Permission.grant(self.public_coll, visitor, True, False) db.session.commit() aggregator = get_aggregator(self.public_coll) aggregator.delete() aggregator.close() reindex_collection(self.public_coll, sync=True) aggregator = get_aggregator(self.private_coll) aggregator.delete() for sample in read_entities(self.get_fixture_path('samples.ijson')): aggregator.put(sample, fragment='sample') aggregator.close() reindex_collection(self.private_coll, sync=True)
def xref_collection(stage, collection): """Cross-reference all the entities and documents in a collection.""" log.info("[%s] Clearing previous xref state....", collection) delete_xref(collection, sync=True) delete_entities(collection.id, origin=ORIGIN, sync=True) index_matches(collection, _query_entities(collection)) index_matches(collection, _query_mentions(collection)) log.info("[%s] Xref done, re-indexing to reify mentions...", collection) reindex_collection(collection, sync=False)
def dispatch_task(self, collection, task): stage = task.stage payload = task.payload sync = task.context.get("sync", False) if stage.stage == OP_INDEX: index_many(stage, collection, sync=sync, **payload) if stage.stage == OP_LOAD_MAPPING: load_mapping(stage, collection, **payload) if stage.stage == OP_FLUSH_MAPPING: flush_mapping(stage, collection, sync=sync, **payload) if stage.stage == OP_REINGEST: reingest_collection(collection, job_id=stage.job.id, **payload) if stage.stage == OP_REINDEX: reindex_collection(collection, sync=sync, **payload) if stage.stage == OP_XREF: xref_collection(stage, collection) if stage.stage == OP_XREF_ITEM: xref_item(stage, collection, **payload) log.info("Task [%s]: %s (done)", task.job.dataset, stage.stage)
def load_entities(foreign_id, infile, unsafe=False): """Load FtM entities from the specified iJSON file.""" collection = ensure_collection(foreign_id, foreign_id) def read_entities(): for idx in count(1): line = infile.readline() if not line: return if idx % 1000 == 0: log.info("[%s] Loaded %s entities from: %s", collection, idx, infile.name) yield json.loads(line) role = Role.load_cli_user() bulk_write(collection, read_entities(), unsafe=unsafe, role_id=role.id, index=False) reindex_collection(collection)
def handle(self, task): stage = task.stage payload = task.payload collection = Collection.by_foreign_id(task.job.dataset.name) if collection is None: log.error("Collection not found: %s", task.job.dataset) return sync = task.context.get('sync', False) if stage.stage == OP_INDEX: index_many(stage, collection, sync=sync, **payload) if stage.stage == OP_LOAD_MAPPING: load_mapping(stage, collection, **payload) if stage.stage == OP_FLUSH_MAPPING: flush_mapping(stage, collection, sync=sync, **payload) if stage.stage == OP_REINGEST: reingest_collection(collection, job_id=stage.job.id, **payload) if stage.stage == OP_REINDEX: reindex_collection(collection, sync=sync, **payload) if stage.stage == OP_XREF: xref_collection(stage, collection) if stage.stage == OP_XREF_ITEM: xref_item(stage, collection, **payload) log.info("Task [%s]: %s (done)", task.job.dataset, stage.stage)
def reindex_casefiles(flush=False): """Re-index all the casefile collections.""" for collection in Collection.all_casefiles(): log.info("[%s] Starting to re-index", collection) reindex_collection(collection, flush=flush)
def reindex(foreign_id, flush=False): """Index all the aggregator contents for a collection.""" collection = get_collection(foreign_id) reindex_collection(collection, flush=flush)
def _reindex_collection(collection, flush=False): log.info("[%s] Starting to re-index", collection) try: reindex_collection(collection, flush=flush) except Exception: log.exception("Failed to re-index: %s", collection)
def load_fixtures(self): self.admin = self.create_user(foreign_id="admin", is_admin=True) self.private_coll = self.create_collection( foreign_id="test_private", label="Private Collection", category="grey", creator=self.admin, ) self._banana = self.create_entity( { "schema": "Person", "properties": { "name": ["Banana"], "birthDate": "1970-08-21" }, }, self.private_coll, ) self._banana2 = self.create_entity( { "schema": "Person", "properties": { "name": ["Banana"], "birthDate": "1970-03-21" }, }, self.private_coll, ) self._banana3 = self.create_entity( { "schema": "Person", "properties": { "name": ["Banana ba Nana"], "birthDate": "1969-05-21", "deathDate": "1972-04-23", }, }, self.private_coll, ) user = Role.by_foreign_id(Role.SYSTEM_USER) Permission.grant(self.private_coll, user, True, False) self.public_coll = self.create_collection( foreign_id="test_public", label="Public Collection", category="news", creator=self.admin, ) self._kwazulu = self.create_entity( { "schema": "Company", "properties": { "name": ["KwaZulu"], "alias": ["kwazulu"] }, }, self.public_coll, ) visitor = Role.by_foreign_id(Role.SYSTEM_GUEST) Permission.grant(self.public_coll, visitor, True, False) db.session.commit() aggregator = get_aggregator(self.public_coll) aggregator.delete() aggregator.close() reindex_collection(self.public_coll, sync=True) aggregator = get_aggregator(self.private_coll) aggregator.delete() for sample in read_entities(self.get_fixture_path("samples.ijson")): aggregator.put(sample, fragment="sample") aggregator.close() reindex_collection(self.private_coll, sync=True)
def op_reindex_handler(collection, task): sync = task.context.get("sync", False) reindex_collection(collection, sync=sync, **task.payload)