def cleanup_deleted(): from aleph.model import Collection, Role from aleph.model import EntitySet, EntitySetItem EntitySetItem.cleanup_deleted() EntitySet.cleanup_deleted() Collection.cleanup_deleted() Role.cleanup_deleted() db.session.commit()
def decide_xref(xref, judgement, authz): """Store user feedback from an Xref result as an profile-type EntitySet The problem here is that we're trying to translate a single pair-wise user judgement into a merge or split judgement regarding a cluster of entities. This works for most cases, with the exception that a profile, once established, cannot be split in a way that preserves what entities were linked to what other entities originally.""" if not isinstance(judgement, Judgement): judgement = Judgement(judgement) entity_id = xref.get("entity_id") collection = Collection.by_id(xref.get("collection_id")) entity_profile = EntitySet.by_entity_id( entity_id, judgements=[Judgement.POSITIVE], collection_ids=[collection.id], types=[EntitySet.PROFILE], ).first() match_id = xref.get("match_id") match_collection_id = xref.get("match_collection_id") match_profile = EntitySet.by_entity_id( match_id, judgements=[Judgement.POSITIVE], collection_ids=[collection.id], types=[EntitySet.PROFILE], ).first() # If we are undecided, and we stay undecided, not much to change. if entity_profile is None or match_profile is None: if judgement == Judgement.NO_JUDGEMENT: return if entity_profile is None: entity_profile = create_profile(collection, authz) profile_add_entities(entity_profile, entity_id, collection.id, None, Judgement.POSITIVE, authz) if judgement is Judgement.POSITIVE and match_profile is not None: # Case 1: both entities have profiles and the match is positive entity_profile = entity_profile.merge(match_profile, authz.id) else: # Case 2: any other judgement # NOTE: Another case of NEGATIVE judgements triggering a # `split_profile` could be useful, however it isn't implemented # here so that we don't lose judgements. This however should be # strongly considered in order to reverse profile mergers. The question # is: what to do with old judgements on a pair when we do this? profile_add_entities(entity_profile, match_id, match_collection_id, entity_id, judgement, authz) db.session.commit() return entity_profile
def create_entityset(collection, data, authz): """Create an entity set. This will create or update any entities that already exist in the entityset and sign their IDs into the collection. """ old_to_new_id_map = {} entity_ids = [] for entity in data.pop("entities", []): old_id = entity.get("id") new_id = upsert_entity(entity, collection, sync=True) old_to_new_id_map[old_id] = new_id entity_ids.append(new_id) layout = data.get("layout", {}) data["layout"] = replace_layout_ids(layout, old_to_new_id_map) entityset = EntitySet.create(data, collection, authz) for entity_id in entity_ids: save_entityset_item(entityset, collection, entity_id) publish( Events.CREATE_ENTITYSET, params={ "collection": collection, "entityset": entityset }, channels=[collection, authz.role], actor_id=authz.id, ) return entityset
def test_bulk_entitysets_api(self): role, headers = self.login(is_admin=True) authz = Authz.from_role(role) data = {"type": EntitySet.LIST, "label": "Foo"} eset = EntitySet.create(data, self.col, authz) db.session.commit() eset_id = eset.id data = json.dumps([ { "id": "4345800498380953840", "schema": "Person", "properties": { "name": "Osama bin Laden" }, }, { "id": "7598743983789743598", "schema": "Person", "properties": { "name": "Osama bin Laden" }, }, ]) url = "/api/2/collections/%s/_bulk?entityset_id=%s" % (self.col.id, eset_id) res = self.client.post(url, headers=headers, data=data) assert res.status_code == 204, res query = "/api/2/entitysets/%s/entities?filter:schema=Person" % eset_id res = self.client.get(query, headers=headers) assert res.json["total"] == 2, res.json
def get_deep_collection(collection): mappings = Mapping.by_collection(collection.id).count() entitysets = EntitySet.type_counts(collection_id=collection.id) return { "statistics": index.get_collection_stats(collection.id), "counts": {"mappings": mappings, "entitysets": entitysets}, "status": get_status(collection), "shallow": False, }
def decide_pairwise(collection, entity, match_collection, match, judgement, authz): """Store user feedback from an pairwise judgement as an profile-type EntitySet The problem here is that we're trying to translate a single pair-wise user judgement into a merge or split judgement regarding a cluster of entities. This works for most cases, with the exception that a profile, once established, cannot be split in a way that preserves what entities were linked to what other entities originally.""" if not isinstance(judgement, Judgement): judgement = Judgement(judgement) # This will raise a InvalidData error if the two types are not compatible model.common_schema(entity.get("schema"), match.get("schema")) profile = EntitySet.by_entity_id( entity.get("id"), collection_ids=[collection.id], types=[EntitySet.PROFILE], judgements=[Judgement.POSITIVE], ).first() if profile is None: data = {"type": EntitySet.PROFILE, "label": "profile"} profile = EntitySet.create(data, collection, authz) item = save_entityset_item( profile, collection, entity.get("id"), judgement=Judgement.POSITIVE, added_by_id=authz.id, ) item = save_entityset_item( profile, match_collection, match.get("id"), judgement=judgement, compared_to_entity_id=entity.get("id"), added_by_id=authz.id, ) db.session.commit() if item is not None: return item.entityset
def collection_profiles(collection_id, judgements=None, deleted=False): if judgements is not None: judgements = list(map(Judgement, judgements)) entity_sets = EntitySet.by_collection_id(collection_id, types=[EntitySet.PROFILE]) for entity_set in entity_sets: items = entity_set.profile(judgements=judgements, deleted=deleted).all() if items: yield (entity_set, items)
def delete_collection(collection, keep_metadata=False, sync=False): cancel_queue(collection) aggregator = get_aggregator(collection) aggregator.drop() flush_notifications(collection, sync=sync) index.delete_entities(collection.id, sync=sync) xref_index.delete_xref(collection, sync=sync) deleted_at = collection.deleted_at or datetime.utcnow() Mapping.delete_by_collection(collection.id) EntitySet.delete_by_collection(collection.id, deleted_at) Entity.delete_by_collection(collection.id) Document.delete_by_collection(collection.id) if not keep_metadata: Permission.delete_by_collection(collection.id) collection.delete(deleted_at=deleted_at) db.session.commit() if not keep_metadata: index.delete_collection(collection.id, sync=True) Authz.flush() refresh_collection(collection.id)
def index(): """Returns a list of entitysets for the role --- get: summary: List entitysets parameters: - description: The collection id. in: query name: 'filter:collection_id' required: true schema: minimum: 1 type: integer - description: The type of the entity set in: query name: 'filter:type' required: false schema: type: string - description: Quert string for searches in: query name: 'prefix' required: false schema: type: string responses: '200': content: application/json: schema: type: object allOf: - $ref: '#/components/schemas/QueryResponse' properties: results: type: array items: $ref: '#/components/schemas/EntitySet' description: OK tags: - EntitySet """ parser = QueryParser(request.args, request.authz) types = parser.filters.get("type") q = EntitySet.by_authz(request.authz, types=types, prefix=parser.prefix) q = q.order_by(EntitySet.updated_at.desc()) collection_ids = ensure_list(parser.filters.get("collection_id")) if len(collection_ids): q = q.filter(EntitySet.collection_id.in_(collection_ids)) result = DatabaseQueryResult(request, q, parser=parser) return EntitySetSerializer.jsonify_result(result)
def dump_profiles(outfile, foreign_id=None): """Export profile entityset items for the given collection.""" entitysets = EntitySet.by_type(EntitySet.PROFILE) if foreign_id is not None: collection = get_collection(foreign_id) entitysets = entitysets.filter( EntitySet.collection_id == collection.id) encoder = JSONEncoder(sort_keys=True) for entityset in entitysets: for item in entityset.items(): data = item.to_dict(entityset=entityset) data["entity"] = get_expanded_entity(data.get("entity_id")) data["compared_to_entity"] = get_expanded_entity( data.get("compared_to_entity_id")) outfile.write(encoder.encode(data) + "\n")
def get_deep_role(role): authz = Authz.from_role(role) alerts = Alert.by_role_id(role.id).count() exports = Export.by_role_id(role.id).count() casefiles = Collection.all_casefiles(authz=authz).count() entitysets = EntitySet.type_counts(authz=authz) return { "counts": { "alerts": alerts, "entitysets": entitysets, "casefiles": casefiles, "exports": exports, }, "shallow": False, }
def get_entitysets_by_entity(entity_id, collection_ids=None, judgements=None, types=None, labels=None): if judgements is not None: judgements = list(map(Judgement, judgements)) entitysets = EntitySet.by_entity_id( entity_id, collection_ids=collection_ids, judgements=judgements, types=types, labels=labels, ) return entitysets
def profile_fragments(collection, aggregator, entity_id=None): """In order to make the profile_id visible on entities in a collection, we generate stub entities in the FtM store that contain only a context. """ aggregator.delete(origin=ORIGIN) writer = aggregator.bulk() profile_id = None for (profile_id, entity_id) in EntitySet.all_profiles(collection.id, entity_id=entity_id): data = { "id": entity_id, "schema": Entity.THING, "profile_id": profile_id } writer.put(model.get_proxy(data), origin=ORIGIN) writer.flush() return profile_id
def _query_item(entity, entitysets=True): """Cross-reference an entity or document, given as an indexed document.""" query = match_query(entity) if query == none_query(): return log.debug("Candidate [%s]: %s", entity.schema.name, entity.caption) entityset_ids = EntitySet.entity_entitysets(entity.id) if entitysets else [] query = {"query": query, "size": 50, "_source": ENTITY_SOURCE} index = entities_read_index(schema=list(entity.schema.matchable_schemata)) result = es.search(index=index, body=query) for result in result.get("hits").get("hits"): result = unpack_result(result) if result is None: continue match = model.get_proxy(result) score = compare(model, entity, match) log.debug("Match: %s <[%.2f]> %s", entity.caption, score, match.caption) yield score, entity, result.get("collection_id"), match, entityset_ids
def setUp(self): super(ProfilesApiTestCase, self).setUp() self.rolex = self.create_user(foreign_id="rolex") self.col1 = self.create_collection() self.grant(self.col1, self.rolex, True, True) authz = Authz.from_role(self.rolex) self.profile = EntitySet.create( { "label": "x", "type": EntitySet.PROFILE }, self.col1, authz) ent1 = { "schema": "LegalEntity", "properties": { "name": "Donald Trump", "address": "721 Fifth Avenue, New York, NY", "phone": "+12024561414", }, } self.ent1 = self.create_entity(ent1, self.col1) index_entity(self.ent1) EntitySetItem.save(self.profile, self.ent1.id, collection_id=self.col1.id) self.col2 = self.create_collection() self.grant_publish(self.col2) ent2 = { "schema": "Person", "properties": { "name": "Donald J. Trump", "position": "45th President of the US", "phone": "+12024561414", }, } self.ent2 = self.create_entity(ent2, self.col2) index_entity(self.ent2) EntitySetItem.save(self.profile, self.ent2.id, collection_id=self.col2.id) ent_false = { "schema": "LegalEntity", "properties": { "name": "Donald Trump, Jr", "email": "*****@*****.**" }, } self.ent_false = self.create_entity(ent_false, self.col2) index_entity(self.ent_false) EntitySetItem.save( self.profile, self.ent_false.id, collection_id=self.col2.id, judgement=Judgement.NEGATIVE, ) self.col3 = self.create_collection() ent3 = { "schema": "LegalEntity", "properties": { "name": "Donald John Trump", "birthDate": "1964" }, } self.ent3 = self.create_entity(ent3, self.col3) index_entity(self.ent3) EntitySetItem.save(self.profile, self.ent3.id, collection_id=self.col3.id) db.session.commit()
def create_profile(collection, authz): data = {"type": EntitySet.PROFILE, "label": "profile"} return EntitySet.create(data, collection, authz)
def get_entityset(entityset_id): return EntitySet.by_id(entityset_id)
def get_entityset(entityset_id, action=Authz.READ): eset = obj_or_404(EntitySet.by_id(entityset_id)) require(request.authz.can(eset.collection_id, action)) return eset