def _generate(self): latest = Entity.latest() if self.latest is not None and self.latest >= latest: return self.latest = latest matches = {} q = Entity.all() q = q.filter(Entity.state == Entity.STATE_ACTIVE) for entity in q: for term in entity.regex_terms: if term in matches: matches[term].append(entity.id) else: matches[term] = [entity.id] if not len(matches): self.automaton = None return self.automaton = Automaton() for term, entities in matches.iteritems(): self.automaton.add_word(term.encode('utf-8'), entities) self.automaton.make_automaton() log.info('Generated automaton with %s terms', len(matches))
def _generate(self): latest = Entity.latest() if self.latest is not None and self.latest >= latest: return self.latest = latest self.matches = defaultdict(set) q = Entity.all() q = q.options(joinedload('other_names')) q = q.filter(Entity.state == Entity.STATE_ACTIVE) for entity in q: for term in entity.regex_terms: self.matches[normalize_strong(term)].add(entity.id) self.regexes = [] terms = self.matches.keys() terms = [t for t in terms if len(t) > 2] for i in count(0): terms_slice = terms[i * BATCH_SIZE:(i + 1) * BATCH_SIZE] if not len(terms_slice): break body = '|'.join(terms_slice) rex = re.compile('( |^)(%s)( |$)' % body) # rex = re.compile('(%s)' % body) self.regexes.append(rex) log.info('Generating entity tagger: %r (%s terms)', latest, len(terms))
def delete_collection(collection, keep_metadata=False, sync=False): cancel_queue(collection) aggregator = get_aggregator(collection) try: aggregator.drop() finally: aggregator.close() flush_notifications(collection, sync=sync) index.delete_entities(collection.id, sync=sync) xref_index.delete_xref(collection, sync=sync) deleted_at = collection.deleted_at or datetime.utcnow() Entity.delete_by_collection(collection.id, deleted_at=deleted_at) Mapping.delete_by_collection(collection.id, deleted_at=deleted_at) Diagram.delete_by_collection(collection.id, deleted_at=deleted_at) Document.delete_by_collection(collection.id) if not keep_metadata: # Considering linkages metadata for now, might be wrong: Linkage.delete_by_collection(collection.id) Permission.delete_by_collection(collection.id, deleted_at=deleted_at) collection.delete(deleted_at=deleted_at) db.session.commit() if not keep_metadata: index.delete_collection(collection.id, sync=True) Authz.flush() refresh_collection(collection.id, sync=True)
def delete_collection(collection_id, wait=False): # Deleting a collection affects many associated objects and requires # checks, so this is done manually and in detail here. q = db.session.query(Collection) q = q.filter(Collection.id == collection_id) collection = q.first() if collection is None: log.error("No collection with ID: %r", collection_id) return log.info("Deleting collection [%r]: %r", collection.id, collection.label) deleted_at = datetime.utcnow() index_delete(collection_id, wait=wait) log.info("Delete cross-referencing matches...") Match.delete_by_collection(collection_id) log.info("Delete permissions...") Permission.delete_by_collection(collection_id, deleted_at=deleted_at) log.info("Delete documents...") Document.delete_by_collection(collection_id, deleted_at=deleted_at) log.info("Delete entities...") Entity.delete_by_collection(collection_id, deleted_at=deleted_at) collection.delete(deleted_at=deleted_at) db.session.commit()
def setUp(self): super(EntitiesTestCase, self).setUp() self.rolex = self.create_user(foreign_id='user_3') self.col = Collection() self.col.label = 'Original Collection' self.col.foreign_id = 'test_coll_entities' db.session.add(self.col) self.col_other = Collection() self.col_other.label = 'Other Collection' self.col_other.foreign_id = 'test_coll_entities_other' db.session.add(self.col_other) db.session.flush() self.ent = Entity.create({ 'schema': 'LegalEntity', 'properties': { 'name': 'Winnie the Pooh', 'country': 'pa', 'summary': 'a fictional teddy bear created by A. A. Milne', 'alias': ['Puh der Bär', 'Pooh Bear'] } }, self.col) self.other = Entity.create({ 'schema': 'LegalEntity', 'properties': { 'name': 'Pu der Bär', 'country': 'de', 'description': 'he is a bear', 'alias': ['Puh der Bär'] } }, self.col) db.session.commit() index_entity(self.ent) index_entity(self.other)
def load_fixtures(self): self.private_coll = Collection.create({ 'foreign_id': 'test_private', 'label': "Private Collection", 'category': 'grey' }) self._banana = Entity.create( { 'schema': 'Person', 'properties': { 'name': ['Banana'], } }, self.private_coll) user = Role.by_foreign_id(Role.SYSTEM_USER) Permission.grant(self.private_coll, user, True, False) self.public_coll = Collection.create({ 'foreign_id': 'test_public', 'label': "Public Collection", 'category': 'news' }) self._kwazulu = Entity.create( { 'schema': 'Company', 'properties': { 'name': ['KwaZulu'], 'alias': ['kwazulu'] } }, self.public_coll) visitor = Role.by_foreign_id(Role.SYSTEM_GUEST) Permission.grant(self.public_coll, visitor, True, False) db.session.commit() samples = read_entities(self.get_fixture_path('samples.ijson')) index_entities(self.private_coll, samples) process_collection(self.public_coll, ingest=False, reset=True) process_collection(self.private_coll, ingest=False, reset=True)
def upsert_entity(data, collection, authz=None, sync=False, sign=False, job_id=None): """Create or update an entity in the database. This has a side hustle of migrating entities created via the _bulk API or a mapper to a database entity in the event that it gets edited by the user. """ entity = None entity_id = collection.ns.sign(data.get("id")) if entity_id is not None: entity = Entity.by_id(entity_id, collection=collection) if entity is None: role_id = authz.id if authz is not None else None entity = Entity.create(data, collection, sign=sign, role_id=role_id) else: entity.update(data, collection, sign=sign) proxy = entity.to_proxy() aggregator = get_aggregator(collection) aggregator.delete(entity_id=entity.id) aggregator.put(proxy, origin=MODEL_ORIGIN) index.index_proxy(collection, proxy, sync=sync) refresh_entity(collection, entity.id) queue_task(collection, OP_UPDATE_ENTITY, job_id=job_id, entity_id=entity.id) return entity.id
def _generate(self): latest = Entity.latest() if latest is None: return if self.latest is not None and self.latest >= latest: return self.latest = latest matches = {} q = Entity.all() for entity in q: for term in entity.regex_terms: type_ = self.TYPES.get(entity.type) if type_ is None: continue if term in matches: matches[term].append((entity.name, type_)) else: matches[term] = [(entity.name, type_)] if not len(matches): return for term, entities in matches.iteritems(): self.automaton.add_word(term.encode('utf-8'), entities) self.automaton.make_automaton() log.info('Generated automaton with %s terms', len(matches))
def setUp(self): super(EntitiesTestCase, self).setUp() self.rolex = self.create_user(foreign_id='user_3') self.col = Collection() self.col.label = 'Original Collection' self.col.foreign_id = 'test_coll_entities' db.session.add(self.col) self.col_other = Collection() self.col_other.label = 'Other Collection' self.col_other.foreign_id = 'test_coll_entities_other' db.session.add(self.col_other) db.session.flush() self.ent = Entity.create({ 'schema': 'LegalEntity', 'properties': { 'name': 'Winnie the Pooh', 'country': 'pa', 'summary': 'a fictional teddy bear created by A. A. Milne', 'alias': ['Puh der Bär', 'Pooh Bear'] } }, self.col) self.other = Entity.create({ 'schema': 'LegalEntity', 'properties': { 'name': 'Pu der Bär', 'country': 'de', 'description': 'he is a bear', 'alias': ['Puh der Bär'] } }, self.col) db.session.commit()
def delete_entities(collection_id, deleted_at=None): deleted_at = deleted_at or datetime.utcnow() log.info("Deleting entities...") Entity.delete_by_collection(collection_id, deleted_at=deleted_at) index.delete_entities(collection_id) log.info("Deleting cross-referencing matches...") Match.delete_by_collection(collection_id, deleted_at=deleted_at)
def update(id): entity = obj_or_404(Entity.by_id(id)) entity = Entity.save(get_data(entity=entity), collection_id=entity.collection_id, merge=arg_bool('merge')) db.session.commit() analyze_entity.delay(entity.id) return view(entity.id)
def cleanup_deleted(): from aleph.model import Alert, Entity, Collection from aleph.model import Permission, Role Alert.cleanup_deleted() Permission.cleanup_deleted() Entity.cleanup_deleted() Collection.cleanup_deleted() Role.cleanup_deleted() db.session.commit()
def setUp(self): super(EntitiesTestCase, self).setUp() self.rolex = self.create_user(foreign_id='user_3') self.col = Collection() self.col.label = 'Original Collection' self.col.foreign_id = 'test_coll_entities' db.session.add(self.col) self.col_other = Collection() self.col_other.label = 'Other Collection' self.col_other.foreign_id = 'test_coll_entities_other' db.session.add(self.col_other) db.session.flush() self.ent = Entity.save( { 'name': 'Winnie the Pooh', 'jurisdiction_code': 'pa', 'summary': 'a fictional teddy bear created by author A. A. Milne', 'identifiers': [{ 'scheme': 'wikipedia', 'identifier': 'en:Winnie-the-Pooh' }], 'other_names': [{ 'name': u'Puh der Bär' }, { 'name': 'Pooh Bear' }] }, [self.col]) db.session.add(self.ent) db.session.flush() self.other = Entity.save( { 'name': 'Pu der Bär', 'jurisdiction_code': 'de', 'description': 'he is a bear', 'identifiers': [{ 'scheme': 'wikipedia', 'identifier': 'en:Winnie-the-Pooh' }, { 'scheme': 'animals', 'identifier': 'bears.winnie.pooh' }], 'other_names': [{ 'name': u'Puh der Bär' }] }, [self.col_other]) db.session.add(self.other) self.alert = Alert() self.alert.entity = self.other db.session.add(self.alert) db.session.commit()
def merge(id, other_id): entity = obj_or_404(Entity.by_id(id)) check_authz(entity, authz.WRITE) other = obj_or_404(Entity.by_id(other_id)) check_authz(other, authz.WRITE) entity.merge(other) db.session.commit() update_entity(entity) update_entity(other) return view(entity.id)
def delete_collection(collection, sync=False): reset_collection(collection, sync=False) flush_notifications(collection) deleted_at = collection.deleted_at or datetime.utcnow() Entity.delete_by_collection(collection.id, deleted_at=deleted_at) Document.delete_by_collection(collection.id) Permission.delete_by_collection(collection.id, deleted_at=deleted_at) collection.delete(deleted_at=deleted_at) db.session.commit() index.delete_collection(collection.id, sync=sync) Authz.flush()
def load_entity(self, fk, name, schema): entity = Entity.by_foreign_id(fk, self.collection.id, deleted=True) if entity is not None: return entity return Entity.save({ 'name': name, 'schema': schema, 'foreign_ids': [fk], 'state': Entity.STATE_PENDING, 'data': {} }, self.collection)
def load_fixture(name): dir_name = os.path.join(fixtures_path, name) if not os.path.isdir(dir_name): raise ValueError("No such directory: %r" % dir_name) with open(os.path.join(dir_name, 'mapping.yaml'), 'rb') as fh: data = yaml.load(fh) lst = List.by_label(data.get('list')) selectors = set() if lst is not None: selectors = lst.terms lst.delete() db.session.commit() lst = List.create( { 'label': data.get('list'), 'public': data.get('public'), 'users': [] }, None) log.info("Loading %r", lst) mapping = data.get('mapping') default_category = data.get('default_category') assert default_category in CATEGORIES, default_category entities = defaultdict(set) with open(os.path.join(dir_name, 'data.csv'), 'rb') as fh: for row in unicodecsv.DictReader(fh): label = row.get(mapping.get('label', 'label')) if label is None: continue category = row.get(mapping.get('category', 'category')) category = category or default_category selectors = [row.get(mapping.get('selector', 'selector'))] selectors = [s for s in selectors if s] entities[(label, category)].update(selectors) for (label, category), selectors in entities.items(): data = { 'label': label, 'category': category, 'selectors': selectors, 'list': lst } try: Entity.create(data, None) except Invalid, inv: log.warn("Failed: %s", inv)
def cleanup_deleted(): from aleph.model import Alert, Entity, Collection from aleph.model import Permission, Role, Document from aleph.model import Diagram, Mapping Mapping.cleanup_deleted() Diagram.cleanup_deleted() Document.cleanup_deleted() Alert.cleanup_deleted() Permission.cleanup_deleted() Entity.cleanup_deleted() Collection.cleanup_deleted() Role.cleanup_deleted() db.session.commit()
def update(id): entity = obj_or_404(Entity.by_id(id)) check_authz(entity, authz.WRITE) data = request_data() data['id'] = entity.id possible_collections = authz.collections(authz.WRITE) possible_collections.extend([c.id for c in entity.collections]) data['collections'] = [c for c in get_collections(data) if c.id in possible_collections] entity = Entity.save(data, merge=arg_bool('merge')) db.session.commit() update_entity(entity) return view(entity.id)
def delete_collection(collection, sync=False): flush_notifications(collection) drop_aggregator(collection) deleted_at = collection.deleted_at or datetime.utcnow() Entity.delete_by_collection(collection.id, deleted_at=deleted_at) Match.delete_by_collection(collection.id, deleted_at=deleted_at) Permission.delete_by_collection(collection.id, deleted_at=deleted_at) collection.delete(deleted_at=deleted_at) db.session.commit() index.delete_collection(collection.id, sync=sync) index.delete_entities(collection.id, sync=False) refresh_collection(collection.id) Authz.flush()
def setUp(self): super(CollectionsApiTestCase, self).setUp() self.rolex = self.create_user(foreign_id='user_3') self.col = Collection() self.col.label = 'Test Collection' self.col.foreign_id = 'test_coll_entities_api' db.session.add(self.col) db.session.flush() self.ent = Entity() self.ent.collection = self.col self.ent.update({'name': 'Winnie the Pooh'}) db.session.add(self.ent) db.session.commit()
def update(id): entity = obj_or_404(Entity.by_id(id)) check_authz(entity, authz.WRITE) data = request_data() data['id'] = entity.id possible_collections = authz.collections(authz.WRITE) possible_collections.extend([c.id for c in entity.collections]) data['collections'] = [ c for c in get_collections(data) if c.id in possible_collections ] entity = Entity.save(data, merge=arg_bool('merge')) db.session.commit() update_entity(entity) return view(entity.id)
def setUp(self): super(EntitiesTestCase, self).setUp() self.rolex = self.create_user(foreign_id='user_3') self.col = Collection() self.col.label = 'Original Collection' self.col.foreign_id = 'test_coll_entities' db.session.add(self.col) self.col_other = Collection() self.col_other.label = 'Other Collection' self.col_other.foreign_id = 'test_coll_entities_other' db.session.add(self.col_other) db.session.flush() self.ent = Entity.save({ 'name': 'Winnie the Pooh', 'collections': [self.col], 'jurisdiction_code': 'pa', 'summary': 'a fictional teddy bear created by author A. A. Milne', 'identifiers': [{ 'scheme': 'wikipedia', 'identifier': 'en:Winnie-the-Pooh' }], 'other_names': [{ 'name': u'Puh der Bär' }, { 'name': 'Pooh Bear' }] }) db.session.add(self.ent) db.session.flush() self.other = Entity.save({ 'name': 'Pu der Bär', 'collections': [self.col_other], 'jurisdiction_code': 'de', 'description': 'he is a bear', 'identifiers': [{ 'scheme': 'wikipedia', 'identifier': 'en:Winnie-the-Pooh' }, { 'scheme': 'animals', 'identifier': 'bears.winnie.pooh' }], 'other_names': [{ 'name': u'Puh der Bär' }] }) db.session.add(self.other) self.alert = Alert() self.alert.entity = self.other db.session.add(self.alert) db.session.commit()
def delete_collection(collection, keep_metadata=False, sync=False): reset_collection(collection, sync=False) deleted_at = collection.deleted_at or datetime.utcnow() Entity.delete_by_collection(collection.id, deleted_at=deleted_at) Mapping.delete_by_collection(collection.id, deleted_at=deleted_at) Diagram.delete_by_collection(collection.id, deleted_at=deleted_at) Document.delete_by_collection(collection.id) if not keep_metadata: Permission.delete_by_collection(collection.id, deleted_at=deleted_at) collection.delete(deleted_at=deleted_at) db.session.commit() if not keep_metadata: index.delete_collection(collection.id, sync=sync) Authz.flush() refresh_collection(collection.id, sync=True)
def load_fixtures(self): self.admin = self.create_user(foreign_id='admin', is_admin=True) self.private_coll = self.create_collection(foreign_id='test_private', label="Private Collection", category='grey', casefile=False, creator=self.admin) self._banana = Entity.create( { 'schema': 'Person', 'properties': { 'name': ['Banana'], } }, self.private_coll) user = Role.by_foreign_id(Role.SYSTEM_USER) Permission.grant(self.private_coll, user, True, False) self.public_coll = self.create_collection(foreign_id='test_public', label="Public Collection", category='news', casefile=False, creator=self.admin) self._kwazulu = Entity.create( { 'schema': 'Company', 'properties': { 'name': ['KwaZulu'], 'alias': ['kwazulu'] } }, self.public_coll) visitor = Role.by_foreign_id(Role.SYSTEM_GUEST) Permission.grant(self.public_coll, visitor, True, False) db.session.commit() drop_aggregator(self.public_coll) stage = get_stage(self.public_coll, OP_PROCESS) process_collection(stage, self.public_coll, ingest=False, sync=True) aggregator = get_aggregator(self.private_coll) aggregator.delete() stage = get_stage(self.private_coll, OP_PROCESS) for sample in read_entities(self.get_fixture_path('samples.ijson')): aggregator.put(sample, fragment='sample') index_aggregate(stage, self.private_coll, entity_id=sample.id, sync=True) aggregator.close() process_collection(stage, self.private_coll, ingest=False, sync=True)
def delete(id): entity = obj_or_404(Entity.by_id(id)) check_authz(entity, authz.WRITE) delete_entity(entity) db.session.commit() log_event(request, entity_id=entity.id) return jsonify({'status': 'ok'})
def crawl_collection(self, collection): if not len(collection.get('subjects', [])): return url = urljoin(self.URL, '/api/collections/%s' % collection.get('id')) collection = Collection.by_foreign_id(url, { 'label': collection.get('title') }) res = requests.get('%s/permissions' % url, headers=self.HEADERS) for perm in res.json().get('results', []): Permission.grant_foreign(collection, perm.get('role'), perm.get('read'), perm.get('write')) log.info(" > Spindle collection: %s", collection.label) res = requests.get('%s/entities' % url, headers=self.HEADERS) terms = set() existing_entities = [] for entity in res.json().get('results', []): if entity.get('name') is None: continue aliases = [on.get('alias') for on in entity.get('other_names', [])] ent = Entity.by_foreign_id(entity.get('id'), collection, { 'name': entity.get('name'), 'category': SCHEMATA.get(entity.get('$schema'), OTHER), 'data': entity, 'selectors': aliases }) terms.update(ent.terms) existing_entities.append(ent.id) log.info(" # %s (%s)", ent.name, ent.category) for entity in collection.entities: if entity.id not in existing_entities: entity.delete() self.emit_collection(collection, terms)
def all(): q = Entity.all() q = q.filter(Entity.state == Entity.STATE_ACTIVE) clause = Collection.id.in_(authz.collections(authz.READ)) q = q.filter(Entity.collections.any(clause)) q = q.order_by(Entity.id.asc()) return jsonify(Pager(q, limit=100))
def delete(id): entity = obj_or_404(Entity.by_id(id)) authz.require(authz.watchlist_write(entity.watchlist_id)) entity.delete() db.session.commit() analyze_entity.delay(id) return jsonify({"status": "ok"})
def crawl_collection(self, collection): if not len(collection.get('subjects', [])): return url = urljoin(self.URL, '/api/collections/%s' % collection.get('id')) watchlist = Watchlist.by_foreign_id(url, { 'label': collection.get('title') }) res = requests.get('%s/permissions' % url, headers=self.HEADERS) for perm in res.json().get('results', []): Permission.grant_foreign(watchlist, perm.get('role'), perm.get('read'), perm.get('write')) log.info(" > Spindle collection: %s", watchlist.label) res = requests.get('%s/entities' % url, headers=self.HEADERS) previous_terms = watchlist.terms updated_terms = set() existing_entities = [] for entity in res.json().get('results', []): if entity.get('name') is None: continue aliases = [on.get('alias') for on in entity.get('other_names', [])] ent = Entity.by_foreign_id(entity.get('id'), watchlist, { 'name': entity.get('name'), 'category': SCHEMATA.get(entity.get('$schema'), OTHER), 'data': entity, 'selectors': aliases }) updated_terms.update(ent.terms) existing_entities.append(ent.id) log.info(" # %s (%s)", ent.name, ent.category) watchlist.delete_entities(spare=existing_entities) terms = previous_terms.symmetric_difference(updated_terms) self.emit_watchlist(watchlist, terms)
def build_automaton(self): q = Entity.all() q = q.filter(Entity.schema.in_(self.TYPES.keys())) matches = {} for entity in q: tag = self.TYPES.get(entity.schema) if tag is None: continue for name in entity.names: if name is None or len(name) > 120: continue match = self.match_form(name) if match is None: continue if match in matches: matches[match].append((name, tag)) else: matches[match] = [(name, tag)] if not len(matches): return automaton = Automaton() for term, entities in matches.iteritems(): automaton.add_word(term, entities) automaton.make_automaton() return automaton
def delete(id): entity = obj_or_404(Entity.by_id(id)) check_authz(entity, authz.WRITE) entity.delete() db.session.commit() update_entity(entity) return jsonify({'status': 'ok'})
def transform_facets(aggregations): coll = aggregations.get('all', {}).get('ftr', {}).get('collections', {}) coll = coll.get('buckets', []) lists = {} for list_id in get_list_facets(request.args): key = 'list_%s' % list_id ents = aggregations.get(key, {}).get('inner', {}) ents = ents.get('entities', {}).get('buckets', []) objs = Entity.by_id_set([e.get('key') for e in ents]) entities = [] for entity in ents: entity['entity'] = objs.get(entity.get('key')) if entity['entity'] is not None: entities.append(entity) lists[list_id] = entities attributes = {} for attr in request.args.getlist('attributefacet'): key = 'attr_%s' % attr vals = aggregations.get(key, {}).get('inner', {}) vals = vals.get('values', {}).get('buckets', []) attributes[attr] = vals return { 'sources': coll, 'lists': lists, 'attributes': attributes }
def load_entities(): tx = get_graph().begin() q = Entity.all() q = q.filter(Entity.state == Entity.STATE_ACTIVE) for entity in q: load_entity(tx, entity) tx.commit()
def records_query(document_id, args, size=5): terms = [] text = args.get('q', '').strip() if len(text): terms.append(text) entities = Entity.by_id_set(args.getlist('entity')) for entity in entities.values(): terms.extend(entity.terms) if not len(terms): return None shoulds = [] for term in terms: shoulds.append({ 'match': { 'text': { 'query': term, 'boost': 10, 'operator': 'and' } } }) shoulds.append({ 'match': { 'text_latin': { 'query': latinize_text(term), 'operator': 'and' } } }) q = { 'bool': { 'minimum_should_match': 1, 'should': shoulds } } if document_id is not None: q['bool']['must'] = { 'term': {'document_id': document_id} } try: snippet = int(args.get('snippet', 150)) except: snippet = 150 return { 'size': size, 'query': q, 'highlight': { 'fields': { 'text': {'fragment_size': snippet}, 'text_latin': {'fragment_size': snippet} } }, '_source': ['document_id', 'sheet', 'row_id', 'page'] }
def crawl(self): url = urljoin(self.host, '/ticket/all_closed/?format=json') watchlist = Watchlist.by_foreign_id(url, { 'label': 'Investigative Dashboard Requests' }) Permission.grant_foreign(watchlist, 'idashboard:occrp_staff', True, False) existing_entities = [] previous_terms = watchlist.terms updated_terms = set() db.session.flush() for endpoint in ['all_closed', 'all_open']: url = urljoin(self.host, '/ticket/%s/?format=json' % endpoint) data = self.session.get(url).json() for req in data.get('paginator', {}).get('object_list'): category = REQUEST_TYPES.get(req.get('ticket_type')) if category is None: continue ent = Entity.by_foreign_id(str(req.get('id')), watchlist, { 'name': req.get('name'), 'category': category, 'data': req, 'selectors': [req.get('name')] }) updated_terms.update(ent.terms) existing_entities.append(ent.id) log.info(" # %s (%s)", ent.name, ent.category) watchlist.delete_entities(spare=existing_entities) terms = previous_terms.symmetric_difference(updated_terms) self.emit_watchlist(watchlist, terms)
def load_entity(self, name, schema): identifier = name.lower().strip() q = db.session.query(EntityIdentifier) q = q.order_by(EntityIdentifier.deleted_at.desc().nullsfirst()) q = q.filter(EntityIdentifier.scheme == self.origin) q = q.filter(EntityIdentifier.identifier == identifier) ident = q.first() if ident is not None: if ident.deleted_at is None: # TODO: add to collections? Security risk here. return ident.entity_id if ident.entity.deleted_at is None: return None data = { 'name': name, '$schema': schema, 'state': Entity.STATE_PENDING, 'identifiers': [{ 'scheme': self.origin, 'identifier': identifier }] } entity = Entity.save(data, self.collections) return entity.id
def delete(id): entity = obj_or_404(Entity.by_id(id)) authz.require(authz.collection_write(entity.collection_id)) entity.delete() db.session.commit() analyze_entity.delay(id) return jsonify({'status': 'ok'})
def prune_entity(collection, entity_id=None, job_id=None): """Prune handles the full deletion of an entity outside of the HTTP request cycle. This involves cleaning up adjacent entities like xref results, notifications and so on.""" # This is recursive and will also delete any entities which # reference the given entity. Usually this is going to be child # documents, or directoships referencing a person. It's a pretty # dangerous operation, though. log.info("[%s] Prune entity: %s", collection, entity_id) for adjacent in index.iter_adjacent(collection.id, entity_id): log.warning("Recursive delete: %s", adjacent.get("id")) delete_entity(collection, adjacent, job_id=job_id) flush_notifications(entity_id, clazz=Entity) obj = Entity.by_id(entity_id, collection=collection) if obj is not None: obj.delete() doc = Document.by_id(entity_id, collection=collection) if doc is not None: doc.delete() EntitySetItem.delete_by_entity(entity_id) Mapping.delete_by_table(entity_id) xref_index.delete_xref(collection, entity_id=entity_id) aggregator = get_aggregator(collection) aggregator.delete(entity_id=entity_id) refresh_entity(collection, entity_id) collection.touch() db.session.commit()
def test(): from aleph.model import Entity graph = get_graph() tx = graph.begin() for entity_id in Entity.all_ids(): remove_entity(tx, entity_id) tx.commit()
def crawl(self): url = urljoin(self.host, '/ticket/all_closed/?format=json') collection = Collection.by_foreign_id(url, { 'label': 'Investigative Dashboard Requests' }) Permission.grant_foreign(collection, 'idashboard:occrp_staff', True, False) existing_entities = [] terms = set() db.session.flush() for endpoint in ['all_closed', 'all_open']: url = urljoin(self.host, '/ticket/%s/?format=json' % endpoint) data = self.session.get(url).json() for req in data.get('paginator', {}).get('object_list'): category = REQUEST_TYPES.get(req.get('ticket_type')) if category is None: continue ent = Entity.by_foreign_id(str(req.get('id')), collection, { 'name': req.get('name'), 'category': category, 'data': req, 'selectors': [req.get('name')] }) terms.update(ent.terms) existing_entities.append(ent.id) log.info(" # %s (%s)", ent.name, ent.category) for entity in collection.entities: if entity.id not in existing_entities: entity.delete() self.emit_collection(collection, terms)
def fetch_entity(entity_id): """Load entities from both the ES index and the database.""" entity = get_entity(entity_id) obj = Entity.by_id(entity_id) if obj is not None: entity['data'] = obj.data return entity, obj
def analyze_collection(collection_id): """Re-analyze the elements of this collection, documents and entities.""" Entity.delete_dangling(collection_id) db.session.commit() q = db.session.query(Collection).filter(Collection.id == collection_id) collection = q.first() if collection is None: log.error("No collection with ID: %r", collection_id) # re-process the documents analyze_documents(collection.id) # re-process entities for entity in collection.entities: update_entity_full(entity.id)
def emit_entity(self, collection, data): entity = Entity.save(data, [collection], merge=True) db.session.commit() log.info("Entity [%s]: %s", entity.id, entity.name) update_entity(entity) self.increment_count() return entity
def load_document(tx, document): if tx is None: return log.info("Graph load [%s]: %r", document.id, document.meta) meta = document.meta node = DocumentNode.merge(tx, name=meta.title, alephTitle=document.type, fileName=meta.file_name, fingerprint=document.id, alephDocument=document.id) add_to_collections(tx, node, document.collections, alephDocument=document.id) for email in meta.emails: enode = EmailNode.merge(tx, name=email, fingerprint=email) MENTIONS.merge(tx, node, enode, alephDocument=document.id) add_to_collections(tx, enode, document.collections, alephDocument=document.id) for phone in meta.phone_numbers: pnode = PhoneNode.merge(tx, name=phone, fingerprint=phone) MENTIONS.merge(tx, node, pnode, alephDocument=document.id) add_to_collections(tx, pnode, document.collections, alephDocument=document.id) for entity in Entity.all_by_document(document.id): enode = load_entity(tx, entity) MENTIONS.merge(tx, node, enode, alephDocument=document.id, alephEntity=entity.id) return node
def format_results(query): sources = {} entities = {} results = [] for row in raw_iter(query): src = row.get('_source') data = {} for name, value in src.items(): if isinstance(value, dict) or name in SKIP_FIELDS: continue if name == 'entities': load_ids = [] for entity_id in value: if entity_id not in entities: load_ids.append(entity_id) if len(load_ids): for id, ent in Entity.by_id_set(load_ids).items(): entities[id] = ent.name value = ', '.join([entities.get(e) for e in value if entities.get(e) is not None]) if isinstance(value, (list, tuple, set)): value = ', '.join(value) if name == 'source_id': # WARNING: don't to one query per row if value not in sources: source = Source.by_id(value) if source is None: sources[value] = '[Deleted source %s]' % value else: sources[value] = source.label value = sources[value] data[name] = value results.append(data) return results
def load_fixture(name): dir_name = os.path.join(fixtures_path, name) if not os.path.isdir(dir_name): raise ValueError("No such directory: %r" % dir_name) with open(os.path.join(dir_name, 'mapping.yaml'), 'rb') as fh: data = yaml.load(fh) lst = List.by_label(data.get('list')) selectors = set() if lst is not None: selectors = lst.terms lst.delete() db.session.commit() lst = List.create({ 'label': data.get('list'), 'public': data.get('public'), 'users': [] }, None) log.info("Loading %r", lst) mapping = data.get('mapping') default_category = data.get('default_category') assert default_category in CATEGORIES, default_category entities = defaultdict(set) with open(os.path.join(dir_name, 'data.csv'), 'rb') as fh: for row in unicodecsv.DictReader(fh): label = row.get(mapping.get('label', 'label')) if label is None: continue category = row.get(mapping.get('category', 'category')) category = category or default_category selectors = [row.get(mapping.get('selector', 'selector'))] selectors = [s for s in selectors if s] entities[(label, category)].update(selectors) for (label, category), selectors in entities.items(): data = {'label': label, 'category': category, 'selectors': selectors, 'list': lst} try: Entity.create(data, None) except Invalid, inv: log.warn("Failed: %s", inv)
def delete(id): entity = obj_or_404(Entity.by_id(id)) authz.require(authz.list_write(entity.list_id)) selectors = entity.terms entity.delete() db.session.commit() refresh_selectors.delay(list(selectors)) return jsonify({"status": "ok"})
def create(): data = EntityForm().deserialize(request_data()) authz.require(data["list"]) authz.require(authz.list_write(data["list"].id)) entity = Entity.create(data, current_user) db.session.commit() refresh_selectors.delay(list(entity.terms)) return view(entity.id)
def emit_entity(self, collection, data): data['collections'] = [collection] entity = Entity.save(data, merge=True) db.session.flush() update_entity_full.delay(entity.id) log.info("Entity [%s]: %s", entity.id, entity.name) self.entity_cache[collection.id].append(entity) return entity
def all(): collection_id = request.args.getlist('collection_id') collection_id = authz.collections_intersect(authz.READ, collection_id) q = Entity.all_ids() q = q.filter(Entity.state == Entity.STATE_ACTIVE) q = q.filter(Entity.deleted_at == None) # noqa clause = Collection.id.in_(collection_id) q = q.filter(Entity.collections.any(clause)) return jsonify({'results': [r[0] for r in q]})
def _generate(): for entity in Entity.by_collection(collection.id): entity_id, index, body = index_operation(entity.to_dict()) yield { '_id': entity_id, '_index': index, '_source': body } yield from generate_collection_docs(collection)