def test_update_collections_via_doc_update(self): url = '/api/1/documents/1000' ores = self.client.get(url) user = self.login() Permission.grant_collection(1000, user, True, True) can_write = Collection.create({'label': "Write"}, user) no_write = Collection.create({'label': "No-write"}) db.session.commit() data = ores.json.copy() data['collection_id'].append(can_write.id) res = self.client.post(url, data=json.dumps(data), content_type='application/json') assert res.status_code == 200, res assert can_write.id in res.json['collection_id'], res.json data = ores.json.copy() data['collection_id'] = [no_write.id] res = self.client.post(url, data=json.dumps(data), content_type='application/json') assert res.status_code == 200, res assert no_write.id not in res.json['collection_id'], res.json assert 1000 in res.json['collection_id'], res.json data = ores.json.copy() data['collection_id'] = ['foo'] res = self.client.post(url, data=json.dumps(data), content_type='application/json') assert res.status_code == 400, res
def test_update_collections(self): url = '/api/1/documents/1000/collections' ores = self.client.get(url) user = self.login() can_write = Collection.create({'label': "Write"}, user) no_write = Collection.create({'label': "No-write"}) db.session.commit() data = list(ores.json) data.append(can_write.id) res = self.client.post(url, data=json.dumps(data), content_type='application/json') assert res.status_code == 200, res assert can_write.id in res.json, res.json data = list(ores.json) data = [no_write.id] res = self.client.post(url, data=json.dumps(data), content_type='application/json') assert res.status_code == 200, res assert no_write.id not in res.json, res.json assert 1000 in res.json, res.json data = list(ores.json) data = ['foo'] res = self.client.post(url, data=json.dumps(data), content_type='application/json') assert res.status_code == 400, res
def load_collection(self, data): foreign_id = data.get('foreign_id') collection = Collection.by_foreign_id(foreign_id) if collection is None: collection = Collection.create(data) db.session.commit() update_collection(collection) return collection
def cleanup_deleted(): from aleph.model import Alert, Entity, Collection from aleph.model import Permission, Role Alert.cleanup_deleted() Permission.cleanup_deleted() Entity.cleanup_deleted() Collection.cleanup_deleted() Role.cleanup_deleted() db.session.commit()
def analyze(foreign_id=None): """Re-analyze documents in the given collection (or throughout).""" if foreign_id: collection = Collection.by_foreign_id(foreign_id) if collection is None: raise ValueError("No such collection: %r" % foreign_id) analyze_collection.delay(collection.id) else: for collection in Collection.all(): analyze_collection.delay(collection.id)
def crawl(self): url = urljoin(self.host, '/ticket/all_closed/?format=json') collection = Collection.by_foreign_id(url, { 'label': 'Investigative Dashboard Requests' }) Permission.grant_foreign(collection, 'idashboard:occrp_staff', True, False) existing_entities = [] terms = set() db.session.flush() for endpoint in ['all_closed', 'all_open']: url = urljoin(self.host, '/ticket/%s/?format=json' % endpoint) data = self.session.get(url).json() for req in data.get('paginator', {}).get('object_list'): category = REQUEST_TYPES.get(req.get('ticket_type')) if category is None: continue ent = Entity.by_foreign_id(str(req.get('id')), collection, { 'name': req.get('name'), 'category': category, 'data': req, 'selectors': [req.get('name')] }) terms.update(ent.terms) existing_entities.append(ent.id) log.info(" # %s (%s)", ent.name, ent.category) for entity in collection.entities: if entity.id not in existing_entities: entity.delete() self.emit_collection(collection, terms)
def collections(self, action): if action in self._collections: return self._collections.get(action) prefix_key = cache.key(self.PREFIX) key = cache.key(self.PREFIX, action, self.id) collections = cache.get_list(key) if len(collections): collections = [int(c) for c in collections] self._collections[action] = collections log.debug("[C] Authz: %s (%s): %s", self, action, collections) return collections if self.is_admin: q = Collection.all_ids() else: q = db.session.query(Permission.collection_id) q = q.filter(Permission.deleted_at == None) # noqa q = q.filter(Permission.role_id.in_(self.roles)) if action == self.READ: q = q.filter(Permission.read == True) # noqa if action == self.WRITE: q = q.filter(Permission.write == True) # noqa q = q.distinct() # log.info("Query: %s", q) collections = [c for (c,) in q.all()] log.debug("Authz: %s (%s): %s", self, action, collections) cache.kv.sadd(prefix_key, key) cache.set_list(key, collections) self._collections[action] = collections return collections
def get_collection(collection_id): """Fetch a collection from the index.""" if collection_id is None: return key = cache.object_key(Collection, collection_id) data = cache.get_complex(key) if data is not None: return data collection = Collection.by_id(collection_id) if collection is None: return data = collection.to_dict() stats = get_collection_stats(collection.id) data['count'] = stats['count'] data['schemata'] = stats['schemata'] # if no countries or langs are given, take the most common from the data. countries = ensure_list(collection.countries) countries = countries or stats['countries'].keys() data['countries'] = registry.country.normalize_set(countries) languages = ensure_list(collection.languages) languages = languages or stats['languages'].keys() data['languages'] = registry.language.normalize_set(languages) cache.set_complex(key, data, expire=cache.EXPIRE) return data
def crawl_collection(self, collection): if not len(collection.get('subjects', [])): return url = urljoin(self.URL, '/api/collections/%s' % collection.get('id')) collection = Collection.by_foreign_id(url, { 'label': collection.get('title') }) res = requests.get('%s/permissions' % url, headers=self.HEADERS) for perm in res.json().get('results', []): Permission.grant_foreign(collection, perm.get('role'), perm.get('read'), perm.get('write')) log.info(" > Spindle collection: %s", collection.label) res = requests.get('%s/entities' % url, headers=self.HEADERS) terms = set() existing_entities = [] for entity in res.json().get('results', []): if entity.get('name') is None: continue aliases = [on.get('alias') for on in entity.get('other_names', [])] ent = Entity.by_foreign_id(entity.get('id'), collection, { 'name': entity.get('name'), 'category': SCHEMATA.get(entity.get('$schema'), OTHER), 'data': entity, 'selectors': aliases }) terms.update(ent.terms) existing_entities.append(ent.id) log.info(" # %s (%s)", ent.name, ent.category) for entity in collection.entities: if entity.id not in existing_entities: entity.delete() self.emit_collection(collection, terms)
def collections(self, action): if self.is_admin: return [c for (c, ) in Collection.all_ids()] if action in self._collections: return self._collections.get(action) key = self.id or "anonymous" collections = cache.kv.hget(self.ACCESS, key) if collections: self._collections = json.loads(collections) else: reads = set() writes = set() q = db.session.query(Permission) q = q.filter(Permission.role_id.in_(self.roles)) for perm in q.all(): if perm.read: reads.add(perm.collection_id) if perm.write: writes.add(perm.collection_id) self._collections = { self.READ: list(reads), self.WRITE: list(writes) } log.debug("Authz: %s: %r", self, self._collections) cache.kv.hset(self.ACCESS, key, json.dumps(self._collections)) return self._collections.get(action, [])
def sample_entities(secret, properties, schematas, seed, sample_pct, limit, outfile): """Sample random entities""" random.seed(seed) authz = Authz.from_role(Role.load_cli_user()) collections = list(Collection.all_by_secret(secret, authz)) random.shuffle(collections) iter_proxies_kwargs = { "authz": authz, "schemata": schematas or None, "randomize": True, "random_seed": seed, } n_entities = 0 for collection in collections: for entity in iter_proxies(collection_id=collection.id, **iter_proxies_kwargs): if properties and not any( entity.properties.get(prop) for prop in properties): continue if not sample_pct or random.random() < sample_pct: write_object(outfile, entity) n_entities += 1 if limit and n_entities >= limit: return
def bulk_load_query(collection_id, query): collection = Collection.by_id(collection_id) if collection is None: log.warning("Collection does not exist: %s", collection_id) return namespace = Namespace(collection.foreign_id) mapping = model.make_mapping(query, key_prefix=collection.foreign_id) records_total = len(mapping.source) or 'streaming' entities = {} entities_count = 0 for records_index, record in enumerate(mapping.source.records, 1): for entity in mapping.map(record).values(): entity = namespace.apply(entity) # When loading from a tabular data source, we will often # encounter mappings where the same entity is emitted # multiple times in short sequence, e.g. when the data # describes all the directors of a single company. if entity.id in entities: entities[entity.id].merge(entity) else: entities[entity.id] = entity entities_count += 1 if records_index > 0 and records_index % 1000 == 0: log.info("[%s] Loaded %s records (%s), %s entities...", collection.foreign_id, records_index, records_total, entities_count) if len(entities) >= BULK_PAGE: index.index_bulk(collection.id, entities) entities = {} index.index_bulk(collection.id, entities) refresh_collection(collection)
def peek_query(args): if not isinstance(args, MultiDict): args = MultiDict(args) text = args.get('q', '').strip() q = text_query(text) filters = parse_filters(args) for entity in args.getlist('entity'): filters.append(('entities.id', entity)) q = filter_query(q, filters, []) q = add_filter(q, { 'not': { 'terms': { 'collection_id': authz.collections(authz.READ) } } }) q = { 'query': q, 'size': 0, 'aggregations': { 'collections': { 'terms': {'field': 'collection_id', 'size': 30} } }, '_source': False } # import json # print json.dumps(q, indent=2) result = get_es().search(index=get_es_index(), body=q, doc_type=TYPE_DOCUMENT) aggs = result.get('aggregations', {}).get('collections', {}) buckets = aggs.get('buckets', []) q = Collection.all_by_ids([b['key'] for b in buckets]) q = q.filter(Collection.creator_id != None) # noqa objs = {o.id: o for o in q.all()} roles = {} for bucket in buckets: collection = objs.get(bucket.get('key')) if collection is None or collection.private: continue if collection.creator_id in roles: roles[collection.creator_id]['total'] += bucket.get('doc_count') else: roles[collection.creator_id] = { 'name': collection.creator.name, 'email': collection.creator.email, 'total': bucket.get('doc_count') } roles = sorted(roles.values(), key=lambda r: r['total'], reverse=True) roles = [format_total(r) for r in roles] total = result.get('hits', {}).get('total') return format_total({ 'roles': roles, 'active': total > 0, 'total': total })
def collections(action): """Pre-load collection authorisation info and cache the result. This is the core authorisation function, and is called at least once per request. It will query and cache the ID for all collections the current user is authorised to read or write. """ if not hasattr(request, 'auth_collections'): public_roles = get_public_roles() request.auth_collections = {READ: set(), WRITE: set(), PUBLIC: set()} q = db.session.query(Permission.collection_id, Permission.role_id, Permission.read, Permission.write) q = q.filter(Permission.deleted_at == None) # noqa q = q.filter(Permission.role_id.in_(request.auth_roles)) q = q.filter(Permission.collection_id != None) # noqa for collection_id, role_id, read, write in q: if read or write: request.auth_collections[READ].add(collection_id) if role_id in public_roles: request.auth_collections[PUBLIC].add(collection_id) if write and request.logged_in: request.auth_collections[WRITE].add(collection_id) if is_admin(): q = Collection.all_ids().filter(Collection.deleted_at == None) # noqa for collection_id, in q: request.auth_collections[READ].add(collection_id) request.auth_collections[WRITE].add(collection_id) return list(request.auth_collections.get(action, []))
def delete(id): collection = obj_or_404(Collection.by_id(id)) authz.require(authz.collection_write(id)) delete_collection.apply_async([collection.id], queue=USER_QUEUE, routing_key=USER_ROUTING_KEY) log_event(request) return jsonify({'status': 'ok'})
def get_results(query, limit): collections = {} for i, row in enumerate(scan_iter(query)): if i >= limit: return data = { 'file_url': url_for('documents_api.file', document_id=row.get('_id')) } for name, value in row.get('_source').items(): if name == 'collection_id': colls = [] for coll in value: if coll not in collections: source = Collection.by_id(coll) if source is None: collections[coll] = '[Deleted collection %s]' % value else: collections[coll] = source.label colls.append(collections[coll]) value = ', '.join(sorted(colls)) name = 'collections' if name not in FIELDS: continue if isinstance(value, (list, tuple, set)): value = ', '.join(value) data[name] = value yield data
def test_delete_collection(self): collection = Collection.by_id(1000) res = self.client.get('/api/2/search?q="mention fruit"') assert res.json['total'] == 1, res.json delete_collection(collection) res = self.client.get('/api/2/search?q="mention fruit"') assert res.json['total'] == 0, res.json
def crawl(self): url = urljoin(self.host, '/ticket/all_closed/?format=json') collection = Collection.by_foreign_id(url, { 'label': 'Investigative Dashboard Requests' }) Permission.grant_foreign(collection, 'idashboard:occrp_staff', True, False) existing_entities = [] terms = set() db.session.flush() for endpoint in ['all_closed', 'all_open']: url = urljoin(self.host, '/ticket/%s/?format=json' % endpoint) data = self.session.get(url).json() print url continue for req in data.get('paginator', {}).get('object_list'): ent = self.update_entity(req, collection) if ent is not None: terms.update(ent.terms) existing_entities.append(ent.id) log.info(" # %s", ent.name) for entity in collection.entities: if entity.id not in existing_entities: entity.delete() self.emit_collection(collection, terms)
def update_collection_access(collection_id): """Re-write all etities in this collection to reflect updated roles.""" collection = Collection.by_id(collection_id, deleted=True) if collection is None: return log.info("Update roles [%s]: %s", collection.foreign_id, collection.label) index.update_collection_roles(collection)
def load_collection(self): if not hasattr(self, '_collection'): self._collection = Collection.by_foreign_id('polyglot:ner', { 'label': 'Automatically Extracted Persons and Companies', 'public': True }) return self._collection
def create(collection_id): collection = obj_or_404(Collection.by_id(collection_id)) authz.require(authz.collection_write(collection.id)) network = Network.create(request_data(), collection, request.auth_role) db.session.commit() log_event(request) return view(collection_id, network.id)
def delete(id): authz.require(authz.collection_write(id)) collection = obj_or_404(Collection.by_id(id)) analyze_terms.delay(collection.terms) collection.delete() db.session.commit() return jsonify({'status': 'ok'})
def update(id): authz.require(authz.collection_write(id)) collection = obj_or_404(Collection.by_id(id)) collection.update(request_data()) db.session.add(collection) db.session.commit() return view(id)
def find_collection(self, foreign_id, data): collection = Collection.by_foreign_id(foreign_id, data) if not hasattr(self, 'entity_cache'): self.entity_cache = {} self.entity_cache[collection.id] = [] db.session.flush() return collection
def ingest_upload(collection_id): collection = obj_or_404(Collection.by_id(collection_id)) authz.require(authz.collection_write(collection.id)) log_event(request) try: meta = json.loads(request.form.get('meta', '{}')) except Exception as ex: raise BadRequest(unicode(ex)) metas = [] for storage in request.files.values(): file_meta = meta.copy() file_meta['mime_type'] = storage.mimetype file_meta['file_name'] = storage.filename validate(file_meta, 'metadata.json#') file_meta = Metadata.from_data(file_meta) file_meta.crawler_id = 'user_upload:%s' % request.auth_role.id file_meta.crawler_run = make_textid() sec_fn = os.path.join(get_upload_folder(), secure_filename(storage.filename)) storage.save(sec_fn) ingest_file(collection.id, file_meta, sec_fn, move=True, queue=USER_QUEUE, routing_key=USER_ROUTING_KEY) metas.append(file_meta) return jsonify({'status': 'ok', 'metadata': metas})
def crawl(self, directory=None, collection=None, meta={}): collection = collection or directory collection = Collection.create({ 'foreign_id': 'directory:%s' % slugify(collection), 'label': collection }) db.session.commit() collection_id = collection.id if os.path.isfile(directory): self.crawl_file(collection_id, directory, meta) directory = directory or os.getcwd() directory = directory.encode('utf-8') for (dirname, dirs, files) in os.walk(directory): dirparts = [d for d in dirname.split(os.path.sep) if d in SKIP_DIRECTORIES] if len(dirparts): continue log.info("Descending: %r", dirname) for file_name in files: dirname = string_value(dirname) file_name = string_value(file_name) if file_name in SKIP_FILES: continue file_path = os.path.join(dirname, file_name) self.crawl_file(collection_id, file_path, meta)
def create(): authz.require(authz.logged_in()) collection = Collection.create(request_data(), request.auth_role) db.session.commit() update_collection(collection) log_event(request) return view(collection.id)
def get_collections(data): collections = [] for coll_id in data.get('collections'): if isinstance(coll_id, dict): coll_id = coll_id.get('id') collections.append(coll_id) return Collection.all_by_ids(collections).all()
def upgrade_collections(): for collection in Collection.all(deleted=True): if collection.deleted_at is not None: delete_collection(collection, keep_metadata=True, sync=True, reset_sync=True) else: refresh_collection(collection.id, sync=True) compute_collection(collection, sync=True)
def process(id): collection = obj_or_404(Collection.by_id(id)) request.authz.require(request.authz.collection_write(collection)) analyze_collection.apply_async([collection.id], queue=USER_QUEUE, routing_key=USER_ROUTING_KEY) log_event(request) return jsonify({'status': 'ok'})
def load_collection(self): if not hasattr(self, '_collection'): self._collection = Collection.by_foreign_id( 'polyglot:ner', { 'label': 'Automatically Extracted Persons and Companies', 'public': True }) return self._collection
def bulk_load(config): """Bulk load entities from a CSV file or SQL database. This is done by mapping the rows in the source data to entities and links which can be understood by the entity index. """ for foreign_id, data in config.items(): collection = Collection.by_foreign_id(foreign_id) if collection is None: data['foreign_id'] = foreign_id data['label'] = data.get('label', foreign_id) collection = Collection.create(data) db.session.commit() index_collection(collection) for query in dict_list(data, 'queries', 'query'): bulk_load_query.apply_async([collection.id, query], priority=6)
def delete(id): collection = obj_or_404(Collection.by_id(id)) authz.require(authz.collection_write(id)) collection.delete() for entity in collection.entities: update_entity(entity) db.session.commit() return jsonify({'status': 'ok'})
def create(): require(request.authz.logged_in) data = parse_request(schema=CollectionSchema) data['managed'] = False collection = Collection.create(data, request.authz.role) db.session.commit() update_collection(collection) return view(collection.id)
def create(): require(request.authz.logged_in) data = parse_request(CollectionSchema) role = Role.by_id(request.authz.id) collection = Collection.create(data, role) db.session.commit() update_collection(collection) return view(collection.id)
def get_collections(data): collections = [] collection_id = data.get('collection_id') or [] for coll_id in collection_id: if isinstance(coll_id, dict): coll_id = coll_id.get('id') collections.append(coll_id) return Collection.all_by_ids(collections).all()
def test_delete_source(self): collection = Collection.by_id(1000) res = self.client.get('/api/1/query?q="mention fruit"') assert res.json['total'] == 1, res.json delete_collection(collection.id) optimize_search() res = self.client.get('/api/1/query?q="mention fruit"') assert res.json['total'] == 0, res.json
def test_delete_collection(self): collection = Collection.by_id(1000) url = '/api/2/entities?filter:schemata=Thing&q="mention fruit"' res = self.client.get(url) assert res.json['total'] == 1, res.json delete_collection(collection) res = self.client.get(url) assert res.json['total'] == 0, res.json
def ensure_collection(foreign_id, label): authz = Authz.from_role(Role.load_cli_user()) config = { 'foreign_id': foreign_id, 'label': label, } create_collection(config, authz) return Collection.by_foreign_id(foreign_id)
def test_load_sqlite(self): count = Collection.all().count() assert 0 == count, count yml_path = self.get_fixture_path('kek.yml') config = load_config_file(yml_path) bulk_load(config) flush_index() count = Collection.all().count() assert 1 == count, count res = self.client.get('/api/2/entities?q=friede+springer') assert res.status_code == 200, res assert res.json['total'] == 1, res.json res0 = res.json['results'][0] assert res0['id'] == '9895ccc1b3d6444ccc6371ae239a7d55c748a714', res0
def upgrade_collections(): for collection in Collection.all(deleted=True): if collection.deleted_at is not None: delete_collection(collection, keep_metadata=True, sync=True) else: compute_collection(collection, force=True) # update global cache: compute_collections()
def matches(id, other_id): collection = obj_or_404(Collection.by_id(id)) require(request.authz.can_read(collection.id)) require(request.authz.can_read(other_id)) parser = QueryParser(request.args, request.authz, limit=10) q = Match.find_by_collection(collection.id, other_id) result = MatchQueryResult(request, q, parser=parser, schema=MatchSchema) return jsonify(result)
def delete(collection_id, id): collection = obj_or_404(Collection.by_id(collection_id)) authz.require(authz.collection_write(collection.id)) network = obj_or_404(Network.by_id_collection(id, collection)) network.delete() db.session.commit() log_event(request) return jsonify({'status': 'ok'})
def update(collection_id, id): collection = obj_or_404(Collection.by_id(collection_id)) authz.require(authz.collection_write(collection_id)) network = obj_or_404(Network.by_id_collection(id, collection)) network.update(request_data()) log_event(request) db.session.commit() return view(collection_id, network.id)
def publish(foreign_id): """Make a collection visible to all users.""" collection = Collection.by_foreign_id(foreign_id) if collection is None: raise ValueError("No such collection: %r" % foreign_id) role = Role.by_foreign_id(Role.SYSTEM_GUEST) update_permission(role, collection, True, False) db.session.commit()
def collection(self): if not hasattr(self, '_collection'): self._collection = Collection.create({ 'foreign_id': self.COLLECTION_ID, 'label': self.COLLECTION_LABEL or self.COLLECTION_ID }) db.session.commit() db.session.add(self._collection) return self._collection
def _resolve_collections(self, cache): collections = set() for (type_, id_) in cache.keys(): if type_ == Collection: collections.add(id_) if not len(collections): return for coll in Collection.all_by_ids(collections, deleted=True): cache[(Collection, str(coll.id))] = coll
def crawl_collection(self, engine, foreign_id, data): collection = Collection.create({ 'foreign_id': foreign_id, 'label': data.get('label') }) db.session.commit() meta_base = data.get('meta', {}) for name, query in data.get('queries', {}).items(): self.crawl_query(engine, collection, meta_base, name, query)
def rdf(foreign_id): """Generate a RDF triples for the given collection.""" collection = Collection.by_foreign_id(foreign_id) if collection is None: raise ValueError("No such collection: %r" % foreign_id) for line in export_collection(collection): line = line.strip().decode('utf-8') if len(line): print(line)
def publish(foreign_id): """Make a collection visible to all users.""" collection = Collection.by_foreign_id(foreign_id) if collection is None: raise ValueError("No such collection: %r" % foreign_id) role = Role.by_foreign_id(Role.SYSTEM_GUEST) editor = Role.load_cli_user() update_permission(role, collection, True, False, editor_id=editor.id) update_collection(collection)
def create(): request.authz.require(request.authz.logged_in) data = request_data() data['managed'] = False collection = Collection.create(data, request.authz.role) db.session.commit() update_collection(collection) log_event(request) return jsonify(collection)
def update(id): collection = obj_or_404(Collection.by_id(id)) request.authz.require(request.authz.collection_write(collection)) collection.update(request_data()) db.session.add(collection) db.session.commit() update_collection(collection) log_event(request) return view(id)
def deletepending(foreign_id=None): """Deletes any pending entities and related items.""" collection_id = None if foreign_id is None: collection = Collection.by_foreign_id(foreign_id) if collection is None: raise ValueError("No such collection: %r" % foreign_id) collection_id = collection.id delete_pending(collection_id=collection_id)
def delete(id): collection = obj_or_404(Collection.by_id(id)) authz.require(authz.collection_write(id)) # TODO: race condition-ish... for entity in collection.entities: analyze_entity.delay(entity.id) collection.delete() db.session.commit() return jsonify({'status': 'ok'})