def get_instance_stats(authz): # Compute entity stats: query = { 'size': 0, 'query': { 'bool': { 'filter': [authz_query(authz), { 'term': { 'schemata': Entity.THING } }] } }, 'aggs': { 'schema': { 'terms': { 'field': 'schema', 'size': 1000 } } } } result = es.search(index=entities_index(), body=query) aggregations = result.get('aggregations') data = {'count': result.get('hits').get('total'), 'schemata': {}} for schema in aggregations.get('schema').get('buckets'): key = schema.get('key') data['schemata'][key] = schema.get('doc_count') # Compute collection stats (should we return categories?) query = {'size': 0, 'query': {'bool': {'filter': [authz_query(authz)]}}} result = es.search(index=collections_index(), body=query) data['collections'] = result.get('hits').get('total') return data
def execute_tabular_query(document_id, table_id, args, query): """ Execute a query against records and return a set of results. """ result = es.search(index=es_index, doc_type=TYPE_RECORD, body=query) hits = result.get('hits', {}) output = { 'status': 'ok', 'results': [], 'offset': query['from'], 'limit': query['size'], 'total': hits.get('total'), 'next': None } next_offset = output['offset'] + output['limit'] if output['total'] > next_offset: params = {'offset': next_offset} for k, v in args.iterlists(): if k in ['offset']: continue params[k] = v output['next'] = url_for('table.rows', document_id=document_id, table_id=table_id, **params) for rec in hits.get('hits', []): record = rec.get('_source').get('raw') record['_id'] = rec.get('_source', {}).get('row_id') output['results'].append(record) return output
def get_sitemap_entities(collection_id): filters = [ { 'term': { 'collection_id': collection_id } }, { 'term': { 'schemata': Entity.THING } }, ] query = { 'query': { 'bool': { 'filter': filters } }, 'size': MAX_PAGE, 'sort': [{ 'updated_at': 'desc' }], '_source': { 'includes': ['schema', 'updated_at'] } } index = entities_read_index(Entity.THING) res = es.search(index=index, body=query) for res in res.get('hits', {}).get('hits', []): source = res.get('_source') source['id'] = res.get('_id') yield source
def _query_item(entity): """Cross-reference an entity or document, given as an indexed document.""" query = match_query(entity) if query == none_query(): return query = { "query": query, "size": 100, "_source": { "includes": PROXY_INCLUDES } } matchable = list(entity.schema.matchable_schemata) index = entities_read_index(schema=matchable) result = es.search(index=index, body=query) for result in result.get("hits").get("hits"): result = unpack_result(result) if result is None: continue match = model.get_proxy(result) score = compare(model, entity, match) if score >= SCORE_CUTOFF: log.debug("Match: %s <[%.2f]> %s", entity.caption, score, match.caption) yield score, entity, result.get("collection_id"), match
def entities_by_ids(ids, schemata=None, cached=False, includes=None, excludes=None): """Iterate over unpacked entities based on a search for the given entity IDs.""" ids = ensure_list(ids) if not len(ids): return index = entities_read_index(schema=schemata) query = {'ids': {'values': ids}} # query = {'bool': {'filter': query}} query = { 'query': query, '_source': _source_spec(includes, excludes), 'size': MAX_PAGE } result = es.search(index=index, body=query) for doc in result.get('hits', {}).get('hits', []): entity = unpack_result(doc) if entity is not None: # Cache entities only briefly to avoid filling up the cache: if cached: key = cache.object_key(Entity, entity.get('id')) cache.set_complex(key, entity, expire=60 * 60) yield entity
def get_notifications(role, since=None, parser=None): """Fetch a stream of notifications for the given role.""" channels = get_role_channels(role) filters = [{'terms': {'channels': channels}}] if since is not None: filters.append({'range': {'created_at': {'gt': since}}}) must_not = [{'term': {'actor_id': role.id}}] query = { 'size': 30, 'query': { 'bool': { 'filter': filters, 'must_not': must_not } }, 'sort': [{ 'created_at': { 'order': 'desc' } }] } if parser is not None: query['size'] = parser.limit query['from'] = parser.offset return es.search(index=notifications_index(), body=query)
def check_alert(alert_id): alert = Alert.by_id(alert_id) if alert is None or alert.role is None: return log.info("Check alert [%s]: %s", alert.id, alert.query) authz = Authz.from_role(alert.role) try: query = alert_query(alert, authz) index = entities_read_index(schema=Entity.THING) result = es.search(index=index, body=query) except RequestError as re: log.error("Invalid query [%s]: %r", alert.query, re.error) alert.delete() db.session.commit() return for result in result.get("hits").get("hits", []): entity = unpack_result(result) if entity is None: continue log.info("Alert [%s]: %s", alert.query, entity.get("id")) params = { "alert": alert, "role": alert.role, "entity": entity.get("id"), "collection": entity.get("collection_id"), } channels = [alert.role] # channels.append(channel_tag(collection_id, Collection)) publish(Events.MATCH_ALERT, params=params, channels=channels) alert.update() db.session.commit()
def search(self): """Execute the query as assmbled.""" # log.info("Search index: %s", self.get_index()) result = es.search(index=self.get_index(), body=self.get_body()) log.info("Took: %sms", result.get('took')) # log.info("%s", pformat(result.get('profile'))) return result
def get_collection_facet(collection_id, facet, refresh=False): """Compute some statistics on the content of a collection.""" key = cache.object_key(Collection, collection_id, facet) data = cache.get_complex(key) if not refresh and data is not None: return data query = {'term': {'collection_id': collection_id}} query = { 'size': 0, 'query': {'bool': {'filter': [query]}}, 'aggs': { 'values': {'terms': {'field': facet, 'size': 300}}, 'total': {'cardinality': {'field': facet}} } } schemata = set() facet_type = registry.groups.get(facet) if facet_type is not None: schemata = model.get_type_schemata(facet_type) result = es.search(index=entities_read_index(schema=schemata), body=query, request_timeout=3600, timeout='20m') aggregations = result.get('aggregations') values = {} for bucket in aggregations.get('values').get('buckets', []): values[bucket['key']] = bucket['doc_count'] data = { 'values': values, 'total': aggregations.get('total').get('value', 0) } cache.set_complex(key, data, expires=cache.EXPIRE) return data
def suggest_entities(prefix, authz, min_count=0, schemas=None, size=5): """Auto-complete API.""" options = [] if prefix is not None and len(prefix.strip()): q = {'match_phrase_prefix': {'name': prefix.strip()}} if min_count > 0: q = add_filter(q, {'range': {'doc_count': {'gte': min_count}}}) if schemas is not None and len(schemas): q = add_filter(q, {'terms': {'$schema': schemas}}) # TODO: is this correct? should we allow filter by dataset entities? q = add_filter(q, {'terms': {'collection_id': authz.collections_read}}) q = { 'size': size, 'sort': [{ 'doc_count': 'desc' }, '_score'], 'query': q, '_source': ['name', 'schema', 'fingerprints', 'doc_count'] } ref = ascii_text(prefix) result = es.search(index=es_index, doc_type=TYPE_ENTITY, body=q) for res in result.get('hits', {}).get('hits', []): ent = res.get('_source') terms = [ascii_text(t) for t in ent.pop('fingerprints', [])] ent['match'] = ref in terms ent['score'] = res.get('_score') ent['id'] = res.get('_id') options.append(ent) return {'prefix': prefix, 'results': options}
def get_instance_stats(authz): query = { 'size': 0, 'query': { 'terms': { 'roles': list(authz.roles) } }, 'aggs': { 'schema': { 'terms': { 'field': 'schema', 'size': 1000 } } } } result = es.search(index=entities_index(), doc_type=entity_type(), body=query) aggregations = result.get('aggregations') data = {'count': result.get('hits').get('total'), 'schemata': {}} for schema in aggregations.get('schema').get('buckets'): key = schema.get('key') data['schemata'][key] = schema.get('doc_count') return data
def search(self): """Execute the query as assmbled.""" # pprint(self.get_body()) return es.search(index=self.get_index(), body=self.get_body(), request_cache=True, request_timeout=REQUEST_TIMEOUT)
def entities_by_ids(ids, schemata=None, cached=False, includes=None, excludes=None): """Iterate over unpacked entities based on a search for the given entity IDs.""" ids = ensure_list(ids) if not len(ids): return index = entities_read_index(schema=schemata) query = {'ids': {'values': ids}} # query = {'bool': {'filter': query}} query = { 'query': query, '_source': _source_spec(includes, excludes), 'size': MAX_PAGE } result = es.search(index=index, body=query) for doc in result.get('hits', {}).get('hits', []): entity = unpack_result(doc) if entity is not None: if cached: _cache_entity(entity) yield entity
def check_alert(alert_id): alert = Alert.by_id(alert_id) if alert is None or alert.role is None: return if not alert.role.is_alertable: return authz = Authz.from_role(alert.role) query = alert_query(alert, authz) index = entities_read_index(schema=Entity.THING) result = es.search(index=index, body=query) for result in result.get('hits').get('hits', []): entity = unpack_result(result) if entity is None: continue log.info('Alert [%s]: %s', alert.query, entity.get('name')) params = { 'alert': alert, 'role': alert.role, 'entity': entity } publish(Events.MATCH_ALERT, actor_id=entity.get('uploader_id'), params=params) alert.update() db.session.commit() db.session.close()
def check_alert(alert_id): alert = Alert.by_id(alert_id) if alert is None or alert.role is None: return log.info("Check alert [%s]: %s", alert.id, alert.query) authz = Authz.from_role(alert.role) query = alert_query(alert, authz) index = entities_read_index(schema=Entity.THING) result = es.search(index=index, body=query) for result in result.get('hits').get('hits', []): entity = unpack_result(result) if entity is None: continue log.info('Alert [%s]: %s', alert.query, entity.get('name')) params = { 'alert': alert, 'role': alert.role, 'entity': entity.get('id') } publish(Events.MATCH_ALERT, params=params, channels=[alert.role]) db.session.flush() alert.update() db.session.commit() db.session.close()
def get_instance_stats(authz): query = { 'size': 0, 'query': { 'terms': { 'roles': list(authz.roles) } }, 'aggs': { 'schema': { 'terms': { 'field': 'schema', 'size': 1000 } }, 'types': { 'terms': { 'field': '_type', 'size': len(TYPES) } } } } result = es.search(index=es_index, doc_type=TYPES.keys(), body=query) aggregations = result.get('aggregations') data = {'$total': result.get('hits').get('total'), '$schemata': {}} for schema in aggregations.get('schema').get('buckets'): key = schema.get('key') data['$schemata'][key] = schema.get('doc_count') for doc_type in aggregations.get('types').get('buckets'): key = TYPES.get(doc_type.get('key')) data[key] = doc_type.get('doc_count') return data
def search(self): """Execute the query as assmbled.""" # log.info("Search index: %s", self.get_index()) result = es.search(index=self.get_index(), body=self.get_body()) log.info("[%s] took: %sms", self.to_text(), result.get("took")) # log.info("%s", pformat(self.get_body())) # log.info("%s", pformat(self.parser.filters)) return result
def console(): query = {'query': { 'term': {#'collection': 'sec-edgar', 'filed_at': '20160216'} } } res = es.search(body=query,index='aleph') import ipdb; ipdb.set_trace()
def get_dataset_countries(dataset_name): """Create a list of the top 300 countries mentioned in a dataset.""" q = {'term': {'dataset': dataset_name}} aggs = {'countries': {'terms': {'field': 'countries', 'size': 300}}} q = {'size': 0, 'query': q, 'aggregations': aggs} result = es.search(index=es_index, doc_type=TYPE_ENTITY, body=q) result = result.get('aggregations', {}).get('countries', {}) return [b.get('key') for b in result.get('buckets', [])]
def execute_documents_query(args, q): """ Execute the query and return a set of results. """ result = es.search(index=es_index, doc_type=TYPE_DOCUMENT, body=q) hits = result.get('hits', {}) output = { 'status': 'ok', 'results': [], 'offset': q['from'], 'limit': q['size'], 'total': hits.get('total'), 'next': None, 'facets': {}, 'watchlists': {} } convert_aggregations(result, output, args) next_offset = output['offset'] + output['limit'] if output['total'] > next_offset: params = {'offset': next_offset} for k, v in args.iterlists(): if k in ['offset']: continue params[k] = v output['next'] = url_for('search.query', **params) sub_queries = [] for doc in hits.get('hits', []): document = doc.get('_source') document['id'] = int(doc.get('_id')) document['score'] = doc.get('_score') document['records'] = {'results': [], 'total': 0} sq = records_query(document['id'], args) if sq is not None: sub_queries.append(json.dumps({})) sub_queries.append(json.dumps(sq)) document['api_url'] = url_for('document.view', document_id=doc.get('_id')) document['data_url'] = url_for('document.file', document_id=doc.get('_id')) output['results'].append(document) if len(sub_queries): res = es.msearch(index=es_index, doc_type=TYPE_RECORD, body='\n'.join(sub_queries)) for doc in output['results']: for sq in res.get('responses', []): sqhits = sq.get('hits', {}) for hit in sqhits.get('hits', {}): record = hit.get('_source') if doc['id'] != record.get('document_id'): continue record['score'] = hit.get('_score') record['text'] = hit.get('highlight', {}).get('text') doc['records']['results'].append(record) doc['records']['total'] = sqhits.get('total', 0) return output
def get_xref(xref_id, collection_id=None): """Get an xref match combo by its ID.""" filters = [{"ids": {"values": [xref_id]}}] if collection_id is not None: filters.append({"term": {"collection_id": collection_id}}) query = {"query": {"bool": {"filter": filters}}, "size": 1} result = es.search(index=xref_index(), body=query) for doc in result.get("hits", {}).get("hits", []): return unpack_result(doc)
def search(self): """Execute the query as assmbled.""" result = es.search(index=self.get_index(), body=self.get_body(), request_cache=True, request_timeout=REQUEST_TIMEOUT) log.info("Took: %sms", result.get('took')) # log.info("%s", pformat(result)) return result
def get_xref(xref_id, collection_id=None): """Get an xref match combo by its ID.""" filters = [{'ids': {'values': [xref_id]}}] if collection_id is not None: filters.append({'term': {'collection_id': collection_id}}) query = {'query': {'bool': {'filter': filters}}, 'size': 1} result = es.search(index=xref_index(), body=query) for doc in result.get('hits', {}).get('hits', []): return unpack_result(doc)
def search_safe(*args, **kwargs): # This is not supposed to be used in every location where search is # run, but only where it's a backend search that we could back off of # without hurting UX. for attempt in count(): try: return es.search(*args, **kwargs) except Exception as exc: log.warning("Search error: %s", exc) backoff_cluster(failures=attempt)
def console(): query = { 'query': { 'term': { #'collection': 'sec-edgar', 'filed_at': '20160216' } } } res = es.search(body=query, index='aleph') import ipdb ipdb.set_trace()
def result(self): if self._result is None: q = self.query.copy() q['from'] = self._offset q['size'] = self._limit # HACKY if q['from'] > 0: del q['aggregations'] self._result = es.search(index=es_index, doc_type=self.doc_type, body=q) return self._result
def result(self): if self._result is None: q = self.query.copy() q['from'] = self._offset q['size'] = self._limit # HACKY if q['from'] > 0: del q['aggregations'] self._result = es.search(index=es_index, doc_type=self.doc_type, query=q) return self._result
def search(self): """Execute the query as assmbled.""" # log.info("Search index: %s", self.get_index()) result = es.search(index=self.get_index(), body=self.get_body()) # log.info( # f"Elasticsearch query [{self.to_text()}] took {result.get('took')}ms", # query=self.to_text(), # took=result.get("took"), # ) # log.info("%s", pformat(self.get_body())) # log.info("%s", pformat(self.parser.filters)) return result
def peek_query(state): """Peek into hidden collections. This allows users to retrieve an approximate result count of a given query against those collections which they are not authorised to view. It is a rudimentary collaboration mechanism. """ filters = state.filters cq = Collection.all() cq = cq.filter(not_(Collection.id.in_(state.authz.collections_read))) cq = cq.filter(Collection.creator_id != None) # noqa cq = cq.filter(Collection.private != True) # noqa collections = {c.id: c for c in cq} filters['collection_id'] = collections.keys() q = text_query(state.text) q = { 'query': filter_query(q, filters), 'query': q, 'size': 0, 'aggregations': { 'collections': { 'terms': {'field': 'collection_id', 'size': 1000} } }, '_source': False } result = es.search(index=es_index, body=q, doc_type=TYPE_DOCUMENT) roles = {} total = 0 aggs = result.get('aggregations', {}).get('collections', {}) for bucket in aggs.get('buckets', []): collection = collections.get(bucket.get('key')) if collection is None or collection.creator is None: continue total += bucket.get('doc_count') if collection.creator_id in roles: roles[collection.creator_id]['total'] += bucket.get('doc_count') else: roles[collection.creator_id] = { 'name': collection.creator.name, 'email': collection.creator.email, 'total': bucket.get('doc_count') } roles = sorted(roles.values(), key=lambda r: r['total'], reverse=True) roles = [format_total(r) for r in roles] return format_total({ 'roles': roles, 'active': total > 0, 'total': total })
def execute_basic(doc_type, query): """Common part of running a particular query.""" result = es.search(index=es_index, doc_type=doc_type, body=query) hits = result.get('hits', {}) output = { 'status': 'ok', 'results': [], 'offset': query.get('from', 0), 'limit': query.get('size'), 'total': hits.get('total'), 'next': None } return result, hits, output
def raw_iter(query, total=10000): for page in count(0): query['from'] = PAGE * page if query['from'] >= total: return query['size'] = PAGE result = es.search(index=es_index, doc_type=DOC_TYPE, body=query) hits = result.get('hits', {}) for doc in hits.get('hits', []): yield doc if not hits.get('total') > PAGE * (page + 1): return
def get_instance_stats(authz): # Compute entity stats: query = { 'size': 0, 'query': { 'bool': { 'filter': [ authz_query(authz), # {'term': {'schemata': Entity.THING}} ] } } } entities = es.search(index=entities_index(), body=query) # Compute collection stats (should we return categories?) query = {'size': 0, 'query': {'bool': {'filter': [authz_query(authz)]}}} collections = es.search(index=collections_index(), body=query) return { 'entities': entities.get('hits').get('total'), 'collections': collections.get('hits').get('total') }
def raw_iter(query): for page in count(0): query['from'] = PAGE * page query['size'] = PAGE result = es.search(index=es_index, doc_type=TYPE_DOCUMENT, body=query) hits = result.get('hits', {}) for doc in hits.get('hits', []): yield doc if not hits.get('total') > PAGE * (page + 1): return
def get_notifications(role, since=None): """Fetch a stream of notifications for the given role.""" channels = get_role_channels(role) filters = [{"terms": {"channels": channels}}] if since is not None: filters.append({"range": {"created_at": {"gt": since}}}) must_not = [{"term": {"actor_id": role.id}}] query = { "size": 30, "query": {"bool": {"filter": filters, "must_not": must_not}}, "sort": [{"created_at": {"order": "desc"}}], } return es.search(index=notifications_index(), body=query)
def search_safe(*args, **kwargs): # This is not supposed to be used in every location where search is # run, but only where it's a backend search that we could back off of # without hurting UX. for attempt in range(REQUEST_RETRIES): try: kwargs['doc_type'] = 'doc' return es.search(*args, **kwargs) except RequestError: raise except Exception as exc: log.warning("Search error: %s", exc) backoff_cluster(failures=attempt)
def random_docs(howmany=100, offset=0): index = 'aleph' query = { "from": offset, "size": howmany, "query": { "function_score": { "functions": [{ "random_score": {"seed": 42}}] } } } results = es.search(index=index,body=query) return results['hits']['hits']
def raw_iter(query, total=10000): for page in count(0): query['from'] = PAGE * page if query['from'] >= total: return query['size'] = PAGE result = es.search(index=es_index, doc_type=DOC_TYPE, query=query) hits = result.get('hits', {}) for doc in hits.get('hits', []): yield doc if not hits.get('total') > PAGE * (page + 1): return
def replace_es(query, updatefunc, index='aleph_test', howmany=10): perpage = 50 start = 522050 for offset in range(start,howmany,perpage): print('# %s' % offset) results = es.search( index=index, body=query, from_=offset, size=min(perpage, howmany)) for result in results['hits']['hits']: newbody = updatefunc(result['_source']) if not newbody: print('skipping item') continue updated = es.index( index=result['_index'], doc_type=result['_type'], id = result['_id'], body = newbody) assert updated['created'] == False
def xref_item(proxy, collection_ids=None): """Cross-reference an entity or document, given as an indexed document.""" query = match_query(proxy, collection_ids=collection_ids) if query == none_query(): return query = { 'query': query, 'size': 100, '_source': {'includes': ['schema', 'properties', 'collection_id']} } matchable = list(proxy.schema.matchable_schemata) index = entities_read_index(schema=matchable) result = es.search(index=index, body=query) results = result.get('hits').get('hits') for result in results: result = unpack_result(result) if result is not None: other = model.get_proxy(result) score = compare(model, proxy, other) if score >= SCORE_CUTOFF: yield score, result.get('collection_id'), other
def get_collection_stats(collection_id): """Compute some statistics on the content of a collection.""" log.info("Generating collection stats: %s", collection_id) query = {'term': {'collection_id': collection_id}} query = { 'size': 0, 'query': {'bool': {'filter': [query]}}, 'aggs': { 'schemata': {'terms': {'field': 'schema', 'size': 1000}}, 'countries': {'terms': {'field': 'countries', 'size': 1000}}, 'languages': {'terms': {'field': 'languages', 'size': 1000}}, } } index = entities_read_index(schema=Entity.THING) result = es.search(index=index, body=query) aggregations = result.get('aggregations', {}) data = {'count': 0} for facet in ['schemata', 'countries', 'languages']: data[facet] = {} for bucket in aggregations.get(facet, {}).get('buckets', []): data[facet][bucket['key']] = bucket['doc_count'] if len(data['schemata']): data['count'] = sum(data['schemata'].values()) return data
def available_attributes(args, sources=None, lists=None): q = attributes_query(args, sources=sources, lists=lists) result = es.search(index=es_index, doc_type=DOC_TYPE, query=q) result = result.get("aggregations", {}).get("attributes", {}) result = {r.get("key"): False for r in result.get("buckets", [])} return {"fields": CORE_FIELDS, "attributes": result}
def available_attributes(args, sources=None, lists=None): q = attributes_query(args, sources=sources, lists=lists) result = es.search(index=es_index, doc_type=DOC_TYPE, body=q) result = result.get('aggregations', {}).get('attributes', {}) result = {r.get('key'): False for r in result.get('buckets', [])} return {'fields': CORE_FIELDS, 'attributes': result}