def _filters_faceted_query(authz, queries): indexed = {} for (idx, alias, filter_) in queries: indexed[idx] = indexed.get(idx, {}) indexed[idx][alias] = filter_ queries = [] for (idx, filters) in indexed.items(): queries.append({'index': idx}) queries.append({ 'size': 0, 'query': {'bool': {'filter': [authz_query(authz)]}}, 'aggs': {'counters': {'filters': {'filters': filters}}} }) results = {} if not len(queries): return results res = es.msearch(body=queries) for resp in res.get('responses', []): aggs = resp.get('aggregations', {}).get('counters', {}) for alias, value in aggs.get('buckets', {}).items(): results[alias] = value.get('doc_count', results.get(alias, 0)) return results
def execute_documents_query(args, q): """ Execute the query and return a set of results. """ result = es.search(index=es_index, doc_type=TYPE_DOCUMENT, body=q) hits = result.get('hits', {}) output = { 'status': 'ok', 'results': [], 'offset': q['from'], 'limit': q['size'], 'total': hits.get('total'), 'next': None, 'facets': {}, 'watchlists': {} } convert_aggregations(result, output, args) next_offset = output['offset'] + output['limit'] if output['total'] > next_offset: params = {'offset': next_offset} for k, v in args.iterlists(): if k in ['offset']: continue params[k] = v output['next'] = url_for('search.query', **params) sub_queries = [] for doc in hits.get('hits', []): document = doc.get('_source') document['id'] = int(doc.get('_id')) document['score'] = doc.get('_score') document['records'] = {'results': [], 'total': 0} sq = records_query(document['id'], args) if sq is not None: sub_queries.append(json.dumps({})) sub_queries.append(json.dumps(sq)) document['api_url'] = url_for('document.view', document_id=doc.get('_id')) document['data_url'] = url_for('document.file', document_id=doc.get('_id')) output['results'].append(document) if len(sub_queries): res = es.msearch(index=es_index, doc_type=TYPE_RECORD, body='\n'.join(sub_queries)) for doc in output['results']: for sq in res.get('responses', []): sqhits = sq.get('hits', {}) for hit in sqhits.get('hits', {}): record = hit.get('_source') if doc['id'] != record.get('document_id'): continue record['score'] = hit.get('_score') record['text'] = hit.get('highlight', {}).get('text') doc['records']['results'].append(record) doc['records']['total'] = sqhits.get('total', 0) return output
def _counted_msearch(queries, authz, limit=0): """Run batched queries to count or retrieve entities with certain property values.""" # The default case for this is that we want to retrieve only the # counts for a bunch of filtered sub-queries. In this case, we can # group the queries by the affected index. # In some cases, the expand API wants to actually retrieve entities. # Then, we need to make one query per filter. grouped = {} for (index, key), query in sorted(queries.items()): group = index if limit == 0 else (index, key) if group not in grouped: grouped[group] = { "index": index, "filters": [query], "counts": {key: query}, } else: grouped[group]["filters"].append(query) grouped[group]["counts"][key] = query log.debug("Counts: %s queries, %s groups", len(queries), len(grouped)) body = [] for group in grouped.values(): body.append({"index": group.get("index")}) filters = group.get("filters") if limit == 0 and len(filters) > 1: filters = [{"bool": {"should": filters, "minimum_should_match": 1}}] filters.append(authz_query(authz)) query = { "size": limit, "query": {"bool": {"filter": filters}}, "aggs": {"counts": {"filters": {"filters": group.get("counts")}}}, "_source": ENTITY_SOURCE, } body.append(query) counts = {} # FIXME: This doesn't actually retain context on which query a particular # entity is a result from. Doesn't matter if all we do in the end is stuff # everything into an FtMGraph and then traverse for adjacency. entities = [] if not len(body): return entities, counts response = es.msearch(body=body) for resp in response.get("responses", []): for result in resp.get("hits", {}).get("hits", []): entities.append(unpack_result(result)) buckets = resp.get("aggregations", {}).get("counts", {}).get("buckets", {}) for key, count in buckets.items(): counts[key] = count.get("doc_count", 0) return entities, counts
def entity_tags(entity, authz): """Do a search on tags of an entity.""" # NOTE: This must also work for documents. FIELDS = [ 'names', 'emails', 'phones', 'addresses', 'identifiers' ] pivots = [] queries = [] # Go through all the tags which apply to this entity, and find how # often they've been mentioned in other entities. for field in FIELDS: for value in entity.get(field, []): if value is None: continue queries.append({}) queries.append({ 'size': 0, 'query': { 'bool': { 'filter': [ authz_query(authz), field_filter_query(field, value) ], 'must_not': [ {'ids': {'values': [entity.get('id')]}}, ] } } }) pivots.append((field, value)) if not len(queries): return [] res = es.msearch(index=entities_index(), body=queries) results = [] for (field, value), resp in zip(pivots, res.get('responses', [])): total = resp.get('hits', {}).get('total') if total > 0: qvalue = quote(value.encode('utf-8')) key = ('filter:%s' % field, qvalue) results.append({ 'id': query_string([key]), 'value': value, 'field': field, 'count': total }) results.sort(key=lambda p: p['count'], reverse=True) return results
def checksums_count(checksums): """Query how many documents mention a checksum.""" schemata = model.get_type_schemata(registry.checksum) index = entities_read_index(schemata) body = [] for checksum in checksums: body.append({"index": index}) query = {"term": {"checksums": checksum}} body.append({"size": 0, "query": query}) results = es.msearch(body=body) for checksum, result in zip(checksums, results.get("responses", [])): total = result.get("hits", {}).get("total", {}).get("value", 0) yield checksum, total
def entity_references(entity, authz): """Given a particular entity, find all the references to it from other entities, grouped by the property where they are used.""" schema = model[entity.get('schema')] # Generate all the possible mention locations. properties = [] queries = [] for prop in model.properties: if not prop.is_entity: continue if not schema.is_a(prop.range): continue field = 'properties.%s' % prop.name queries.append({}) queries.append({ 'size': 0, 'query': { 'bool': { 'filter': [ authz_query(authz), { 'term': { 'schemata': prop.schema.name } }, { 'term': { field: entity.get('id') } }, ] } } }) properties.append(prop) # Run a count search (with schema facet?) res = es.msearch(index=entities_index(), body=queries) results = [] for prop, resp in zip(properties, res.get('responses', [])): total = resp.get('hits', {}).get('total') if total > 0: results.append({ 'count': total, 'property': prop, 'schema': prop.schema.name }) return results
def _filters_faceted_query(facets, authz=None): filters = {} indexed = {} for (idx, alias, group, field, value) in facets: indexed[idx] = indexed.get(idx, {}) indexed[idx][alias] = field_filter_query(field, value) filters[idx] = filters.get(idx, {}) filters[idx][group] = filters[idx].get(group, []) filters[idx][group].append(value) queries = [] for (idx, facets) in indexed.items(): shoulds = [] for field, values in filters[idx].items(): shoulds.append(field_filter_query(field, values)) query = [] if authz is not None: query.append(authz_query(authz)) query = { 'bool': { 'should': shoulds, 'filter': query, 'minimum_should_match': 1 } } queries.append({'index': idx}) queries.append({ 'size': 0, 'query': query, 'aggs': { 'counters': { 'filters': { 'filters': facets } } } }) results = {} if not len(queries): return results res = es.msearch(body=queries) for resp in res.get('responses', []): aggs = resp.get('aggregations', {}).get('counters', {}) for alias, value in aggs.get('buckets', {}).items(): results[alias] = value.get('doc_count', results.get(alias, 0)) return results
def entity_tags(entity, authz): """Do a search on tags of an entity.""" proxy = model.get_proxy(entity) Thing = model.get(Entity.THING) types = [registry.name, registry.email, registry.identifier, registry.iban, registry.phone, registry.address] pivots = [] queries = [] # Go through all the tags which apply to this entity, and find how # often they've been mentioned in other entities. for type_ in types: if type_.group is None: continue for value in proxy.get_type_values(type_): if type_.specificity(value) < 0.1: continue schemata = model.get_type_schemata(type_) schemata = [s for s in schemata if s.is_a(Thing)] index = entities_read_index(schemata) queries.append({'index': index}) queries.append({ 'size': 0, 'query': { 'bool': { 'filter': [ authz_query(authz), field_filter_query(type_.group, value) ], 'must_not': [ {'ids': {'values': [entity.get('id')]}}, ] } } }) pivots.append((type_.group, value)) if not len(queries): return res = es.msearch(body=queries) for (field, value), resp in zip(pivots, res.get('responses', [])): total = resp.get('hits', {}).get('total') if total is not None and total > 0: yield (field, value, total)
def entity_tags(entity, authz): """Do a search on tags of an entity.""" # NOTE: This must also work for documents. FIELDS = [ 'names', 'emails', 'phones', 'addresses', 'identifiers' ] pivots = [] queries = [] # Go through all the tags which apply to this entity, and find how # often they've been mentioned in other entities. for field in FIELDS: for value in entity.get(field, []): if value is None or not len(value): continue queries.append({}) queries.append({ 'size': 0, 'query': { 'bool': { 'filter': [ authz_query(authz), field_filter_query(field, value) ], 'must_not': [ {'ids': {'values': [entity.get('id')]}}, ] } } }) pivots.append((field, value)) if not len(queries): return res = es.msearch(index=entities_read_index(), body=queries) for (field, value), resp in zip(pivots, res.get('responses', [])): total = resp.get('hits', {}).get('total') if total is not None and total > 0: yield (field, value, total)
def execute(self): "Run queries and dingle apart the returned responses." queries = self.compile() if not len(queries): return [] body = [] for (_, index, query) in queries: body.append({'index': index}) body.append(query) results = es.msearch(body=body) responses = results.get('responses', []) for ((patterns, _, _), result) in zip(queries, responses): hits = result.get('hits', {}).get('hits', []) results = [unpack_result(r) for r in hits] aggs = result.get('aggregations', {}).get('counters', {}) counters = aggs.get('buckets', {}) for pattern in patterns: count = counters.get(pattern.id, {}).get('doc_count') pattern.apply(count, results) return self.patterns
def _filters_faceted_query(authz, facets): filters = {} indexed = {} for (idx, alias, group, field, value) in facets: indexed[idx] = indexed.get(idx, {}) indexed[idx][alias] = field_filter_query(field, value) filters[idx] = filters.get(idx, {}) filters[idx][group] = filters[idx].get(group, []) filters[idx][group].append(value) queries = [] for (idx, facets) in indexed.items(): shoulds = [] for field, values in filters[idx].items(): shoulds.append(field_filter_query(field, values)) query = { 'bool': { 'should': shoulds, 'filter': [authz_query(authz)], 'minimum_should_match': 1 } } queries.append({'index': idx}) queries.append({ 'size': 0, 'query': query, 'aggs': {'counters': {'filters': {'filters': facets}}} }) results = {} if not len(queries): return results res = es.msearch(body=queries) for resp in res.get('responses', []): aggs = resp.get('aggregations', {}).get('counters', {}) for alias, value in aggs.get('buckets', {}).items(): results[alias] = value.get('doc_count', results.get(alias, 0)) return results
def entities_query(state, fields=None, facets=True, doc_counts=False): """Parse a user query string, compose and execute a query.""" if state.has_text: q = { "query_string": { "query": state.text, "fields": ['name^5', 'names^2', 'text'], "default_operator": "AND", "use_dis_max": True } } else: q = match_all() if state.raw_query: q = {"bool": {"must": [q, state.raw_query]}} q = authz_filter(q, state.authz, roles=True) aggs = {'scoped': {'global': {}, 'aggs': {}}} if facets: facets = list(state.facet_names) if 'collections' in facets: aggs = facet_collections(state, q, aggs) facets.remove('collections') aggs = aggregate(state, q, aggs, facets) if state.sort == 'doc_count': sort = [{'doc_count': 'desc'}, '_score'] elif state.sort == 'score': sort = ['_score', {'name_sort': 'asc'}] else: sort = [{'name_sort': 'asc'}] # pprint(q) q = { 'sort': sort, 'query': filter_query(q, state.filters), 'aggregations': aggs, 'size': state.limit, 'from': state.offset, '_source': fields or DEFAULT_FIELDS } result, hits, output = execute_basic(TYPE_ENTITY, q) output['facets'] = parse_facet_result(state, result) sub_queries = [] for doc in hits.get('hits', []): entity = doc.get('_source') entity['id'] = doc.get('_id') entity['score'] = doc.get('_score') entity['api_url'] = url_for('entities_api.view', id=doc.get('_id')) output['results'].append(entity) sq = {'term': {'entities.id': entity['id']}} sq = add_filter( sq, {'terms': { 'collection_id': state.authz.collections_read }}) sq = {'size': 0, 'query': sq} sub_queries.append(json.dumps({})) sub_queries.append(json.dumps(sq)) if doc_counts and len(sub_queries): # Get the number of matching documents for each entity. body = '\n'.join(sub_queries) res = es.msearch(index=es_index, doc_type=TYPE_DOCUMENT, body=body) for (entity, res) in zip(output['results'], res.get('responses')): entity['doc_count'] = res.get('hits', {}).get('total') return output
def documents_query(state, fields=None, facets=True, since=None): """Parse a user query string, compose and execute a query.""" # This used to be several functions, but it's actually incredibly # procedural and so it's been linearised into one function. To really # clean this up, I think it should be based around an object model of # some sort. q = text_query(state.text) if state.raw_query: q = {"bool": {"must": [q, state.raw_query]}} q = authz_filter(q, state.authz, roles=False) # Used by alerting to find only updated results: if since is not None: q = add_filter(q, {"range": {"created_at": {"gt": since}}}) # Sorting if state.sort == 'newest': sort = [{'dates': 'desc'}, {'created_at': 'desc'}, '_score'] if state.sort == 'oldest': sort = [{'dates': 'asc'}, {'created_at': 'asc'}, '_score'] else: sort = ['_score'] # TODO: find a better way to handle "slightly special" aggregations like # entities and collections. aggs = {'scoped': {'global': {}, 'aggs': {}}} if facets: facets = list(state.facet_names) if 'collections' in facets: aggs = facet_collections(state, q, aggs) facets.remove('collections') if 'entities' in facets: aggs = facet_entities(state, aggs) facets.remove('entities') aggs = aggregate(state, q, aggs, facets) # allow plug-ins to post-process the query. signals.document_query_process.send(q=q, state=state) q = { 'sort': sort, 'size': state.limit, 'from': state.offset, 'query': filter_query(q, state.filters), 'aggregations': aggs, '_source': fields or DEFAULT_FIELDS } result, hits, output = execute_basic(TYPE_DOCUMENT, q) # This will add labels and other contextual information. output['facets'] = parse_facet_result(state, result) # After the main query has run, a sub-query will be run for each returned # result in order to find relevant records for result highlighting. sub_shoulds = records_query_shoulds(state) sub_queries = [] for doc in hits.get('hits', []): document = doc.get('_source') document['id'] = int(doc.get('_id')) document['score'] = doc.get('_score') document['records'] = {'results': [], 'total': 0} collection_id = document.get('collection_id') try: # FIXME: there's got to be a nicer way of doing this.... document['public'] = state.authz.collection_public(collection_id) except: document['public'] = None sq = records_query_internal(document['id'], sub_shoulds) if sq is not None: sub_queries.append(json.dumps({})) sub_queries.append(json.dumps(sq)) output['results'].append(document) if not len(sub_queries): return output body = '\n'.join(sub_queries) res = es.msearch(index=es_index, doc_type=TYPE_RECORD, body=body) for doc in output['results']: for sq in res.get('responses', []): sqhits = sq.get('hits', {}) doc['records']['total'] = sqhits.get('total', 0) for hit in sqhits.get('hits', {}): record = hit.get('_source') if doc['id'] != record['document_id']: continue hlt = hit.get('highlight', {}) texts = hlt.get('text', []) or hlt.get('text_latin', []) texts = [clean_highlight(t) for t in texts] texts = [t for t in texts if len(t)] if len(texts): record['text'] = texts[0] doc['records']['results'].append(record) return output