def get_instance_stats(authz): # Compute entity stats: query = { 'size': 0, 'query': { 'bool': { 'filter': [authz_query(authz), { 'term': { 'schemata': Entity.THING } }] } }, 'aggs': { 'schema': { 'terms': { 'field': 'schema', 'size': 1000 } } } } result = es.search(index=entities_index(), body=query) aggregations = result.get('aggregations') data = {'count': result.get('hits').get('total'), 'schemata': {}} for schema in aggregations.get('schema').get('buckets'): key = schema.get('key') data['schemata'][key] = schema.get('doc_count') # Compute collection stats (should we return categories?) query = {'size': 0, 'query': {'bool': {'filter': [authz_query(authz)]}}} result = es.search(index=collections_index(), body=query) data['collections'] = result.get('hits').get('total') return data
def get_filters(self): """Apply query filters from the user interface.""" filters = [] if self.AUTHZ_FIELD is not None: # This enforces the authorization (access control) rules on # a particular query by comparing the collections a user is # authorized for with the one on the document. if self.parser.authz and not self.parser.authz.is_admin: authz = authz_query(self.parser.authz, field=self.AUTHZ_FIELD) filters.append(authz) range_filters = dict() for field, values in self.parser.filters.items(): if field in self.SKIP_FILTERS: continue if field not in self.parser.facet_names: # Collect all range query filters for a field in a single query if field.startswith(("gt:", "gte:", "lt:", "lte:")): op, field = field.split(":", 1) if range_filters.get(field) is None: range_filters[field] = {op: list(values)[0]} else: range_filters[field][op] = list(values)[0] continue filters.append(field_filter_query(field, values)) for field, ops in range_filters.items(): filters.append(range_filter_query(field, ops)) return filters
def iter_entities(authz=None, collection_id=None, schemata=None, includes=None, excludes=None): """Scan all entities matching the given criteria.""" filters = [] if authz is not None: filters.append(authz_query(authz)) if collection_id is not None: filters.append({'term': {'collection_id': collection_id}}) if ensure_list(schemata): filters.append({'terms': {'schemata': ensure_list(schemata)}}) source = {} if ensure_list(includes): source['includes'] = ensure_list(includes) if ensure_list(excludes): source['excludes'] = ensure_list(excludes) query = { 'query': { 'bool': { 'filter': filters } }, 'sort': ['_doc'], '_source': source } for res in scan(es, index=entities_index(), query=query, scroll='1410m'): yield unpack_result(res)
def iter_entities(authz=None, collection_id=None, schemata=None, includes=None, excludes=None): """Scan all entities matching the given criteria.""" filters = [] if authz is not None: filters.append(authz_query(authz)) if collection_id is not None: filters.append({'term': {'collection_id': collection_id}}) if ensure_list(schemata): filters.append({'terms': {'schemata': ensure_list(schemata)}}) query = { 'query': { 'bool': { 'filter': filters } }, '_source': _source_spec(includes, excludes) } index = entities_read_index(schema=schemata) for res in scan(es, index=index, query=query, scroll='1410m'): entity = unpack_result(res) if entity is not None: yield entity
def alert_query(alert, authz): """Construct a search query to find new matching entities and documents for a particular alert. Update handling is done via a timestamp of the latest known result.""" # Many users have bookmarked complex queries, otherwise we'd use a # precise match query. queries = [{ "query_string": { "query": alert.query, "lenient": True, "default_field": "text", "default_operator": "AND", "minimum_should_match": "90%", } }] filters = [authz_query(authz)] if alert.notified_at is not None: notified_at = alert.notified_at.isoformat() filters.append({"range": {"updated_at": {"gt": notified_at}}}) return { "size": 50, "_source": { "includes": ["collection_id"] }, "query": { "bool": { "should": queries, "filter": filters, "minimum_should_match": 1 } }, }
def generate_sitemap(collection_id): """Generate entries for a collection-based sitemap.xml file.""" # cf. https://www.sitemaps.org/protocol.html query = { 'query': { 'bool': { 'filter': [ {'term': {'collection_id': collection_id}}, {'term': {'schemata': Entity.THING}}, authz_query(Authz.from_role(None)) ] } }, '_source': {'includes': ['schemata', 'updated_at']} } scanner = scan(es, index=entities_index(), query=query) # strictly, the limit for sitemap.xml is 50,000 for res in islice(scanner, 49500): source = res.get('_source', {}) updated_at = source.get('updated_at', '').split('T', 1)[0] if Document.SCHEMA in source.get('schemata', []): url = document_url(res.get('_id')) else: url = entity_url(res.get('_id')) yield (url, updated_at)
def alert_query(alert, authz): """Construct a search query to find new matching entities and documents for a particular alert. Update handling is done via a timestamp of the latest known result.""" # Many users have bookmarked complex queries, otherwise we'd use a # precise match query. query = { 'simple_query_string': { 'query': alert.query, 'fields': ['text'], 'default_operator': 'AND', 'minimum_should_match': '90%' } } filter_since = {'range': {'created_at': {'gt': alert.notified_at}}} return { 'size': MAX_PAGE, 'query': { 'bool': { 'should': [query], 'filter': [filter_since, authz_query(authz)], 'minimum_should_match': 1 } } }
def alert_query(alert, authz): """Construct a search query to find new matching entities and documents for a particular alert. Update handling is done via a timestamp of the latest known result.""" # Many users have bookmarked complex queries, otherwise we'd use a # precise match query. queries = [{ 'query_string': { 'query': alert.query, 'fields': ['fingerprints.text^3', 'text'], 'default_operator': 'AND', 'minimum_should_match': '90%' } }] filters = [authz_query(authz)] if alert.notified_at is not None: notified_at = alert.notified_at.isoformat() filters.append({'range': {'updated_at': {'gt': notified_at}}}) return { 'size': 50, '_source': { 'includes': ['name'] }, 'query': { 'bool': { 'should': queries, 'filter': filters, 'minimum_should_match': 1 } } }
def _filters_faceted_query(authz, queries): indexed = {} for (idx, alias, filter_) in queries: indexed[idx] = indexed.get(idx, {}) indexed[idx][alias] = filter_ queries = [] for (idx, filters) in indexed.items(): queries.append({'index': idx}) queries.append({ 'size': 0, 'query': {'bool': {'filter': [authz_query(authz)]}}, 'aggs': {'counters': {'filters': {'filters': filters}}} }) results = {} if not len(queries): return results res = es.msearch(body=queries) for resp in res.get('responses', []): aggs = resp.get('aggregations', {}).get('counters', {}) for alias, value in aggs.get('buckets', {}).items(): results[alias] = value.get('doc_count', results.get(alias, 0)) return results
def iter_matches(collection, authz): """Scan all matching xref results, does not support sorting.""" filters = [{'term': {'collection_id': collection.id}}, authz_query(authz, field='match_collection_id')] query = {'query': {'bool': {'filter': filters}}} for res in scan(es, index=xref_index(), query=query): yield unpack_result(res)
def _entities_query(filters, authz, collection_id, schemata): filters = filters or [] if authz is not None: filters.append(authz_query(authz)) if collection_id is not None: filters.append({"term": {"collection_id": collection_id}}) if ensure_list(schemata): filters.append({"terms": {"schemata": ensure_list(schemata)}}) return {"bool": {"filter": filters}}
def iter_matches(collection, authz): """Scan all matching xref results, does not support sorting.""" filters = [ {"term": {"collection_id": collection.id}}, authz_query(authz, field="match_collection_id"), ] query = {"query": {"bool": {"filter": filters}}} for res in scan(es, index=xref_index(), query=query): yield unpack_result(res)
def _entities_query(filters, authz, collection_id, schemata): filters = filters or [] if authz is not None: filters.append(authz_query(authz)) if collection_id is not None: filters.append({'term': {'collection_id': collection_id}}) if ensure_list(schemata): filters.append({'terms': {'schemata': ensure_list(schemata)}}) return {'bool': {'filter': filters}}
def __init__(self, graph, authz=None, collection_ids=None): self.graph = graph self.authz = authz self.patterns = [] self.filters = [] if authz is not None: self.filters.append(authz_query(authz)) if collection_ids is not None: filter_ = field_filter_query('collection_id', collection_ids) self.filters.append(filter_)
def _counted_msearch(queries, authz, limit=0): """Run batched queries to count or retrieve entities with certain property values.""" # The default case for this is that we want to retrieve only the # counts for a bunch of filtered sub-queries. In this case, we can # group the queries by the affected index. # In some cases, the expand API wants to actually retrieve entities. # Then, we need to make one query per filter. grouped = {} for (index, key), query in sorted(queries.items()): group = index if limit == 0 else (index, key) if group not in grouped: grouped[group] = { "index": index, "filters": [query], "counts": {key: query}, } else: grouped[group]["filters"].append(query) grouped[group]["counts"][key] = query log.debug("Counts: %s queries, %s groups", len(queries), len(grouped)) body = [] for group in grouped.values(): body.append({"index": group.get("index")}) filters = group.get("filters") if limit == 0 and len(filters) > 1: filters = [{"bool": {"should": filters, "minimum_should_match": 1}}] filters.append(authz_query(authz)) query = { "size": limit, "query": {"bool": {"filter": filters}}, "aggs": {"counts": {"filters": {"filters": group.get("counts")}}}, "_source": ENTITY_SOURCE, } body.append(query) counts = {} # FIXME: This doesn't actually retain context on which query a particular # entity is a result from. Doesn't matter if all we do in the end is stuff # everything into an FtMGraph and then traverse for adjacency. entities = [] if not len(body): return entities, counts response = es.msearch(body=body) for resp in response.get("responses", []): for result in resp.get("hits", {}).get("hits", []): entities.append(unpack_result(result)) buckets = resp.get("aggregations", {}).get("counts", {}).get("buckets", {}) for key, count in buckets.items(): counts[key] = count.get("doc_count", 0) return entities, counts
def entity_tags(entity, authz): """Do a search on tags of an entity.""" # NOTE: This must also work for documents. FIELDS = [ 'names', 'emails', 'phones', 'addresses', 'identifiers' ] pivots = [] queries = [] # Go through all the tags which apply to this entity, and find how # often they've been mentioned in other entities. for field in FIELDS: for value in entity.get(field, []): if value is None: continue queries.append({}) queries.append({ 'size': 0, 'query': { 'bool': { 'filter': [ authz_query(authz), field_filter_query(field, value) ], 'must_not': [ {'ids': {'values': [entity.get('id')]}}, ] } } }) pivots.append((field, value)) if not len(queries): return [] res = es.msearch(index=entities_index(), body=queries) results = [] for (field, value), resp in zip(pivots, res.get('responses', [])): total = resp.get('hits', {}).get('total') if total > 0: qvalue = quote(value.encode('utf-8')) key = ('filter:%s' % field, qvalue) results.append({ 'id': query_string([key]), 'value': value, 'field': field, 'count': total }) results.sort(key=lambda p: p['count'], reverse=True) return results
def get_filters(self): """Apply query filters from the user interface.""" skip = [*self.SKIP_FILTERS, *self.parser.facet_names] filters = self.get_filters_list(skip) if self.AUTHZ_FIELD is not None: # This enforces the authorization (access control) rules on # a particular query by comparing the collections a user is # authorized for with the one on the document. if self.parser.authz and not self.parser.authz.is_admin: authz = authz_query(self.parser.authz, field=self.AUTHZ_FIELD) filters.append(authz) return filters
def get_instance_stats(authz): # Compute entity stats: query = { 'size': 0, 'query': { 'bool': { 'filter': [ authz_query(authz), # {'term': {'schemata': Entity.THING}} ] } } } entities = es.search(index=entities_index(), body=query) # Compute collection stats (should we return categories?) query = {'size': 0, 'query': {'bool': {'filter': [authz_query(authz)]}}} collections = es.search(index=collections_index(), body=query) return { 'entities': entities.get('hits').get('total'), 'collections': collections.get('hits').get('total') }
def entity_references(entity, authz): """Given a particular entity, find all the references to it from other entities, grouped by the property where they are used.""" schema = model[entity.get('schema')] # Generate all the possible mention locations. properties = [] queries = [] for prop in model.properties: if not prop.is_entity: continue if not schema.is_a(prop.range): continue field = 'properties.%s' % prop.name queries.append({}) queries.append({ 'size': 0, 'query': { 'bool': { 'filter': [ authz_query(authz), { 'term': { 'schemata': prop.schema.name } }, { 'term': { field: entity.get('id') } }, ] } } }) properties.append(prop) # Run a count search (with schema facet?) res = es.msearch(index=entities_index(), body=queries) results = [] for prop, resp in zip(properties, res.get('responses', [])): total = resp.get('hits', {}).get('total') if total > 0: results.append({ 'count': total, 'property': prop, 'schema': prop.schema.name }) return results
def alert_query(alert, authz): """Construct a search query to find new matching entities and documents for a particular alert. Update handling is done via a timestamp of the latest known result.""" entity = get_entity(alert.entity_id) clauses = [] if entity is None and not alert.query_text: # TODO: do we want to delete the alert here? clauses.append({'match_none': {}}) if alert.query_text: # Many users have bookmarked complex queries, otherwise we'd use a # precise match query. clauses.append({ 'simple_query_string': { 'query': alert.query_text, 'fields': ['text'], 'default_operator': 'AND', 'minimum_should_match': '90%' } }) if entity is not None: for field in ['names', 'fingerprints', 'emails', 'phones']: for value in ensure_list(entity.get(field)): clauses.append({'term': {field: value}}) clauses.append( {"multi_match": { "query": value, "fields": ['text'] }}) return { 'query': { 'bool': { 'should': clauses, 'filter': [{ 'range': { 'created_at': { 'gt': alert.notified_at } } }, authz_query(authz)], 'minimum_should_match': 1 } } }
def get_filters(self): """Apply query filters from the user interface.""" filters = [] # This enforces the authorization (access control) rules on # a particular query by comparing the collections a user is # authorized for with the one on the document. if self.parser.authz and not self.parser.authz.is_admin: filters.append(authz_query(self.parser.authz)) for field, values in self.parser.filters.items(): if field in self.SKIP_FILTERS: continue if field not in self.parser.facet_names: filters.append(field_filter_query(field, values)) return filters
def get_filters(self): """Apply query filters from the user interface.""" filters = [] # This enforces the authorization (access control) rules on # a particular query by comparing the collections a user is # authorized for with the one on the document. if self.parser.authz and not self.parser.authz.is_admin: filters.append(authz_query(self.parser.authz)) for field, values in self.parser.filters.items(): if field in self.SKIP_FILTERS: continue if field not in self.parser.facet_names: filters.append(field_filter_query(field, values)) return filters
def _filters_faceted_query(facets, authz=None): filters = {} indexed = {} for (idx, alias, group, field, value) in facets: indexed[idx] = indexed.get(idx, {}) indexed[idx][alias] = field_filter_query(field, value) filters[idx] = filters.get(idx, {}) filters[idx][group] = filters[idx].get(group, []) filters[idx][group].append(value) queries = [] for (idx, facets) in indexed.items(): shoulds = [] for field, values in filters[idx].items(): shoulds.append(field_filter_query(field, values)) query = [] if authz is not None: query.append(authz_query(authz)) query = { 'bool': { 'should': shoulds, 'filter': query, 'minimum_should_match': 1 } } queries.append({'index': idx}) queries.append({ 'size': 0, 'query': query, 'aggs': { 'counters': { 'filters': { 'filters': facets } } } }) results = {} if not len(queries): return results res = es.msearch(body=queries) for resp in res.get('responses', []): aggs = resp.get('aggregations', {}).get('counters', {}) for alias, value in aggs.get('buckets', {}).items(): results[alias] = value.get('doc_count', results.get(alias, 0)) return results
def entity_tags(entity, authz): """Do a search on tags of an entity.""" proxy = model.get_proxy(entity) Thing = model.get(Entity.THING) types = [registry.name, registry.email, registry.identifier, registry.iban, registry.phone, registry.address] pivots = [] queries = [] # Go through all the tags which apply to this entity, and find how # often they've been mentioned in other entities. for type_ in types: if type_.group is None: continue for value in proxy.get_type_values(type_): if type_.specificity(value) < 0.1: continue schemata = model.get_type_schemata(type_) schemata = [s for s in schemata if s.is_a(Thing)] index = entities_read_index(schemata) queries.append({'index': index}) queries.append({ 'size': 0, 'query': { 'bool': { 'filter': [ authz_query(authz), field_filter_query(type_.group, value) ], 'must_not': [ {'ids': {'values': [entity.get('id')]}}, ] } } }) pivots.append((type_.group, value)) if not len(queries): return res = es.msearch(body=queries) for (field, value), resp in zip(pivots, res.get('responses', [])): total = resp.get('hits', {}).get('total') if total is not None and total > 0: yield (field, value, total)
def iter_entities(authz=None, collection_id=None, schemata=None, includes=None, excludes=None): """Scan all entities matching the given criteria.""" filters = [] if authz is not None: filters.append(authz_query(authz)) if collection_id is not None: filters.append({'term': {'collection_id': collection_id}}) if ensure_list(schemata): filters.append({'terms': {'schemata': ensure_list(schemata)}}) query = { 'query': {'bool': {'filter': filters}}, '_source': _source_spec(includes, excludes) } index = entities_read_index(schema=schemata) for res in scan(es, index=index, query=query, scroll='1410m'): entity = unpack_result(res) if entity is not None: yield entity
def entity_tags(entity, authz): """Do a search on tags of an entity.""" # NOTE: This must also work for documents. FIELDS = [ 'names', 'emails', 'phones', 'addresses', 'identifiers' ] pivots = [] queries = [] # Go through all the tags which apply to this entity, and find how # often they've been mentioned in other entities. for field in FIELDS: for value in entity.get(field, []): if value is None or not len(value): continue queries.append({}) queries.append({ 'size': 0, 'query': { 'bool': { 'filter': [ authz_query(authz), field_filter_query(field, value) ], 'must_not': [ {'ids': {'values': [entity.get('id')]}}, ] } } }) pivots.append((field, value)) if not len(queries): return res = es.msearch(index=entities_read_index(), body=queries) for (field, value), resp in zip(pivots, res.get('responses', [])): total = resp.get('hits', {}).get('total') if total is not None and total > 0: yield (field, value, total)
def _filters_faceted_query(authz, facets): filters = {} indexed = {} for (idx, alias, group, field, value) in facets: indexed[idx] = indexed.get(idx, {}) indexed[idx][alias] = field_filter_query(field, value) filters[idx] = filters.get(idx, {}) filters[idx][group] = filters[idx].get(group, []) filters[idx][group].append(value) queries = [] for (idx, facets) in indexed.items(): shoulds = [] for field, values in filters[idx].items(): shoulds.append(field_filter_query(field, values)) query = { 'bool': { 'should': shoulds, 'filter': [authz_query(authz)], 'minimum_should_match': 1 } } queries.append({'index': idx}) queries.append({ 'size': 0, 'query': query, 'aggs': {'counters': {'filters': {'filters': facets}}} }) results = {} if not len(queries): return results res = es.msearch(body=queries) for resp in res.get('responses', []): aggs = resp.get('aggregations', {}).get('counters', {}) for alias, value in aggs.get('buckets', {}).items(): results[alias] = value.get('doc_count', results.get(alias, 0)) return results
def iter_entities_by_ids(ids, authz=None): """Iterate over unpacked entities based on a search for the given entity IDs.""" for i in range(0, len(ids), MAX_PAGE): chunk = ids[i:i + MAX_PAGE] if not len(chunk): return query = bool_query() query['bool']['filter'].append({'ids': {'values': chunk}}) if authz is not None: query['bool']['filter'].append(authz_query(authz)) includes = ['schema', 'properties', 'collection_id', 'created_at'] query = { 'query': query, '_source': {'includes': includes}, 'size': min(MAX_PAGE, len(chunk) * 2) } result = search_safe(index=entity_index(), body=query, request_cache=False) for doc in result.get('hits', {}).get('hits', []): entity = unpack_result(doc) if entity is not None: yield entity
def entities_by_ids(ids, authz=None, schemata=None): """Iterate over unpacked entities based on a search for the given entity IDs.""" for i in range(0, len(ids), MAX_PAGE): chunk = ids[i:i + MAX_PAGE] if not len(chunk): return query = bool_query() query['bool']['filter'].append({'ids': {'values': chunk}}) if authz is not None: query['bool']['filter'].append(authz_query(authz)) query = { 'query': query, '_source': { 'excludes': ['text'] }, 'size': min(MAX_PAGE, len(chunk)) } index = entities_read_index(schema=schemata) result = search_safe(index=index, body=query, ignore=[404]) for doc in result.get('hits', {}).get('hits', []): entity = unpack_result(doc) if entity is not None: yield entity
def get_filters(self): filters = super(AuthzQuery, self).get_filters() filters.append(authz_query(self.parser.authz)) return filters