class ESAlchemy(object): def __init__(self, index_name, config): self.index_name = index_name self.config = config self.es = HQESQuery(index_name) def __getitem__(self, sliced_or_int): hits = self.es[sliced_or_int] hits = [self._hit_to_row(hit) for hit in hits] if isinstance(sliced_or_int, (int, long)): return hits[0] return hits def _hit_to_row(self, hit): def mapping_to_datatype(column, value): if not value: return value datatype = column.datatype if datatype == 'datetime': try: return datetime.datetime.strptime(value, "%Y-%m-%dT%H:%M:%S") except ValueError: return datetime.datetime.strptime(value, "%Y-%m-%dT%H:%M:%S.%f") elif datatype == 'date': return datetime.datetime.strptime(value, "%Y-%m-%d") return value return ESAlchemyRow(self.column_ordering, { col.database_column_name: mapping_to_datatype(col, hit[col.database_column_name]) for col in self.columns }) @property def columns(self): return self.config.indicators.get_columns() @property @memoized def column_ordering(self): return [col.database_column_name for col in self.columns] @property def column_descriptions(self): return [{"name": col} for col in self.column_ordering] def count(self): return self.es.count() def distinct_values(self, column, size): # missing aggregation can be removed on upgrade to ES 2.0 missing_agg_name = column + '_missing' query = self.es.terms_aggregation(column, column, size=size, sort_field="_term").size(0) query = query.aggregation(MissingAggregation(missing_agg_name, column)) results = query.run() missing_result = getattr(results.aggregations, missing_agg_name).result result = getattr(results.aggregations, column).keys if missing_result['doc_count'] > 0: result.append(None) return result
def _get_aggregated_query(self, start, limit): max_size = (start or 0) + (limit or 0) query = HQESQuery(self.table_name).size(0) for filter in self.filters: query = query.filter(filter) innermost_agg_col = self.aggregation_columns[-1] innermost_agg = TermsAggregation(innermost_agg_col, innermost_agg_col) aggregations = [] for col in self.top_level_columns: for agg in col.aggregations(self.config, self.lang): innermost_agg.aggregation(agg) top_agg = innermost_agg # go through aggregations in reverse order so that they are nested properly # todo: Refactor NestedTermAggregationsHelper to support this use case for agg_column in self.aggregation_columns[:-1][::-1]: top_agg = TermsAggregation(agg_column, agg_column).aggregation(top_agg) top_agg.size(max_size) if self.order_by: # todo sort by more than one column # todo sort by by something other than the first aggregate column col, desc = self.order_by[0] if col == self.aggregation_columns[0] or col == self.top_level_columns[0].field: top_agg = top_agg.order('_term', desc) query = query.aggregation(top_agg) return query.run()
def test_cleanup_before_run(self): json_output = { "query": { "filtered": { "filter": { "and": [ {"match_all": {}} ] }, "query": {"match_all": {}} } }, "aggs": { "by_day": { "date_histogram": { "field": "date", "interval": "day", "time_zone": "-01:00" } } }, "size": SIZE_LIMIT } expected_output = deepcopy(json_output) expected_output['size'] = 0 query = HQESQuery('forms').date_histogram('by_day', 'date', 'day', '-01:00') self.checkQuery(query, json_output) self.checkQuery(query._clean_before_run(), expected_output)
def case_owners(self): # Get user ids for each user that match the demo_user, admin, Unknown Users, or All Mobile Workers filters user_types = EMWF.selected_user_types(self.request) user_type_filters = [] if HQUserType.ADMIN in user_types: user_type_filters.append(user_es.admin_users()) if HQUserType.UNKNOWN in user_types: user_type_filters.append(user_es.unknown_users()) user_type_filters.append(user_es.web_users()) if HQUserType.DEMO_USER in user_types: user_type_filters.append(user_es.demo_users()) if HQUserType.REGISTERED in user_types: user_type_filters.append(user_es.mobile_users()) if len(user_type_filters) > 0: special_q = user_es.UserES().domain(self.domain).OR(*user_type_filters).show_inactive() special_user_ids = special_q.run().doc_ids else: special_user_ids = [] # Get user ids for each user that was specifically selected selected_user_ids = EMWF.selected_user_ids(self.request) # Get group ids for each group that was specified selected_reporting_group_ids = EMWF.selected_reporting_group_ids(self.request) selected_sharing_group_ids = EMWF.selected_sharing_group_ids(self.request) # Get user ids for each user in specified reporting groups report_group_q = HQESQuery(index="groups").domain(self.domain)\ .doc_type("Group")\ .filter(filters.term("_id", selected_reporting_group_ids))\ .fields(["users"]) user_lists = [group["users"] for group in report_group_q.run().hits] selected_reporting_group_users = list(set().union(*user_lists)) # Get ids for each sharing group that contains a user from selected_reporting_group_users OR a user that was specifically selected share_group_q = HQESQuery(index="groups").domain(self.domain)\ .doc_type("Group")\ .filter(filters.term("case_sharing", True))\ .filter(filters.term("users", selected_reporting_group_users+selected_user_ids+special_user_ids))\ .fields([]) sharing_group_ids = share_group_q.run().doc_ids owner_ids = list(set().union( special_user_ids, selected_user_ids, selected_sharing_group_ids, selected_reporting_group_users, sharing_group_ids )) if HQUserType.COMMTRACK in user_types: owner_ids.append("commtrack-system") if HQUserType.DEMO_USER in user_types: owner_ids.append("demo_user_group_id") owner_ids += self.location_sharing_owner_ids() owner_ids += self.location_reporting_owner_ids() return owner_ids
def case_owners(self): # Get user ids for each user that match the demo_user, admin, # Unknown Users, or All Mobile Workers filters mobile_user_and_group_slugs = self.request.GET.getlist(EMWF.slug) user_types = EMWF.selected_user_types(mobile_user_and_group_slugs) special_owner_ids = self.get_special_owner_ids( admin=HQUserType.ADMIN in user_types, unknown=HQUserType.UNKNOWN in user_types, demo=HQUserType.DEMO_USER in user_types, commtrack=HQUserType.COMMTRACK in user_types, ) # Get user ids for each user that was specifically selected selected_user_ids = EMWF.selected_user_ids(mobile_user_and_group_slugs) # Get group ids for each group that was specified selected_reporting_group_ids = EMWF.selected_reporting_group_ids(mobile_user_and_group_slugs) selected_sharing_group_ids = EMWF.selected_sharing_group_ids(mobile_user_and_group_slugs) # Show cases owned by any selected locations, user locations, or their children loc_ids = set(EMWF.selected_location_ids(mobile_user_and_group_slugs) + get_users_location_ids(self.domain, selected_user_ids)) location_owner_ids = get_locations_and_children(loc_ids).location_ids() # Get user ids for each user in specified reporting groups report_group_q = HQESQuery(index="groups").domain(self.domain)\ .doc_type("Group")\ .filter(filters.term("_id", selected_reporting_group_ids))\ .fields(["users"]) user_lists = [group["users"] for group in report_group_q.run().hits] selected_reporting_group_users = list(set().union(*user_lists)) # Get ids for each sharing group that contains a user from selected_reporting_group_users OR a user that was specifically selected share_group_q = (HQESQuery(index="groups") .domain(self.domain) .doc_type("Group") .term("case_sharing", True) .term("users", (selected_reporting_group_users + selected_user_ids)) .fields([])) sharing_group_ids = share_group_q.run().doc_ids owner_ids = list(set().union( special_owner_ids, selected_user_ids, selected_sharing_group_ids, selected_reporting_group_users, sharing_group_ids, location_owner_ids, )) return owner_ids
def test_nested_filter(self): json_output = { "query": { "filtered": { "filter": { "and": [{ "nested": { "path": "actions", "filter": { "range": { "actions.date": { "gte": "2015-01-01", "lt": "2015-02-01" } } } } }, { "match_all": {} }] }, "query": { "match_all": {} } } }, "size": SIZE_LIMIT } start, end = date(2015, 1, 1), date(2015, 2, 1) query = (HQESQuery('cases').nested( "actions", filters.date_range("actions.date", gte=start, lt=end))) self.checkQuery(query, json_output)
def test_flatten_field_dicts(self): example_response = { 'hits': {'hits': [{ '_source': { 'domains': ['joesproject'], } }, { '_source': { 'domains': ['mikesproject'] } } ], }, } hits = [ { 'domains': 'joesproject', }, { 'domains': 'mikesproject', } ] fields = ['domain'] response = ESQuerySet( example_response, HQESQuery('forms').fields(fields) ) self.assertEquals(response.hits, hits)
def test_exclude_source(self): hits = ['8063dff5-460b-46f2-b4d0-5871abfd97d4', 'dc1376cd-0869-4c13-a267-365dfc2fa754'] response = ESQuerySet( self.example_response, HQESQuery('forms').exclude_source() ) self.assertEquals(response.hits, hits)
def test_terms_aggregation_with_order(self): json_output = { "query": { "filtered": { "filter": { "and": [{ "match_all": {} }] }, "query": { "match_all": {} } } }, "aggs": { "name": { "terms": { "field": "name", "size": 1000000, "order": [{ "sort_field": "asc" }] }, }, }, "size": SIZE_LIMIT } query = HQESQuery('cases').aggregation( TermsAggregation('name', 'name').order('sort_field')) self.checkQuery(query, json_output)
def test_nested_aggregation(self): json_output = { "query": { "filtered": { "filter": { "and": [{ "match_all": {} }] }, "query": { "match_all": {} } } }, "aggs": { "case_actions": { "nested": { "path": "actions" } }, }, "size": SIZE_LIMIT } query = HQESQuery('cases').aggregation( NestedAggregation( 'case_actions', 'actions', )) self.checkQuery(query, json_output)
def test_date_histogram(self): json_output = { "query": { "filtered": { "filter": { "and": [{ "match_all": {} }] }, "query": { "match_all": {} } } }, "aggs": { "by_day": { "date_histogram": { "field": "date", "interval": "day", "time_zone": "-01:00" } } }, "size": SIZE_LIMIT } query = HQESQuery('forms').date_histogram('by_day', 'date', 'day', '-01:00') self.checkQuery(query, json_output)
def test_nesting_aggregations(self): json_output = { "query": { "filtered": { "filter": { "and": [{ "match_all": {} }] }, "query": { "match_all": {} } } }, "aggs": { "users": { "terms": { "field": "user_id", "size": SIZE_LIMIT }, "aggs": { "closed": { "filter": { "term": { "closed": True } } } } }, "total_by_status": { "filters": { "filters": { "closed": { "term": { "closed": True } }, "open": { "term": { "closed": False } } } } } }, "size": SIZE_LIMIT } query = HQESQuery('cases').aggregations([ TermsAggregation("users", 'user_id').aggregation( FilterAggregation('closed', filters.term('closed', True))), FiltersAggregation('total_by_status').add_filter( 'closed', filters.term('closed', True)).add_filter('open', filters.term('closed', False)) ]) self.checkQuery(query, json_output)
def test_extended_stats_aggregation(self): json_output = { "query": { "filtered": { "filter": { "and": [{ "match_all": {} }] }, "query": { "match_all": {} } } }, "aggs": { "name_stats": { "extended_stats": { "field": "name", "script": "MY weird script" } }, }, "size": SIZE_LIMIT } query = HQESQuery('cases').aggregation( ExtendedStatsAggregation('name_stats', 'name', script='MY weird script')) self.checkQuery(query, json_output)
def test_missing_aggregation(self): json_output = { "query": { "filtered": { "filter": { "and": [{ "match_all": {} }] }, "query": { "match_all": {} } } }, "aggs": { "missing_user_id": { "missing": { "field": "user_id" } }, }, "size": SIZE_LIMIT } query = HQESQuery('cases').aggregation( MissingAggregation( 'missing_user_id', 'user_id', )) self.checkQuery(query, json_output)
def _get_total_aggregated_results(self): query = HQESQuery(self.table_name).size(0) for filter in self.filters: query = query.filter(filter) columns = [ col for col in self.top_level_columns if col.calculate_total ] totals_aggregations = [] for col in columns: for agg in col.aggregations(self.config, self.lang): totals_aggregations.append(agg) query = query.aggregations(totals_aggregations) return query.run().aggregations
def test_basic_query_string_query(self): query = HQESQuery('forms').search_string_query("foo") self.assertHasQuery(query, { "query_string": { "query": "*foo*", "default_operator": "AND", "fields": None, } })
def test_query_size(self): json_output = { "query": { "filtered": { "filter": { "and": [ {"match_all": {}} ] }, "query": {"match_all": {}} } }, "size": 0 } # use `is not None`; 0 or 1000000 == 1000000 self.checkQuery(HQESQuery('forms').size(0), json_output) json_output['size'] = 123 self.checkQuery(HQESQuery('forms').size(123), json_output)
def test_query_with_fields(self): default_fields = ['name', 'type', 'date'] query = HQESQuery('forms').search_string_query("foo", default_fields) self.assertHasQuery(query, { "query_string": { "query": "*foo*", "default_operator": "AND", "fields": ['name', 'type', 'date'], } })
def test_complex_query_with_fields(self): default_fields = ['name', 'type', 'date'] query = (HQESQuery('forms') .search_string_query("name: foo", default_fields)) self.assertHasQuery(query, { "simple_query_string": { "query": "name: foo", "default_operator": "AND", "fields": None, } })
def _get_aggregated_query(self, start, limit): max_size = (start or 0) + (limit or 0) query = HQESQuery(self.table_name).size(0) for filter in self.filters: query = query.filter(filter) top_agg = TermsAggregation(self.aggregation_columns[0], self.aggregation_columns[0], size=max_size) for agg_column in self.aggregation_columns[1:]: # todo support multiple aggregations pass aggregations = [] for col in self.top_level_columns: if col.type == 'expanded': for sub_col in get_expanded_column_config( self.config, col, 'en').columns: aggregations.append(sub_col.aggregation) elif col.type == 'field': if col.aggregation == 'sum': # todo push this to the column aggregations.append(SumAggregation(col.field, col.field)) for agg in aggregations: top_agg = top_agg.aggregation(agg) if self.order_by: # todo sort by more than one column # todo sort by by something other than the first aggregate column col, desc = self.order_by[0] if col == self.aggregation_columns[ 0] or col == self.top_level_columns[0].field: top_agg = top_agg.order('_count', desc) query = query.aggregation(top_agg) return query.run()
def _get_aggregated_query(self, start, limit): max_size = (start or 0) + (limit or 0) query = HQESQuery(self.table_name).size(0) for filter in self.filters: query = query.filter(filter) innermost_agg_col = self.aggregation_columns[-1] innermost_agg = TermsAggregation(innermost_agg_col, innermost_agg_col, size=max_size) aggregations = [] for col in self.top_level_columns: for agg in col.aggregations(self.config, self.lang): innermost_agg.aggregation(agg) top_agg = innermost_agg # go through aggregations in reverse order so that they are nested properly # todo: Refactor NestedTermAggregationsHelper to support this use case for agg_column in self.aggregation_columns[:-1][::-1]: top_agg = TermsAggregation(agg_column, agg_column, size=max_size).aggregation(top_agg) if self.order_by: # todo sort by more than one column # todo sort by by something other than the first aggregate column col, desc = self.order_by[0] valid_columns = (self.aggregation_columns[0], self.top_level_columns[0].field, self.top_level_columns[0].column_id) if col in valid_columns: top_agg = top_agg.order('_term', desc) query = query.aggregation(top_agg) return query.run()
def test_basic_query(self): json_output = { "query": { "filtered": { "filter": { "and": [ {"match_all": {}} ] }, "query": {"match_all": {}} } }, "size": SIZE_LIMIT } self.checkQuery(HQESQuery('forms'), json_output)
def test_nested_terms_helper(self): json_output = { "query": { "filtered": { "filter": { "and": [{ "match_all": {} }] }, "query": { "match_all": {} } } }, "aggs": { "app_id": { "terms": { "field": "app_id", "size": SIZE_LIMIT, }, "aggs": { "user_id": { "terms": { "field": "user_id", "size": SIZE_LIMIT, }, "aggs": { "balance": { "sum": { "field": "balance" } } } } } } }, "size": SIZE_LIMIT } base_query = HQESQuery('cases') query = NestedTermAggregationsHelper( base_query=base_query, terms=[ AggregationTerm('app_id', 'app_id'), AggregationTerm('user_id', 'user_id') ], inner_most_aggregation=SumAggregation('balance', 'balance')).query self.checkQuery(query, json_output)
class ESAlchemy(object): def __init__(self, index_name, config): self.index_name = index_name self.config = config self.es = HQESQuery(index_name) def __getitem__(self, sliced_or_int): hits = self.es[sliced_or_int] hits = [self._hit_to_row(hit) for hit in hits] if isinstance(sliced_or_int, (int, long)): return hits[0] return hits def _hit_to_row(self, hit): def mapping_to_datatype(column, value): if not value: return value datatype = column.datatype if datatype == 'datetime': try: return datetime.datetime.strptime(value, "%Y-%m-%dT%H:%M:%S") except ValueError: return datetime.datetime.strptime(value, "%Y-%m-%dT%H:%M:%S.%f") elif datatype == 'date': return datetime.datetime.strptime(value, "%Y-%m-%d") return value return ESAlchemyRow(self.column_ordering, { col.database_column_name: mapping_to_datatype(col, hit[col.database_column_name]) for col in self.columns }) @property def columns(self): return self.config.indicators.get_columns() @property @memoized def column_ordering(self): return [col.database_column_name for col in self.columns] @property def column_descriptions(self): return [{"name": col} for col in self.column_ordering] def count(self): return self.es.count()
def _get_query(self, start=None, limit=None): query = HQESQuery(self.table_name).source(self.required_fields) for column, order in self.order_by: query = query.sort(column, desc=(order == DESCENDING), reset_sort=False) if start: query = query.start(start) if limit: query = query.size(limit) for filter in self.filters: query = query.filter(filter) return query.run()
def test_range_aggregation(self): json_output = { "query": { "filtered": { "filter": { "and": [{ "match_all": {} }] }, "query": { "match_all": {} } } }, "aggs": { "by_date": { "range": { "field": "name", "keyed": True, "ranges": [ { "to": "c" }, { "from": "f" }, { "from": "k", "to": "p", "key": "k-p" }, ] } }, }, "size": SIZE_LIMIT } query = HQESQuery('cases').aggregation( RangeAggregation('by_date', 'name', [ AggregationRange(end='c'), AggregationRange(start='f'), AggregationRange(start='k', end='p', key='k-p') ])) self.checkQuery(query, json_output)
def case_owners(self): # Get user ids for each user that match the demo_user, admin, Unknown Users, or All Mobile Workers filters user_types = EMWF.selected_user_types(self.request) user_type_filters = [] if HQUserType.ADMIN in user_types: user_type_filters.append(user_es.admin_users()) if HQUserType.UNKNOWN in user_types: user_type_filters.append(user_es.unknown_users()) user_type_filters.append(user_es.web_users()) if HQUserType.DEMO_USER in user_types: user_type_filters.append(user_es.demo_users()) if HQUserType.REGISTERED in user_types: user_type_filters.append(user_es.mobile_users()) if len(user_type_filters) > 0: special_q = user_es.UserES().domain( self.domain).OR(*user_type_filters) special_user_ids = special_q.run().doc_ids else: special_user_ids = [] # Get user ids for each user that was specifically selected selected_user_ids = EMWF.selected_user_ids(self.request) # Get group ids for each group that was specified selected_reporting_group_ids = EMWF.selected_reporting_group_ids( self.request) selected_sharing_group_ids = EMWF.selected_sharing_group_ids( self.request) # Get user ids for each user in specified reporting groups report_group_q = HQESQuery(index="groups").domain(self.domain)\ .doc_type("Group")\ .filter(filters.term("_id", selected_reporting_group_ids))\ .fields(["users"]) user_lists = [group["users"] for group in report_group_q.run().hits] selected_reporting_group_users = list(set().union(*user_lists)) # Get ids for each sharing group that contains a user from selected_reporting_group_users OR a user that was specifically selected share_group_q = HQESQuery(index="groups").domain(self.domain)\ .doc_type("Group")\ .filter(filters.term("case_sharing", True))\ .filter(filters.term("users", selected_reporting_group_users+selected_user_ids+special_user_ids))\ .fields([]) sharing_group_ids = share_group_q.run().doc_ids owner_ids = list(set().union(special_user_ids, selected_user_ids, selected_sharing_group_ids, selected_reporting_group_users, sharing_group_ids)) if HQUserType.COMMTRACK in EMWF.selected_user_types(self.request): owner_ids.append("commtrack-system") return owner_ids
def test_source_include(self): json_output = { "query": { "filtered": { "filter": { "and": [ {"match_all": {}} ] }, "query": {"match_all": {}} } }, "size": SIZE_LIMIT, "_source": "source_obj" } q = HQESQuery('forms').source('source_obj') self.checkQuery(q, json_output)
def test_sort(self): json_output = { "query": { "filtered": { "filter": { "and": [{ "match_all": {} }] }, "query": { "match_all": {} } } }, "size": SIZE_LIMIT, "sort": [{ "timeEnd": { "order": "asc" } }], } query = (HQESQuery('forms').sort('timeEnd')) self.checkQuery(query, json_output) json_output['sort'] = [ { "timeStart": { "order": "asc" } }, ] self.checkQuery(query.sort('timeStart'), json_output) json_output['sort'] = [ { "timeEnd": { "order": "asc" } }, { "timeStart": { "order": "asc" } }, ] self.checkQuery(query.sort('timeStart', reset_sort=False), json_output)
def test_result_parsing_basic(self): query = HQESQuery('cases').aggregations([ FilterAggregation('closed', filters.term('closed', True)), FilterAggregation('open', filters.term('closed', False)) ]) raw_result = { "aggregations": { "closed": { "doc_count": 1 }, "open": { "doc_count": 2 } } } queryset = ESQuerySet(raw_result, deepcopy(query)) self.assertEqual(queryset.aggregations.closed.doc_count, 1) self.assertEqual(queryset.aggregations.open.doc_count, 2)
def test_not_or_rewrite(self): json_output = { "query": { "bool": { "filter": [ { "bool": { "must_not": { "bool": { "should": [ { "term": { "type": "A" } }, { "term": { "type": "B" } } ] } } } }, { "match_all": {} } ], "must": { "match_all": {} } } }, "size": SIZE_LIMIT } query = HQESQuery('cases').filter( filters.NOT( filters.OR(filters.term('type', 'A'), filters.term('type', 'B')) ) ) self.checkQuery(query, json_output)
def es_histogram(histo_type, domains=None, startdate=None, enddate=None, interval="day", filters=[]): from corehq.apps.es.es_query import HQESQuery date_field = DATE_FIELDS[histo_type] query = ( HQESQuery(index=histo_type) .range_filter(date_field, gte=startdate, lte=enddate) ) for filter_ in ADD_TO_ES_FILTER.get(histo_type, []): query = query.filter(filter_) if domains is not None: query = query.domain(domains) if filters: query = query.filter(filters) query = query.date_histogram('histo', date_field, interval) ret_data = query.run().aggregations.histo.as_facet_result() return ret_data
def test_exclude_source(self): json_output = { "query": { "bool": { "filter": [{ "term": { "domain.exact": "test-exclude" } }, { "match_all": {} }], "must": { "match_all": {} } } }, "_source": False, "size": SIZE_LIMIT, } query = HQESQuery('forms').domain('test-exclude').exclude_source() self.checkQuery(query, json_output)
def test_top_hits_aggregation(self): json_output = { "query": { "filtered": { "filter": { "and": [ {"match_all": {}} ] }, "query": {"match_all": {}} } }, "aggs": { "name_top_hits": { "top_hits": { "sort": [{ "my_awesome_field": { "order": "desc" } }], "_source": { "include": [ "title" ] }, "size": 2 }, }, }, "size": SIZE_LIMIT } query = HQESQuery('cases').aggregation( TopHitsAggregation( 'name_top_hits', field='my_awesome_field', is_ascending=False, size=2, include=['title']) ) self.checkQuery(query, json_output)
def test_not_and_rewrite(self): json_output = { "query": { "filtered": { "filter": { "and": [{ 'or': ( { "not": { "term": { "type": "A" } } }, { "not": { "term": { "type": "B" } } }, ) }, { "match_all": {} }] }, "query": { "match_all": {} } } }, "size": SIZE_LIMIT } query = HQESQuery('cases').filter( filters.NOT( filters.AND(filters.term('type', 'A'), filters.term('type', 'B')))) self.checkQuery(query, json_output)
def __init__(self, index_name, config): self.index_name = index_name self.config = config self.es = HQESQuery(index_name)