def query_elasticsearch(self, filters) -> list: filter_query = QueryWithFilters.generate_awards_elasticsearch_query(filters) s = AwardSearch().filter(filter_query) s.aggs.bucket( "types", "filters", filters={category: Q("terms", type=types) for category, types in all_award_types_mappings.items()}, ) results = s.handle_execute() contracts = results.aggregations.types.buckets.contracts.doc_count idvs = results.aggregations.types.buckets.idvs.doc_count grants = results.aggregations.types.buckets.grants.doc_count direct_payments = results.aggregations.types.buckets.direct_payments.doc_count loans = results.aggregations.types.buckets.loans.doc_count other = results.aggregations.types.buckets.other_financial_assistance.doc_count response = { "contracts": contracts, "direct_payments": direct_payments, "grants": grants, "idvs": idvs, "loans": loans, "other": other, } return response
def build_elasticsearch_search_with_aggregation( self, filter_query: ES_Q) -> Optional[AwardSearch]: # Create the initial search using filters search = AwardSearch().filter(filter_query) # Check number of unique terms (buckets) for performance and restrictions on maximum buckets allowed bucket_count = get_number_of_unique_terms_for_awards( filter_query, f"{self.agg_key}.hash") if bucket_count == 0: return None else: # Add 1 to handle null case since murmur3 doesn't support "null_value" property bucket_count += 1 # Add 100 to make sure that we consider enough records in each shard for accurate results group_by_agg_key = A("terms", field=self.agg_key, size=bucket_count, shard_size=bucket_count + 100) sum_aggregations = get_scaled_sum_aggregations(self.metric_field) sum_field = sum_aggregations["sum_field"] search.aggs.bucket("group_by_agg_key", group_by_agg_key).metric("sum_field", sum_field) # Set size to 0 since we don't care about documents returned search.update_from_dict({"size": 0}) return search
def get_number_of_unique_terms_for_awards(filter_query: ES_Q, field: str) -> int: """ Returns the count for a specific filter_query. NOTE: Counts below the precision_threshold are expected to be close to accurate (per the Elasticsearch documentation). Since aggregations do not support more than 10k buckets this value is hard coded to 11k to ensure that endpoints using Elasticsearch do not cross the 10k threshold. Elasticsearch endpoints should be implemented with a safeguard in case this count is above 10k. """ return _get_number_of_unique_terms(AwardSearch().filter(filter_query), field)
def query_elasticsearch(self) -> list: filter_query = QueryWithFilters.generate_awards_elasticsearch_query( self.filters) sort_field = self.get_elastic_sort_by_fields() sorts = [{ field: self.pagination["sort_order"] } for field in sort_field] record_num = (self.pagination["page"] - 1) * self.pagination["limit"] # random page jumping was removed due to performance concerns if (self.last_record_sort_value is None and self.last_record_unique_id is not None) or (self.last_record_sort_value is not None and self.last_record_unique_id is None): # malformed request raise Exception( "Using search_after functionality in Elasticsearch requires both last_record_sort_value and last_record_unique_id." ) if record_num >= settings.ES_AWARDS_MAX_RESULT_WINDOW and ( self.last_record_unique_id is None and self.last_record_sort_value is None): raise UnprocessableEntityException( "Page #{page} with limit {limit} is over the maximum result limit {es_limit}. Please provide the 'last_record_sort_value' and 'last_record_unique_id' to paginate sequentially." .format( page=self.pagination["page"], limit=self.pagination["limit"], es_limit=settings.ES_AWARDS_MAX_RESULT_WINDOW, )) # Search_after values are provided in the API request - use search after if self.last_record_sort_value is not None and self.last_record_unique_id is not None: search = ( AwardSearch().filter(filter_query).sort(*sorts).extra( search_after=[ self.last_record_sort_value, self.last_record_unique_id ])[:self.pagination["limit"] + 1] # add extra result to check for next page ) # no values, within result window, use regular elasticsearch else: search = AwardSearch().filter(filter_query).sort( *sorts)[record_num:record_num + self.pagination["limit"]] response = search.handle_execute() return response
def query_elasticsearch(self) -> list: filter_query = QueryWithFilters.generate_awards_elasticsearch_query( self.filters) sort_field = self.get_elastic_sort_by_fields() sorts = [{ field: self.pagination["sort_order"] } for field in sort_field] search = ((AwardSearch().filter(filter_query).sort(*sorts).extra( search_after=[self.last_value, self.last_id] )[0:self.pagination["limit"]]) if self.last_value and self.last_id else (AwardSearch().filter(filter_query).sort( *sorts)[((self.pagination["page"] - 1) * self.pagination["limit"]):( ((self.pagination["page"] - 1) * self.pagination["limit"]) + self.pagination["limit"])])) response = search.handle_execute() return response
def build_elasticsearch_search_with_aggregations( self) -> Optional[AwardSearch]: """ Using the provided ES_Q object creates an AwardSearch object with the necessary applied aggregations. """ # Create the initial search using filters search = AwardSearch().filter(self.filter_query) # As of writing this the value of settings.ES_ROUTING_FIELD is the only high cardinality aggregation that # we support. Since the Elasticsearch clusters are routed by this field we don't care to get a count of # unique buckets, but instead we use the upper_limit and don't allow an upper_limit > 10k. if self.bucket_count == 0: return None elif self.agg_key == settings.ES_ROUTING_FIELD: size = self.bucket_count shard_size = size group_by_agg_key_values = { "order": [ { self.sort_column_mapping[self.pagination.sort_key]: self.pagination.sort_order }, { self.sort_column_mapping["id"]: self.pagination.sort_order }, ] } bucket_sort_values = None else: size = self.bucket_count shard_size = self.bucket_count + 100 group_by_agg_key_values = {} bucket_sort_values = { "sort": [ { self.sort_column_mapping[self.pagination.sort_key]: { "order": self.pagination.sort_order } }, { self.sort_column_mapping["id"]: { "order": self.pagination.sort_order } }, ] } if shard_size > 10000: raise ForbiddenException( "Current filters return too many unique items. Narrow filters to return results or use downloads." ) # Define all aggregations needed to build the response group_by_agg_key_values.update({ "field": self.agg_key, "size": size, "shard_size": shard_size }) group_by_agg_key = A("terms", **group_by_agg_key_values) sum_aggregations = { mapping: get_scaled_sum_aggregations(mapping, self.pagination) for mapping in self.sum_column_mapping.values() } search.aggs.bucket(self.agg_group_name, group_by_agg_key) for field, sum_aggregations in sum_aggregations.items(): search.aggs[self.agg_group_name].metric( field, sum_aggregations["sum_field"]) if bucket_sort_values: bucket_sort_aggregation = A("bucket_sort", **bucket_sort_values) search.aggs[self.agg_group_name].pipeline("pagination_aggregation", bucket_sort_aggregation) # If provided, break down primary bucket aggregation into sub-aggregations based on a sub_agg_key if self.sub_agg_key: self.extend_elasticsearch_search_with_sub_aggregation(search) # Set size to 0 since we don't care about documents returned search.update_from_dict({"size": 0}) return search