def build_elasticsearch_search_with_aggregation( self, filter_query: ES_Q) -> Optional[TransactionSearch]: # Create the initial search using filters search = TransactionSearch().filter(filter_query) # Check number of unique terms (buckets) for performance and restrictions on maximum buckets allowed bucket_count = get_number_of_unique_terms(filter_query, f"{self.agg_key}.hash") if bucket_count == 0: return None # Add 100 to make sure that we consider enough records in each shard for accurate results group_by_agg_key = A("terms", field=self.agg_key, size=bucket_count, shard_size=bucket_count + 100) sum_aggregations = get_scaled_sum_aggregations(self.obligation_column) sum_field = sum_aggregations["sum_field"] search.aggs.bucket("group_by_agg_key", group_by_agg_key).metric("sum_field", sum_field) # Set size to 0 since we don't care about documents returned search.update_from_dict({"size": 0}) return search
def build_elasticsearch_search_with_aggregations( self, filter_query: ES_Q) -> Optional[TransactionSearch]: """ Using the provided ES_Q object creates a TransactionSearch object with the necessary applied aggregations. """ # Create the filtered Search Object search = TransactionSearch().filter(filter_query) sum_aggregations = get_scaled_sum_aggregations( "generated_pragmatic_obligation", self.pagination) # Need to handle high cardinality categories differently; this assumes that the Search object references # an Elasticsearch cluster that has a "routing" equal to "self.category.agg_key" if self.category.name in self.high_cardinality_categories: # 10k is the maximum number of allowed buckets size = self.pagination.upper_limit shard_size = size sum_bucket_sort = sum_aggregations["sum_bucket_truncate"] group_by_agg_key_values = {"order": {"sum_field": "desc"}} else: # Get count of unique buckets; terminate early if there are no buckets matching criteria bucket_count = get_number_of_unique_terms( filter_query, f"{self.category.agg_key}.hash") if bucket_count == 0: return None else: # Add 100 to make sure that we consider enough records in each shard for accurate results; # Only needed for non high-cardinality fields since those are being routed size = bucket_count shard_size = bucket_count + 100 sum_bucket_sort = sum_aggregations["sum_bucket_sort"] group_by_agg_key_values = {} if shard_size > 10000: logger.warning( f"Max number of buckets reached for aggregation key: {self.category.agg_key}." ) raise ElasticsearchConnectionException( "Current filters return too many unique items. Narrow filters to return results." ) # Define all aggregations needed to build the response group_by_agg_key_values.update({ "field": self.category.agg_key, "size": size, "shard_size": shard_size }) group_by_agg_key = A("terms", **group_by_agg_key_values) sum_field = sum_aggregations["sum_field"] # Apply the aggregations to the TransactionSearch object search.aggs.bucket("group_by_agg_key", group_by_agg_key).metric( "sum_field", sum_field).pipeline("sum_bucket_sort", sum_bucket_sort) # Set size to 0 since we don't care about documents returned search.update_from_dict({"size": 0}) return search
def extend_elasticsearch_search_with_sub_aggregation( self, search: AwardSearch): """ This template method is called if the `self.sub_agg_key` is supplied, in order to post-process the query and inject a sub-aggregation on a secondary dimension (that is subordinate to the first agg_key's dimension). Example: Subtier Agency spending rolled up to Toptier Agency spending """ sub_bucket_count = get_number_of_unique_terms_for_awards( self.filter_query, f"{self.sub_agg_key}.hash") size = sub_bucket_count shard_size = sub_bucket_count + 100 sub_group_by_sub_agg_key_values = {} if shard_size > 10000: raise ForbiddenException( "Current filters return too many unique items. Narrow filters to return results or use downloads." ) # Sub-aggregation to append to primary agg sub_group_by_sub_agg_key_values.update({ "field": self.sub_agg_key, "size": size, "shard_size": shard_size, "order": [ { self.sort_column_mapping[self.pagination.sort_key]: self.pagination.sort_order }, { self.sort_column_mapping["id"]: self.pagination.sort_order }, ], }) sub_group_by_sub_agg_key = A("terms", **sub_group_by_sub_agg_key_values) sum_aggregations = { mapping: get_scaled_sum_aggregations(mapping) for mapping in self.sum_column_mapping.values() } # Append sub-agg to primary agg, and include the sub-agg's sum metric aggs too search.aggs[self.agg_group_name].bucket(self.sub_agg_group_name, sub_group_by_sub_agg_key) for field, sum_aggregations in sum_aggregations.items(): search.aggs[self.agg_group_name].aggs[ self.sub_agg_group_name].metric(field, sum_aggregations["sum_field"])
def build_elasticsearch_search_with_aggregations( self) -> Optional[AwardSearch]: """ Using the provided ES_Q object creates an AwardSearch object with the necessary applied aggregations. """ # Create the initial search using filters search = AwardSearch().filter(self.filter_query) # As of writing this the value of settings.ES_ROUTING_FIELD is the only high cardinality aggregation that # we support. Since the Elasticsearch clusters are routed by this field we don't care to get a count of # unique buckets, but instead we use the upper_limit and don't allow an upper_limit > 10k. if self.bucket_count == 0: return None elif self.agg_key == settings.ES_ROUTING_FIELD: size = self.bucket_count shard_size = size group_by_agg_key_values = { "order": [ { self.sort_column_mapping[self.pagination.sort_key]: self.pagination.sort_order }, { self.sort_column_mapping["id"]: self.pagination.sort_order }, ] } bucket_sort_values = None else: size = self.bucket_count shard_size = self.bucket_count + 100 group_by_agg_key_values = {} bucket_sort_values = { "sort": [ { self.sort_column_mapping[self.pagination.sort_key]: { "order": self.pagination.sort_order } }, { self.sort_column_mapping["id"]: { "order": self.pagination.sort_order } }, ] } if shard_size > 10000: raise ForbiddenException( "Current filters return too many unique items. Narrow filters to return results or use downloads." ) # Define all aggregations needed to build the response group_by_agg_key_values.update({ "field": self.agg_key, "size": size, "shard_size": shard_size }) group_by_agg_key = A("terms", **group_by_agg_key_values) sum_aggregations = { mapping: get_scaled_sum_aggregations(mapping, self.pagination) for mapping in self.sum_column_mapping.values() } search.aggs.bucket(self.agg_group_name, group_by_agg_key) for field, sum_aggregations in sum_aggregations.items(): search.aggs[self.agg_group_name].metric( field, sum_aggregations["sum_field"]) if bucket_sort_values: bucket_sort_aggregation = A("bucket_sort", **bucket_sort_values) search.aggs[self.agg_group_name].pipeline("pagination_aggregation", bucket_sort_aggregation) # If provided, break down primary bucket aggregation into sub-aggregations based on a sub_agg_key if self.sub_agg_key: self.extend_elasticsearch_search_with_sub_aggregation(search) # Set size to 0 since we don't care about documents returned search.update_from_dict({"size": 0}) return search
def obtain_recipient_totals(recipient_id, children=False, year="latest"): """Extract the total amount and transaction count for the recipient_hash given the time frame Args: recipient_id: string of hash(duns, name)-[recipient-level] children: whether or not to group by children year: the year the totals/counts are based on Returns: list of dictionaries representing hashes and their totals/counts """ filters = reshape_filters(recipient_id=recipient_id, year=year) filter_query = QueryWithFilters.generate_transactions_elasticsearch_query( filters) search = TransactionSearch().filter(filter_query) if children: group_by_field = "recipient_agg_key" elif recipient_id[-2:] == "-P": group_by_field = "parent_recipient_hash" else: group_by_field = "recipient_hash" bucket_count = get_number_of_unique_terms_for_transactions( filter_query, f"{group_by_field}.hash") if bucket_count == 0: return [] # Not setting the shard_size since the number of child recipients under a # parent recipient will not exceed 10k group_by_recipient = A("terms", field=group_by_field, size=bucket_count) sum_obligation = get_scaled_sum_aggregations( "generated_pragmatic_obligation")["sum_field"] filter_loans = A("filter", terms={"type": list(loan_type_mapping.keys())}) sum_face_value_loan = get_scaled_sum_aggregations( "face_value_loan_guarantee")["sum_field"] search.aggs.bucket("group_by_recipient", group_by_recipient) search.aggs["group_by_recipient"].metric("sum_obligation", sum_obligation) search.aggs["group_by_recipient"].bucket("filter_loans", filter_loans) search.aggs["group_by_recipient"]["filter_loans"].metric( "sum_face_value_loan", sum_face_value_loan) response = search.handle_execute() response_as_dict = response.aggs.to_dict() recipient_info_buckets = response_as_dict.get("group_by_recipient", {}).get("buckets", []) result_list = [] for bucket in recipient_info_buckets: result = {} if children: recipient_info = json.loads(bucket.get("key")) hash_with_level = recipient_info.get("hash_with_level") or None result = { "recipient_hash": hash_with_level[:-2] if hash_with_level else None, "recipient_unique_id": recipient_info.get("unique_id"), "recipient_name": recipient_info.get("name"), } loan_info = bucket.get("filter_loans", {}) result.update({ "total_obligation_amount": int(bucket.get("sum_obligation", {"value": 0})["value"]) / Decimal("100"), "total_obligation_count": bucket.get("doc_count", 0), "total_face_value_loan_amount": int(loan_info.get("sum_face_value_loan", {"value": 0})["value"]) / Decimal("100"), "total_face_value_loan_count": loan_info.get("doc_count", 0), }) result_list.append(result) return result_list