def build_elasticsearch_search_with_aggregation(
            self, filter_query: ES_Q) -> Optional[TransactionSearch]:
        # Create the initial search using filters
        search = TransactionSearch().filter(filter_query)

        # Check number of unique terms (buckets) for performance and restrictions on maximum buckets allowed
        bucket_count = get_number_of_unique_terms(filter_query,
                                                  f"{self.agg_key}.hash")

        if bucket_count == 0:
            return None

        # Add 100 to make sure that we consider enough records in each shard for accurate results
        group_by_agg_key = A("terms",
                             field=self.agg_key,
                             size=bucket_count,
                             shard_size=bucket_count + 100)
        sum_aggregations = get_scaled_sum_aggregations(self.obligation_column)
        sum_field = sum_aggregations["sum_field"]

        search.aggs.bucket("group_by_agg_key",
                           group_by_agg_key).metric("sum_field", sum_field)

        # Set size to 0 since we don't care about documents returned
        search.update_from_dict({"size": 0})

        return search
Beispiel #2
0
    def build_elasticsearch_search_with_aggregations(
            self, filter_query: ES_Q) -> Optional[TransactionSearch]:
        """
        Using the provided ES_Q object creates a TransactionSearch object with the necessary applied aggregations.
        """
        # Create the filtered Search Object
        search = TransactionSearch().filter(filter_query)

        sum_aggregations = get_scaled_sum_aggregations(
            "generated_pragmatic_obligation", self.pagination)

        # Need to handle high cardinality categories differently; this assumes that the Search object references
        # an Elasticsearch cluster that has a "routing" equal to "self.category.agg_key"
        if self.category.name in self.high_cardinality_categories:
            # 10k is the maximum number of allowed buckets
            size = self.pagination.upper_limit
            shard_size = size
            sum_bucket_sort = sum_aggregations["sum_bucket_truncate"]
            group_by_agg_key_values = {"order": {"sum_field": "desc"}}
        else:
            # Get count of unique buckets; terminate early if there are no buckets matching criteria
            bucket_count = get_number_of_unique_terms(
                filter_query, f"{self.category.agg_key}.hash")
            if bucket_count == 0:
                return None
            else:
                # Add 100 to make sure that we consider enough records in each shard for accurate results;
                # Only needed for non high-cardinality fields since those are being routed
                size = bucket_count
                shard_size = bucket_count + 100
                sum_bucket_sort = sum_aggregations["sum_bucket_sort"]
                group_by_agg_key_values = {}

        if shard_size > 10000:
            logger.warning(
                f"Max number of buckets reached for aggregation key: {self.category.agg_key}."
            )
            raise ElasticsearchConnectionException(
                "Current filters return too many unique items. Narrow filters to return results."
            )

        # Define all aggregations needed to build the response
        group_by_agg_key_values.update({
            "field": self.category.agg_key,
            "size": size,
            "shard_size": shard_size
        })
        group_by_agg_key = A("terms", **group_by_agg_key_values)

        sum_field = sum_aggregations["sum_field"]

        # Apply the aggregations to the TransactionSearch object
        search.aggs.bucket("group_by_agg_key", group_by_agg_key).metric(
            "sum_field", sum_field).pipeline("sum_bucket_sort",
                                             sum_bucket_sort)

        # Set size to 0 since we don't care about documents returned
        search.update_from_dict({"size": 0})

        return search
Beispiel #3
0
    def extend_elasticsearch_search_with_sub_aggregation(
            self, search: AwardSearch):
        """
        This template method is called if the `self.sub_agg_key` is supplied, in order to post-process the query and
        inject a sub-aggregation on a secondary dimension (that is subordinate to the first agg_key's dimension).

        Example: Subtier Agency spending rolled up to Toptier Agency spending
        """
        sub_bucket_count = get_number_of_unique_terms_for_awards(
            self.filter_query, f"{self.sub_agg_key}.hash")
        size = sub_bucket_count
        shard_size = sub_bucket_count + 100
        sub_group_by_sub_agg_key_values = {}

        if shard_size > 10000:
            raise ForbiddenException(
                "Current filters return too many unique items. Narrow filters to return results or use downloads."
            )

        # Sub-aggregation to append to primary agg
        sub_group_by_sub_agg_key_values.update({
            "field":
            self.sub_agg_key,
            "size":
            size,
            "shard_size":
            shard_size,
            "order": [
                {
                    self.sort_column_mapping[self.pagination.sort_key]:
                    self.pagination.sort_order
                },
                {
                    self.sort_column_mapping["id"]: self.pagination.sort_order
                },
            ],
        })
        sub_group_by_sub_agg_key = A("terms",
                                     **sub_group_by_sub_agg_key_values)

        sum_aggregations = {
            mapping: get_scaled_sum_aggregations(mapping)
            for mapping in self.sum_column_mapping.values()
        }

        # Append sub-agg to primary agg, and include the sub-agg's sum metric aggs too
        search.aggs[self.agg_group_name].bucket(self.sub_agg_group_name,
                                                sub_group_by_sub_agg_key)
        for field, sum_aggregations in sum_aggregations.items():
            search.aggs[self.agg_group_name].aggs[
                self.sub_agg_group_name].metric(field,
                                                sum_aggregations["sum_field"])
Beispiel #4
0
    def build_elasticsearch_search_with_aggregations(
            self) -> Optional[AwardSearch]:
        """
        Using the provided ES_Q object creates an AwardSearch object with the necessary applied aggregations.
        """
        # Create the initial search using filters
        search = AwardSearch().filter(self.filter_query)

        # As of writing this the value of settings.ES_ROUTING_FIELD is the only high cardinality aggregation that
        # we support. Since the Elasticsearch clusters are routed by this field we don't care to get a count of
        # unique buckets, but instead we use the upper_limit and don't allow an upper_limit > 10k.
        if self.bucket_count == 0:
            return None
        elif self.agg_key == settings.ES_ROUTING_FIELD:
            size = self.bucket_count
            shard_size = size
            group_by_agg_key_values = {
                "order": [
                    {
                        self.sort_column_mapping[self.pagination.sort_key]:
                        self.pagination.sort_order
                    },
                    {
                        self.sort_column_mapping["id"]:
                        self.pagination.sort_order
                    },
                ]
            }
            bucket_sort_values = None
        else:
            size = self.bucket_count
            shard_size = self.bucket_count + 100
            group_by_agg_key_values = {}
            bucket_sort_values = {
                "sort": [
                    {
                        self.sort_column_mapping[self.pagination.sort_key]: {
                            "order": self.pagination.sort_order
                        }
                    },
                    {
                        self.sort_column_mapping["id"]: {
                            "order": self.pagination.sort_order
                        }
                    },
                ]
            }

        if shard_size > 10000:
            raise ForbiddenException(
                "Current filters return too many unique items. Narrow filters to return results or use downloads."
            )

        # Define all aggregations needed to build the response
        group_by_agg_key_values.update({
            "field": self.agg_key,
            "size": size,
            "shard_size": shard_size
        })
        group_by_agg_key = A("terms", **group_by_agg_key_values)

        sum_aggregations = {
            mapping: get_scaled_sum_aggregations(mapping, self.pagination)
            for mapping in self.sum_column_mapping.values()
        }

        search.aggs.bucket(self.agg_group_name, group_by_agg_key)
        for field, sum_aggregations in sum_aggregations.items():
            search.aggs[self.agg_group_name].metric(
                field, sum_aggregations["sum_field"])

        if bucket_sort_values:
            bucket_sort_aggregation = A("bucket_sort", **bucket_sort_values)
            search.aggs[self.agg_group_name].pipeline("pagination_aggregation",
                                                      bucket_sort_aggregation)

        # If provided, break down primary bucket aggregation into sub-aggregations based on a sub_agg_key
        if self.sub_agg_key:
            self.extend_elasticsearch_search_with_sub_aggregation(search)

        # Set size to 0 since we don't care about documents returned
        search.update_from_dict({"size": 0})

        return search
Beispiel #5
0
def obtain_recipient_totals(recipient_id, children=False, year="latest"):
    """Extract the total amount and transaction count for the recipient_hash given the time frame

    Args:
        recipient_id: string of hash(duns, name)-[recipient-level]
        children: whether or not to group by children
        year: the year the totals/counts are based on
    Returns:
        list of dictionaries representing hashes and their totals/counts
    """
    filters = reshape_filters(recipient_id=recipient_id, year=year)
    filter_query = QueryWithFilters.generate_transactions_elasticsearch_query(
        filters)

    search = TransactionSearch().filter(filter_query)

    if children:
        group_by_field = "recipient_agg_key"
    elif recipient_id[-2:] == "-P":
        group_by_field = "parent_recipient_hash"
    else:
        group_by_field = "recipient_hash"

    bucket_count = get_number_of_unique_terms_for_transactions(
        filter_query, f"{group_by_field}.hash")

    if bucket_count == 0:
        return []

    # Not setting the shard_size since the number of child recipients under a
    # parent recipient will not exceed 10k
    group_by_recipient = A("terms", field=group_by_field, size=bucket_count)

    sum_obligation = get_scaled_sum_aggregations(
        "generated_pragmatic_obligation")["sum_field"]

    filter_loans = A("filter", terms={"type": list(loan_type_mapping.keys())})
    sum_face_value_loan = get_scaled_sum_aggregations(
        "face_value_loan_guarantee")["sum_field"]

    search.aggs.bucket("group_by_recipient", group_by_recipient)
    search.aggs["group_by_recipient"].metric("sum_obligation", sum_obligation)
    search.aggs["group_by_recipient"].bucket("filter_loans", filter_loans)
    search.aggs["group_by_recipient"]["filter_loans"].metric(
        "sum_face_value_loan", sum_face_value_loan)

    response = search.handle_execute()
    response_as_dict = response.aggs.to_dict()
    recipient_info_buckets = response_as_dict.get("group_by_recipient",
                                                  {}).get("buckets", [])

    result_list = []

    for bucket in recipient_info_buckets:
        result = {}
        if children:
            recipient_info = json.loads(bucket.get("key"))
            hash_with_level = recipient_info.get("hash_with_level") or None
            result = {
                "recipient_hash":
                hash_with_level[:-2] if hash_with_level else None,
                "recipient_unique_id": recipient_info.get("unique_id"),
                "recipient_name": recipient_info.get("name"),
            }
        loan_info = bucket.get("filter_loans", {})
        result.update({
            "total_obligation_amount":
            int(bucket.get("sum_obligation", {"value": 0})["value"]) /
            Decimal("100"),
            "total_obligation_count":
            bucket.get("doc_count", 0),
            "total_face_value_loan_amount":
            int(loan_info.get("sum_face_value_loan", {"value": 0})["value"]) /
            Decimal("100"),
            "total_face_value_loan_count":
            loan_info.get("doc_count", 0),
        })
        result_list.append(result)

    return result_list