Beispiel #1
0
    def build_elasticsearch_search_with_aggregation(
            self, filter_query: ES_Q) -> Optional[AwardSearch]:
        # Create the initial search using filters
        search = AwardSearch().filter(filter_query)

        # Check number of unique terms (buckets) for performance and restrictions on maximum buckets allowed
        bucket_count = get_number_of_unique_terms_for_awards(
            filter_query, f"{self.agg_key}.hash")

        if bucket_count == 0:
            return None
        else:
            # Add 1 to handle null case since murmur3 doesn't support "null_value" property
            bucket_count += 1

        # Add 100 to make sure that we consider enough records in each shard for accurate results
        group_by_agg_key = A("terms",
                             field=self.agg_key,
                             size=bucket_count,
                             shard_size=bucket_count + 100)
        sum_aggregations = get_scaled_sum_aggregations(self.metric_field)
        sum_field = sum_aggregations["sum_field"]

        search.aggs.bucket("group_by_agg_key",
                           group_by_agg_key).metric("sum_field", sum_field)

        # Set size to 0 since we don't care about documents returned
        search.update_from_dict({"size": 0})

        return search
Beispiel #2
0
    def build_elasticsearch_search_with_aggregations(
            self) -> Optional[AwardSearch]:
        """
        Using the provided ES_Q object creates an AwardSearch object with the necessary applied aggregations.
        """
        # Create the initial search using filters
        search = AwardSearch().filter(self.filter_query)

        # As of writing this the value of settings.ES_ROUTING_FIELD is the only high cardinality aggregation that
        # we support. Since the Elasticsearch clusters are routed by this field we don't care to get a count of
        # unique buckets, but instead we use the upper_limit and don't allow an upper_limit > 10k.
        if self.bucket_count == 0:
            return None
        elif self.agg_key == settings.ES_ROUTING_FIELD:
            size = self.bucket_count
            shard_size = size
            group_by_agg_key_values = {
                "order": [
                    {
                        self.sort_column_mapping[self.pagination.sort_key]:
                        self.pagination.sort_order
                    },
                    {
                        self.sort_column_mapping["id"]:
                        self.pagination.sort_order
                    },
                ]
            }
            bucket_sort_values = None
        else:
            size = self.bucket_count
            shard_size = self.bucket_count + 100
            group_by_agg_key_values = {}
            bucket_sort_values = {
                "sort": [
                    {
                        self.sort_column_mapping[self.pagination.sort_key]: {
                            "order": self.pagination.sort_order
                        }
                    },
                    {
                        self.sort_column_mapping["id"]: {
                            "order": self.pagination.sort_order
                        }
                    },
                ]
            }

        if shard_size > 10000:
            raise ForbiddenException(
                "Current filters return too many unique items. Narrow filters to return results or use downloads."
            )

        # Define all aggregations needed to build the response
        group_by_agg_key_values.update({
            "field": self.agg_key,
            "size": size,
            "shard_size": shard_size
        })
        group_by_agg_key = A("terms", **group_by_agg_key_values)

        sum_aggregations = {
            mapping: get_scaled_sum_aggregations(mapping, self.pagination)
            for mapping in self.sum_column_mapping.values()
        }

        search.aggs.bucket(self.agg_group_name, group_by_agg_key)
        for field, sum_aggregations in sum_aggregations.items():
            search.aggs[self.agg_group_name].metric(
                field, sum_aggregations["sum_field"])

        if bucket_sort_values:
            bucket_sort_aggregation = A("bucket_sort", **bucket_sort_values)
            search.aggs[self.agg_group_name].pipeline("pagination_aggregation",
                                                      bucket_sort_aggregation)

        # If provided, break down primary bucket aggregation into sub-aggregations based on a sub_agg_key
        if self.sub_agg_key:
            self.extend_elasticsearch_search_with_sub_aggregation(search)

        # Set size to 0 since we don't care about documents returned
        search.update_from_dict({"size": 0})

        return search