Ejemplo n.º 1
0
    def query_elasticsearch(self, filters) -> list:
        filter_query = QueryWithFilters.generate_awards_elasticsearch_query(filters)
        s = AwardSearch().filter(filter_query)

        s.aggs.bucket(
            "types",
            "filters",
            filters={category: Q("terms", type=types) for category, types in all_award_types_mappings.items()},
        )
        results = s.handle_execute()

        contracts = results.aggregations.types.buckets.contracts.doc_count
        idvs = results.aggregations.types.buckets.idvs.doc_count
        grants = results.aggregations.types.buckets.grants.doc_count
        direct_payments = results.aggregations.types.buckets.direct_payments.doc_count
        loans = results.aggregations.types.buckets.loans.doc_count
        other = results.aggregations.types.buckets.other_financial_assistance.doc_count

        response = {
            "contracts": contracts,
            "direct_payments": direct_payments,
            "grants": grants,
            "idvs": idvs,
            "loans": loans,
            "other": other,
        }
        return response
Ejemplo n.º 2
0
    def build_elasticsearch_search_with_aggregation(
            self, filter_query: ES_Q) -> Optional[AwardSearch]:
        # Create the initial search using filters
        search = AwardSearch().filter(filter_query)

        # Check number of unique terms (buckets) for performance and restrictions on maximum buckets allowed
        bucket_count = get_number_of_unique_terms_for_awards(
            filter_query, f"{self.agg_key}.hash")

        if bucket_count == 0:
            return None
        else:
            # Add 1 to handle null case since murmur3 doesn't support "null_value" property
            bucket_count += 1

        # Add 100 to make sure that we consider enough records in each shard for accurate results
        group_by_agg_key = A("terms",
                             field=self.agg_key,
                             size=bucket_count,
                             shard_size=bucket_count + 100)
        sum_aggregations = get_scaled_sum_aggregations(self.metric_field)
        sum_field = sum_aggregations["sum_field"]

        search.aggs.bucket("group_by_agg_key",
                           group_by_agg_key).metric("sum_field", sum_field)

        # Set size to 0 since we don't care about documents returned
        search.update_from_dict({"size": 0})

        return search
def get_number_of_unique_terms_for_awards(filter_query: ES_Q, field: str) -> int:
    """
    Returns the count for a specific filter_query.
    NOTE: Counts below the precision_threshold are expected to be close to accurate (per the Elasticsearch
          documentation). Since aggregations do not support more than 10k buckets this value is hard coded to
          11k to ensure that endpoints using Elasticsearch do not cross the 10k threshold. Elasticsearch endpoints
          should be implemented with a safeguard in case this count is above 10k.
    """
    return _get_number_of_unique_terms(AwardSearch().filter(filter_query), field)
    def query_elasticsearch(self) -> list:
        filter_query = QueryWithFilters.generate_awards_elasticsearch_query(
            self.filters)
        sort_field = self.get_elastic_sort_by_fields()
        sorts = [{
            field: self.pagination["sort_order"]
        } for field in sort_field]
        record_num = (self.pagination["page"] - 1) * self.pagination["limit"]
        # random page jumping was removed due to performance concerns
        if (self.last_record_sort_value is None and self.last_record_unique_id
                is not None) or (self.last_record_sort_value is not None
                                 and self.last_record_unique_id is None):
            # malformed request
            raise Exception(
                "Using search_after functionality in Elasticsearch requires both last_record_sort_value and last_record_unique_id."
            )
        if record_num >= settings.ES_AWARDS_MAX_RESULT_WINDOW and (
                self.last_record_unique_id is None
                and self.last_record_sort_value is None):
            raise UnprocessableEntityException(
                "Page #{page} with limit {limit} is over the maximum result limit {es_limit}. Please provide the 'last_record_sort_value' and 'last_record_unique_id' to paginate sequentially."
                .format(
                    page=self.pagination["page"],
                    limit=self.pagination["limit"],
                    es_limit=settings.ES_AWARDS_MAX_RESULT_WINDOW,
                ))
        # Search_after values are provided in the API request - use search after
        if self.last_record_sort_value is not None and self.last_record_unique_id is not None:
            search = (
                AwardSearch().filter(filter_query).sort(*sorts).extra(
                    search_after=[
                        self.last_record_sort_value, self.last_record_unique_id
                    ])[:self.pagination["limit"] +
                       1]  # add extra result to check for next page
            )
        # no values, within result window, use regular elasticsearch
        else:
            search = AwardSearch().filter(filter_query).sort(
                *sorts)[record_num:record_num + self.pagination["limit"]]

        response = search.handle_execute()

        return response
Ejemplo n.º 5
0
    def query_elasticsearch(self) -> list:
        filter_query = QueryWithFilters.generate_awards_elasticsearch_query(
            self.filters)
        sort_field = self.get_elastic_sort_by_fields()
        sorts = [{
            field: self.pagination["sort_order"]
        } for field in sort_field]
        search = ((AwardSearch().filter(filter_query).sort(*sorts).extra(
            search_after=[self.last_value, self.last_id]
        )[0:self.pagination["limit"]]) if self.last_value and self.last_id else
                  (AwardSearch().filter(filter_query).sort(
                      *sorts)[((self.pagination["page"] - 1) *
                               self.pagination["limit"]):(
                                   ((self.pagination["page"] - 1) *
                                    self.pagination["limit"]) +
                                   self.pagination["limit"])]))
        response = search.handle_execute()

        return response
Ejemplo n.º 6
0
    def build_elasticsearch_search_with_aggregations(
            self) -> Optional[AwardSearch]:
        """
        Using the provided ES_Q object creates an AwardSearch object with the necessary applied aggregations.
        """
        # Create the initial search using filters
        search = AwardSearch().filter(self.filter_query)

        # As of writing this the value of settings.ES_ROUTING_FIELD is the only high cardinality aggregation that
        # we support. Since the Elasticsearch clusters are routed by this field we don't care to get a count of
        # unique buckets, but instead we use the upper_limit and don't allow an upper_limit > 10k.
        if self.bucket_count == 0:
            return None
        elif self.agg_key == settings.ES_ROUTING_FIELD:
            size = self.bucket_count
            shard_size = size
            group_by_agg_key_values = {
                "order": [
                    {
                        self.sort_column_mapping[self.pagination.sort_key]:
                        self.pagination.sort_order
                    },
                    {
                        self.sort_column_mapping["id"]:
                        self.pagination.sort_order
                    },
                ]
            }
            bucket_sort_values = None
        else:
            size = self.bucket_count
            shard_size = self.bucket_count + 100
            group_by_agg_key_values = {}
            bucket_sort_values = {
                "sort": [
                    {
                        self.sort_column_mapping[self.pagination.sort_key]: {
                            "order": self.pagination.sort_order
                        }
                    },
                    {
                        self.sort_column_mapping["id"]: {
                            "order": self.pagination.sort_order
                        }
                    },
                ]
            }

        if shard_size > 10000:
            raise ForbiddenException(
                "Current filters return too many unique items. Narrow filters to return results or use downloads."
            )

        # Define all aggregations needed to build the response
        group_by_agg_key_values.update({
            "field": self.agg_key,
            "size": size,
            "shard_size": shard_size
        })
        group_by_agg_key = A("terms", **group_by_agg_key_values)

        sum_aggregations = {
            mapping: get_scaled_sum_aggregations(mapping, self.pagination)
            for mapping in self.sum_column_mapping.values()
        }

        search.aggs.bucket(self.agg_group_name, group_by_agg_key)
        for field, sum_aggregations in sum_aggregations.items():
            search.aggs[self.agg_group_name].metric(
                field, sum_aggregations["sum_field"])

        if bucket_sort_values:
            bucket_sort_aggregation = A("bucket_sort", **bucket_sort_values)
            search.aggs[self.agg_group_name].pipeline("pagination_aggregation",
                                                      bucket_sort_aggregation)

        # If provided, break down primary bucket aggregation into sub-aggregations based on a sub_agg_key
        if self.sub_agg_key:
            self.extend_elasticsearch_search_with_sub_aggregation(search)

        # Set size to 0 since we don't care about documents returned
        search.update_from_dict({"size": 0})

        return search