def query_elasticsearch(self, time_periods: list) -> list: filter_query = QueryWithFilters.generate_transactions_elasticsearch_query( self.filters) search = TransactionSearch().filter(filter_query) self.apply_elasticsearch_aggregations(search) response = search.handle_execute() return self.build_elasticsearch_result(response.aggs, time_periods)
def build_elasticsearch_search_with_aggregation( self, filter_query: ES_Q) -> Optional[TransactionSearch]: # Create the initial search using filters search = TransactionSearch().filter(filter_query) # Check number of unique terms (buckets) for performance and restrictions on maximum buckets allowed bucket_count = get_number_of_unique_terms(filter_query, f"{self.agg_key}.hash") if bucket_count == 0: return None # Add 100 to make sure that we consider enough records in each shard for accurate results group_by_agg_key = A("terms", field=self.agg_key, size=bucket_count, shard_size=bucket_count + 100) sum_aggregations = get_scaled_sum_aggregations(self.obligation_column) sum_field = sum_aggregations["sum_field"] search.aggs.bucket("group_by_agg_key", group_by_agg_key).metric("sum_field", sum_field) # Set size to 0 since we don't care about documents returned search.update_from_dict({"size": 0}) return search
def get_total_results(keyword): group_by_agg_key_values = { "filters": { category: { "terms": { "type": types } } for category, types in INDEX_ALIASES_TO_AWARD_TYPES.items() } } aggs = A("filters", **group_by_agg_key_values) filter_query = QueryWithFilters.generate_transactions_elasticsearch_query( {"keyword_search": [es_minimal_sanitize(keyword)]}) search = TransactionSearch().filter(filter_query) search.aggs.bucket("types", aggs) response = search.handle_execute() if response is not None: try: return response["aggregations"]["types"]["buckets"] except KeyError: logger.error("Unexpected Response") else: logger.error("No Response") return None
def get_number_of_unique_terms(filter_query: ES_Q, field: str) -> int: search = TransactionSearch().filter(filter_query) cardinality_aggregation = A("cardinality", field=field) search.aggs.metric("field_count", cardinality_aggregation) response = search.handle_execute() response_dict = response.aggs.to_dict() return response_dict.get("field_count", {"value": 0})["value"]
def get_sum_and_count_aggregation_results(keyword): filter_query = QueryWithFilters.generate_transactions_elasticsearch_query( {"keyword_search": [es_minimal_sanitize(keyword)]}) search = TransactionSearch().filter(filter_query) search.aggs.bucket("prime_awards_obligation_amount", {"sum": { "field": "transaction_amount" }}) search.aggs.bucket("prime_awards_count", {"value_count": { "field": "transaction_id" }}) response = search.handle_execute() if response is not None: try: results = {} results["prime_awards_count"] = response["aggregations"][ "prime_awards_count"]["value"] results["prime_awards_obligation_amount"] = round( response["aggregations"]["prime_awards_obligation_amount"] ["value"], 2) return results except KeyError: logger.exception("Unexpected Response") else: return None
def build_elasticsearch_search_with_aggregations( self, filter_query: ES_Q) -> Optional[TransactionSearch]: """ Using the provided ES_Q object creates a TransactionSearch object with the necessary applied aggregations. """ # Create the filtered Search Object search = TransactionSearch().filter(filter_query) sum_aggregations = get_scaled_sum_aggregations( "generated_pragmatic_obligation", self.pagination) # Need to handle high cardinality categories differently; this assumes that the Search object references # an Elasticsearch cluster that has a "routing" equal to "self.category.agg_key" if self.category.name in self.high_cardinality_categories: # 10k is the maximum number of allowed buckets size = self.pagination.upper_limit shard_size = size sum_bucket_sort = sum_aggregations["sum_bucket_truncate"] group_by_agg_key_values = {"order": {"sum_field": "desc"}} else: # Get count of unique buckets; terminate early if there are no buckets matching criteria bucket_count = get_number_of_unique_terms( filter_query, f"{self.category.agg_key}.hash") if bucket_count == 0: return None else: # Add 100 to make sure that we consider enough records in each shard for accurate results; # Only needed for non high-cardinality fields since those are being routed size = bucket_count shard_size = bucket_count + 100 sum_bucket_sort = sum_aggregations["sum_bucket_sort"] group_by_agg_key_values = {} if shard_size > 10000: logger.warning( f"Max number of buckets reached for aggregation key: {self.category.agg_key}." ) raise ElasticsearchConnectionException( "Current filters return too many unique items. Narrow filters to return results." ) # Define all aggregations needed to build the response group_by_agg_key_values.update({ "field": self.category.agg_key, "size": size, "shard_size": shard_size }) group_by_agg_key = A("terms", **group_by_agg_key_values) sum_field = sum_aggregations["sum_field"] # Apply the aggregations to the TransactionSearch object search.aggs.bucket("group_by_agg_key", group_by_agg_key).metric( "sum_field", sum_field).pipeline("sum_bucket_sort", sum_bucket_sort) # Set size to 0 since we don't care about documents returned search.update_from_dict({"size": 0}) return search
def get_number_of_unique_terms_for_transactions(filter_query: ES_Q, field: str) -> int: """ Returns the count for a specific filter_query. NOTE: Counts below the precision_threshold are expected to be close to accurate (per the Elasticsearch documentation). Since aggregations do not support more than 10k buckets this value is hard coded to 11k to ensure that endpoints using Elasticsearch do not cross the 10k threshold. Elasticsearch endpoints should be implemented with a safeguard in case this count is above 10k. """ return _get_number_of_unique_terms(TransactionSearch().filter(filter_query), field)
def get_sum_aggregation_results(keyword, field="transaction_amount"): group_by_agg_key_values = {"field": field} aggs = A("sum", **group_by_agg_key_values) filter_query = QueryWithFilters.generate_transactions_elasticsearch_query( {"keywords": es_minimal_sanitize(keyword)}) search = TransactionSearch().filter(filter_query) search.aggs.bucket("transaction_sum", aggs) response = search.handle_execute() if response: return response["aggregations"] else: return None
def get_number_of_unique_terms(filter_query: ES_Q, field: str) -> int: """ Returns the count for a specific filter_query. NOTE: This will only work when the number of unique values is 40k or less. This is captured in the Elasticsearch documentation for the cardinality aggregation: "The maximum supported value is 40000, thresholds above this number will have the same effect as a threshold of 40000" """ search = TransactionSearch().filter(filter_query) cardinality_aggregation = A("cardinality", field=field) search.aggs.metric("field_count", cardinality_aggregation) response = search.handle_execute() response_dict = response.aggs.to_dict() return response_dict.get("field_count", {"value": 0})["value"]
def create_es_search(scope, search_text, country=None, state=None): """ Providing the parameters, create a dictionary representing the bool-query conditional clauses for elasticsearch Args: scope: which city field was chosen for searching `pop` (place of performance) or `recipient_location` search_text: the text the user is typing in and sent to the backend country: optional country selected by user state: optional state selected by user """ # The base query that will do a wildcard term-level query query = {"must": [{"wildcard": {"{}_city_name.keyword".format(scope): search_text + "*"}}]} if country != "USA": # A non-USA selected country if country != ALL_FOREIGN_COUNTRIES: query["must"].append({"match": {"{scope}_country_code".format(scope=scope): country}}) # Create a "Should Not" query with a nested bool, to get everything non-USA query["should"] = [ { "bool": { "must": {"exists": {"field": "{}_country_code".format(scope)}}, "must_not": [ {"match": {"{}_country_code".format(scope): "USA"}}, {"match_phrase": {"{}_country_code".format(scope): "UNITED STATES"}}, ], } } ] query["minimum_should_match"] = 1 else: # USA is selected as country query["should"] = [build_country_match(scope, "USA"), build_country_match(scope, "UNITED STATES")] query["should"].append({"bool": {"must_not": {"exists": {"field": "{}_country_code".format(scope)}}}}) query["minimum_should_match"] = 1 # null country codes are being considered as USA country codes if state: # If a state was provided, include it in the filter to limit hits query["must"].append({"match": {"{}_state_code".format(scope): es_sanitize(state).upper()}}) search = TransactionSearch().filter(ES_Q("bool", **query)) return search
def post(self, request): """Returns boolean of whether a download request is greater than the max limit. """ models = [{ "name": "subawards", "key": "subawards", "type": "boolean", "default": False }] models.extend(copy.deepcopy(AWARD_FILTER)) self.original_filters = request.data.get("filters") json_request = TinyShield(models).block(request.data) # If no filters in request return empty object to return all transactions filters = json_request.get("filters", {}) if json_request["subawards"]: total_count = subaward_filter(filters).count() else: filter_query = QueryWithFilters.generate_transactions_elasticsearch_query( filters) search = TransactionSearch().filter(filter_query) total_count = search.handle_count() if total_count is None: total_count = 0 result = { "calculated_transaction_count": total_count, "maximum_transaction_limit": settings.MAX_DOWNLOAD_LIMIT, "transaction_rows_gt_limit": total_count > settings.MAX_DOWNLOAD_LIMIT, "messages": [ get_generic_filters_message( self.original_filters.keys(), [elem["name"] for elem in AWARD_FILTER]) ], } return Response(result)
def get_download_ids(keyword, field, size=10000): """ returns a generator that yields list of transaction ids in chunksize SIZE Note: this only works for fields in ES of integer type. """ n_iter = DOWNLOAD_QUERY_SIZE // size results = get_total_results(keyword) if results is None: logger.error( "Error retrieving total results. Max number of attempts reached") return total = sum(results[category]["doc_count"] for category in INDEX_ALIASES_TO_AWARD_TYPES.keys()) required_iter = (total // size) + 1 n_iter = min(max(1, required_iter), n_iter) for i in range(n_iter): filter_query = QueryWithFilters.generate_transactions_elasticsearch_query( {"keyword_search": [es_minimal_sanitize(keyword)]}) search = TransactionSearch().filter(filter_query) group_by_agg_key_values = { "field": field, "include": { "partition": i, "num_partitions": n_iter }, "size": size, "shard_size": size, } aggs = A("terms", **group_by_agg_key_values) search.aggs.bucket("results", aggs) response = search.handle_execute() if response is None: raise Exception("Breaking generator, unable to reach cluster") results = [] for result in response["aggregations"]["results"]["buckets"]: results.append(result["key"]) yield results
def post(self, request): models = [{ "name": "fields", "key": "fields", "type": "array", "array_type": "text", "text_type": "search", "optional": False, }] models.extend(copy.deepcopy(AWARD_FILTER)) models.extend(copy.deepcopy(PAGINATION)) for m in models: if m["name"] in ("keywords", "award_type_codes", "sort"): m["optional"] = False validated_payload = TinyShield(models).block(request.data) record_num = (validated_payload["page"] - 1) * validated_payload["limit"] if record_num >= settings.ES_TRANSACTIONS_MAX_RESULT_WINDOW: raise UnprocessableEntityException( "Page #{page} of size {limit} is over the maximum result limit ({es_limit}). Consider using custom data downloads to obtain large data sets." .format( page=validated_payload["page"], limit=validated_payload["limit"], es_limit=settings.ES_TRANSACTIONS_MAX_RESULT_WINDOW, )) if validated_payload["sort"] not in validated_payload["fields"]: raise InvalidParameterException( "Sort value not found in fields: {}".format( validated_payload["sort"])) if "filters" in validated_payload and "no intersection" in validated_payload[ "filters"]["award_type_codes"]: # "Special case": there will never be results when the website provides this value return Response({ "limit": validated_payload["limit"], "results": [], "page_metadata": { "page": validated_payload["page"], "next": None, "previous": None, "hasNext": False, "hasPrevious": False, }, }) sorts = { TRANSACTIONS_LOOKUP[validated_payload["sort"]]: validated_payload["order"] } lower_limit = (validated_payload["page"] - 1) * validated_payload["limit"] upper_limit = ( validated_payload["page"]) * validated_payload["limit"] + 1 validated_payload["filters"]["keyword_search"] = [ es_minimal_sanitize(x) for x in validated_payload["filters"]["keywords"] ] validated_payload["filters"].pop("keywords") filter_query = QueryWithFilters.generate_transactions_elasticsearch_query( validated_payload["filters"]) search = TransactionSearch().filter(filter_query).sort( sorts)[lower_limit:upper_limit] response = search.handle_execute() return Response( self.build_elasticsearch_result(validated_payload, response))
def test_top_1_fails_with_es_transactions_routed_dangerously( client, monkeypatch, elasticsearch_transaction_index, db): """ This confirms vulnerability of high-cardinality aggregations documented in DEV-4685, that leads to inaccurate summing and ordering of sums when taking less buckets than the term cardinality. This is shown by manually applying a routing key (using a key value stuck in ``awards.piid`` field here as the routing key value) on index do that documents are distributed as below NOTE: This requires an ES cluster with at least 3 shards for the transaction index. Ours should be defaulted to 5. Recipient shard0 shard1 shard2 shard3 shard4 Biz 1 $2.00 Biz 1 $ 7.00 Biz 1 $ 3.00 Biz 1 $ 2.00 Biz 1 $ 3.00 Biz 1 $ 5.00 Biz 2 $ 6.00 Biz 2 $ 3.00 Biz 2 $ 2.00 Biz 2 $ 3.00 Biz 2 $ 4.00 Biz 2 $13.00 **IF THIS TEST FAILS** - Did our cluster structure change to not be 5 shards per the transaction index? - Did the transaction<->award DB linkage change? - Did we change ES version or config? - Investigate if Elasticsearch has changed the way they do routing or hash routing key values """ # Setup data for this test recipient1 = uuid.uuid4() recipient2 = uuid.uuid4() # Recipient Lookup mommy.make("recipient.RecipientLookup", id=1, recipient_hash=recipient1, legal_business_name="Biz 1", duns="111") mommy.make("recipient.RecipientLookup", id=2, recipient_hash=recipient2, legal_business_name="Biz 2", duns="222") # Transaction FPDS _make_fpds_transaction(1, 1, 2.00, "2020-01-01", "111", "Biz 1") _make_fpds_transaction(2, 3, 7.00, "2020-02-02", "111", "Biz 1") _make_fpds_transaction(3, 3, 3.00, "2020-03-03", "111", "Biz 1") _make_fpds_transaction(4, 2, 2.00, "2020-01-02", "111", "Biz 1") _make_fpds_transaction(5, 2, 3.00, "2020-02-03", "111", "Biz 1") _make_fpds_transaction(6, 2, 5.00, "2020-03-04", "111", "Biz 1") _make_fpds_transaction(7, 2, 6.00, "2020-01-03", "222", "Biz 2") _make_fpds_transaction(8, 2, 3.00, "2020-02-04", "222", "Biz 2") _make_fpds_transaction(9, 3, 2.00, "2020-03-05", "222", "Biz 2") _make_fpds_transaction(10, 3, 3.00, "2020-01-04", "222", "Biz 2") _make_fpds_transaction(11, 3, 4.00, "2020-02-05", "222", "Biz 2") _make_fpds_transaction(12, 1, 13.00, "2020-03-06", "222", "Biz 2") # Awards # Jam a routing key value into the piid field, and use the derived piid value for routing documents to shards later mommy.make("awards.Award", id=1, latest_transaction_id=12, piid="shard_zero") mommy.make("awards.Award", id=2, latest_transaction_id=6, piid="shard_one") mommy.make("awards.Award", id=3, latest_transaction_id=9, piid="shard_two") # Push DB data into the test ES cluster # NOTE: Force routing of documents by the piid field, which will separate them int 3 groups, leading to an # inaccurate sum and ordering of sums # Using piid (derived from the transaction's award) to route transaction documents to shards setup_elasticsearch_test(monkeypatch, elasticsearch_transaction_index, routing="piid") search = TransactionSearch() total = search.handle_count() assert total == 12, "Should have seen 12 documents indexed for this test" group_by_agg = A("terms", field="recipient_hash", size=1, shard_size=1, order={"sum_agg": "desc"}) sum_agg = A("sum", field="generated_pragmatic_obligation") search.aggs.bucket("results", group_by_agg).metric("sum_agg", sum_agg) logging.getLogger("console").debug( f"=>->=>->=>-> WILL RUN THIS ES QUERY: \n {search.extra(size=0).to_dict()}" ) response = search.extra(size=0).handle_execute().to_dict() results = [] for bucket in response["aggregations"]["results"]["buckets"]: results.append({ "key": bucket["key"], "sum": bucket["sum_agg"]["value"] }) print(results) assert len(results) == 1 assert results[0]["key"] == str( recipient1 ), "This botched 'Top 1' sum agg should have incorrectly chosen the lesser recipient" assert results[0][ "sum"] == 20.0, "The botched 'Top 1' sum agg should have incorrectly summed up recipient totals"
def test_top_1_with_es_transactions_routed_by_recipient( client, monkeypatch, elasticsearch_transaction_index, db): """ This tests the approach to compensating for high-cardinality aggregations documented in DEV-4685, to ensure accuracy and completeness of aggregations and sorting even when taking less buckets than the term cardinality. Without the code to route indexing of transaction documents in elasticsearch to shards by the `recipient_agg_key`, which was added to :meth:`usaspending_api.etl.elasticsearch_loader_helpers.csv_chunk_gen`, the below agg queries should lead to inaccurate results, as shown in the DEV-4538. With routing by recipient, documents will be allocated to shards as below Recipient shard0 shard1 shard2 shard3 shard4 Biz 1 $2.00 Biz 1 $ 7.00 Biz 1 $ 3.00 Biz 1 $ 2.00 Biz 1 $ 3.00 Biz 1 $ 5.00 Biz 2 $ 6.00 Biz 2 $ 3.00 Biz 2 $ 2.00 Biz 2 $ 3.00 Biz 2 $ 4.00 Biz 2 $13.00 **IF THIS TEST FAILS** - Are we still using the TestElasticSearchIndex fixture to help with pushing test data to ES? - Did TestElasticSearchIndex indexing / routing behavior change? - Did our cluster structure change to not be 5 shards per the transaction index? - Did the transaction<->recipient DB linkage change? - Did we change ES version or config? - Investigate if Elasticsearch has changed the way they do routing or hash routing key values """ # Setup data for this test recipient1 = uuid.uuid4() recipient2 = uuid.uuid4() # Recipient Lookup mommy.make("recipient.RecipientLookup", id=1, recipient_hash=recipient1, legal_business_name="Biz 1", duns="111") mommy.make("recipient.RecipientLookup", id=2, recipient_hash=recipient2, legal_business_name="Biz 2", duns="222") # Transaction FPDS _make_fpds_transaction(1, 1, 2.00, "2020-01-01", "111", "Biz 1") _make_fpds_transaction(2, 3, 7.00, "2020-02-02", "111", "Biz 1") _make_fpds_transaction(3, 3, 3.00, "2020-03-03", "111", "Biz 1") _make_fpds_transaction(4, 2, 2.00, "2020-01-02", "111", "Biz 1") _make_fpds_transaction(5, 2, 3.00, "2020-02-03", "111", "Biz 1") _make_fpds_transaction(6, 2, 5.00, "2020-03-04", "111", "Biz 1") _make_fpds_transaction(7, 2, 6.00, "2020-01-03", "222", "Biz 2") _make_fpds_transaction(8, 2, 3.00, "2020-02-04", "222", "Biz 2") _make_fpds_transaction(9, 3, 2.00, "2020-03-05", "222", "Biz 2") _make_fpds_transaction(10, 3, 3.00, "2020-01-04", "222", "Biz 2") _make_fpds_transaction(11, 3, 4.00, "2020-02-05", "222", "Biz 2") _make_fpds_transaction(12, 1, 13.00, "2020-03-06", "222", "Biz 2") # Awards mommy.make("awards.Award", id=1, latest_transaction_id=12) mommy.make("awards.Award", id=2, latest_transaction_id=6) mommy.make("awards.Award", id=3, latest_transaction_id=9) # Push DB data into the test ES cluster setup_elasticsearch_test(monkeypatch, elasticsearch_transaction_index) search = TransactionSearch() total = search.handle_count() assert total == 12, "Should have seen 12 documents indexed for this test" group_by_agg = A("terms", field="recipient_hash", size=1, shard_size=1, order={"sum_agg": "desc"}) sum_agg = A("sum", field="generated_pragmatic_obligation") search.aggs.bucket("results", group_by_agg).metric("sum_agg", sum_agg) logging.getLogger("console").debug( f"=>->=>->=>-> WILL RUN THIS ES QUERY: \n {search.extra(size=0).to_dict()}" ) response = search.extra(size=0).handle_execute().to_dict() results = [] for bucket in response["aggregations"]["results"]["buckets"]: results.append({ "key": bucket["key"], "sum": bucket["sum_agg"]["value"] }) assert len(results) == 1 assert results[0]["key"] == str( recipient2 ), "The 'Top 1' sum agg incorrectly chose the recipient with a lesser total sum" assert results[0][ "sum"] == 31.0, "The 'Top 1' sum agg incorrectly summed up recipient totals"
def obtain_recipient_totals(recipient_id, children=False, year="latest"): """Extract the total amount and transaction count for the recipient_hash given the time frame Args: recipient_id: string of hash(duns, name)-[recipient-level] children: whether or not to group by children year: the year the totals/counts are based on Returns: list of dictionaries representing hashes and their totals/counts """ filters = reshape_filters(recipient_id=recipient_id, year=year) filter_query = QueryWithFilters.generate_transactions_elasticsearch_query( filters) search = TransactionSearch().filter(filter_query) if children: group_by_field = "recipient_agg_key" elif recipient_id[-2:] == "-P": group_by_field = "parent_recipient_hash" else: group_by_field = "recipient_hash" bucket_count = get_number_of_unique_terms_for_transactions( filter_query, f"{group_by_field}.hash") if bucket_count == 0: return [] # Not setting the shard_size since the number of child recipients under a # parent recipient will not exceed 10k group_by_recipient = A("terms", field=group_by_field, size=bucket_count) sum_obligation = get_scaled_sum_aggregations( "generated_pragmatic_obligation")["sum_field"] filter_loans = A("filter", terms={"type": list(loan_type_mapping.keys())}) sum_face_value_loan = get_scaled_sum_aggregations( "face_value_loan_guarantee")["sum_field"] search.aggs.bucket("group_by_recipient", group_by_recipient) search.aggs["group_by_recipient"].metric("sum_obligation", sum_obligation) search.aggs["group_by_recipient"].bucket("filter_loans", filter_loans) search.aggs["group_by_recipient"]["filter_loans"].metric( "sum_face_value_loan", sum_face_value_loan) response = search.handle_execute() response_as_dict = response.aggs.to_dict() recipient_info_buckets = response_as_dict.get("group_by_recipient", {}).get("buckets", []) result_list = [] for bucket in recipient_info_buckets: result = {} if children: recipient_info = json.loads(bucket.get("key")) hash_with_level = recipient_info.get("hash_with_level") or None result = { "recipient_hash": hash_with_level[:-2] if hash_with_level else None, "recipient_unique_id": recipient_info.get("unique_id"), "recipient_name": recipient_info.get("name"), } loan_info = bucket.get("filter_loans", {}) result.update({ "total_obligation_amount": int(bucket.get("sum_obligation", {"value": 0})["value"]) / Decimal("100"), "total_obligation_count": bucket.get("doc_count", 0), "total_face_value_loan_amount": int(loan_info.get("sum_face_value_loan", {"value": 0})["value"]) / Decimal("100"), "total_face_value_loan_count": loan_info.get("doc_count", 0), }) result_list.append(result) return result_list