def query_elasticsearch(self, time_periods: list) -> list: filter_query = QueryWithFilters.generate_transactions_elasticsearch_query( self.filters) search = TransactionSearch().filter(filter_query) self.apply_elasticsearch_aggregations(search) response = search.handle_execute() return self.build_elasticsearch_result(response.aggs, time_periods)
def get_total_results(keyword): group_by_agg_key_values = { "filters": { category: { "terms": { "type": types } } for category, types in INDEX_ALIASES_TO_AWARD_TYPES.items() } } aggs = A("filters", **group_by_agg_key_values) filter_query = QueryWithFilters.generate_transactions_elasticsearch_query( {"keyword_search": [es_minimal_sanitize(keyword)]}) search = TransactionSearch().filter(filter_query) search.aggs.bucket("types", aggs) response = search.handle_execute() if response is not None: try: return response["aggregations"]["types"]["buckets"] except KeyError: logger.error("Unexpected Response") else: logger.error("No Response") return None
def get_sum_and_count_aggregation_results(keyword): filter_query = QueryWithFilters.generate_transactions_elasticsearch_query( {"keyword_search": [es_minimal_sanitize(keyword)]}) search = TransactionSearch().filter(filter_query) search.aggs.bucket("prime_awards_obligation_amount", {"sum": { "field": "transaction_amount" }}) search.aggs.bucket("prime_awards_count", {"value_count": { "field": "transaction_id" }}) response = search.handle_execute() if response is not None: try: results = {} results["prime_awards_count"] = response["aggregations"][ "prime_awards_count"]["value"] results["prime_awards_obligation_amount"] = round( response["aggregations"]["prime_awards_obligation_amount"] ["value"], 2) return results except KeyError: logger.exception("Unexpected Response") else: return None
def get_number_of_unique_terms(filter_query: ES_Q, field: str) -> int: search = TransactionSearch().filter(filter_query) cardinality_aggregation = A("cardinality", field=field) search.aggs.metric("field_count", cardinality_aggregation) response = search.handle_execute() response_dict = response.aggs.to_dict() return response_dict.get("field_count", {"value": 0})["value"]
def get_sum_aggregation_results(keyword, field="transaction_amount"): group_by_agg_key_values = {"field": field} aggs = A("sum", **group_by_agg_key_values) filter_query = QueryWithFilters.generate_transactions_elasticsearch_query( {"keywords": es_minimal_sanitize(keyword)}) search = TransactionSearch().filter(filter_query) search.aggs.bucket("transaction_sum", aggs) response = search.handle_execute() if response: return response["aggregations"] else: return None
def get_number_of_unique_terms(filter_query: ES_Q, field: str) -> int: """ Returns the count for a specific filter_query. NOTE: This will only work when the number of unique values is 40k or less. This is captured in the Elasticsearch documentation for the cardinality aggregation: "The maximum supported value is 40000, thresholds above this number will have the same effect as a threshold of 40000" """ search = TransactionSearch().filter(filter_query) cardinality_aggregation = A("cardinality", field=field) search.aggs.metric("field_count", cardinality_aggregation) response = search.handle_execute() response_dict = response.aggs.to_dict() return response_dict.get("field_count", {"value": 0})["value"]
def get_download_ids(keyword, field, size=10000): """ returns a generator that yields list of transaction ids in chunksize SIZE Note: this only works for fields in ES of integer type. """ n_iter = DOWNLOAD_QUERY_SIZE // size results = get_total_results(keyword) if results is None: logger.error( "Error retrieving total results. Max number of attempts reached") return total = sum(results[category]["doc_count"] for category in INDEX_ALIASES_TO_AWARD_TYPES.keys()) required_iter = (total // size) + 1 n_iter = min(max(1, required_iter), n_iter) for i in range(n_iter): filter_query = QueryWithFilters.generate_transactions_elasticsearch_query( {"keyword_search": [es_minimal_sanitize(keyword)]}) search = TransactionSearch().filter(filter_query) group_by_agg_key_values = { "field": field, "include": { "partition": i, "num_partitions": n_iter }, "size": size, "shard_size": size, } aggs = A("terms", **group_by_agg_key_values) search.aggs.bucket("results", aggs) response = search.handle_execute() if response is None: raise Exception("Breaking generator, unable to reach cluster") results = [] for result in response["aggregations"]["results"]["buckets"]: results.append(result["key"]) yield results
def post(self, request): models = [{ "name": "fields", "key": "fields", "type": "array", "array_type": "text", "text_type": "search", "optional": False, }] models.extend(copy.deepcopy(AWARD_FILTER)) models.extend(copy.deepcopy(PAGINATION)) for m in models: if m["name"] in ("keywords", "award_type_codes", "sort"): m["optional"] = False validated_payload = TinyShield(models).block(request.data) record_num = (validated_payload["page"] - 1) * validated_payload["limit"] if record_num >= settings.ES_TRANSACTIONS_MAX_RESULT_WINDOW: raise UnprocessableEntityException( "Page #{page} of size {limit} is over the maximum result limit ({es_limit}). Consider using custom data downloads to obtain large data sets." .format( page=validated_payload["page"], limit=validated_payload["limit"], es_limit=settings.ES_TRANSACTIONS_MAX_RESULT_WINDOW, )) if validated_payload["sort"] not in validated_payload["fields"]: raise InvalidParameterException( "Sort value not found in fields: {}".format( validated_payload["sort"])) if "filters" in validated_payload and "no intersection" in validated_payload[ "filters"]["award_type_codes"]: # "Special case": there will never be results when the website provides this value return Response({ "limit": validated_payload["limit"], "results": [], "page_metadata": { "page": validated_payload["page"], "next": None, "previous": None, "hasNext": False, "hasPrevious": False, }, }) sorts = { TRANSACTIONS_LOOKUP[validated_payload["sort"]]: validated_payload["order"] } lower_limit = (validated_payload["page"] - 1) * validated_payload["limit"] upper_limit = ( validated_payload["page"]) * validated_payload["limit"] + 1 validated_payload["filters"]["keyword_search"] = [ es_minimal_sanitize(x) for x in validated_payload["filters"]["keywords"] ] validated_payload["filters"].pop("keywords") filter_query = QueryWithFilters.generate_transactions_elasticsearch_query( validated_payload["filters"]) search = TransactionSearch().filter(filter_query).sort( sorts)[lower_limit:upper_limit] response = search.handle_execute() return Response( self.build_elasticsearch_result(validated_payload, response))
def obtain_recipient_totals(recipient_id, children=False, year="latest"): """Extract the total amount and transaction count for the recipient_hash given the time frame Args: recipient_id: string of hash(duns, name)-[recipient-level] children: whether or not to group by children year: the year the totals/counts are based on Returns: list of dictionaries representing hashes and their totals/counts """ filters = reshape_filters(recipient_id=recipient_id, year=year) filter_query = QueryWithFilters.generate_transactions_elasticsearch_query( filters) search = TransactionSearch().filter(filter_query) if children: group_by_field = "recipient_agg_key" elif recipient_id[-2:] == "-P": group_by_field = "parent_recipient_hash" else: group_by_field = "recipient_hash" bucket_count = get_number_of_unique_terms_for_transactions( filter_query, f"{group_by_field}.hash") if bucket_count == 0: return [] # Not setting the shard_size since the number of child recipients under a # parent recipient will not exceed 10k group_by_recipient = A("terms", field=group_by_field, size=bucket_count) sum_obligation = get_scaled_sum_aggregations( "generated_pragmatic_obligation")["sum_field"] filter_loans = A("filter", terms={"type": list(loan_type_mapping.keys())}) sum_face_value_loan = get_scaled_sum_aggregations( "face_value_loan_guarantee")["sum_field"] search.aggs.bucket("group_by_recipient", group_by_recipient) search.aggs["group_by_recipient"].metric("sum_obligation", sum_obligation) search.aggs["group_by_recipient"].bucket("filter_loans", filter_loans) search.aggs["group_by_recipient"]["filter_loans"].metric( "sum_face_value_loan", sum_face_value_loan) response = search.handle_execute() response_as_dict = response.aggs.to_dict() recipient_info_buckets = response_as_dict.get("group_by_recipient", {}).get("buckets", []) result_list = [] for bucket in recipient_info_buckets: result = {} if children: recipient_info = json.loads(bucket.get("key")) hash_with_level = recipient_info.get("hash_with_level") or None result = { "recipient_hash": hash_with_level[:-2] if hash_with_level else None, "recipient_unique_id": recipient_info.get("unique_id"), "recipient_name": recipient_info.get("name"), } loan_info = bucket.get("filter_loans", {}) result.update({ "total_obligation_amount": int(bucket.get("sum_obligation", {"value": 0})["value"]) / Decimal("100"), "total_obligation_count": bucket.get("doc_count", 0), "total_face_value_loan_amount": int(loan_info.get("sum_face_value_loan", {"value": 0})["value"]) / Decimal("100"), "total_face_value_loan_count": loan_info.get("doc_count", 0), }) result_list.append(result) return result_list