Example #1
0
 def query_elasticsearch(self, time_periods: list) -> list:
     filter_query = QueryWithFilters.generate_transactions_elasticsearch_query(
         self.filters)
     search = TransactionSearch().filter(filter_query)
     self.apply_elasticsearch_aggregations(search)
     response = search.handle_execute()
     return self.build_elasticsearch_result(response.aggs, time_periods)
Example #2
0
def get_total_results(keyword):
    group_by_agg_key_values = {
        "filters": {
            category: {
                "terms": {
                    "type": types
                }
            }
            for category, types in INDEX_ALIASES_TO_AWARD_TYPES.items()
        }
    }
    aggs = A("filters", **group_by_agg_key_values)
    filter_query = QueryWithFilters.generate_transactions_elasticsearch_query(
        {"keyword_search": [es_minimal_sanitize(keyword)]})
    search = TransactionSearch().filter(filter_query)
    search.aggs.bucket("types", aggs)
    response = search.handle_execute()

    if response is not None:
        try:
            return response["aggregations"]["types"]["buckets"]
        except KeyError:
            logger.error("Unexpected Response")
    else:
        logger.error("No Response")
        return None
Example #3
0
def get_sum_and_count_aggregation_results(keyword):
    filter_query = QueryWithFilters.generate_transactions_elasticsearch_query(
        {"keyword_search": [es_minimal_sanitize(keyword)]})
    search = TransactionSearch().filter(filter_query)
    search.aggs.bucket("prime_awards_obligation_amount",
                       {"sum": {
                           "field": "transaction_amount"
                       }})
    search.aggs.bucket("prime_awards_count",
                       {"value_count": {
                           "field": "transaction_id"
                       }})
    response = search.handle_execute()

    if response is not None:
        try:
            results = {}
            results["prime_awards_count"] = response["aggregations"][
                "prime_awards_count"]["value"]
            results["prime_awards_obligation_amount"] = round(
                response["aggregations"]["prime_awards_obligation_amount"]
                ["value"], 2)
            return results
        except KeyError:
            logger.exception("Unexpected Response")
    else:
        return None
def get_number_of_unique_terms(filter_query: ES_Q, field: str) -> int:
    search = TransactionSearch().filter(filter_query)
    cardinality_aggregation = A("cardinality", field=field)
    search.aggs.metric("field_count", cardinality_aggregation)
    response = search.handle_execute()
    response_dict = response.aggs.to_dict()
    return response_dict.get("field_count", {"value": 0})["value"]
Example #5
0
def get_sum_aggregation_results(keyword, field="transaction_amount"):
    group_by_agg_key_values = {"field": field}
    aggs = A("sum", **group_by_agg_key_values)
    filter_query = QueryWithFilters.generate_transactions_elasticsearch_query(
        {"keywords": es_minimal_sanitize(keyword)})
    search = TransactionSearch().filter(filter_query)
    search.aggs.bucket("transaction_sum", aggs)
    response = search.handle_execute()

    if response:
        return response["aggregations"]
    else:
        return None
Example #6
0
def get_number_of_unique_terms(filter_query: ES_Q, field: str) -> int:
    """
    Returns the count for a specific filter_query.
    NOTE: This will only work when the number of unique values is 40k or less. This is captured in the Elasticsearch
    documentation for the cardinality aggregation:
        "The maximum supported value is 40000, thresholds above this number will
        have the same effect as a threshold of 40000"
    """
    search = TransactionSearch().filter(filter_query)
    cardinality_aggregation = A("cardinality", field=field)
    search.aggs.metric("field_count", cardinality_aggregation)
    response = search.handle_execute()
    response_dict = response.aggs.to_dict()
    return response_dict.get("field_count", {"value": 0})["value"]
Example #7
0
def get_download_ids(keyword, field, size=10000):
    """
    returns a generator that
    yields list of transaction ids in chunksize SIZE

    Note: this only works for fields in ES of integer type.
    """
    n_iter = DOWNLOAD_QUERY_SIZE // size

    results = get_total_results(keyword)
    if results is None:
        logger.error(
            "Error retrieving total results. Max number of attempts reached")
        return
    total = sum(results[category]["doc_count"]
                for category in INDEX_ALIASES_TO_AWARD_TYPES.keys())
    required_iter = (total // size) + 1
    n_iter = min(max(1, required_iter), n_iter)
    for i in range(n_iter):
        filter_query = QueryWithFilters.generate_transactions_elasticsearch_query(
            {"keyword_search": [es_minimal_sanitize(keyword)]})
        search = TransactionSearch().filter(filter_query)
        group_by_agg_key_values = {
            "field": field,
            "include": {
                "partition": i,
                "num_partitions": n_iter
            },
            "size": size,
            "shard_size": size,
        }
        aggs = A("terms", **group_by_agg_key_values)
        search.aggs.bucket("results", aggs)
        response = search.handle_execute()
        if response is None:
            raise Exception("Breaking generator, unable to reach cluster")
        results = []
        for result in response["aggregations"]["results"]["buckets"]:
            results.append(result["key"])
        yield results
    def post(self, request):

        models = [{
            "name": "fields",
            "key": "fields",
            "type": "array",
            "array_type": "text",
            "text_type": "search",
            "optional": False,
        }]
        models.extend(copy.deepcopy(AWARD_FILTER))
        models.extend(copy.deepcopy(PAGINATION))
        for m in models:
            if m["name"] in ("keywords", "award_type_codes", "sort"):
                m["optional"] = False
        validated_payload = TinyShield(models).block(request.data)

        record_num = (validated_payload["page"] -
                      1) * validated_payload["limit"]
        if record_num >= settings.ES_TRANSACTIONS_MAX_RESULT_WINDOW:
            raise UnprocessableEntityException(
                "Page #{page} of size {limit} is over the maximum result limit ({es_limit}). Consider using custom data downloads to obtain large data sets."
                .format(
                    page=validated_payload["page"],
                    limit=validated_payload["limit"],
                    es_limit=settings.ES_TRANSACTIONS_MAX_RESULT_WINDOW,
                ))

        if validated_payload["sort"] not in validated_payload["fields"]:
            raise InvalidParameterException(
                "Sort value not found in fields: {}".format(
                    validated_payload["sort"]))

        if "filters" in validated_payload and "no intersection" in validated_payload[
                "filters"]["award_type_codes"]:
            # "Special case": there will never be results when the website provides this value
            return Response({
                "limit": validated_payload["limit"],
                "results": [],
                "page_metadata": {
                    "page": validated_payload["page"],
                    "next": None,
                    "previous": None,
                    "hasNext": False,
                    "hasPrevious": False,
                },
            })
        sorts = {
            TRANSACTIONS_LOOKUP[validated_payload["sort"]]:
            validated_payload["order"]
        }
        lower_limit = (validated_payload["page"] -
                       1) * validated_payload["limit"]
        upper_limit = (
            validated_payload["page"]) * validated_payload["limit"] + 1
        validated_payload["filters"]["keyword_search"] = [
            es_minimal_sanitize(x)
            for x in validated_payload["filters"]["keywords"]
        ]
        validated_payload["filters"].pop("keywords")
        filter_query = QueryWithFilters.generate_transactions_elasticsearch_query(
            validated_payload["filters"])
        search = TransactionSearch().filter(filter_query).sort(
            sorts)[lower_limit:upper_limit]
        response = search.handle_execute()
        return Response(
            self.build_elasticsearch_result(validated_payload, response))
Example #9
0
def obtain_recipient_totals(recipient_id, children=False, year="latest"):
    """Extract the total amount and transaction count for the recipient_hash given the time frame

    Args:
        recipient_id: string of hash(duns, name)-[recipient-level]
        children: whether or not to group by children
        year: the year the totals/counts are based on
    Returns:
        list of dictionaries representing hashes and their totals/counts
    """
    filters = reshape_filters(recipient_id=recipient_id, year=year)
    filter_query = QueryWithFilters.generate_transactions_elasticsearch_query(
        filters)

    search = TransactionSearch().filter(filter_query)

    if children:
        group_by_field = "recipient_agg_key"
    elif recipient_id[-2:] == "-P":
        group_by_field = "parent_recipient_hash"
    else:
        group_by_field = "recipient_hash"

    bucket_count = get_number_of_unique_terms_for_transactions(
        filter_query, f"{group_by_field}.hash")

    if bucket_count == 0:
        return []

    # Not setting the shard_size since the number of child recipients under a
    # parent recipient will not exceed 10k
    group_by_recipient = A("terms", field=group_by_field, size=bucket_count)

    sum_obligation = get_scaled_sum_aggregations(
        "generated_pragmatic_obligation")["sum_field"]

    filter_loans = A("filter", terms={"type": list(loan_type_mapping.keys())})
    sum_face_value_loan = get_scaled_sum_aggregations(
        "face_value_loan_guarantee")["sum_field"]

    search.aggs.bucket("group_by_recipient", group_by_recipient)
    search.aggs["group_by_recipient"].metric("sum_obligation", sum_obligation)
    search.aggs["group_by_recipient"].bucket("filter_loans", filter_loans)
    search.aggs["group_by_recipient"]["filter_loans"].metric(
        "sum_face_value_loan", sum_face_value_loan)

    response = search.handle_execute()
    response_as_dict = response.aggs.to_dict()
    recipient_info_buckets = response_as_dict.get("group_by_recipient",
                                                  {}).get("buckets", [])

    result_list = []

    for bucket in recipient_info_buckets:
        result = {}
        if children:
            recipient_info = json.loads(bucket.get("key"))
            hash_with_level = recipient_info.get("hash_with_level") or None
            result = {
                "recipient_hash":
                hash_with_level[:-2] if hash_with_level else None,
                "recipient_unique_id": recipient_info.get("unique_id"),
                "recipient_name": recipient_info.get("name"),
            }
        loan_info = bucket.get("filter_loans", {})
        result.update({
            "total_obligation_amount":
            int(bucket.get("sum_obligation", {"value": 0})["value"]) /
            Decimal("100"),
            "total_obligation_count":
            bucket.get("doc_count", 0),
            "total_face_value_loan_amount":
            int(loan_info.get("sum_face_value_loan", {"value": 0})["value"]) /
            Decimal("100"),
            "total_face_value_loan_count":
            loan_info.get("doc_count", 0),
        })
        result_list.append(result)

    return result_list