Exemple #1
0
    def _groupby_aggs(
        self,
        query_compiler: "QueryCompiler",
        by: List[str],
        pd_aggs: List[str],
        dropna: bool = True,
        is_dataframe_agg: bool = False,
        numeric_only: bool = True,
    ) -> Tuple[List[str], Dict[str, Any]]:
        """
        This method is used to calculate groupby aggregations

        Parameters
        ----------
        query_compiler:
            A Query compiler
        by:
            a list of columns on which groupby operations have to be performed
        pd_aggs:
            a list of aggregations to be performed
        dropna:
            Drop None values if True.
            TODO Not yet implemented
        is_dataframe_agg:
            Know if multi aggregation or single agg is called.
        numeric_only:
            return either numeric values or NaN/NaT

        Returns
        -------
        headers: columns on which MultiIndex has to be applied
        response: dictionary of groupby aggregated values
        """
        query_params, post_processing = self._resolve_tasks(query_compiler)

        size = self._size(query_params, post_processing)
        if size is not None:
            raise NotImplementedError(
                f"Can not count field matches if size is set {size}")

        by_fields, agg_fields = query_compiler._mappings.groupby_source_fields(
            by=by)

        # Used defaultdict to avoid initialization of columns with lists
        response: Dict[str, List[Any]] = defaultdict(list)

        if numeric_only:
            agg_fields = [
                field for field in agg_fields
                if (field.is_numeric or field.is_bool)
            ]

        body = Query(query_params.query)

        # Convert pandas aggs to ES equivalent
        es_aggs = self._map_pd_aggs_to_es_aggs(pd_aggs)

        # Construct Query
        for by_field in by_fields:
            # groupby fields will be term aggregations
            body.composite_agg_bucket_terms(name=f"groupby_{by_field.column}",
                                            field=by_field.es_field_name)

        for field in agg_fields:
            for es_agg in es_aggs:
                if not field.is_es_agg_compatible(es_agg):
                    continue

                # If we have multiple 'extended_stats' etc. here we simply NOOP on 2nd call
                if isinstance(es_agg, tuple):
                    body.metric_aggs(
                        f"{es_agg[0]}_{field.es_field_name}",
                        es_agg[0],
                        field.aggregatable_es_field_name,
                    )
                else:
                    body.metric_aggs(
                        f"{es_agg}_{field.es_field_name}",
                        es_agg,
                        field.aggregatable_es_field_name,
                    )

        # Composite aggregation
        body.composite_agg_start(size=DEFAULT_PAGINATION_SIZE,
                                 name="groupby_buckets",
                                 dropna=dropna)

        def bucket_generator() -> Generator[List[str], None, List[str]]:
            """
            e.g.
            "aggregations": {
                "groupby_buckets": {
                    "after_key": {"total_quantity": 8},
                    "buckets": [
                        {
                            "key": {"total_quantity": 1},
                            "doc_count": 87,
                            "taxful_total_price_avg": {"value": 48.035978536496216},
                        }
                    ],
                }
            }
            Returns
            -------
            A generator which initially yields the bucket
            If after_key is found, use it to fetch the next set of buckets.

            """
            while True:
                res = query_compiler._client.search(
                    index=query_compiler._index_pattern,
                    size=0,
                    body=body.to_search_body(),
                )

                # Pagination Logic
                composite_buckets = res["aggregations"]["groupby_buckets"]
                if "after_key" in composite_buckets:

                    # yield the bucket which contains the result
                    yield composite_buckets["buckets"]

                    body.composite_agg_after_key(
                        name="groupby_buckets",
                        after_key=composite_buckets["after_key"],
                    )
                else:
                    return composite_buckets["buckets"]

        for buckets in bucket_generator():
            # We recieve response row-wise
            for bucket in buckets:
                # groupby columns are added to result same way they are returned
                for by_field in by_fields:
                    bucket_key = bucket["key"][f"groupby_{by_field.column}"]

                    # Datetimes always come back as integers, convert to pd.Timestamp()
                    if by_field.is_timestamp and isinstance(bucket_key, int):
                        bucket_key = pd.to_datetime(bucket_key, unit="ms")

                    response[by_field.column].append(bucket_key)

                agg_calculation = self._unpack_metric_aggs(
                    fields=agg_fields,
                    es_aggs=es_aggs,
                    pd_aggs=pd_aggs,
                    response={"aggregations": bucket},
                    numeric_only=numeric_only,
                    is_dataframe_agg=is_dataframe_agg,
                )
                # Process the calculated agg values to response
                for key, value in agg_calculation.items():
                    if isinstance(value, list):
                        for pd_agg, val in zip(pd_aggs, value):
                            response[f"{key}_{pd_agg}"].append(val)
                    else:
                        response[key].append(value)

        return [field.column for field in agg_fields], response
Exemple #2
0
    def aggs_groupby(
        self,
        query_compiler: "QueryCompiler",
        by: List[str],
        pd_aggs: List[str],
        dropna: bool = True,
        is_dataframe_agg: bool = False,
        numeric_only: Optional[bool] = True,
    ) -> pd.DataFrame:
        """
        This method is used to construct groupby aggregation dataframe

        Parameters
        ----------
        query_compiler:
            A Query compiler
        by:
            a list of columns on which groupby operations have to be performed
        pd_aggs:
            a list of aggregations to be performed
        dropna:
            Drop None values if True.
            TODO Not yet implemented
        is_dataframe_agg:
            Know if groupby with aggregation or single agg is called.
        numeric_only:
            return either numeric values or NaN/NaT

        Returns
        -------
            A dataframe which consists groupby data
        """
        query_params, post_processing = self._resolve_tasks(query_compiler)

        size = self._size(query_params, post_processing)
        if size is not None:
            raise NotImplementedError(
                f"Can not count field matches if size is set {size}"
            )

        by_fields, agg_fields = query_compiler._mappings.groupby_source_fields(by=by)

        # Used defaultdict to avoid initialization of columns with lists
        results: Dict[str, List[Any]] = defaultdict(list)

        if numeric_only:
            agg_fields = [
                field for field in agg_fields if (field.is_numeric or field.is_bool)
            ]

        body = Query(query_params.query)

        # To return for creating multi-index on columns
        headers = [agg_field.column for agg_field in agg_fields]

        # Convert pandas aggs to ES equivalent
        es_aggs = self._map_pd_aggs_to_es_aggs(pd_aggs)

        # Construct Query
        for by_field in by_fields:
            if by_field.aggregatable_es_field_name is None:
                raise ValueError(
                    f"Cannot use {by_field.column!r} with groupby() because "
                    f"it has no aggregatable fields in Elasticsearch"
                )
            # groupby fields will be term aggregations
            body.composite_agg_bucket_terms(
                name=f"groupby_{by_field.column}",
                field=by_field.aggregatable_es_field_name,
            )

        for agg_field in agg_fields:
            for es_agg in es_aggs:
                # Skip if the field isn't compatible or if the agg is
                # 'value_count' as this value is pulled from bucket.doc_count.
                if not agg_field.is_es_agg_compatible(es_agg):
                    continue

                # If we have multiple 'extended_stats' etc. here we simply NOOP on 2nd call
                if isinstance(es_agg, tuple):
                    body.metric_aggs(
                        f"{es_agg[0]}_{agg_field.es_field_name}",
                        es_agg[0],
                        agg_field.aggregatable_es_field_name,
                    )
                else:
                    body.metric_aggs(
                        f"{es_agg}_{agg_field.es_field_name}",
                        es_agg,
                        agg_field.aggregatable_es_field_name,
                    )

        # Composite aggregation
        body.composite_agg_start(
            size=DEFAULT_PAGINATION_SIZE, name="groupby_buckets", dropna=dropna
        )

        for buckets in self.bucket_generator(query_compiler, body):
            # We recieve response row-wise
            for bucket in buckets:
                # groupby columns are added to result same way they are returned
                for by_field in by_fields:
                    bucket_key = bucket["key"][f"groupby_{by_field.column}"]

                    # Datetimes always come back as integers, convert to pd.Timestamp()
                    if by_field.is_timestamp and isinstance(bucket_key, int):
                        bucket_key = pd.to_datetime(bucket_key, unit="ms")

                    results[by_field.column].append(bucket_key)

                agg_calculation = self._unpack_metric_aggs(
                    fields=agg_fields,
                    es_aggs=es_aggs,
                    pd_aggs=pd_aggs,
                    response={"aggregations": bucket},
                    numeric_only=numeric_only,
                    # We set 'True' here because we want the value
                    # unpacking to always be in 'dataframe' mode.
                    is_dataframe_agg=True,
                )

                # Process the calculated agg values to response
                for key, value in agg_calculation.items():
                    if not isinstance(value, list):
                        results[key].append(value)
                        continue
                    for pd_agg, val in zip(pd_aggs, value):
                        results[f"{key}_{pd_agg}"].append(val)

        agg_df = pd.DataFrame(results).set_index(by)

        if is_dataframe_agg:
            # Convert header columns to MultiIndex
            agg_df.columns = pd.MultiIndex.from_product([headers, pd_aggs])
        else:
            # Convert header columns to Index
            agg_df.columns = pd.Index(headers)

        return agg_df