Exemple #1
0
    def count(self, query_compiler):
        query_params, post_processing = self._resolve_tasks(query_compiler)

        # Elasticsearch _count is very efficient and so used to return results here. This means that
        # data frames that have restricted size or sort params will not return valid results
        # (_count doesn't support size).
        # Longer term we may fall back to pandas, but this may result in loading all index into memory.
        if self._size(query_params, post_processing) is not None:
            raise NotImplementedError(
                f"Requesting count with additional query and processing parameters "
                f"not supported {query_params} {post_processing}")

        # Only return requested field_names
        fields = query_compiler.get_field_names(include_scripted_fields=False)

        counts = {}
        for field in fields:
            body = Query(query_params.query)
            body.exists(field, must=True)

            field_exists_count = query_compiler._client.count(
                index=query_compiler._index_pattern,
                body=body.to_count_body())["count"]
            counts[field] = field_exists_count

        return build_pd_series(data=counts, index=fields)
Exemple #2
0
    def _terms_aggs(
        self, query_compiler: "QueryCompiler", func: str, es_size: int
    ) -> pd.Series:
        """
        Parameters
        ----------
        es_size: int, default None
            Parameter used by Series.value_counts()

        Returns
        -------
        pandas.Series
            Series containing results of `func` applied to the field_name(s)
        """
        query_params, post_processing = self._resolve_tasks(query_compiler)

        size = self._size(query_params, post_processing)
        if size is not None:
            raise NotImplementedError(
                f"Can not count field matches if size is set {size}"
            )

        # Get just aggregatable field_names
        aggregatable_field_names = query_compiler._mappings.aggregatable_field_names()

        body = Query(query_params.query)

        for field in aggregatable_field_names.keys():
            body.terms_aggs(field, func, field, es_size=es_size)

        response = query_compiler._client.search(
            index=query_compiler._index_pattern, size=0, body=body.to_search_body()
        )

        results = {}

        for key in aggregatable_field_names.keys():
            # key is aggregatable field, value is label
            # e.g. key=category.keyword, value=category
            for bucket in response["aggregations"][key]["buckets"]:
                results[bucket["key"]] = bucket["doc_count"]

        try:
            # get first value in dict (key is .keyword)
            name = list(aggregatable_field_names.values())[0]
        except IndexError:
            name = None

        return build_pd_series(results, name=name)
Exemple #3
0
 def __init__(self):
     self.query = Query()
     self.sort_field: Optional[str] = None
     self.sort_order: Optional[SortOrder] = None
     self.size: Optional[int] = None
     self.fields: Optional[List[str]] = None
     self.script_fields: Optional[Dict[str, Dict[str, Any]]] = None
Exemple #4
0
    def index_count(self, query_compiler, field):
        # field is the index field so count values
        query_params, post_processing = self._resolve_tasks(query_compiler)

        size = self._size(query_params, post_processing)

        # Size is dictated by operations
        if size is not None:
            # TODO - this is not necessarily valid as the field may not exist in ALL these docs
            return size

        body = Query(query_params.query)
        body.exists(field, must=True)

        return query_compiler._client.count(
            index=query_compiler._index_pattern,
            body=body.to_count_body())["count"]
Exemple #5
0
    def es_info(self, query_compiler, buf):
        buf.write("Operations:\n")
        buf.write(f" tasks: {self._tasks}\n")

        query_params, post_processing = self._resolve_tasks(query_compiler)
        size, sort_params = Operations._query_params_to_size_and_sort(query_params)
        _source = query_compiler._mappings.get_field_names()

        script_fields = query_params.script_fields
        query = Query(query_params.query)
        body = query.to_search_body()
        if script_fields is not None:
            body["script_fields"] = script_fields

        buf.write(f" size: {size}\n")
        buf.write(f" sort_params: {sort_params}\n")
        buf.write(f" _source: {_source}\n")
        buf.write(f" body: {body}\n")
        buf.write(f" post_processing: {post_processing}\n")
Exemple #6
0
    def describe(self, query_compiler):
        query_params, post_processing = self._resolve_tasks(query_compiler)

        size = self._size(query_params, post_processing)
        if size is not None:
            raise NotImplementedError(
                f"Can not count field matches if size is set {size}")

        numeric_source_fields = query_compiler._mappings.numeric_source_fields(
        )

        # for each field we compute:
        # count, mean, std, min, 25%, 50%, 75%, max
        body = Query(query_params.query)

        for field in numeric_source_fields:
            body.metric_aggs("extended_stats_" + field, "extended_stats",
                             field)
            body.metric_aggs("percentiles_" + field, "percentiles", field)

        response = query_compiler._client.search(
            index=query_compiler._index_pattern,
            size=0,
            body=body.to_search_body())

        results = {}

        for field in numeric_source_fields:
            values = list()
            values.append(response["aggregations"]["extended_stats_" +
                                                   field]["count"])
            values.append(response["aggregations"]["extended_stats_" +
                                                   field]["avg"])
            values.append(response["aggregations"]["extended_stats_" +
                                                   field]["std_deviation"])
            values.append(response["aggregations"]["extended_stats_" +
                                                   field]["min"])
            values.append(response["aggregations"]["percentiles_" +
                                                   field]["values"]["25.0"])
            values.append(response["aggregations"]["percentiles_" +
                                                   field]["values"]["50.0"])
            values.append(response["aggregations"]["percentiles_" +
                                                   field]["values"]["75.0"])
            values.append(response["aggregations"]["extended_stats_" +
                                                   field]["max"])

            # if not None
            if values.count(None) < len(values):
                results[field] = values

        df = pd.DataFrame(
            data=results,
            index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
        )

        return df
Exemple #7
0
    def index_matches_count(self, query_compiler, field, items):
        query_params, post_processing = self._validate_index_operation(
            query_compiler, items)

        body = Query(query_params.query)

        if field == Index.ID_INDEX_FIELD:
            body.ids(items, must=True)
        else:
            body.terms(field, items, must=True)

        return query_compiler._client.count(
            index=query_compiler._index_pattern,
            body=body.to_count_body())["count"]
Exemple #8
0
    def _es_results(self, query_compiler, collector):
        query_params, post_processing = self._resolve_tasks(query_compiler)

        size, sort_params = Operations._query_params_to_size_and_sort(
            query_params)

        script_fields = query_params.script_fields
        query = Query(query_params.query)

        body = query.to_search_body()
        if script_fields is not None:
            body["script_fields"] = script_fields

        # Only return requested field_names
        _source = query_compiler.get_field_names(include_scripted_fields=False)
        if _source:
            # For query_compiler._client.search we could add _source
            # as a parameter, or add this value in body.
            #
            # If _source is a parameter it is encoded into to the url.
            #
            # If _source is a large number of fields (1000+) then this can result in an
            # extremely long url and a `too_long_frame_exception`. Therefore, add
            # _source to the body rather than as a _source parameter
            body["_source"] = _source
        else:
            body["_source"] = False

        es_results = None

        # If size=None use scan not search - then post sort results when in df
        # If size>10000 use scan
        is_scan = False
        if size is not None and size <= DEFAULT_ES_MAX_RESULT_WINDOW:
            if size > 0:
                try:

                    es_results = query_compiler._client.search(
                        index=query_compiler._index_pattern,
                        size=size,
                        sort=sort_params,
                        body=body,
                    )
                except Exception:
                    # Catch all ES errors and print debug (currently to stdout)
                    error = {
                        "index": query_compiler._index_pattern,
                        "size": size,
                        "sort": sort_params,
                        "body": body,
                    }
                    print("Elasticsearch error:", error)
                    raise
        else:
            is_scan = True
            es_results = scan(
                client=query_compiler._client,
                index=query_compiler._index_pattern,
                query=body,
            )
            # create post sort
            if sort_params is not None:
                post_processing.append(SortFieldAction(sort_params))

        if is_scan:
            while True:
                partial_result, df = query_compiler._es_results_to_pandas(
                    es_results, collector.batch_size(),
                    collector.show_progress)
                df = self._apply_df_post_processing(df, post_processing)
                collector.collect(df)
                if not partial_result:
                    break
        else:
            partial_result, df = query_compiler._es_results_to_pandas(
                es_results)
            df = self._apply_df_post_processing(df, post_processing)
            collector.collect(df)
Exemple #9
0
    def _hist_aggs(self, query_compiler, num_bins):
        # Get histogram bins and weights for numeric field_names
        query_params, post_processing = self._resolve_tasks(query_compiler)

        size = self._size(query_params, post_processing)
        if size is not None:
            raise NotImplementedError(
                f"Can not count field matches if size is set {size}")

        numeric_source_fields = query_compiler._mappings.numeric_source_fields(
        )

        body = Query(query_params.query)

        results = self._metric_aggs(query_compiler, ["min", "max"],
                                    numeric_only=True)
        min_aggs = {}
        max_aggs = {}
        for field, (min_agg, max_agg) in results.items():
            min_aggs[field] = min_agg
            max_aggs[field] = max_agg

        for field in numeric_source_fields:
            body.hist_aggs(field, field, min_aggs[field], max_aggs[field],
                           num_bins)

        response = query_compiler._client.search(
            index=query_compiler._index_pattern,
            size=0,
            body=body.to_search_body())
        # results are like
        # "aggregations" : {
        #     "DistanceKilometers" : {
        #       "buckets" : [
        #         {
        #           "key" : 0.0,
        #           "doc_count" : 2956
        #         },
        #         {
        #           "key" : 1988.1482421875,
        #           "doc_count" : 768
        #         },
        #         ...

        bins = {}
        weights = {}

        # There is one more bin that weights
        # len(bins) = len(weights) + 1

        # bins = [  0.  36.  72. 108. 144. 180. 216. 252. 288. 324. 360.]
        # len(bins) == 11
        # weights = [10066.,   263.,   386.,   264.,   273.,   390.,   324.,   438.,   261.,   394.]
        # len(weights) == 10

        # ES returns
        # weights = [10066.,   263.,   386.,   264.,   273.,   390.,   324.,   438.,   261.,   252.,    142.]
        # So sum last 2 buckets
        for field in numeric_source_fields:

            # in case of series let plotting.ed_hist_series thrown an exception
            if not response.get("aggregations"):
                continue

            # in case of dataframe, throw warning that field is excluded
            if not response["aggregations"].get(field):
                warnings.warn(
                    f"{field} has no meaningful histogram interval and will be excluded. "
                    f"All values 0.",
                    UserWarning,
                )
                continue

            buckets = response["aggregations"][field]["buckets"]

            bins[field] = []
            weights[field] = []

            for bucket in buckets:
                bins[field].append(bucket["key"])

                if bucket == buckets[-1]:
                    weights[field][-1] += bucket["doc_count"]
                else:
                    weights[field].append(bucket["doc_count"])

        df_bins = pd.DataFrame(data=bins)
        df_weights = pd.DataFrame(data=weights)
        return df_bins, df_weights
Exemple #10
0
    def _metric_aggs(self,
                     query_compiler: "QueryCompiler",
                     pd_aggs,
                     numeric_only=True):
        query_params, post_processing = self._resolve_tasks(query_compiler)

        size = self._size(query_params, post_processing)
        if size is not None:
            raise NotImplementedError(
                f"Can not count field matches if size is set {size}")

        results = {}
        fields = query_compiler._mappings.all_source_fields()
        if numeric_only:
            fields = [
                field for field in fields
                if (field.is_numeric or field.is_bool)
            ]

        body = Query(query_params.query)

        # Convert pandas aggs to ES equivalent
        es_aggs = self._map_pd_aggs_to_es_aggs(pd_aggs)

        for field in fields:
            for es_agg in es_aggs:
                if not field.is_es_agg_compatible(es_agg):
                    continue

                # If we have multiple 'extended_stats' etc. here we simply NOOP on 2nd call
                if isinstance(es_agg, tuple):
                    body.metric_aggs(
                        f"{es_agg[0]}_{field.es_field_name}",
                        es_agg[0],
                        field.aggregatable_es_field_name,
                    )
                else:
                    body.metric_aggs(
                        f"{es_agg}_{field.es_field_name}",
                        es_agg,
                        field.aggregatable_es_field_name,
                    )

        response = query_compiler._client.search(
            index=query_compiler._index_pattern,
            size=0,
            body=body.to_search_body())
        """
        Results are like (for 'sum', 'min')

             AvgTicketPrice  DistanceKilometers  DistanceMiles  FlightDelayMin
        sum    8.204365e+06        9.261629e+07   5.754909e+07          618150
        min    1.000205e+02        0.000000e+00   0.000000e+00               0
        """
        for field in fields:
            values = []
            for es_agg, pd_agg in zip(es_aggs, pd_aggs):

                # If the field and agg aren't compatible we add a NaN
                if not field.is_es_agg_compatible(es_agg):
                    values.append(np.float64(np.NaN))
                    continue

                if isinstance(es_agg, tuple):
                    agg_value = response["aggregations"][
                        f"{es_agg[0]}_{field.es_field_name}"]

                    # Pull multiple values from 'percentiles' result.
                    if es_agg[0] == "percentiles":
                        agg_value = agg_value["values"]

                    agg_value = agg_value[es_agg[1]]

                    # Need to convert 'Population' stddev and variance
                    # from Elasticsearch into 'Sample' stddev and variance
                    # which is what pandas uses.
                    if es_agg[1] in ("std_deviation", "variance"):
                        # Neither transformation works with count <=1
                        count = response["aggregations"][
                            f"{es_agg[0]}_{field.es_field_name}"]["count"]

                        # All of the below calculations result in NaN if count<=1
                        if count <= 1:
                            agg_value = np.float64(np.NaN)

                        elif es_agg[1] == "std_deviation":
                            agg_value *= count / (count - 1.0)

                        else:  # es_agg[1] == "variance"
                            # sample_std=\sqrt{\frac{1}{N-1}\sum_{i=1}^N(x_i-\bar{x})^2}
                            # population_std=\sqrt{\frac{1}{N}\sum_{i=1}^N(x_i-\bar{x})^2}
                            # sample_std=\sqrt{\frac{N}{N-1}population_std}
                            agg_value = np.sqrt((count / (count - 1.0)) *
                                                agg_value * agg_value)
                else:
                    agg_value = response["aggregations"][
                        f"{es_agg}_{field.es_field_name}"]
                    if "value_as_string" in agg_value and field.is_timestamp:
                        agg_value = elasticsearch_date_to_pandas_date(
                            agg_value["value_as_string"], field.es_date_format)
                    else:
                        agg_value = agg_value["value"]

                # These aggregations maintain the column datatype
                if pd_agg in ("max", "min"):
                    agg_value = field.np_dtype.type(agg_value)

                # Null usually means there were no results.
                if agg_value is None:
                    agg_value = np.float64(np.NaN)

                values.append(agg_value)

            results[field.index] = values if len(values) > 1 else values[0]

        return results
Exemple #11
0
    def test_copy(self):
        q = Query()

        q.exists("field_a")
        q.exists("field_b", must=False)

        print(q.to_search_body())

        q1 = Query(q)

        q.exists("field_c", must=False)
        q1.exists("field_c1", must=False)

        print(q.to_search_body())
        print(q1.to_search_body())
Exemple #12
0
    def _groupby_aggs(
        self,
        query_compiler: "QueryCompiler",
        by: List[str],
        pd_aggs: List[str],
        dropna: bool = True,
        is_dataframe_agg: bool = False,
        numeric_only: bool = True,
    ) -> Tuple[List[str], Dict[str, Any]]:
        """
        This method is used to calculate groupby aggregations

        Parameters
        ----------
        query_compiler:
            A Query compiler
        by:
            a list of columns on which groupby operations have to be performed
        pd_aggs:
            a list of aggregations to be performed
        dropna:
            Drop None values if True.
            TODO Not yet implemented
        is_dataframe_agg:
            Know if multi aggregation or single agg is called.
        numeric_only:
            return either numeric values or NaN/NaT

        Returns
        -------
        headers: columns on which MultiIndex has to be applied
        response: dictionary of groupby aggregated values
        """
        query_params, post_processing = self._resolve_tasks(query_compiler)

        size = self._size(query_params, post_processing)
        if size is not None:
            raise NotImplementedError(
                f"Can not count field matches if size is set {size}")

        by_fields, agg_fields = query_compiler._mappings.groupby_source_fields(
            by=by)

        # Used defaultdict to avoid initialization of columns with lists
        response: Dict[str, List[Any]] = defaultdict(list)

        if numeric_only:
            agg_fields = [
                field for field in agg_fields
                if (field.is_numeric or field.is_bool)
            ]

        body = Query(query_params.query)

        # Convert pandas aggs to ES equivalent
        es_aggs = self._map_pd_aggs_to_es_aggs(pd_aggs)

        # Construct Query
        for by_field in by_fields:
            # groupby fields will be term aggregations
            body.composite_agg_bucket_terms(name=f"groupby_{by_field.column}",
                                            field=by_field.es_field_name)

        for field in agg_fields:
            for es_agg in es_aggs:
                if not field.is_es_agg_compatible(es_agg):
                    continue

                # If we have multiple 'extended_stats' etc. here we simply NOOP on 2nd call
                if isinstance(es_agg, tuple):
                    body.metric_aggs(
                        f"{es_agg[0]}_{field.es_field_name}",
                        es_agg[0],
                        field.aggregatable_es_field_name,
                    )
                else:
                    body.metric_aggs(
                        f"{es_agg}_{field.es_field_name}",
                        es_agg,
                        field.aggregatable_es_field_name,
                    )

        # Composite aggregation
        body.composite_agg_start(size=DEFAULT_PAGINATION_SIZE,
                                 name="groupby_buckets",
                                 dropna=dropna)

        def bucket_generator() -> Generator[List[str], None, List[str]]:
            """
            e.g.
            "aggregations": {
                "groupby_buckets": {
                    "after_key": {"total_quantity": 8},
                    "buckets": [
                        {
                            "key": {"total_quantity": 1},
                            "doc_count": 87,
                            "taxful_total_price_avg": {"value": 48.035978536496216},
                        }
                    ],
                }
            }
            Returns
            -------
            A generator which initially yields the bucket
            If after_key is found, use it to fetch the next set of buckets.

            """
            while True:
                res = query_compiler._client.search(
                    index=query_compiler._index_pattern,
                    size=0,
                    body=body.to_search_body(),
                )

                # Pagination Logic
                composite_buckets = res["aggregations"]["groupby_buckets"]
                if "after_key" in composite_buckets:

                    # yield the bucket which contains the result
                    yield composite_buckets["buckets"]

                    body.composite_agg_after_key(
                        name="groupby_buckets",
                        after_key=composite_buckets["after_key"],
                    )
                else:
                    return composite_buckets["buckets"]

        for buckets in bucket_generator():
            # We recieve response row-wise
            for bucket in buckets:
                # groupby columns are added to result same way they are returned
                for by_field in by_fields:
                    bucket_key = bucket["key"][f"groupby_{by_field.column}"]

                    # Datetimes always come back as integers, convert to pd.Timestamp()
                    if by_field.is_timestamp and isinstance(bucket_key, int):
                        bucket_key = pd.to_datetime(bucket_key, unit="ms")

                    response[by_field.column].append(bucket_key)

                agg_calculation = self._unpack_metric_aggs(
                    fields=agg_fields,
                    es_aggs=es_aggs,
                    pd_aggs=pd_aggs,
                    response={"aggregations": bucket},
                    numeric_only=numeric_only,
                    is_dataframe_agg=is_dataframe_agg,
                )
                # Process the calculated agg values to response
                for key, value in agg_calculation.items():
                    if isinstance(value, list):
                        for pd_agg, val in zip(pd_aggs, value):
                            response[f"{key}_{pd_agg}"].append(val)
                    else:
                        response[key].append(value)

        return [field.column for field in agg_fields], response
Exemple #13
0
    def _metric_aggs(
        self,
        query_compiler: "QueryCompiler",
        pd_aggs: List[str],
        numeric_only: Optional[bool] = None,
        is_dataframe_agg: bool = False,
    ) -> Dict[str, Any]:
        """
        Used to calculate metric aggregations
        https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-metrics.html

        Parameters
        ----------
        query_compiler:
            Query Compiler object
        pd_aggs:
            aggregations that are to be performed on dataframe or series
        numeric_only:
            return either all numeric values or NaN/NaT
        is_dataframe_agg:
            know if this method is called from single-agg or aggreagation method

        Returns
        -------
            A dictionary which contains all aggregations calculated.
        """
        query_params, post_processing = self._resolve_tasks(query_compiler)

        size = self._size(query_params, post_processing)
        if size is not None:
            raise NotImplementedError(
                f"Can not count field matches if size is set {size}")

        fields = query_compiler._mappings.all_source_fields()
        if numeric_only:
            # Consider if field is Int/Float/Bool
            fields = [
                field for field in fields
                if (field.is_numeric or field.is_bool)
            ]

        body = Query(query_params.query)

        # Convert pandas aggs to ES equivalent
        es_aggs = self._map_pd_aggs_to_es_aggs(pd_aggs)

        for field in fields:
            for es_agg in es_aggs:
                # NaN/NaT fields are ignored
                if not field.is_es_agg_compatible(es_agg):
                    continue

                # If we have multiple 'extended_stats' etc. here we simply NOOP on 2nd call
                if isinstance(es_agg, tuple):
                    body.metric_aggs(
                        f"{es_agg[0]}_{field.es_field_name}",
                        es_agg[0],
                        field.aggregatable_es_field_name,
                    )
                else:
                    body.metric_aggs(
                        f"{es_agg}_{field.es_field_name}",
                        es_agg,
                        field.aggregatable_es_field_name,
                    )

        response = query_compiler._client.search(
            index=query_compiler._index_pattern,
            size=0,
            body=body.to_search_body())
        """
        Results are like (for 'sum', 'min')

             AvgTicketPrice  DistanceKilometers  DistanceMiles  FlightDelayMin
        sum    8.204365e+06        9.261629e+07   5.754909e+07          618150
        min    1.000205e+02        0.000000e+00   0.000000e+00               0
        """

        return self._unpack_metric_aggs(
            fields=fields,
            es_aggs=es_aggs,
            pd_aggs=pd_aggs,
            response=response,
            numeric_only=numeric_only,
            is_dataframe_agg=is_dataframe_agg,
        )
Exemple #14
0
    def aggs_groupby(
        self,
        query_compiler: "QueryCompiler",
        by: List[str],
        pd_aggs: List[str],
        dropna: bool = True,
        is_dataframe_agg: bool = False,
        numeric_only: Optional[bool] = True,
    ) -> pd.DataFrame:
        """
        This method is used to construct groupby aggregation dataframe

        Parameters
        ----------
        query_compiler:
            A Query compiler
        by:
            a list of columns on which groupby operations have to be performed
        pd_aggs:
            a list of aggregations to be performed
        dropna:
            Drop None values if True.
            TODO Not yet implemented
        is_dataframe_agg:
            Know if groupby with aggregation or single agg is called.
        numeric_only:
            return either numeric values or NaN/NaT

        Returns
        -------
            A dataframe which consists groupby data
        """
        query_params, post_processing = self._resolve_tasks(query_compiler)

        size = self._size(query_params, post_processing)
        if size is not None:
            raise NotImplementedError(
                f"Can not count field matches if size is set {size}"
            )

        by_fields, agg_fields = query_compiler._mappings.groupby_source_fields(by=by)

        # Used defaultdict to avoid initialization of columns with lists
        results: Dict[str, List[Any]] = defaultdict(list)

        if numeric_only:
            agg_fields = [
                field for field in agg_fields if (field.is_numeric or field.is_bool)
            ]

        body = Query(query_params.query)

        # To return for creating multi-index on columns
        headers = [agg_field.column for agg_field in agg_fields]

        # Convert pandas aggs to ES equivalent
        es_aggs = self._map_pd_aggs_to_es_aggs(pd_aggs)

        # Construct Query
        for by_field in by_fields:
            if by_field.aggregatable_es_field_name is None:
                raise ValueError(
                    f"Cannot use {by_field.column!r} with groupby() because "
                    f"it has no aggregatable fields in Elasticsearch"
                )
            # groupby fields will be term aggregations
            body.composite_agg_bucket_terms(
                name=f"groupby_{by_field.column}",
                field=by_field.aggregatable_es_field_name,
            )

        for agg_field in agg_fields:
            for es_agg in es_aggs:
                # Skip if the field isn't compatible or if the agg is
                # 'value_count' as this value is pulled from bucket.doc_count.
                if not agg_field.is_es_agg_compatible(es_agg):
                    continue

                # If we have multiple 'extended_stats' etc. here we simply NOOP on 2nd call
                if isinstance(es_agg, tuple):
                    body.metric_aggs(
                        f"{es_agg[0]}_{agg_field.es_field_name}",
                        es_agg[0],
                        agg_field.aggregatable_es_field_name,
                    )
                else:
                    body.metric_aggs(
                        f"{es_agg}_{agg_field.es_field_name}",
                        es_agg,
                        agg_field.aggregatable_es_field_name,
                    )

        # Composite aggregation
        body.composite_agg_start(
            size=DEFAULT_PAGINATION_SIZE, name="groupby_buckets", dropna=dropna
        )

        for buckets in self.bucket_generator(query_compiler, body):
            # We recieve response row-wise
            for bucket in buckets:
                # groupby columns are added to result same way they are returned
                for by_field in by_fields:
                    bucket_key = bucket["key"][f"groupby_{by_field.column}"]

                    # Datetimes always come back as integers, convert to pd.Timestamp()
                    if by_field.is_timestamp and isinstance(bucket_key, int):
                        bucket_key = pd.to_datetime(bucket_key, unit="ms")

                    results[by_field.column].append(bucket_key)

                agg_calculation = self._unpack_metric_aggs(
                    fields=agg_fields,
                    es_aggs=es_aggs,
                    pd_aggs=pd_aggs,
                    response={"aggregations": bucket},
                    numeric_only=numeric_only,
                    # We set 'True' here because we want the value
                    # unpacking to always be in 'dataframe' mode.
                    is_dataframe_agg=True,
                )

                # Process the calculated agg values to response
                for key, value in agg_calculation.items():
                    if not isinstance(value, list):
                        results[key].append(value)
                        continue
                    for pd_agg, val in zip(pd_aggs, value):
                        results[f"{key}_{pd_agg}"].append(val)

        agg_df = pd.DataFrame(results).set_index(by)

        if is_dataframe_agg:
            # Convert header columns to MultiIndex
            agg_df.columns = pd.MultiIndex.from_product([headers, pd_aggs])
        else:
            # Convert header columns to Index
            agg_df.columns = pd.Index(headers)

        return agg_df