Ejemplo n.º 1
0
 def _metric_agg_series(
     self,
     query_compiler: "QueryCompiler",
     agg: List,
     numeric_only: Optional[bool] = None,
 ) -> pd.Series:
     results = self._metric_aggs(query_compiler,
                                 agg,
                                 numeric_only=numeric_only)
     if numeric_only:
         return build_pd_series(results,
                                index=results.keys(),
                                dtype=np.float64)
     else:
         # If all results are float convert into float64
         if all(isinstance(i, float) for i in results.values()):
             dtype = np.float64
         # If all results are int convert into int64
         elif all(isinstance(i, int) for i in results.values()):
             dtype = np.int64
         # If single result is present consider that datatype instead of object
         elif len(results) <= 1:
             dtype = None
         else:
             dtype = "object"
         return build_pd_series(results, index=results.keys(), dtype=dtype)
Ejemplo n.º 2
0
def test_empty_series_dtypes():
    with warnings.catch_warnings(record=True) as w:
        s = build_pd_series({})
    assert s.dtype == EMPTY_SERIES_DTYPE
    assert w == []

    # Ensure that a passed-in dtype isn't ignore
    # even if the result is empty.
    with warnings.catch_warnings(record=True) as w:
        s = build_pd_series({}, dtype=np.int32)
    assert np.int32 != EMPTY_SERIES_DTYPE
    assert s.dtype == np.int32
    assert w == []
Ejemplo n.º 3
0
    def count(self, query_compiler):
        query_params, post_processing = self._resolve_tasks(query_compiler)

        # Elasticsearch _count is very efficient and so used to return results here. This means that
        # data frames that have restricted size or sort params will not return valid results
        # (_count doesn't support size).
        # Longer term we may fall back to pandas, but this may result in loading all index into memory.
        if self._size(query_params, post_processing) is not None:
            raise NotImplementedError(
                f"Requesting count with additional query and processing parameters "
                f"not supported {query_params} {post_processing}")

        # Only return requested field_names
        fields = query_compiler.get_field_names(include_scripted_fields=False)

        counts = {}
        for field in fields:
            body = Query(query_params.query)
            body.exists(field, must=True)

            field_exists_count = query_compiler._client.count(
                index=query_compiler._index_pattern,
                body=body.to_count_body())["count"]
            counts[field] = field_exists_count

        return build_pd_series(data=counts, index=fields)
Ejemplo n.º 4
0
    def _terms_aggs(
        self, query_compiler: "QueryCompiler", func: str, es_size: int
    ) -> pd.Series:
        """
        Parameters
        ----------
        es_size: int, default None
            Parameter used by Series.value_counts()

        Returns
        -------
        pandas.Series
            Series containing results of `func` applied to the field_name(s)
        """
        query_params, post_processing = self._resolve_tasks(query_compiler)

        size = self._size(query_params, post_processing)
        if size is not None:
            raise NotImplementedError(
                f"Can not count field matches if size is set {size}"
            )

        # Get just aggregatable field_names
        aggregatable_field_names = query_compiler._mappings.aggregatable_field_names()

        body = Query(query_params.query)

        for field in aggregatable_field_names.keys():
            body.terms_aggs(field, func, field, es_size=es_size)

        response = query_compiler._client.search(
            index=query_compiler._index_pattern, size=0, body=body.to_search_body()
        )

        results = {}

        for key in aggregatable_field_names.keys():
            # key is aggregatable field, value is label
            # e.g. key=category.keyword, value=category
            for bucket in response["aggregations"][key]["buckets"]:
                results[bucket["key"]] = bucket["doc_count"]

        try:
            # get first value in dict (key is .keyword)
            name = list(aggregatable_field_names.values())[0]
        except IndexError:
            name = None

        return build_pd_series(results, name=name)
Ejemplo n.º 5
0
 def mad(self, query_compiler, numeric_only=True):
     results = self._metric_aggs(query_compiler, ["mad"],
                                 numeric_only=numeric_only)
     return build_pd_series(results, index=results.keys())
Ejemplo n.º 6
0
 def nunique(self, query_compiler):
     results = self._metric_aggs(query_compiler, ["nunique"],
                                 numeric_only=False)
     return build_pd_series(results, index=results.keys())