def _metric_agg_series( self, query_compiler: "QueryCompiler", agg: List, numeric_only: Optional[bool] = None, ) -> pd.Series: results = self._metric_aggs(query_compiler, agg, numeric_only=numeric_only) if numeric_only: return build_pd_series(results, index=results.keys(), dtype=np.float64) else: # If all results are float convert into float64 if all(isinstance(i, float) for i in results.values()): dtype = np.float64 # If all results are int convert into int64 elif all(isinstance(i, int) for i in results.values()): dtype = np.int64 # If single result is present consider that datatype instead of object elif len(results) <= 1: dtype = None else: dtype = "object" return build_pd_series(results, index=results.keys(), dtype=dtype)
def test_empty_series_dtypes(): with warnings.catch_warnings(record=True) as w: s = build_pd_series({}) assert s.dtype == EMPTY_SERIES_DTYPE assert w == [] # Ensure that a passed-in dtype isn't ignore # even if the result is empty. with warnings.catch_warnings(record=True) as w: s = build_pd_series({}, dtype=np.int32) assert np.int32 != EMPTY_SERIES_DTYPE assert s.dtype == np.int32 assert w == []
def count(self, query_compiler): query_params, post_processing = self._resolve_tasks(query_compiler) # Elasticsearch _count is very efficient and so used to return results here. This means that # data frames that have restricted size or sort params will not return valid results # (_count doesn't support size). # Longer term we may fall back to pandas, but this may result in loading all index into memory. if self._size(query_params, post_processing) is not None: raise NotImplementedError( f"Requesting count with additional query and processing parameters " f"not supported {query_params} {post_processing}") # Only return requested field_names fields = query_compiler.get_field_names(include_scripted_fields=False) counts = {} for field in fields: body = Query(query_params.query) body.exists(field, must=True) field_exists_count = query_compiler._client.count( index=query_compiler._index_pattern, body=body.to_count_body())["count"] counts[field] = field_exists_count return build_pd_series(data=counts, index=fields)
def _terms_aggs( self, query_compiler: "QueryCompiler", func: str, es_size: int ) -> pd.Series: """ Parameters ---------- es_size: int, default None Parameter used by Series.value_counts() Returns ------- pandas.Series Series containing results of `func` applied to the field_name(s) """ query_params, post_processing = self._resolve_tasks(query_compiler) size = self._size(query_params, post_processing) if size is not None: raise NotImplementedError( f"Can not count field matches if size is set {size}" ) # Get just aggregatable field_names aggregatable_field_names = query_compiler._mappings.aggregatable_field_names() body = Query(query_params.query) for field in aggregatable_field_names.keys(): body.terms_aggs(field, func, field, es_size=es_size) response = query_compiler._client.search( index=query_compiler._index_pattern, size=0, body=body.to_search_body() ) results = {} for key in aggregatable_field_names.keys(): # key is aggregatable field, value is label # e.g. key=category.keyword, value=category for bucket in response["aggregations"][key]["buckets"]: results[bucket["key"]] = bucket["doc_count"] try: # get first value in dict (key is .keyword) name = list(aggregatable_field_names.values())[0] except IndexError: name = None return build_pd_series(results, name=name)
def mad(self, query_compiler, numeric_only=True): results = self._metric_aggs(query_compiler, ["mad"], numeric_only=numeric_only) return build_pd_series(results, index=results.keys())
def nunique(self, query_compiler): results = self._metric_aggs(query_compiler, ["nunique"], numeric_only=False) return build_pd_series(results, index=results.keys())