def count(self, query_compiler): query_params, post_processing = self._resolve_tasks(query_compiler) # Elasticsearch _count is very efficient and so used to return results here. This means that # data frames that have restricted size or sort params will not return valid results # (_count doesn't support size). # Longer term we may fall back to pandas, but this may result in loading all index into memory. if self._size(query_params, post_processing) is not None: raise NotImplementedError( f"Requesting count with additional query and processing parameters " f"not supported {query_params} {post_processing}") # Only return requested field_names fields = query_compiler.get_field_names(include_scripted_fields=False) counts = {} for field in fields: body = Query(query_params.query) body.exists(field, must=True) field_exists_count = query_compiler._client.count( index=query_compiler._index_pattern, body=body.to_count_body())["count"] counts[field] = field_exists_count return build_pd_series(data=counts, index=fields)
def _terms_aggs( self, query_compiler: "QueryCompiler", func: str, es_size: int ) -> pd.Series: """ Parameters ---------- es_size: int, default None Parameter used by Series.value_counts() Returns ------- pandas.Series Series containing results of `func` applied to the field_name(s) """ query_params, post_processing = self._resolve_tasks(query_compiler) size = self._size(query_params, post_processing) if size is not None: raise NotImplementedError( f"Can not count field matches if size is set {size}" ) # Get just aggregatable field_names aggregatable_field_names = query_compiler._mappings.aggregatable_field_names() body = Query(query_params.query) for field in aggregatable_field_names.keys(): body.terms_aggs(field, func, field, es_size=es_size) response = query_compiler._client.search( index=query_compiler._index_pattern, size=0, body=body.to_search_body() ) results = {} for key in aggregatable_field_names.keys(): # key is aggregatable field, value is label # e.g. key=category.keyword, value=category for bucket in response["aggregations"][key]["buckets"]: results[bucket["key"]] = bucket["doc_count"] try: # get first value in dict (key is .keyword) name = list(aggregatable_field_names.values())[0] except IndexError: name = None return build_pd_series(results, name=name)
def __init__(self): self.query = Query() self.sort_field: Optional[str] = None self.sort_order: Optional[SortOrder] = None self.size: Optional[int] = None self.fields: Optional[List[str]] = None self.script_fields: Optional[Dict[str, Dict[str, Any]]] = None
def index_count(self, query_compiler, field): # field is the index field so count values query_params, post_processing = self._resolve_tasks(query_compiler) size = self._size(query_params, post_processing) # Size is dictated by operations if size is not None: # TODO - this is not necessarily valid as the field may not exist in ALL these docs return size body = Query(query_params.query) body.exists(field, must=True) return query_compiler._client.count( index=query_compiler._index_pattern, body=body.to_count_body())["count"]
def es_info(self, query_compiler, buf): buf.write("Operations:\n") buf.write(f" tasks: {self._tasks}\n") query_params, post_processing = self._resolve_tasks(query_compiler) size, sort_params = Operations._query_params_to_size_and_sort(query_params) _source = query_compiler._mappings.get_field_names() script_fields = query_params.script_fields query = Query(query_params.query) body = query.to_search_body() if script_fields is not None: body["script_fields"] = script_fields buf.write(f" size: {size}\n") buf.write(f" sort_params: {sort_params}\n") buf.write(f" _source: {_source}\n") buf.write(f" body: {body}\n") buf.write(f" post_processing: {post_processing}\n")
def describe(self, query_compiler): query_params, post_processing = self._resolve_tasks(query_compiler) size = self._size(query_params, post_processing) if size is not None: raise NotImplementedError( f"Can not count field matches if size is set {size}") numeric_source_fields = query_compiler._mappings.numeric_source_fields( ) # for each field we compute: # count, mean, std, min, 25%, 50%, 75%, max body = Query(query_params.query) for field in numeric_source_fields: body.metric_aggs("extended_stats_" + field, "extended_stats", field) body.metric_aggs("percentiles_" + field, "percentiles", field) response = query_compiler._client.search( index=query_compiler._index_pattern, size=0, body=body.to_search_body()) results = {} for field in numeric_source_fields: values = list() values.append(response["aggregations"]["extended_stats_" + field]["count"]) values.append(response["aggregations"]["extended_stats_" + field]["avg"]) values.append(response["aggregations"]["extended_stats_" + field]["std_deviation"]) values.append(response["aggregations"]["extended_stats_" + field]["min"]) values.append(response["aggregations"]["percentiles_" + field]["values"]["25.0"]) values.append(response["aggregations"]["percentiles_" + field]["values"]["50.0"]) values.append(response["aggregations"]["percentiles_" + field]["values"]["75.0"]) values.append(response["aggregations"]["extended_stats_" + field]["max"]) # if not None if values.count(None) < len(values): results[field] = values df = pd.DataFrame( data=results, index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], ) return df
def index_matches_count(self, query_compiler, field, items): query_params, post_processing = self._validate_index_operation( query_compiler, items) body = Query(query_params.query) if field == Index.ID_INDEX_FIELD: body.ids(items, must=True) else: body.terms(field, items, must=True) return query_compiler._client.count( index=query_compiler._index_pattern, body=body.to_count_body())["count"]
def _es_results(self, query_compiler, collector): query_params, post_processing = self._resolve_tasks(query_compiler) size, sort_params = Operations._query_params_to_size_and_sort( query_params) script_fields = query_params.script_fields query = Query(query_params.query) body = query.to_search_body() if script_fields is not None: body["script_fields"] = script_fields # Only return requested field_names _source = query_compiler.get_field_names(include_scripted_fields=False) if _source: # For query_compiler._client.search we could add _source # as a parameter, or add this value in body. # # If _source is a parameter it is encoded into to the url. # # If _source is a large number of fields (1000+) then this can result in an # extremely long url and a `too_long_frame_exception`. Therefore, add # _source to the body rather than as a _source parameter body["_source"] = _source else: body["_source"] = False es_results = None # If size=None use scan not search - then post sort results when in df # If size>10000 use scan is_scan = False if size is not None and size <= DEFAULT_ES_MAX_RESULT_WINDOW: if size > 0: try: es_results = query_compiler._client.search( index=query_compiler._index_pattern, size=size, sort=sort_params, body=body, ) except Exception: # Catch all ES errors and print debug (currently to stdout) error = { "index": query_compiler._index_pattern, "size": size, "sort": sort_params, "body": body, } print("Elasticsearch error:", error) raise else: is_scan = True es_results = scan( client=query_compiler._client, index=query_compiler._index_pattern, query=body, ) # create post sort if sort_params is not None: post_processing.append(SortFieldAction(sort_params)) if is_scan: while True: partial_result, df = query_compiler._es_results_to_pandas( es_results, collector.batch_size(), collector.show_progress) df = self._apply_df_post_processing(df, post_processing) collector.collect(df) if not partial_result: break else: partial_result, df = query_compiler._es_results_to_pandas( es_results) df = self._apply_df_post_processing(df, post_processing) collector.collect(df)
def _hist_aggs(self, query_compiler, num_bins): # Get histogram bins and weights for numeric field_names query_params, post_processing = self._resolve_tasks(query_compiler) size = self._size(query_params, post_processing) if size is not None: raise NotImplementedError( f"Can not count field matches if size is set {size}") numeric_source_fields = query_compiler._mappings.numeric_source_fields( ) body = Query(query_params.query) results = self._metric_aggs(query_compiler, ["min", "max"], numeric_only=True) min_aggs = {} max_aggs = {} for field, (min_agg, max_agg) in results.items(): min_aggs[field] = min_agg max_aggs[field] = max_agg for field in numeric_source_fields: body.hist_aggs(field, field, min_aggs[field], max_aggs[field], num_bins) response = query_compiler._client.search( index=query_compiler._index_pattern, size=0, body=body.to_search_body()) # results are like # "aggregations" : { # "DistanceKilometers" : { # "buckets" : [ # { # "key" : 0.0, # "doc_count" : 2956 # }, # { # "key" : 1988.1482421875, # "doc_count" : 768 # }, # ... bins = {} weights = {} # There is one more bin that weights # len(bins) = len(weights) + 1 # bins = [ 0. 36. 72. 108. 144. 180. 216. 252. 288. 324. 360.] # len(bins) == 11 # weights = [10066., 263., 386., 264., 273., 390., 324., 438., 261., 394.] # len(weights) == 10 # ES returns # weights = [10066., 263., 386., 264., 273., 390., 324., 438., 261., 252., 142.] # So sum last 2 buckets for field in numeric_source_fields: # in case of series let plotting.ed_hist_series thrown an exception if not response.get("aggregations"): continue # in case of dataframe, throw warning that field is excluded if not response["aggregations"].get(field): warnings.warn( f"{field} has no meaningful histogram interval and will be excluded. " f"All values 0.", UserWarning, ) continue buckets = response["aggregations"][field]["buckets"] bins[field] = [] weights[field] = [] for bucket in buckets: bins[field].append(bucket["key"]) if bucket == buckets[-1]: weights[field][-1] += bucket["doc_count"] else: weights[field].append(bucket["doc_count"]) df_bins = pd.DataFrame(data=bins) df_weights = pd.DataFrame(data=weights) return df_bins, df_weights
def _metric_aggs(self, query_compiler: "QueryCompiler", pd_aggs, numeric_only=True): query_params, post_processing = self._resolve_tasks(query_compiler) size = self._size(query_params, post_processing) if size is not None: raise NotImplementedError( f"Can not count field matches if size is set {size}") results = {} fields = query_compiler._mappings.all_source_fields() if numeric_only: fields = [ field for field in fields if (field.is_numeric or field.is_bool) ] body = Query(query_params.query) # Convert pandas aggs to ES equivalent es_aggs = self._map_pd_aggs_to_es_aggs(pd_aggs) for field in fields: for es_agg in es_aggs: if not field.is_es_agg_compatible(es_agg): continue # If we have multiple 'extended_stats' etc. here we simply NOOP on 2nd call if isinstance(es_agg, tuple): body.metric_aggs( f"{es_agg[0]}_{field.es_field_name}", es_agg[0], field.aggregatable_es_field_name, ) else: body.metric_aggs( f"{es_agg}_{field.es_field_name}", es_agg, field.aggregatable_es_field_name, ) response = query_compiler._client.search( index=query_compiler._index_pattern, size=0, body=body.to_search_body()) """ Results are like (for 'sum', 'min') AvgTicketPrice DistanceKilometers DistanceMiles FlightDelayMin sum 8.204365e+06 9.261629e+07 5.754909e+07 618150 min 1.000205e+02 0.000000e+00 0.000000e+00 0 """ for field in fields: values = [] for es_agg, pd_agg in zip(es_aggs, pd_aggs): # If the field and agg aren't compatible we add a NaN if not field.is_es_agg_compatible(es_agg): values.append(np.float64(np.NaN)) continue if isinstance(es_agg, tuple): agg_value = response["aggregations"][ f"{es_agg[0]}_{field.es_field_name}"] # Pull multiple values from 'percentiles' result. if es_agg[0] == "percentiles": agg_value = agg_value["values"] agg_value = agg_value[es_agg[1]] # Need to convert 'Population' stddev and variance # from Elasticsearch into 'Sample' stddev and variance # which is what pandas uses. if es_agg[1] in ("std_deviation", "variance"): # Neither transformation works with count <=1 count = response["aggregations"][ f"{es_agg[0]}_{field.es_field_name}"]["count"] # All of the below calculations result in NaN if count<=1 if count <= 1: agg_value = np.float64(np.NaN) elif es_agg[1] == "std_deviation": agg_value *= count / (count - 1.0) else: # es_agg[1] == "variance" # sample_std=\sqrt{\frac{1}{N-1}\sum_{i=1}^N(x_i-\bar{x})^2} # population_std=\sqrt{\frac{1}{N}\sum_{i=1}^N(x_i-\bar{x})^2} # sample_std=\sqrt{\frac{N}{N-1}population_std} agg_value = np.sqrt((count / (count - 1.0)) * agg_value * agg_value) else: agg_value = response["aggregations"][ f"{es_agg}_{field.es_field_name}"] if "value_as_string" in agg_value and field.is_timestamp: agg_value = elasticsearch_date_to_pandas_date( agg_value["value_as_string"], field.es_date_format) else: agg_value = agg_value["value"] # These aggregations maintain the column datatype if pd_agg in ("max", "min"): agg_value = field.np_dtype.type(agg_value) # Null usually means there were no results. if agg_value is None: agg_value = np.float64(np.NaN) values.append(agg_value) results[field.index] = values if len(values) > 1 else values[0] return results
def test_copy(self): q = Query() q.exists("field_a") q.exists("field_b", must=False) print(q.to_search_body()) q1 = Query(q) q.exists("field_c", must=False) q1.exists("field_c1", must=False) print(q.to_search_body()) print(q1.to_search_body())
def _groupby_aggs( self, query_compiler: "QueryCompiler", by: List[str], pd_aggs: List[str], dropna: bool = True, is_dataframe_agg: bool = False, numeric_only: bool = True, ) -> Tuple[List[str], Dict[str, Any]]: """ This method is used to calculate groupby aggregations Parameters ---------- query_compiler: A Query compiler by: a list of columns on which groupby operations have to be performed pd_aggs: a list of aggregations to be performed dropna: Drop None values if True. TODO Not yet implemented is_dataframe_agg: Know if multi aggregation or single agg is called. numeric_only: return either numeric values or NaN/NaT Returns ------- headers: columns on which MultiIndex has to be applied response: dictionary of groupby aggregated values """ query_params, post_processing = self._resolve_tasks(query_compiler) size = self._size(query_params, post_processing) if size is not None: raise NotImplementedError( f"Can not count field matches if size is set {size}") by_fields, agg_fields = query_compiler._mappings.groupby_source_fields( by=by) # Used defaultdict to avoid initialization of columns with lists response: Dict[str, List[Any]] = defaultdict(list) if numeric_only: agg_fields = [ field for field in agg_fields if (field.is_numeric or field.is_bool) ] body = Query(query_params.query) # Convert pandas aggs to ES equivalent es_aggs = self._map_pd_aggs_to_es_aggs(pd_aggs) # Construct Query for by_field in by_fields: # groupby fields will be term aggregations body.composite_agg_bucket_terms(name=f"groupby_{by_field.column}", field=by_field.es_field_name) for field in agg_fields: for es_agg in es_aggs: if not field.is_es_agg_compatible(es_agg): continue # If we have multiple 'extended_stats' etc. here we simply NOOP on 2nd call if isinstance(es_agg, tuple): body.metric_aggs( f"{es_agg[0]}_{field.es_field_name}", es_agg[0], field.aggregatable_es_field_name, ) else: body.metric_aggs( f"{es_agg}_{field.es_field_name}", es_agg, field.aggregatable_es_field_name, ) # Composite aggregation body.composite_agg_start(size=DEFAULT_PAGINATION_SIZE, name="groupby_buckets", dropna=dropna) def bucket_generator() -> Generator[List[str], None, List[str]]: """ e.g. "aggregations": { "groupby_buckets": { "after_key": {"total_quantity": 8}, "buckets": [ { "key": {"total_quantity": 1}, "doc_count": 87, "taxful_total_price_avg": {"value": 48.035978536496216}, } ], } } Returns ------- A generator which initially yields the bucket If after_key is found, use it to fetch the next set of buckets. """ while True: res = query_compiler._client.search( index=query_compiler._index_pattern, size=0, body=body.to_search_body(), ) # Pagination Logic composite_buckets = res["aggregations"]["groupby_buckets"] if "after_key" in composite_buckets: # yield the bucket which contains the result yield composite_buckets["buckets"] body.composite_agg_after_key( name="groupby_buckets", after_key=composite_buckets["after_key"], ) else: return composite_buckets["buckets"] for buckets in bucket_generator(): # We recieve response row-wise for bucket in buckets: # groupby columns are added to result same way they are returned for by_field in by_fields: bucket_key = bucket["key"][f"groupby_{by_field.column}"] # Datetimes always come back as integers, convert to pd.Timestamp() if by_field.is_timestamp and isinstance(bucket_key, int): bucket_key = pd.to_datetime(bucket_key, unit="ms") response[by_field.column].append(bucket_key) agg_calculation = self._unpack_metric_aggs( fields=agg_fields, es_aggs=es_aggs, pd_aggs=pd_aggs, response={"aggregations": bucket}, numeric_only=numeric_only, is_dataframe_agg=is_dataframe_agg, ) # Process the calculated agg values to response for key, value in agg_calculation.items(): if isinstance(value, list): for pd_agg, val in zip(pd_aggs, value): response[f"{key}_{pd_agg}"].append(val) else: response[key].append(value) return [field.column for field in agg_fields], response
def _metric_aggs( self, query_compiler: "QueryCompiler", pd_aggs: List[str], numeric_only: Optional[bool] = None, is_dataframe_agg: bool = False, ) -> Dict[str, Any]: """ Used to calculate metric aggregations https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-metrics.html Parameters ---------- query_compiler: Query Compiler object pd_aggs: aggregations that are to be performed on dataframe or series numeric_only: return either all numeric values or NaN/NaT is_dataframe_agg: know if this method is called from single-agg or aggreagation method Returns ------- A dictionary which contains all aggregations calculated. """ query_params, post_processing = self._resolve_tasks(query_compiler) size = self._size(query_params, post_processing) if size is not None: raise NotImplementedError( f"Can not count field matches if size is set {size}") fields = query_compiler._mappings.all_source_fields() if numeric_only: # Consider if field is Int/Float/Bool fields = [ field for field in fields if (field.is_numeric or field.is_bool) ] body = Query(query_params.query) # Convert pandas aggs to ES equivalent es_aggs = self._map_pd_aggs_to_es_aggs(pd_aggs) for field in fields: for es_agg in es_aggs: # NaN/NaT fields are ignored if not field.is_es_agg_compatible(es_agg): continue # If we have multiple 'extended_stats' etc. here we simply NOOP on 2nd call if isinstance(es_agg, tuple): body.metric_aggs( f"{es_agg[0]}_{field.es_field_name}", es_agg[0], field.aggregatable_es_field_name, ) else: body.metric_aggs( f"{es_agg}_{field.es_field_name}", es_agg, field.aggregatable_es_field_name, ) response = query_compiler._client.search( index=query_compiler._index_pattern, size=0, body=body.to_search_body()) """ Results are like (for 'sum', 'min') AvgTicketPrice DistanceKilometers DistanceMiles FlightDelayMin sum 8.204365e+06 9.261629e+07 5.754909e+07 618150 min 1.000205e+02 0.000000e+00 0.000000e+00 0 """ return self._unpack_metric_aggs( fields=fields, es_aggs=es_aggs, pd_aggs=pd_aggs, response=response, numeric_only=numeric_only, is_dataframe_agg=is_dataframe_agg, )
def aggs_groupby( self, query_compiler: "QueryCompiler", by: List[str], pd_aggs: List[str], dropna: bool = True, is_dataframe_agg: bool = False, numeric_only: Optional[bool] = True, ) -> pd.DataFrame: """ This method is used to construct groupby aggregation dataframe Parameters ---------- query_compiler: A Query compiler by: a list of columns on which groupby operations have to be performed pd_aggs: a list of aggregations to be performed dropna: Drop None values if True. TODO Not yet implemented is_dataframe_agg: Know if groupby with aggregation or single agg is called. numeric_only: return either numeric values or NaN/NaT Returns ------- A dataframe which consists groupby data """ query_params, post_processing = self._resolve_tasks(query_compiler) size = self._size(query_params, post_processing) if size is not None: raise NotImplementedError( f"Can not count field matches if size is set {size}" ) by_fields, agg_fields = query_compiler._mappings.groupby_source_fields(by=by) # Used defaultdict to avoid initialization of columns with lists results: Dict[str, List[Any]] = defaultdict(list) if numeric_only: agg_fields = [ field for field in agg_fields if (field.is_numeric or field.is_bool) ] body = Query(query_params.query) # To return for creating multi-index on columns headers = [agg_field.column for agg_field in agg_fields] # Convert pandas aggs to ES equivalent es_aggs = self._map_pd_aggs_to_es_aggs(pd_aggs) # Construct Query for by_field in by_fields: if by_field.aggregatable_es_field_name is None: raise ValueError( f"Cannot use {by_field.column!r} with groupby() because " f"it has no aggregatable fields in Elasticsearch" ) # groupby fields will be term aggregations body.composite_agg_bucket_terms( name=f"groupby_{by_field.column}", field=by_field.aggregatable_es_field_name, ) for agg_field in agg_fields: for es_agg in es_aggs: # Skip if the field isn't compatible or if the agg is # 'value_count' as this value is pulled from bucket.doc_count. if not agg_field.is_es_agg_compatible(es_agg): continue # If we have multiple 'extended_stats' etc. here we simply NOOP on 2nd call if isinstance(es_agg, tuple): body.metric_aggs( f"{es_agg[0]}_{agg_field.es_field_name}", es_agg[0], agg_field.aggregatable_es_field_name, ) else: body.metric_aggs( f"{es_agg}_{agg_field.es_field_name}", es_agg, agg_field.aggregatable_es_field_name, ) # Composite aggregation body.composite_agg_start( size=DEFAULT_PAGINATION_SIZE, name="groupby_buckets", dropna=dropna ) for buckets in self.bucket_generator(query_compiler, body): # We recieve response row-wise for bucket in buckets: # groupby columns are added to result same way they are returned for by_field in by_fields: bucket_key = bucket["key"][f"groupby_{by_field.column}"] # Datetimes always come back as integers, convert to pd.Timestamp() if by_field.is_timestamp and isinstance(bucket_key, int): bucket_key = pd.to_datetime(bucket_key, unit="ms") results[by_field.column].append(bucket_key) agg_calculation = self._unpack_metric_aggs( fields=agg_fields, es_aggs=es_aggs, pd_aggs=pd_aggs, response={"aggregations": bucket}, numeric_only=numeric_only, # We set 'True' here because we want the value # unpacking to always be in 'dataframe' mode. is_dataframe_agg=True, ) # Process the calculated agg values to response for key, value in agg_calculation.items(): if not isinstance(value, list): results[key].append(value) continue for pd_agg, val in zip(pd_aggs, value): results[f"{key}_{pd_agg}"].append(val) agg_df = pd.DataFrame(results).set_index(by) if is_dataframe_agg: # Convert header columns to MultiIndex agg_df.columns = pd.MultiIndex.from_product([headers, pd_aggs]) else: # Convert header columns to Index agg_df.columns = pd.Index(headers) return agg_df