def flatten(x, name=""): # We flatten into source fields e.g. if type=geo_point # location: {lat=52.38, lon=4.90} if name == "": is_source_field = False pd_dtype = "object" else: try: pd_dtype = field_mapping_cache.field_name_pd_dtype( name[:-1]) is_source_field = True except KeyError: is_source_field = False pd_dtype = "object" if not is_source_field and type(x) is dict: for a in x: flatten(x[a], name + a + ".") elif not is_source_field and type(x) is list: for a in x: flatten(a, name) elif is_source_field: # only print source fields from mappings # (TODO - not so efficient for large number of fields and filtered mapping) field_name = name[:-1] # Coerce types - for now just datetime if pd_dtype == "datetime64[ns]": x = elasticsearch_date_to_pandas_date( x, field_mapping_cache.date_field_format(field_name)) # Elasticsearch can have multiple values for a field. These are represented as lists, so # create lists for this pivot (see notes above) if field_name in out: if type(out[field_name]) is not list: field_as_list = [out[field_name]] out[field_name] = field_as_list out[field_name].append(x) else: out[field_name] = x else: # Script fields end up here # Elasticsearch returns 'Infinity' as a string for np.inf values. # Map this to a numeric value to avoid this whole Series being classed as an object # TODO - create a lookup for script fields and dtypes to only map 'Infinity' # if the field is numeric. This implementation will currently map # any script field with "Infinity" as a string to np.inf if x == "Infinity": out[name[:-1]] = np.inf else: out[name[:-1]] = x
def _metric_aggs(self, query_compiler: "QueryCompiler", pd_aggs, numeric_only=True): query_params, post_processing = self._resolve_tasks(query_compiler) size = self._size(query_params, post_processing) if size is not None: raise NotImplementedError( f"Can not count field matches if size is set {size}") results = {} fields = query_compiler._mappings.all_source_fields() if numeric_only: fields = [ field for field in fields if (field.is_numeric or field.is_bool) ] body = Query(query_params.query) # Convert pandas aggs to ES equivalent es_aggs = self._map_pd_aggs_to_es_aggs(pd_aggs) for field in fields: for es_agg in es_aggs: if not field.is_es_agg_compatible(es_agg): continue # If we have multiple 'extended_stats' etc. here we simply NOOP on 2nd call if isinstance(es_agg, tuple): body.metric_aggs( f"{es_agg[0]}_{field.es_field_name}", es_agg[0], field.aggregatable_es_field_name, ) else: body.metric_aggs( f"{es_agg}_{field.es_field_name}", es_agg, field.aggregatable_es_field_name, ) response = query_compiler._client.search( index=query_compiler._index_pattern, size=0, body=body.to_search_body()) """ Results are like (for 'sum', 'min') AvgTicketPrice DistanceKilometers DistanceMiles FlightDelayMin sum 8.204365e+06 9.261629e+07 5.754909e+07 618150 min 1.000205e+02 0.000000e+00 0.000000e+00 0 """ for field in fields: values = [] for es_agg, pd_agg in zip(es_aggs, pd_aggs): # If the field and agg aren't compatible we add a NaN if not field.is_es_agg_compatible(es_agg): values.append(np.float64(np.NaN)) continue if isinstance(es_agg, tuple): agg_value = response["aggregations"][ f"{es_agg[0]}_{field.es_field_name}"] # Pull multiple values from 'percentiles' result. if es_agg[0] == "percentiles": agg_value = agg_value["values"] agg_value = agg_value[es_agg[1]] # Need to convert 'Population' stddev and variance # from Elasticsearch into 'Sample' stddev and variance # which is what pandas uses. if es_agg[1] in ("std_deviation", "variance"): # Neither transformation works with count <=1 count = response["aggregations"][ f"{es_agg[0]}_{field.es_field_name}"]["count"] # All of the below calculations result in NaN if count<=1 if count <= 1: agg_value = np.float64(np.NaN) elif es_agg[1] == "std_deviation": agg_value *= count / (count - 1.0) else: # es_agg[1] == "variance" # sample_std=\sqrt{\frac{1}{N-1}\sum_{i=1}^N(x_i-\bar{x})^2} # population_std=\sqrt{\frac{1}{N}\sum_{i=1}^N(x_i-\bar{x})^2} # sample_std=\sqrt{\frac{N}{N-1}population_std} agg_value = np.sqrt((count / (count - 1.0)) * agg_value * agg_value) else: agg_value = response["aggregations"][ f"{es_agg}_{field.es_field_name}"] if "value_as_string" in agg_value and field.is_timestamp: agg_value = elasticsearch_date_to_pandas_date( agg_value["value_as_string"], field.es_date_format) else: agg_value = agg_value["value"] # These aggregations maintain the column datatype if pd_agg in ("max", "min"): agg_value = field.np_dtype.type(agg_value) # Null usually means there were no results. if agg_value is None: agg_value = np.float64(np.NaN) values.append(agg_value) results[field.index] = values if len(values) > 1 else values[0] return results
def _unpack_metric_aggs( self, fields: List["Field"], es_aggs: Union[List[str], List[Tuple[str, str]]], pd_aggs: List[str], response: Dict[str, Any], numeric_only: Optional[bool], is_dataframe_agg: bool = False, ): """ This method unpacks metric aggregations JSON response. This can be called either directly on an aggs query or on an individual bucket within a composite aggregation. Parameters ---------- fields: a list of Field Mappings es_aggs: Eland Equivalent of aggs pd_aggs: a list of aggs response: a dict containing response from Elastic Search numeric_only: return either numeric values or NaN/NaT Returns ------- a dictionary on which agg caluculations are done. """ results: Dict[str, Any] = {} for field in fields: values = [] for es_agg, pd_agg in zip(es_aggs, pd_aggs): # is_dataframe_agg is used to differentiate agg() and an aggregation called through .mean() # If the field and agg aren't compatible we add a NaN/NaT for agg # If the field and agg aren't compatible we don't add NaN/NaT for an aggregation called through .mean() if not field.is_es_agg_compatible(es_agg): if is_dataframe_agg and not numeric_only: values.append(field.nan_value) elif not is_dataframe_agg and numeric_only is False: values.append(field.nan_value) # Explicit condition for mad to add NaN because it doesn't support bool elif is_dataframe_agg and numeric_only: if pd_agg == "mad": values.append(field.nan_value) continue if isinstance(es_agg, tuple): agg_value = response["aggregations"][ f"{es_agg[0]}_{field.es_field_name}"] # Pull multiple values from 'percentiles' result. if es_agg[0] == "percentiles": agg_value = agg_value["values"] agg_value = agg_value[es_agg[1]] # Need to convert 'Population' stddev and variance # from Elasticsearch into 'Sample' stddev and variance # which is what pandas uses. if es_agg[1] in ("std_deviation", "variance"): # Neither transformation works with count <=1 count = response["aggregations"][ f"{es_agg[0]}_{field.es_field_name}"]["count"] # All of the below calculations result in NaN if count<=1 if count <= 1: agg_value = np.NaN elif es_agg[1] == "std_deviation": agg_value *= count / (count - 1.0) else: # es_agg[1] == "variance" # sample_std=\sqrt{\frac{1}{N-1}\sum_{i=1}^N(x_i-\bar{x})^2} # population_std=\sqrt{\frac{1}{N}\sum_{i=1}^N(x_i-\bar{x})^2} # sample_std=\sqrt{\frac{N}{N-1}population_std} agg_value = np.sqrt((count / (count - 1.0)) * agg_value * agg_value) else: agg_value = response["aggregations"][ f"{es_agg}_{field.es_field_name}"]["value"] # Null usually means there were no results. if agg_value is None or np.isnan(agg_value): if is_dataframe_agg and not numeric_only: agg_value = np.NaN elif not is_dataframe_agg and numeric_only is False: agg_value = np.NaN # Cardinality is always either NaN or integer. elif pd_agg == "nunique": agg_value = int(agg_value) # If this is a non-null timestamp field convert to a pd.Timestamp() elif field.is_timestamp: agg_value = elasticsearch_date_to_pandas_date( agg_value, field.es_date_format) # If numeric_only is False | None then maintain column datatype elif not numeric_only: # we're only converting to bool for lossless aggs like min, max, and median. if pd_agg in {"max", "min", "median", "sum"}: # 'sum' isn't representable with bool, use int64 if pd_agg == "sum" and field.is_bool: agg_value = np.int64(agg_value) else: agg_value = field.np_dtype.type(agg_value) values.append(agg_value) # If numeric_only is True and We only have a NaN type field then we check for empty. if values: results[ field.column] = values if len(values) > 1 else values[0] return results