def exec_post_processing(self, df: DataFrame) -> DataFrame: """ Perform post processing operations on DataFrame. :param df: DataFrame returned from database model. :return: new DataFrame to which all post processing operations have been applied :raises QueryObjectValidationError: If the post processing operation is incorrect """ logger.debug("post_processing: \n %s", pformat(self.post_processing)) for post_process in self.post_processing: operation = post_process.get("operation") if not operation: raise InvalidPostProcessingError( _("`operation` property of post processing object undefined") ) if not hasattr(pandas_postprocessing, operation): raise InvalidPostProcessingError( _( "Unsupported post processing operation: %(operation)s", type=operation, ) ) options = post_process.get("options", {}) df = getattr(pandas_postprocessing, operation)(df, **options) return df
def resample( df: pd.DataFrame, rule: str, method: str, fill_value: Optional[Union[float, int]] = None, ) -> pd.DataFrame: """ support upsampling in resample :param df: DataFrame to resample. :param rule: The offset string representing target conversion. :param method: How to fill the NaN value after resample. :param fill_value: What values do fill missing. :return: DataFrame after resample :raises InvalidPostProcessingError: If the request in incorrect """ if not isinstance(df.index, pd.DatetimeIndex): raise InvalidPostProcessingError(_("Resample operation requires DatetimeIndex")) if method not in RESAMPLE_METHOD: raise InvalidPostProcessingError( _("Resample method should in ") + ", ".join(RESAMPLE_METHOD) + "." ) if method == "asfreq" and fill_value is not None: _df = df.resample(rule).asfreq(fill_value=fill_value) elif method == "linear": _df = df.resample(rule).interpolate() else: _df = getattr(df.resample(rule), method)() return _df
def compare( # pylint: disable=too-many-arguments df: DataFrame, source_columns: List[str], compare_columns: List[str], compare_type: PandasPostprocessingCompare, drop_original_columns: Optional[bool] = False, precision: Optional[int] = 4, ) -> DataFrame: """ Calculate column-by-column changing for select columns. :param df: DataFrame on which the compare will be based. :param source_columns: Main query columns :param compare_columns: Columns being compared :param compare_type: Type of compare. Choice of `absolute`, `percentage` or `ratio` :param drop_original_columns: Whether to remove the source columns and compare columns. :param precision: Round a change rate to a variable number of decimal places. :return: DataFrame with compared columns. :raises InvalidPostProcessingError: If the request in incorrect. """ if len(source_columns) != len(compare_columns): raise InvalidPostProcessingError( _("`compare_columns` must have the same length as `source_columns`." )) if compare_type not in tuple(PandasPostprocessingCompare): raise InvalidPostProcessingError( _("`compare_type` must be `difference`, `percentage` or `ratio`")) if len(source_columns) == 0: return df for s_col, c_col in zip(source_columns, compare_columns): s_df = df.loc[:, [s_col]] s_df.rename(columns={s_col: "__intermediate"}, inplace=True) c_df = df.loc[:, [c_col]] c_df.rename(columns={c_col: "__intermediate"}, inplace=True) if compare_type == PandasPostprocessingCompare.DIFF: diff_df = s_df - c_df elif compare_type == PandasPostprocessingCompare.PCT: diff_df = ((s_df - c_df) / c_df).astype(float).round(precision) else: # compare_type == "ratio" diff_df = (s_df / c_df).astype(float).round(precision) diff_df.rename( columns={ "__intermediate": TIME_COMPARISON.join([compare_type, s_col, c_col]) }, inplace=True, ) df = pd.concat([df, diff_df], axis=1) if drop_original_columns: df = df.drop(source_columns + compare_columns, axis=1) return df
def contribution( df: DataFrame, orientation: Optional[ PostProcessingContributionOrientation ] = PostProcessingContributionOrientation.COLUMN, columns: Optional[List[str]] = None, rename_columns: Optional[List[str]] = None, ) -> DataFrame: """ Calculate cell contibution to row/column total for numeric columns. Non-numeric columns will be kept untouched. If `columns` are specified, only calculate contributions on selected columns. :param df: DataFrame containing all-numeric data (temporal column ignored) :param columns: Columns to calculate values from. :param rename_columns: The new labels for the calculated contribution columns. The original columns will not be removed. :param orientation: calculate by dividing cell with row/column total :return: DataFrame with contributions. """ contribution_df = df.copy() numeric_df = contribution_df.select_dtypes(include=["number", Decimal]) # TODO: copy needed due to following regression in 1.4, remove if not needed: # https://github.com/pandas-dev/pandas/issues/48090 numeric_df = numeric_df.copy() numeric_df.fillna(0, inplace=True) # verify column selections if columns: numeric_columns = numeric_df.columns.tolist() for col in columns: if col not in numeric_columns: raise InvalidPostProcessingError( _( 'Column "%(column)s" is not numeric or does not ' "exists in the query results.", column=col, ) ) columns = columns or numeric_df.columns rename_columns = rename_columns or columns if len(rename_columns) != len(columns): raise InvalidPostProcessingError( _("`rename_columns` must have the same length as `columns`.") ) # limit to selected columns numeric_df = numeric_df[columns] axis = 0 if orientation == PostProcessingContributionOrientation.COLUMN else 1 numeric_df = numeric_df / numeric_df.values.sum(axis=axis, keepdims=True) contribution_df[rename_columns] = numeric_df return contribution_df
def cum( df: DataFrame, operator: str, columns: Dict[str, str], ) -> DataFrame: """ Calculate cumulative sum/product/min/max for select columns. :param df: DataFrame on which the cumulative operation will be based. :param columns: columns on which to perform a cumulative operation, mapping source column to target column. For instance, `{'y': 'y'}` will replace the column `y` with the cumulative value in `y`, while `{'y': 'y2'}` will add a column `y2` based on cumulative values calculated from `y`, leaving the original column `y` unchanged. :param operator: cumulative operator, e.g. `sum`, `prod`, `min`, `max` :return: DataFrame with cumulated columns """ columns = columns or {} df_cum = df.loc[:, columns.keys()] operation = "cum" + operator if operation not in ALLOWLIST_CUMULATIVE_FUNCTIONS or not hasattr( df_cum, operation): raise InvalidPostProcessingError( _("Invalid cumulative operator: %(operator)s", operator=operator)) df_cum = _append_columns(df, getattr(df_cum, operation)(), columns) return df_cum
def geohash_encode( df: DataFrame, geohash: str, longitude: str, latitude: str, ) -> DataFrame: """ Encode longitude and latitude into geohash :param df: DataFrame containing longitude and latitude data :param geohash: Name of new column to be created containing geohash location. :param longitude: Name of source column containing longitude. :param latitude: Name of source column containing latitude. :return: DataFrame with decoded longitudes and latitudes """ try: encode_df = df[[latitude, longitude]] encode_df.columns = ["latitude", "longitude"] encode_df["geohash"] = encode_df.apply( lambda row: geohash_lib.encode(row["latitude"], row["longitude"]), axis=1, ) return _append_columns(df, encode_df, {"geohash": geohash}) except ValueError as ex: raise InvalidPostProcessingError( _("Invalid longitude/latitude")) from ex
def _get_aggregate_funcs( df: DataFrame, aggregates: Dict[str, Dict[str, Any]], ) -> Dict[str, NamedAgg]: """ Converts a set of aggregate config objects into functions that pandas can use as aggregators. Currently only numpy aggregators are supported. :param df: DataFrame on which to perform aggregate operation. :param aggregates: Mapping from column name to aggregate config. :return: Mapping from metric name to function that takes a single input argument. """ agg_funcs: Dict[str, NamedAgg] = {} for name, agg_obj in aggregates.items(): column = agg_obj.get("column", name) if column not in df: raise InvalidPostProcessingError( _( "Column referenced by aggregate is undefined: %(column)s", column=column, ) ) if "operator" not in agg_obj: raise InvalidPostProcessingError( _( "Operator undefined for aggregator: %(name)s", name=name, ) ) operator = agg_obj["operator"] if callable(operator): aggfunc = operator else: func = NUMPY_FUNCTIONS.get(operator) if not func: raise InvalidPostProcessingError( _( "Invalid numpy function: %(operator)s", operator=operator, ) ) options = agg_obj.get("options", {}) aggfunc = partial(func, **options) agg_funcs[name] = NamedAgg(column=column, aggfunc=aggfunc) return agg_funcs
def wrapped(df: DataFrame, **options: Any) -> Any: if _is_multi_index_on_columns(df): # MultiIndex column validate first level columns = df.columns.get_level_values(0) else: columns = df.columns.tolist() for name in argnames: if name in options and not all( elem in columns for elem in options.get(name) or []): raise InvalidPostProcessingError( _("Referenced columns not available in DataFrame.")) return func(df, **options)
def geohash_decode(df: DataFrame, geohash: str, longitude: str, latitude: str) -> DataFrame: """ Decode a geohash column into longitude and latitude :param df: DataFrame containing geohash data :param geohash: Name of source column containing geohash location. :param longitude: Name of new column to be created containing longitude. :param latitude: Name of new column to be created containing latitude. :return: DataFrame with decoded longitudes and latitudes """ try: lonlat_df = DataFrame() lonlat_df["latitude"], lonlat_df["longitude"] = zip( *df[geohash].apply(geohash_lib.decode)) return _append_columns(df, lonlat_df, { "latitude": latitude, "longitude": longitude }) except ValueError as ex: raise InvalidPostProcessingError(_("Invalid geohash string")) from ex
def geodetic_parse( df: DataFrame, geodetic: str, longitude: str, latitude: str, altitude: Optional[str] = None, ) -> DataFrame: """ Parse a column containing a geodetic point string [Geopy](https://geopy.readthedocs.io/en/stable/#geopy.point.Point). :param df: DataFrame containing geodetic point data :param geodetic: Name of source column containing geodetic point string. :param longitude: Name of new column to be created containing longitude. :param latitude: Name of new column to be created containing latitude. :param altitude: Name of new column to be created containing altitude. :return: DataFrame with decoded longitudes and latitudes """ def _parse_location(location: str) -> Tuple[float, float, float]: """ Parse a string containing a geodetic point and return latitude, longitude and altitude """ point = Point(location) return point[0], point[1], point[2] try: geodetic_df = DataFrame() ( geodetic_df["latitude"], geodetic_df["longitude"], geodetic_df["altitude"], ) = zip(*df[geodetic].apply(_parse_location)) columns = {"latitude": latitude, "longitude": longitude} if altitude: columns["altitude"] = altitude return _append_columns(df, geodetic_df, columns) except ValueError as ex: raise InvalidPostProcessingError(_("Invalid geodetic string")) from ex
def _prophet_fit_and_predict( # pylint: disable=too-many-arguments df: DataFrame, confidence_interval: float, yearly_seasonality: Union[bool, str, int], weekly_seasonality: Union[bool, str, int], daily_seasonality: Union[bool, str, int], periods: int, freq: str, ) -> DataFrame: """ Fit a prophet model and return a DataFrame with predicted results. """ try: # pylint: disable=import-error,import-outside-toplevel from prophet import Prophet prophet_logger = logging.getLogger("prophet.plot") prophet_logger.setLevel(logging.CRITICAL) prophet_logger.setLevel(logging.NOTSET) except ModuleNotFoundError as ex: raise InvalidPostProcessingError( _("`prophet` package not installed")) from ex model = Prophet( interval_width=confidence_interval, yearly_seasonality=yearly_seasonality, weekly_seasonality=weekly_seasonality, daily_seasonality=daily_seasonality, ) if df["ds"].dt.tz: df["ds"] = df["ds"].dt.tz_convert(None) model.fit(df) future = model.make_future_dataframe(periods=periods, freq=freq) forecast = model.predict(future)[[ "ds", "yhat", "yhat_lower", "yhat_upper" ]] return forecast.join(df.set_index("ds"), on="ds").set_index(["ds"])
def pivot( # pylint: disable=too-many-arguments,too-many-locals df: DataFrame, index: List[str], aggregates: Dict[str, Dict[str, Any]], columns: Optional[List[str]] = None, metric_fill_value: Optional[Any] = None, column_fill_value: Optional[str] = NULL_STRING, drop_missing_columns: Optional[bool] = True, combine_value_with_metric: bool = False, marginal_distributions: Optional[bool] = None, marginal_distribution_name: Optional[str] = None, flatten_columns: bool = True, reset_index: bool = True, ) -> DataFrame: """ Perform a pivot operation on a DataFrame. :param df: Object on which pivot operation will be performed :param index: Columns to group by on the table index (=rows) :param columns: Columns to group by on the table columns :param metric_fill_value: Value to replace missing values with :param column_fill_value: Value to replace missing pivot columns with. By default replaces missing values with "<NULL>". Set to `None` to remove columns with missing values. :param drop_missing_columns: Do not include columns whose entries are all missing :param combine_value_with_metric: Display metrics side by side within each column, as opposed to each column being displayed side by side for each metric. :param aggregates: A mapping from aggregate column name to the aggregate config. :param marginal_distributions: Add totals for row/column. Default to False :param marginal_distribution_name: Name of row/column with marginal distribution. Default to 'All'. :param flatten_columns: Convert column names to strings :param reset_index: Convert index to column :return: A pivot table :raises InvalidPostProcessingError: If the request in incorrect """ if not index: raise InvalidPostProcessingError( _("Pivot operation requires at least one index")) if not aggregates: raise InvalidPostProcessingError( _("Pivot operation must include at least one aggregate")) if columns and column_fill_value: df[columns] = df[columns].fillna(value=column_fill_value) aggregate_funcs = _get_aggregate_funcs(df, aggregates) # TODO (villebro): Pandas 1.0.3 doesn't yet support NamedAgg in pivot_table. # Remove once/if support is added. aggfunc = {na.column: na.aggfunc for na in aggregate_funcs.values()} # When dropna = False, the pivot_table function will calculate cartesian-product # for MultiIndex. # https://github.com/apache/superset/issues/15956 # https://github.com/pandas-dev/pandas/issues/18030 series_set = set() if not drop_missing_columns and columns: for row in df[columns].itertuples(): for metric in aggfunc.keys(): series_set.add(str(tuple([metric]) + tuple(row[1:]))) df = df.pivot_table( values=aggfunc.keys(), index=index, columns=columns, aggfunc=aggfunc, fill_value=metric_fill_value, dropna=drop_missing_columns, margins=marginal_distributions, margins_name=marginal_distribution_name, ) if not drop_missing_columns and len(series_set) > 0 and not df.empty: for col in df.columns: series = str(col) if series not in series_set: df = df.drop(col, axis=PandasAxis.COLUMN) if combine_value_with_metric: df = df.stack(0).unstack() # Make index regular column if flatten_columns: df.columns = [ _flatten_column_after_pivot(col, aggregates) for col in df.columns ] # return index as regular column if reset_index: df.reset_index(level=0, inplace=True) return df
def boxplot( df: DataFrame, groupby: List[str], metrics: List[str], whisker_type: PostProcessingBoxplotWhiskerType, percentiles: Optional[Union[List[Union[int, float]], Tuple[Union[int, float], Union[int, float]]]] = None, ) -> DataFrame: """ Calculate boxplot statistics. For each metric, the operation creates eight new columns with the column name suffixed with the following values: - `__mean`: the mean - `__median`: the median - `__max`: the maximum value excluding outliers (see whisker type) - `__min`: the minimum value excluding outliers (see whisker type) - `__q1`: the median - `__q1`: the first quartile (25th percentile) - `__q3`: the third quartile (75th percentile) - `__count`: count of observations - `__outliers`: the values that fall outside the minimum/maximum value (see whisker type) :param df: DataFrame containing all-numeric data (temporal column ignored) :param groupby: The categories to group by (x-axis) :param metrics: The metrics for which to calculate the distribution :param whisker_type: The confidence level type :return: DataFrame with boxplot statistics per groupby """ def quartile1(series: Series) -> float: return np.nanpercentile(series, 25, interpolation="midpoint") def quartile3(series: Series) -> float: return np.nanpercentile(series, 75, interpolation="midpoint") if whisker_type == PostProcessingBoxplotWhiskerType.TUKEY: def whisker_high(series: Series) -> float: upper_outer_lim = quartile3(series) + 1.5 * (quartile3(series) - quartile1(series)) return series[series <= upper_outer_lim].max() def whisker_low(series: Series) -> float: lower_outer_lim = quartile1(series) - 1.5 * (quartile3(series) - quartile1(series)) return series[series >= lower_outer_lim].min() elif whisker_type == PostProcessingBoxplotWhiskerType.PERCENTILE: if (not isinstance(percentiles, (list, tuple)) or len(percentiles) != 2 or not isinstance(percentiles[0], (int, float)) or not isinstance(percentiles[1], (int, float)) or percentiles[0] >= percentiles[1]): raise InvalidPostProcessingError( _("percentiles must be a list or tuple with two numeric values, " "of which the first is lower than the second value")) low, high = percentiles[0], percentiles[1] def whisker_high(series: Series) -> float: return np.nanpercentile(series, high) def whisker_low(series: Series) -> float: return np.nanpercentile(series, low) else: whisker_high = np.max whisker_low = np.min def outliers(series: Series) -> Set[float]: above = series[series > whisker_high(series)] below = series[series < whisker_low(series)] return above.tolist() + below.tolist() operators: Dict[str, Callable[[Any], Any]] = { "mean": np.mean, "median": np.median, "max": whisker_high, "min": whisker_low, "q1": quartile1, "q3": quartile3, "count": np.ma.count, "outliers": outliers, } aggregates: Dict[str, Dict[str, Union[str, Callable[..., Any]]]] = { f"{metric}__{operator_name}": { "column": metric, "operator": operator } for operator_name, operator in operators.items() for metric in metrics } # nanpercentile needs numeric values, otherwise the isnan function # that's used in the underlying function will fail for column in metrics: if df.dtypes[column] == np.object: df[column] = to_numeric(df[column], errors="coerce") return aggregate(df, groupby=groupby, aggregates=aggregates)
def prophet( # pylint: disable=too-many-arguments df: DataFrame, time_grain: str, periods: int, confidence_interval: float, yearly_seasonality: Optional[Union[bool, int]] = None, weekly_seasonality: Optional[Union[bool, int]] = None, daily_seasonality: Optional[Union[bool, int]] = None, index: Optional[str] = None, ) -> DataFrame: """ Add forecasts to each series in a timeseries dataframe, along with confidence intervals for the prediction. For each series, the operation creates three new columns with the column name suffixed with the following values: - `__yhat`: the forecast for the given date - `__yhat_lower`: the lower bound of the forecast for the given date - `__yhat_upper`: the upper bound of the forecast for the given date :param df: DataFrame containing all-numeric data (temporal column ignored) :param time_grain: Time grain used to specify time period increments in prediction :param periods: Time periods (in units of `time_grain`) to predict into the future :param confidence_interval: Width of predicted confidence interval :param yearly_seasonality: Should yearly seasonality be applied. An integer value will specify Fourier order of seasonality. :param weekly_seasonality: Should weekly seasonality be applied. An integer value will specify Fourier order of seasonality, `None` will automatically detect seasonality. :param daily_seasonality: Should daily seasonality be applied. An integer value will specify Fourier order of seasonality, `None` will automatically detect seasonality. :param index: the name of the column containing the x-axis data :return: DataFrame with contributions, with temporal column at beginning if present """ index = index or DTTM_ALIAS # validate inputs if not time_grain: raise InvalidPostProcessingError(_("Time grain missing")) if time_grain not in PROPHET_TIME_GRAIN_MAP: raise InvalidPostProcessingError( _( "Unsupported time grain: %(time_grain)s", time_grain=time_grain, )) freq = PROPHET_TIME_GRAIN_MAP[time_grain] # check type at runtime due to marhsmallow schema not being able to handle # union types if not isinstance(periods, int) or periods < 0: raise InvalidPostProcessingError(_("Periods must be a whole number")) if not confidence_interval or confidence_interval <= 0 or confidence_interval >= 1: raise InvalidPostProcessingError( _("Confidence interval must be between 0 and 1 (exclusive)")) if index not in df.columns: raise InvalidPostProcessingError( _("DataFrame must include temporal column")) if len(df.columns) < 2: raise InvalidPostProcessingError( _("DataFrame include at least one series")) target_df = DataFrame() for column in [column for column in df.columns if column != index]: fit_df = _prophet_fit_and_predict( df=df[[index, column]].rename(columns={ index: "ds", column: "y" }), confidence_interval=confidence_interval, yearly_seasonality=_prophet_parse_seasonality(yearly_seasonality), weekly_seasonality=_prophet_parse_seasonality(weekly_seasonality), daily_seasonality=_prophet_parse_seasonality(daily_seasonality), periods=periods, freq=freq, ) new_columns = [ f"{column}__yhat", f"{column}__yhat_lower", f"{column}__yhat_upper", f"{column}", ] fit_df.columns = new_columns if target_df.empty: target_df = fit_df else: for new_column in new_columns: target_df = target_df.assign( **{new_column: fit_df[new_column]}) target_df.reset_index(level=0, inplace=True) return target_df.rename(columns={"ds": index})
def rolling( # pylint: disable=too-many-arguments df: DataFrame, rolling_type: str, columns: Dict[str, str], window: Optional[int] = None, rolling_type_options: Optional[Dict[str, Any]] = None, center: bool = False, win_type: Optional[str] = None, min_periods: Optional[int] = None, ) -> DataFrame: """ Apply a rolling window on the dataset. See the Pandas docs for further details: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.rolling.html :param df: DataFrame on which the rolling period will be based. :param columns: columns on which to perform rolling, mapping source column to target column. For instance, `{'y': 'y'}` will replace the column `y` with the rolling value in `y`, while `{'y': 'y2'}` will add a column `y2` based on rolling values calculated from `y`, leaving the original column `y` unchanged. :param rolling_type: Type of rolling window. Any numpy function will work. :param window: Size of the window. :param rolling_type_options: Optional options to pass to rolling method. Needed for e.g. quantile operation. :param center: Should the label be at the center of the window. :param win_type: Type of window function. :param min_periods: The minimum amount of periods required for a row to be included in the result set. :return: DataFrame with the rolling columns :raises InvalidPostProcessingError: If the request in incorrect """ rolling_type_options = rolling_type_options or {} df_rolling = df.loc[:, columns.keys()] kwargs: Dict[str, Union[str, int]] = {} if window is None: raise InvalidPostProcessingError(_("Undefined window for rolling operation")) if window == 0: raise InvalidPostProcessingError(_("Window must be > 0")) kwargs["window"] = window if min_periods is not None: kwargs["min_periods"] = min_periods if center is not None: kwargs["center"] = center if win_type is not None: kwargs["win_type"] = win_type df_rolling = df_rolling.rolling(**kwargs) if rolling_type not in DENYLIST_ROLLING_FUNCTIONS or not hasattr( df_rolling, rolling_type ): raise InvalidPostProcessingError( _("Invalid rolling_type: %(type)s", type=rolling_type) ) try: df_rolling = getattr(df_rolling, rolling_type)(**rolling_type_options) except TypeError as ex: raise InvalidPostProcessingError( _( "Invalid options for %(rolling_type)s: %(options)s", rolling_type=rolling_type, options=rolling_type_options, ) ) from ex df_rolling = _append_columns(df, df_rolling, columns) if min_periods: df_rolling = df_rolling[min_periods:] return df_rolling