Python InvalidPostProcessingError Beispiele, superset.exceptions.InvalidPostProcessingError Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: query_object.py Projekt: EBoisseauSierra/superset

    def exec_post_processing(self, df: DataFrame) -> DataFrame:
        """
        Perform post processing operations on DataFrame.

        :param df: DataFrame returned from database model.
        :return: new DataFrame to which all post processing operations have been
                 applied
        :raises QueryObjectValidationError: If the post processing operation
                 is incorrect
        """
        logger.debug("post_processing: \n %s", pformat(self.post_processing))
        for post_process in self.post_processing:
            operation = post_process.get("operation")
            if not operation:
                raise InvalidPostProcessingError(
                    _("`operation` property of post processing object undefined")
                )
            if not hasattr(pandas_postprocessing, operation):
                raise InvalidPostProcessingError(
                    _(
                        "Unsupported post processing operation: %(operation)s",
                        type=operation,
                    )
                )
            options = post_process.get("options", {})
            df = getattr(pandas_postprocessing, operation)(df, **options)
        return df

Beispiel #2

0

Datei anzeigen

Datei: resample.py Projekt: dodopizza/superset

def resample(
    df: pd.DataFrame,
    rule: str,
    method: str,
    fill_value: Optional[Union[float, int]] = None,
) -> pd.DataFrame:
    """
    support upsampling in resample

    :param df: DataFrame to resample.
    :param rule: The offset string representing target conversion.
    :param method: How to fill the NaN value after resample.
    :param fill_value: What values do fill missing.
    :return: DataFrame after resample
    :raises InvalidPostProcessingError: If the request in incorrect
    """
    if not isinstance(df.index, pd.DatetimeIndex):
        raise InvalidPostProcessingError(_("Resample operation requires DatetimeIndex"))
    if method not in RESAMPLE_METHOD:
        raise InvalidPostProcessingError(
            _("Resample method should in ") + ", ".join(RESAMPLE_METHOD) + "."
        )

    if method == "asfreq" and fill_value is not None:
        _df = df.resample(rule).asfreq(fill_value=fill_value)
    elif method == "linear":
        _df = df.resample(rule).interpolate()
    else:
        _df = getattr(df.resample(rule), method)()
    return _df

Beispiel #3

0

Datei anzeigen

def compare(  # pylint: disable=too-many-arguments
    df: DataFrame,
    source_columns: List[str],
    compare_columns: List[str],
    compare_type: PandasPostprocessingCompare,
    drop_original_columns: Optional[bool] = False,
    precision: Optional[int] = 4,
) -> DataFrame:
    """
    Calculate column-by-column changing for select columns.

    :param df: DataFrame on which the compare will be based.
    :param source_columns: Main query columns
    :param compare_columns: Columns being compared
    :param compare_type: Type of compare. Choice of `absolute`, `percentage` or `ratio`
    :param drop_original_columns: Whether to remove the source columns and
           compare columns.
    :param precision: Round a change rate to a variable number of decimal places.
    :return: DataFrame with compared columns.
    :raises InvalidPostProcessingError: If the request in incorrect.
    """
    if len(source_columns) != len(compare_columns):
        raise InvalidPostProcessingError(
            _("`compare_columns` must have the same length as `source_columns`."
              ))
    if compare_type not in tuple(PandasPostprocessingCompare):
        raise InvalidPostProcessingError(
            _("`compare_type` must be `difference`, `percentage` or `ratio`"))
    if len(source_columns) == 0:
        return df

    for s_col, c_col in zip(source_columns, compare_columns):
        s_df = df.loc[:, [s_col]]
        s_df.rename(columns={s_col: "__intermediate"}, inplace=True)
        c_df = df.loc[:, [c_col]]
        c_df.rename(columns={c_col: "__intermediate"}, inplace=True)
        if compare_type == PandasPostprocessingCompare.DIFF:
            diff_df = s_df - c_df
        elif compare_type == PandasPostprocessingCompare.PCT:
            diff_df = ((s_df - c_df) / c_df).astype(float).round(precision)
        else:
            # compare_type == "ratio"
            diff_df = (s_df / c_df).astype(float).round(precision)

        diff_df.rename(
            columns={
                "__intermediate":
                TIME_COMPARISON.join([compare_type, s_col, c_col])
            },
            inplace=True,
        )
        df = pd.concat([df, diff_df], axis=1)

    if drop_original_columns:
        df = df.drop(source_columns + compare_columns, axis=1)
    return df

Beispiel #4

0

Datei anzeigen

Datei: contribution.py Projekt: aliavni/incubator-superset

def contribution(
    df: DataFrame,
    orientation: Optional[
        PostProcessingContributionOrientation
    ] = PostProcessingContributionOrientation.COLUMN,
    columns: Optional[List[str]] = None,
    rename_columns: Optional[List[str]] = None,
) -> DataFrame:
    """
    Calculate cell contibution to row/column total for numeric columns.
    Non-numeric columns will be kept untouched.

    If `columns` are specified, only calculate contributions on selected columns.

    :param df: DataFrame containing all-numeric data (temporal column ignored)
    :param columns: Columns to calculate values from.
    :param rename_columns: The new labels for the calculated contribution columns.
                           The original columns will not be removed.
    :param orientation: calculate by dividing cell with row/column total
    :return: DataFrame with contributions.
    """
    contribution_df = df.copy()
    numeric_df = contribution_df.select_dtypes(include=["number", Decimal])
    # TODO: copy needed due to following regression in 1.4, remove if not needed:
    # https://github.com/pandas-dev/pandas/issues/48090
    numeric_df = numeric_df.copy()
    numeric_df.fillna(0, inplace=True)
    # verify column selections
    if columns:
        numeric_columns = numeric_df.columns.tolist()
        for col in columns:
            if col not in numeric_columns:
                raise InvalidPostProcessingError(
                    _(
                        'Column "%(column)s" is not numeric or does not '
                        "exists in the query results.",
                        column=col,
                    )
                )
    columns = columns or numeric_df.columns
    rename_columns = rename_columns or columns
    if len(rename_columns) != len(columns):
        raise InvalidPostProcessingError(
            _("`rename_columns` must have the same length as `columns`.")
        )
    # limit to selected columns
    numeric_df = numeric_df[columns]
    axis = 0 if orientation == PostProcessingContributionOrientation.COLUMN else 1
    numeric_df = numeric_df / numeric_df.values.sum(axis=axis, keepdims=True)
    contribution_df[rename_columns] = numeric_df
    return contribution_df

Beispiel #5

0

Datei anzeigen

def cum(
    df: DataFrame,
    operator: str,
    columns: Dict[str, str],
) -> DataFrame:
    """
    Calculate cumulative sum/product/min/max for select columns.

    :param df: DataFrame on which the cumulative operation will be based.
    :param columns: columns on which to perform a cumulative operation, mapping source
           column to target column. For instance, `{'y': 'y'}` will replace the column
           `y` with the cumulative value in `y`, while `{'y': 'y2'}` will add a column
           `y2` based on cumulative values calculated from `y`, leaving the original
           column `y` unchanged.
    :param operator: cumulative operator, e.g. `sum`, `prod`, `min`, `max`
    :return: DataFrame with cumulated columns
    """
    columns = columns or {}
    df_cum = df.loc[:, columns.keys()]
    operation = "cum" + operator
    if operation not in ALLOWLIST_CUMULATIVE_FUNCTIONS or not hasattr(
            df_cum, operation):
        raise InvalidPostProcessingError(
            _("Invalid cumulative operator: %(operator)s", operator=operator))
    df_cum = _append_columns(df, getattr(df_cum, operation)(), columns)
    return df_cum

Beispiel #6

0

Datei anzeigen

def geohash_encode(
    df: DataFrame,
    geohash: str,
    longitude: str,
    latitude: str,
) -> DataFrame:
    """
    Encode longitude and latitude into geohash

    :param df: DataFrame containing longitude and latitude data
    :param geohash: Name of new column to be created containing geohash location.
    :param longitude: Name of source column containing longitude.
    :param latitude: Name of source column containing latitude.
    :return: DataFrame with decoded longitudes and latitudes
    """
    try:
        encode_df = df[[latitude, longitude]]
        encode_df.columns = ["latitude", "longitude"]
        encode_df["geohash"] = encode_df.apply(
            lambda row: geohash_lib.encode(row["latitude"], row["longitude"]),
            axis=1,
        )
        return _append_columns(df, encode_df, {"geohash": geohash})
    except ValueError as ex:
        raise InvalidPostProcessingError(
            _("Invalid longitude/latitude")) from ex

Beispiel #7

0

Datei anzeigen

Datei: utils.py Projekt: dodopizza/superset

def _get_aggregate_funcs(
    df: DataFrame,
    aggregates: Dict[str, Dict[str, Any]],
) -> Dict[str, NamedAgg]:
    """
    Converts a set of aggregate config objects into functions that pandas can use as
    aggregators. Currently only numpy aggregators are supported.

    :param df: DataFrame on which to perform aggregate operation.
    :param aggregates: Mapping from column name to aggregate config.
    :return: Mapping from metric name to function that takes a single input argument.
    """
    agg_funcs: Dict[str, NamedAgg] = {}
    for name, agg_obj in aggregates.items():
        column = agg_obj.get("column", name)
        if column not in df:
            raise InvalidPostProcessingError(
                _(
                    "Column referenced by aggregate is undefined: %(column)s",
                    column=column,
                )
            )
        if "operator" not in agg_obj:
            raise InvalidPostProcessingError(
                _(
                    "Operator undefined for aggregator: %(name)s",
                    name=name,
                )
            )
        operator = agg_obj["operator"]
        if callable(operator):
            aggfunc = operator
        else:
            func = NUMPY_FUNCTIONS.get(operator)
            if not func:
                raise InvalidPostProcessingError(
                    _(
                        "Invalid numpy function: %(operator)s",
                        operator=operator,
                    )
                )
            options = agg_obj.get("options", {})
            aggfunc = partial(func, **options)
        agg_funcs[name] = NamedAgg(column=column, aggfunc=aggfunc)

    return agg_funcs

Beispiel #8

0

Datei anzeigen

Datei: utils.py Projekt: EBoisseauSierra/superset

 def wrapped(df: DataFrame, **options: Any) -> Any:
     if _is_multi_index_on_columns(df):
         # MultiIndex column validate first level
         columns = df.columns.get_level_values(0)
     else:
         columns = df.columns.tolist()
     for name in argnames:
         if name in options and not all(
                 elem in columns for elem in options.get(name) or []):
             raise InvalidPostProcessingError(
                 _("Referenced columns not available in DataFrame."))
     return func(df, **options)

Beispiel #9

0

Datei anzeigen

def geohash_decode(df: DataFrame, geohash: str, longitude: str,
                   latitude: str) -> DataFrame:
    """
    Decode a geohash column into longitude and latitude

    :param df: DataFrame containing geohash data
    :param geohash: Name of source column containing geohash location.
    :param longitude: Name of new column to be created containing longitude.
    :param latitude: Name of new column to be created containing latitude.
    :return: DataFrame with decoded longitudes and latitudes
    """
    try:
        lonlat_df = DataFrame()
        lonlat_df["latitude"], lonlat_df["longitude"] = zip(
            *df[geohash].apply(geohash_lib.decode))
        return _append_columns(df, lonlat_df, {
            "latitude": latitude,
            "longitude": longitude
        })
    except ValueError as ex:
        raise InvalidPostProcessingError(_("Invalid geohash string")) from ex

Beispiel #10

0

Datei anzeigen

def geodetic_parse(
    df: DataFrame,
    geodetic: str,
    longitude: str,
    latitude: str,
    altitude: Optional[str] = None,
) -> DataFrame:
    """
    Parse a column containing a geodetic point string
    [Geopy](https://geopy.readthedocs.io/en/stable/#geopy.point.Point).

    :param df: DataFrame containing geodetic point data
    :param geodetic: Name of source column containing geodetic point string.
    :param longitude: Name of new column to be created containing longitude.
    :param latitude: Name of new column to be created containing latitude.
    :param altitude: Name of new column to be created containing altitude.
    :return: DataFrame with decoded longitudes and latitudes
    """
    def _parse_location(location: str) -> Tuple[float, float, float]:
        """
        Parse a string containing a geodetic point and return latitude, longitude
        and altitude
        """
        point = Point(location)
        return point[0], point[1], point[2]

    try:
        geodetic_df = DataFrame()
        (
            geodetic_df["latitude"],
            geodetic_df["longitude"],
            geodetic_df["altitude"],
        ) = zip(*df[geodetic].apply(_parse_location))
        columns = {"latitude": latitude, "longitude": longitude}
        if altitude:
            columns["altitude"] = altitude
        return _append_columns(df, geodetic_df, columns)
    except ValueError as ex:
        raise InvalidPostProcessingError(_("Invalid geodetic string")) from ex

Beispiel #11

0

Datei anzeigen

Datei: prophet.py Projekt: dodopizza/superset

def _prophet_fit_and_predict(  # pylint: disable=too-many-arguments
    df: DataFrame,
    confidence_interval: float,
    yearly_seasonality: Union[bool, str, int],
    weekly_seasonality: Union[bool, str, int],
    daily_seasonality: Union[bool, str, int],
    periods: int,
    freq: str,
) -> DataFrame:
    """
    Fit a prophet model and return a DataFrame with predicted results.
    """
    try:
        # pylint: disable=import-error,import-outside-toplevel
        from prophet import Prophet

        prophet_logger = logging.getLogger("prophet.plot")
        prophet_logger.setLevel(logging.CRITICAL)
        prophet_logger.setLevel(logging.NOTSET)
    except ModuleNotFoundError as ex:
        raise InvalidPostProcessingError(
            _("`prophet` package not installed")) from ex
    model = Prophet(
        interval_width=confidence_interval,
        yearly_seasonality=yearly_seasonality,
        weekly_seasonality=weekly_seasonality,
        daily_seasonality=daily_seasonality,
    )
    if df["ds"].dt.tz:
        df["ds"] = df["ds"].dt.tz_convert(None)
    model.fit(df)
    future = model.make_future_dataframe(periods=periods, freq=freq)
    forecast = model.predict(future)[[
        "ds", "yhat", "yhat_lower", "yhat_upper"
    ]]
    return forecast.join(df.set_index("ds"), on="ds").set_index(["ds"])

Beispiel #12

0

Datei anzeigen

Datei: pivot.py Projekt: dodopizza/superset

def pivot(  # pylint: disable=too-many-arguments,too-many-locals
    df: DataFrame,
    index: List[str],
    aggregates: Dict[str, Dict[str, Any]],
    columns: Optional[List[str]] = None,
    metric_fill_value: Optional[Any] = None,
    column_fill_value: Optional[str] = NULL_STRING,
    drop_missing_columns: Optional[bool] = True,
    combine_value_with_metric: bool = False,
    marginal_distributions: Optional[bool] = None,
    marginal_distribution_name: Optional[str] = None,
    flatten_columns: bool = True,
    reset_index: bool = True,
) -> DataFrame:
    """
    Perform a pivot operation on a DataFrame.

    :param df: Object on which pivot operation will be performed
    :param index: Columns to group by on the table index (=rows)
    :param columns: Columns to group by on the table columns
    :param metric_fill_value: Value to replace missing values with
    :param column_fill_value: Value to replace missing pivot columns with. By default
           replaces missing values with "<NULL>". Set to `None` to remove columns
           with missing values.
    :param drop_missing_columns: Do not include columns whose entries are all missing
    :param combine_value_with_metric: Display metrics side by side within each column,
           as opposed to each column being displayed side by side for each metric.
    :param aggregates: A mapping from aggregate column name to the aggregate
           config.
    :param marginal_distributions: Add totals for row/column. Default to False
    :param marginal_distribution_name: Name of row/column with marginal distribution.
           Default to 'All'.
    :param flatten_columns: Convert column names to strings
    :param reset_index: Convert index to column
    :return: A pivot table
    :raises InvalidPostProcessingError: If the request in incorrect
    """
    if not index:
        raise InvalidPostProcessingError(
            _("Pivot operation requires at least one index"))
    if not aggregates:
        raise InvalidPostProcessingError(
            _("Pivot operation must include at least one aggregate"))

    if columns and column_fill_value:
        df[columns] = df[columns].fillna(value=column_fill_value)

    aggregate_funcs = _get_aggregate_funcs(df, aggregates)

    # TODO (villebro): Pandas 1.0.3 doesn't yet support NamedAgg in pivot_table.
    #  Remove once/if support is added.
    aggfunc = {na.column: na.aggfunc for na in aggregate_funcs.values()}

    # When dropna = False, the pivot_table function will calculate cartesian-product
    # for MultiIndex.
    # https://github.com/apache/superset/issues/15956
    # https://github.com/pandas-dev/pandas/issues/18030
    series_set = set()
    if not drop_missing_columns and columns:
        for row in df[columns].itertuples():
            for metric in aggfunc.keys():
                series_set.add(str(tuple([metric]) + tuple(row[1:])))

    df = df.pivot_table(
        values=aggfunc.keys(),
        index=index,
        columns=columns,
        aggfunc=aggfunc,
        fill_value=metric_fill_value,
        dropna=drop_missing_columns,
        margins=marginal_distributions,
        margins_name=marginal_distribution_name,
    )

    if not drop_missing_columns and len(series_set) > 0 and not df.empty:
        for col in df.columns:
            series = str(col)
            if series not in series_set:
                df = df.drop(col, axis=PandasAxis.COLUMN)

    if combine_value_with_metric:
        df = df.stack(0).unstack()

    # Make index regular column
    if flatten_columns:
        df.columns = [
            _flatten_column_after_pivot(col, aggregates) for col in df.columns
        ]
    # return index as regular column
    if reset_index:
        df.reset_index(level=0, inplace=True)
    return df

Beispiel #13

0

Datei anzeigen

def boxplot(
    df: DataFrame,
    groupby: List[str],
    metrics: List[str],
    whisker_type: PostProcessingBoxplotWhiskerType,
    percentiles: Optional[Union[List[Union[int, float]],
                                Tuple[Union[int, float],
                                      Union[int, float]]]] = None,
) -> DataFrame:
    """
    Calculate boxplot statistics. For each metric, the operation creates eight
    new columns with the column name suffixed with the following values:

    - `__mean`: the mean
    - `__median`: the median
    - `__max`: the maximum value excluding outliers (see whisker type)
    - `__min`: the minimum value excluding outliers (see whisker type)
    - `__q1`: the median
    - `__q1`: the first quartile (25th percentile)
    - `__q3`: the third quartile (75th percentile)
    - `__count`: count of observations
    - `__outliers`: the values that fall outside the minimum/maximum value
                    (see whisker type)

    :param df: DataFrame containing all-numeric data (temporal column ignored)
    :param groupby: The categories to group by (x-axis)
    :param metrics: The metrics for which to calculate the distribution
    :param whisker_type: The confidence level type
    :return: DataFrame with boxplot statistics per groupby
    """
    def quartile1(series: Series) -> float:
        return np.nanpercentile(series, 25, interpolation="midpoint")

    def quartile3(series: Series) -> float:
        return np.nanpercentile(series, 75, interpolation="midpoint")

    if whisker_type == PostProcessingBoxplotWhiskerType.TUKEY:

        def whisker_high(series: Series) -> float:
            upper_outer_lim = quartile3(series) + 1.5 * (quartile3(series) -
                                                         quartile1(series))
            return series[series <= upper_outer_lim].max()

        def whisker_low(series: Series) -> float:
            lower_outer_lim = quartile1(series) - 1.5 * (quartile3(series) -
                                                         quartile1(series))
            return series[series >= lower_outer_lim].min()

    elif whisker_type == PostProcessingBoxplotWhiskerType.PERCENTILE:
        if (not isinstance(percentiles, (list, tuple)) or len(percentiles) != 2
                or not isinstance(percentiles[0], (int, float))
                or not isinstance(percentiles[1], (int, float))
                or percentiles[0] >= percentiles[1]):
            raise InvalidPostProcessingError(
                _("percentiles must be a list or tuple with two numeric values, "
                  "of which the first is lower than the second value"))
        low, high = percentiles[0], percentiles[1]

        def whisker_high(series: Series) -> float:
            return np.nanpercentile(series, high)

        def whisker_low(series: Series) -> float:
            return np.nanpercentile(series, low)

    else:
        whisker_high = np.max
        whisker_low = np.min

    def outliers(series: Series) -> Set[float]:
        above = series[series > whisker_high(series)]
        below = series[series < whisker_low(series)]
        return above.tolist() + below.tolist()

    operators: Dict[str, Callable[[Any], Any]] = {
        "mean": np.mean,
        "median": np.median,
        "max": whisker_high,
        "min": whisker_low,
        "q1": quartile1,
        "q3": quartile3,
        "count": np.ma.count,
        "outliers": outliers,
    }
    aggregates: Dict[str, Dict[str, Union[str, Callable[..., Any]]]] = {
        f"{metric}__{operator_name}": {
            "column": metric,
            "operator": operator
        }
        for operator_name, operator in operators.items() for metric in metrics
    }

    # nanpercentile needs numeric values, otherwise the isnan function
    # that's used in the underlying function will fail
    for column in metrics:
        if df.dtypes[column] == np.object:
            df[column] = to_numeric(df[column], errors="coerce")

    return aggregate(df, groupby=groupby, aggregates=aggregates)

Beispiel #14

0

Datei anzeigen

Datei: prophet.py Projekt: dodopizza/superset

def prophet(  # pylint: disable=too-many-arguments
    df: DataFrame,
    time_grain: str,
    periods: int,
    confidence_interval: float,
    yearly_seasonality: Optional[Union[bool, int]] = None,
    weekly_seasonality: Optional[Union[bool, int]] = None,
    daily_seasonality: Optional[Union[bool, int]] = None,
    index: Optional[str] = None,
) -> DataFrame:
    """
    Add forecasts to each series in a timeseries dataframe, along with confidence
    intervals for the prediction. For each series, the operation creates three
    new columns with the column name suffixed with the following values:

    - `__yhat`: the forecast for the given date
    - `__yhat_lower`: the lower bound of the forecast for the given date
    - `__yhat_upper`: the upper bound of the forecast for the given date


    :param df: DataFrame containing all-numeric data (temporal column ignored)
    :param time_grain: Time grain used to specify time period increments in prediction
    :param periods: Time periods (in units of `time_grain`) to predict into the future
    :param confidence_interval: Width of predicted confidence interval
    :param yearly_seasonality: Should yearly seasonality be applied.
           An integer value will specify Fourier order of seasonality.
    :param weekly_seasonality: Should weekly seasonality be applied.
           An integer value will specify Fourier order of seasonality, `None` will
           automatically detect seasonality.
    :param daily_seasonality: Should daily seasonality be applied.
           An integer value will specify Fourier order of seasonality, `None` will
           automatically detect seasonality.
    :param index: the name of the column containing the x-axis data
    :return: DataFrame with contributions, with temporal column at beginning if present
    """
    index = index or DTTM_ALIAS
    # validate inputs
    if not time_grain:
        raise InvalidPostProcessingError(_("Time grain missing"))
    if time_grain not in PROPHET_TIME_GRAIN_MAP:
        raise InvalidPostProcessingError(
            _(
                "Unsupported time grain: %(time_grain)s",
                time_grain=time_grain,
            ))
    freq = PROPHET_TIME_GRAIN_MAP[time_grain]
    # check type at runtime due to marhsmallow schema not being able to handle
    # union types
    if not isinstance(periods, int) or periods < 0:
        raise InvalidPostProcessingError(_("Periods must be a whole number"))
    if not confidence_interval or confidence_interval <= 0 or confidence_interval >= 1:
        raise InvalidPostProcessingError(
            _("Confidence interval must be between 0 and 1 (exclusive)"))
    if index not in df.columns:
        raise InvalidPostProcessingError(
            _("DataFrame must include temporal column"))
    if len(df.columns) < 2:
        raise InvalidPostProcessingError(
            _("DataFrame include at least one series"))

    target_df = DataFrame()
    for column in [column for column in df.columns if column != index]:
        fit_df = _prophet_fit_and_predict(
            df=df[[index, column]].rename(columns={
                index: "ds",
                column: "y"
            }),
            confidence_interval=confidence_interval,
            yearly_seasonality=_prophet_parse_seasonality(yearly_seasonality),
            weekly_seasonality=_prophet_parse_seasonality(weekly_seasonality),
            daily_seasonality=_prophet_parse_seasonality(daily_seasonality),
            periods=periods,
            freq=freq,
        )
        new_columns = [
            f"{column}__yhat",
            f"{column}__yhat_lower",
            f"{column}__yhat_upper",
            f"{column}",
        ]
        fit_df.columns = new_columns
        if target_df.empty:
            target_df = fit_df
        else:
            for new_column in new_columns:
                target_df = target_df.assign(
                    **{new_column: fit_df[new_column]})
    target_df.reset_index(level=0, inplace=True)
    return target_df.rename(columns={"ds": index})

Beispiel #15

0

Datei anzeigen

def rolling(  # pylint: disable=too-many-arguments
    df: DataFrame,
    rolling_type: str,
    columns: Dict[str, str],
    window: Optional[int] = None,
    rolling_type_options: Optional[Dict[str, Any]] = None,
    center: bool = False,
    win_type: Optional[str] = None,
    min_periods: Optional[int] = None,
) -> DataFrame:
    """
    Apply a rolling window on the dataset. See the Pandas docs for further details:
    https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.rolling.html

    :param df: DataFrame on which the rolling period will be based.
    :param columns: columns on which to perform rolling, mapping source column to
           target column. For instance, `{'y': 'y'}` will replace the column `y` with
           the rolling value in `y`, while `{'y': 'y2'}` will add a column `y2` based
           on rolling values calculated from `y`, leaving the original column `y`
           unchanged.
    :param rolling_type: Type of rolling window. Any numpy function will work.
    :param window: Size of the window.
    :param rolling_type_options: Optional options to pass to rolling method. Needed
           for e.g. quantile operation.
    :param center: Should the label be at the center of the window.
    :param win_type: Type of window function.
    :param min_periods: The minimum amount of periods required for a row to be included
                        in the result set.
    :return: DataFrame with the rolling columns
    :raises InvalidPostProcessingError: If the request in incorrect
    """
    rolling_type_options = rolling_type_options or {}
    df_rolling = df.loc[:, columns.keys()]

    kwargs: Dict[str, Union[str, int]] = {}
    if window is None:
        raise InvalidPostProcessingError(_("Undefined window for rolling operation"))
    if window == 0:
        raise InvalidPostProcessingError(_("Window must be > 0"))

    kwargs["window"] = window
    if min_periods is not None:
        kwargs["min_periods"] = min_periods
    if center is not None:
        kwargs["center"] = center
    if win_type is not None:
        kwargs["win_type"] = win_type

    df_rolling = df_rolling.rolling(**kwargs)
    if rolling_type not in DENYLIST_ROLLING_FUNCTIONS or not hasattr(
        df_rolling, rolling_type
    ):
        raise InvalidPostProcessingError(
            _("Invalid rolling_type: %(type)s", type=rolling_type)
        )
    try:
        df_rolling = getattr(df_rolling, rolling_type)(**rolling_type_options)
    except TypeError as ex:
        raise InvalidPostProcessingError(
            _(
                "Invalid options for %(rolling_type)s: %(options)s",
                rolling_type=rolling_type,
                options=rolling_type_options,
            )
        ) from ex

    df_rolling = _append_columns(df, df_rolling, columns)

    if min_periods:
        df_rolling = df_rolling[min_periods:]
    return df_rolling