def test_get_integer(): """Tests get_integer function""" with pytest.warns(Warning) as record: assert get_integer(None, "val", min_value=10, default_value=20) == 20 assert get_integer(11, "val", min_value=10, default_value=20) == 11 assert get_integer(10.5, "val", min_value=10, default_value=20) == 10 assert "val converted to integer 10 from 10.5" in record[ 0].message.args[0] with pytest.raises(ValueError, match="val must be an integer"): get_integer("q", "val") with pytest.raises(ValueError, match="val must be >= 1"): get_integer(0, "val", min_value=1) with pytest.raises(ValueError, match="val must be >= 1"): get_integer(None, "val", min_value=1, default_value=0)
def pipeline_wrapper( # The arguments to this wrapper must be identical to forecast_pipeline() function. # We don't use **kwargs # because it's easier to check parameters directly. # input df: pd.DataFrame, time_col=TIME_COL, value_col=VALUE_COL, date_format=None, tz=None, freq=None, train_end_date=None, anomaly_info=None, # model pipeline=None, regressor_cols=None, lagged_regressor_cols=None, estimator=SimpleSilverkiteEstimator(), hyperparameter_grid=None, hyperparameter_budget=None, n_jobs=COMPUTATION_N_JOBS, verbose=1, # forecast forecast_horizon=None, coverage=0.95, test_horizon=None, periods_between_train_test=None, agg_periods=None, agg_func=None, # evaluation score_func=EvaluationMetricEnum.MeanAbsolutePercentError.name, score_func_greater_is_better=False, cv_report_metrics=None, null_model_params=None, relative_error_tolerance=None, # CV cv_horizon=None, cv_min_train_periods=None, cv_expanding_window=False, cv_use_most_recent_splits=False, cv_periods_between_splits=None, cv_periods_between_train_test=0, cv_max_splits=3): if coverage is not None and (coverage < 0 or coverage > 1): raise ValueError(f"coverage must be between 0 and 1, found {coverage}") if relative_error_tolerance is not None and relative_error_tolerance < 0: raise ValueError(f"relative_error_tolerance must non-negative, found {relative_error_tolerance}") # default values for forecast horizon, test, and cross-validation parameters period = min_gap_in_seconds(df=df, time_col=time_col) num_observations = df.shape[0] default_time_params = get_default_time_parameters( period=period, num_observations=num_observations, forecast_horizon=forecast_horizon, test_horizon=test_horizon, periods_between_train_test=periods_between_train_test, cv_horizon=cv_horizon, cv_min_train_periods=cv_min_train_periods, cv_periods_between_train_test=cv_periods_between_train_test) forecast_horizon = default_time_params.get("forecast_horizon") test_horizon = default_time_params.get("test_horizon") periods_between_train_test = default_time_params.get("periods_between_train_test") cv_horizon = default_time_params.get("cv_horizon") cv_min_train_periods = default_time_params.get("cv_min_train_periods") cv_periods_between_train_test = default_time_params.get("cv_periods_between_train_test") # ensures the values are integers in the proper domain if hyperparameter_budget is not None: hyperparameter_budget = get_integer( hyperparameter_budget, "hyperparameter_budget", min_value=1) if (cv_horizon == 0 or cv_max_splits == 0) and test_horizon == 0: raise ValueError("Either CV or backtest must be enabled." " Set cv_horizon and cv_max_splits to nonzero values to enable CV." " Set test_horizon to nonzero value to enable backtest." " It's important to check model" " performance on historical data.") if test_horizon == 0: warnings.warn("No data selected for test (test_horizon=0). " "It is important to check out of sample performance") # checks horizon against data size if num_observations < forecast_horizon * 2: warnings.warn(f"Not enough training data to forecast the full forecast_horizon." " Exercise extra caution with" f" forecasted values after {num_observations // 2} periods.") if test_horizon > num_observations: raise ValueError(f"test_horizon ({test_horizon}) is too large." " Must be less than the number " f"of input data points: {num_observations})") if test_horizon > forecast_horizon: warnings.warn(f"test_horizon should never be larger than forecast_horizon.") if test_horizon > num_observations // 3: warnings.warn(f"test_horizon should be <= than 1/3 of the data set size to allow enough data to train" f" a backtest model. Consider reducing to {num_observations // 3}. If this is smaller" f" than the forecast_horizon, you will need to make a trade-off between setting" f" test_horizon=forecast_horizon and having enough data left over to properly" f" train a realistic backtest model.") log_message(f"forecast_horizon: {forecast_horizon}", LoggingLevelEnum.INFO) log_message(f"test_horizon: {test_horizon}", LoggingLevelEnum.INFO) log_message(f"cv_horizon: {cv_horizon}", LoggingLevelEnum.INFO) return pipeline_function( df, time_col=time_col, value_col=value_col, date_format=date_format, tz=tz, freq=freq, train_end_date=train_end_date, anomaly_info=anomaly_info, pipeline=pipeline, regressor_cols=regressor_cols, lagged_regressor_cols=lagged_regressor_cols, estimator=estimator, hyperparameter_grid=hyperparameter_grid, hyperparameter_budget=hyperparameter_budget, n_jobs=n_jobs, verbose=verbose, forecast_horizon=forecast_horizon, coverage=coverage, test_horizon=test_horizon, periods_between_train_test=periods_between_train_test, agg_periods=agg_periods, agg_func=agg_func, score_func=score_func, score_func_greater_is_better=score_func_greater_is_better, cv_report_metrics=cv_report_metrics, null_model_params=null_model_params, relative_error_tolerance=relative_error_tolerance, cv_horizon=cv_horizon, cv_min_train_periods=cv_min_train_periods, cv_expanding_window=cv_expanding_window, cv_use_most_recent_splits=cv_use_most_recent_splits, cv_periods_between_splits=cv_periods_between_splits, cv_periods_between_train_test=cv_periods_between_train_test, cv_max_splits=cv_max_splits )
def get_default_time_parameters( period, num_observations, forecast_horizon=None, test_horizon=None, periods_between_train_test=None, cv_horizon=None, cv_min_train_periods=None, cv_expanding_window=False, cv_periods_between_splits=None, cv_periods_between_train_test=None, cv_max_splits=3): """Returns default forecast horizon, backtest, and cross-validation parameters, given the input frequency, size, and user requested values. This function is called from the `~greykite.framework.pipeline.pipeline.forecast_pipeline` directly, to provide suitable default to users of forecast_pipeline, and because the default should not depend on model configuration (the template). Parameters ---------- period: `float` Period of each observation (i.e. average time between observations, in seconds). num_observations: `int` Number of observations in the input data. forecast_horizon: `int` or None, default None Number of periods to forecast into the future. Must be > 0. If None, default is determined from input data frequency. test_horizon: `int` or None, default None Numbers of periods held back from end of df for test. The rest is used for cross validation. If None, default is ``forecast_horizon``. Set to 0 to skip backtest. periods_between_train_test : `int` or None, default None Number of periods gap between train and test in a CV split. If None, default is 0. cv_horizon: `int` or None, default None Number of periods in each CV test set. If None, default is ``forecast_horizon``. Set to 0 to skip CV. cv_min_train_periods: `int` or None, default None Minimum number of periods for training each CV fold. If ``cv_expanding_window`` is False, every training period is this size. If None, default is 2 * ``cv_horizon``. cv_expanding_window: `bool`, default False If True, training window for each CV split is fixed to the first available date. Otherwise, train start date is sliding, determined by ``cv_min_train_periods``. cv_periods_between_splits: `int` or None, default None Number of periods to slide the test window between CV splits If None, default is ``cv_horizon``. cv_periods_between_train_test: `int` or None, default None Number of periods gap between train and test in a CV split. If None, default is ``periods_between_train_test``. cv_max_splits: `int` or None, default 3 Maximum number of CV splits. Given the above configuration, samples up to max_splits train/test splits, preferring splits toward the end of available data. If None, uses all splits. Returns ------- time_params : `dict` [`str`, `str`] keys are parameter names, values are their default values. """ if forecast_horizon is None: forecast_horizon = get_default_horizon_from_period( period=period, num_observations=num_observations) forecast_horizon = get_integer(val=forecast_horizon, name="forecast_horizon", min_value=1) test_horizon = get_integer( val=test_horizon, name="test_horizon", min_value=0, default_value=forecast_horizon) # reduces test_horizon to default 80/20 split if there is not enough data if test_horizon >= num_observations: test_horizon = math.floor(num_observations * 0.2) cv_horizon = get_integer( val=cv_horizon, name="cv_horizon", min_value=0, default_value=forecast_horizon) # RollingTimeSeriesSplit handles the case of no CV splits, not handled in detail here # temporary patch to avoid the case where cv_horizon==num_observations, which throws an error # in RollingTimeSeriesSplit if cv_horizon >= num_observations: cv_horizon = math.floor(num_observations * 0.2) periods_between_train_test = get_integer( val=periods_between_train_test, name="periods_between_train_test", min_value=0, default_value=0) cv_periods_between_train_test = get_integer( val=cv_periods_between_train_test, name="cv_periods_between_train_test", min_value=0, default_value=periods_between_train_test) return { "forecast_horizon": forecast_horizon, "test_horizon": test_horizon, "periods_between_train_test": periods_between_train_test, "cv_horizon": cv_horizon, "cv_min_train_periods": cv_min_train_periods, "cv_periods_between_train_test": cv_periods_between_train_test }
def __init__(self, forecast_horizon, min_train_periods=None, expanding_window=False, use_most_recent_splits=False, periods_between_splits=None, periods_between_train_test=0, max_splits=3): """Initializes attributes of RollingTimeSeriesSplit Parameters ---------- forecast_horizon : `int` How many periods in each CV test set min_train_periods : `int` or None, optional Minimum number of periods for training. If ``expanding_window`` is False, every training period has this size. expanding_window : `bool`, default False If True, training window for each CV split is fixed to the first available date. Otherwise, train start date is sliding, determined by ``min_train_periods``. use_most_recent_splits: `bool`, default False If True, splits from the end of the dataset are used. Else a sampling strategy is applied. Check `~greykite.sklearn.cross_validation.RollingTimeSeriesSplit._sample_splits` for details. periods_between_splits : `int` or None Number of periods to slide the test window periods_between_train_test : `int` Number of periods gap between train and test within a CV split max_splits : `int` or None Maximum number of CV splits. Given the above configuration, samples up to max_splits train/test splits, preferring splits toward the end of available data. If None, uses all splits. """ super().__init__() self.forecast_horizon = get_integer(forecast_horizon, name="forecast_horizon", min_value=1) # by default, use at least twice the forecast horizon for training self.min_train_periods = get_integer(min_train_periods, name="min_train_periods", min_value=1, default_value=2 * self.forecast_horizon) # by default, use fixed size training window self.expanding_window = expanding_window # by default, does not force most recent splits self.use_most_recent_splits = use_most_recent_splits # by default, use non-overlapping test sets self.periods_between_splits = get_integer( periods_between_splits, name="periods_between_splits", min_value=1, default_value=self.forecast_horizon) # by default, use test set immediately following train set self.periods_between_train_test = get_integer( periods_between_train_test, name="periods_between_train_test", min_value=0, default_value=0) if self.min_train_periods < 2 * self.forecast_horizon: warnings.warn( f"`min_train_periods` is too small for your `forecast_horizon`. Should be at least" f" {forecast_horizon*2}=2*`forecast_horizon`.") self.max_splits = max_splits self.min_splits = 1 # CV ensures there is always at least one split # test end index for the first CV split, before applying offset to ensure last data point in X is used self.__starting_test_index = (self.forecast_horizon + self.min_train_periods + self.periods_between_train_test - 1)