def _set_y_X(self, y, X=None): """Set training data. Parameters ---------- y : pd.Series Endogenous time series X : pd.DataFrame, optional (default=None) Exogenous time series """ # set initial training data self._y, self._X = check_y_X(y, X, allow_empty=False) # set initial cutoff to the end of the training data self._set_cutoff(y.index[-1])
def _fit(self, y, X=None, fh=None, **fit_params): """Fit to training data. Parameters ---------- y : pd.Series Target time series to which to fit the forecaster. X : pd.DataFrame, optional (default=None) Exogenous variables. fh : int, list or np.array, optional (default=None) The forecasters horizon with the steps ahead to to predict. Returns ------- self : returns an instance of self. """ self._instantiate_model() self._check_changepoints() y, X = check_y_X(y, X, enforce_index_type=pd.DatetimeIndex) # We have to bring the data into the required format for fbprophet: df = pd.DataFrame({"y": y, "ds": y.index}) # Add seasonality/seasonalities if self.add_seasonality: if type(self.add_seasonality) == dict: self._forecaster.add_seasonality(**self.add_seasonality) elif type(self.add_seasonality) == list: for seasonality in self.add_seasonality: self._forecaster.add_seasonality(**seasonality) # Add country holidays if self.add_country_holidays: self._forecaster.add_country_holidays(**self.add_country_holidays) # Add regressor (multivariate) if X is not None: df, X = _merge_X(df, X) for col in X.columns: self._forecaster.add_regressor(col) if self.verbose: self._forecaster.fit(df=df, **fit_params) else: with _suppress_stdout_stderr(): self._forecaster.fit(df=df, **fit_params) return self
def fit(self, y, X=None, fh=None): """Fit forecaster to training data. public method including checks & utility dispatches to core logic in _fit Parameters ---------- y : pd.Series Target time series to which to fit the forecaster. fh : int, list, np.array or ForecastingHorizon, optional (default=None) The forecasters horizon with the steps ahead to to predict. X : pd.DataFrame, optional (default=None) Exogeneous data Returns ------- self : reference to self. State change ------------ stores data in self._X and self._y stores fh, if passed updates self.cutoff to most recent time in y creates fitted model (attributes ending in "_") sets is_fitted flag to true """ # if fit is called, fitted state is re-set self._is_fitted = False self._set_fh(fh) y, X = check_y_X(y, X) self._X = X self._y = y self._set_cutoff(y.index[-1]) self._fit(y=y, X=X, fh=fh) # this should happen last self._is_fitted = True return self
def _update_y_X(self, y, X=None): """Update training data. Parameters ---------- y : pd.Series Endogenous time series X : pd.DataFrame, optional (default=None) Exogenous time series """ # update only for non-empty data y, X = check_y_X(y, X=X, allow_empty=True) if len(y) > 0: self._y = y.combine_first(self._y) # set cutoff to the end of the observation horizon self._set_cutoff(y.index[-1]) # update X if given if X is not None: self._X = X.combine_first(self._X)
def fit(self, y, X=None, fh=None, **fit_params): """Fit to training data. Parameters ---------- y : pd.Series Target time series to which to fit the forecaster. fh : int, list or np.array, optional (default=None) The forecasters horizon with the steps ahead to to predict. X : pd.DataFrame, optional (default=None) Exogenous variables are ignored Returns ------- self : returns an instance of self. """ y, X = check_y_X(y, X) # validate cross-validator cv = check_cv(self.cv) base_forecaster = clone(self.forecaster) scoring = check_scoring(self.scoring) scorers = {scoring.name: scoring} refit_metric = scoring.name fit_and_score_kwargs = dict( scorer=scorers, fit_params=fit_params, return_train_score=self.return_train_score, return_times=True, return_parameters=False, error_score=self.error_score, verbose=self.verbose, ) results = {} all_candidate_params = [] all_out = [] def evaluate_candidates(candidate_params): candidate_params = list(candidate_params) n_candidates = len(candidate_params) if self.verbose > 0: n_splits = cv.get_n_splits(y) print( # noqa "Fitting {0} folds for each of {1} candidates," " totalling {2} fits".format(n_splits, n_candidates, n_candidates * n_splits)) out = [] for parameters in candidate_params: r = _fit_and_score(clone(base_forecaster), cv, y, X, parameters=parameters, **fit_and_score_kwargs) out.append(r) n_splits = cv.get_n_splits(y) if len(out) < 1: raise ValueError("No fits were performed. " "Was the CV iterator empty? " "Were there no candidates?") all_candidate_params.extend(candidate_params) all_out.extend(out) nonlocal results results = self._format_results(all_candidate_params, scorers, all_out) return results self._run_search(evaluate_candidates) self.best_index_ = results["rank_test_%s" % refit_metric].argmin() self.best_score_ = results["mean_test_%s" % refit_metric][self.best_index_] self.best_params_ = results["params"][self.best_index_] self.best_forecaster_ = clone(base_forecaster).set_params( **self.best_params_) if self.refit: refit_start_time = time.time() self.best_forecaster_.fit(y, X, fh) self.refit_time_ = time.time() - refit_start_time # Store the only scorer not as a dict for single metric evaluation self.scorer_ = scorers[scoring.name] self.cv_results_ = results self.n_splits_ = cv.get_n_splits(y) self._is_fitted = True return self
def evaluate( forecaster, cv, y, X=None, strategy="refit", scoring=None, fit_params=None, return_data=False, ): """Evaluate forecaster using cross-validation Parameters ---------- forecaster : sktime.forecaster Any forecaster y : pd.Series Target time series to which to fit the forecaster. X : pd.DataFrame, optional (default=None) Exogenous variables cv : Temporal cross-validation splitter Splitter of how to split the data into test data and train data strategy : str, optional (default="refit") Must be "refit" or "update". The strategy defines whether the `forecaster` is only fitted on the first train window data and then updated, or always refitted. scoring : object of class MetricFunctionWrapper from sktime.performance_metrics, optional. Example scoring=sMAPE(). Used to get a score function that takes y_pred and y_test as arguments, by default None (if None, uses sMAPE) fit_params : dict, optional (default=None) Parameters passed to the `fit` call of the forecaster. return_data : bool, optional Returns three additional columns in the DataFrame, by default False. The cells of the columns contain each a pd.Series for y_train, y_pred, y_test. Returns ------- pd.DataFrame DataFrame that contains several columns with information regarding each refit/update and prediction of the forecaster. Examples -------- >>> from sktime.datasets import load_airline >>> from sktime.forecasting.model_evaluation import evaluate >>> from sktime.forecasting.model_selection import ExpandingWindowSplitter >>> from sktime.forecasting.naive import NaiveForecaster >>> y = load_airline() >>> forecaster = NaiveForecaster(strategy="mean", sp=12) >>> cv = ExpandingWindowSplitter(initial_window=24, step_length=12, ... fh=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]) >>> results = evaluate(forecaster=forecaster, y=y, cv=cv) """ _check_strategy(strategy) cv = check_cv(cv, enforce_start_with_window=True) scoring = check_scoring(scoring) y, X = check_y_X(y, X) fit_params = {} if fit_params is None else fit_params # Define score name. score_name = "test_" + scoring.name # Initialize dataframe. results = pd.DataFrame() # Run temporal cross-validation. for i, (train, test) in enumerate(cv.split(y)): # split data y_train, y_test, X_train, X_test = _split(y, X, train, test, cv.fh) # create forecasting horizon fh = ForecastingHorizon(y_test.index, is_relative=False) # fit/update start_fit = time.time() if i == 0 or strategy == "refit": forecaster.fit(y_train, X_train, fh=fh, **fit_params) else: # if strategy == "update": forecaster.update(y_train, X_train) fit_time = time.time() - start_fit # predict start_pred = time.time() y_pred = forecaster.predict(fh, X=X_test) pred_time = time.time() - start_pred # score score = scoring(y_pred, y_test) # save results results = results.append( { score_name: score, "fit_time": fit_time, "pred_time": pred_time, "len_train_window": len(y_train), "cutoff": forecaster.cutoff, "y_train": y_train if return_data else np.nan, "y_test": y_test if return_data else np.nan, "y_pred": y_pred if return_data else np.nan, }, ignore_index=True, ) # post-processing of results if not return_data: results = results.drop(columns=["y_train", "y_test", "y_pred"]) results["len_train_window"] = results["len_train_window"].astype(int) return results
def fit(self, y, X=None, fh=None, **fit_params): """Fit to training data. Parameters ---------- y : pd.Series Target time series to which to fit the forecaster. fh : int, list or np.array, optional (default=None) The forecasters horizon with the steps ahead to to predict. X : pd.DataFrame, optional (default=None) Exogenous variables are ignored Returns ------- self : returns an instance of self. """ y, X = check_y_X(y, X) cv = check_cv(self.cv) scoring = check_scoring(self.scoring) scoring_name = f"test_{scoring.name}" parallel = Parallel(n_jobs=self.n_jobs, pre_dispatch=self.pre_dispatch) def _fit_and_score(params): # Clone forecaster. forecaster = clone(self.forecaster) # Set parameters. forecaster.set_params(**params) # Evaluate. out = evaluate( forecaster, cv, y, X, strategy=self.strategy, scoring=scoring, fit_params=fit_params, ) # Filter columns. out = out.filter(items=[scoring_name, "fit_time", "pred_time"], axis=1) # Aggregate results. out = out.mean() out = out.add_prefix("mean_") # Add parameters to output table. out["params"] = params return out def evaluate_candidates(candidate_params): candidate_params = list(candidate_params) if self.verbose > 0: n_candidates = len(candidate_params) n_splits = cv.get_n_splits(y) print( # noqa "Fitting {0} folds for each of {1} candidates," " totalling {2} fits".format(n_splits, n_candidates, n_candidates * n_splits)) out = parallel( delayed(_fit_and_score)(params) for params in candidate_params) if len(out) < 1: raise ValueError("No fits were performed. " "Was the CV iterator empty? " "Were there no candidates?") return out # Run grid-search cross-validation. results = self._run_search(evaluate_candidates) results = pd.DataFrame(results) # Rank results, according to whether greater is better for the given scoring. results[ f"rank_{scoring_name}"] = results.loc[:, f"mean_{scoring_name}"].rank( ascending=~scoring. greater_is_better) self.cv_results_ = results # Select best parameters. self.best_index_ = results.loc[:, f"rank_{scoring_name}"].argmin() self.best_score_ = results.loc[self.best_index_, f"mean_{scoring_name}"] self.best_params_ = results.loc[self.best_index_, "params"] self.best_forecaster_ = clone( self.forecaster).set_params(**self.best_params_) # Refit model with best parameters. if self.refit: self.best_forecaster_.fit(y, X, fh) self._is_fitted = True return self