Beispiel #1
0
    def _set_y_X(self, y, X=None):
        """Set training data.

        Parameters
        ----------
        y : pd.Series
            Endogenous time series
        X : pd.DataFrame, optional (default=None)
            Exogenous time series
        """
        # set initial training data
        self._y, self._X = check_y_X(y, X, allow_empty=False)

        # set initial cutoff to the end of the training data
        self._set_cutoff(y.index[-1])
Beispiel #2
0
    def _fit(self, y, X=None, fh=None, **fit_params):
        """Fit to training data.

        Parameters
        ----------
        y : pd.Series
            Target time series to which to fit the forecaster.
        X : pd.DataFrame, optional (default=None)
            Exogenous variables.
        fh : int, list or np.array, optional (default=None)
            The forecasters horizon with the steps ahead to to predict.

        Returns
        -------
        self : returns an instance of self.
        """
        self._instantiate_model()
        self._check_changepoints()
        y, X = check_y_X(y, X, enforce_index_type=pd.DatetimeIndex)

        # We have to bring the data into the required format for fbprophet:
        df = pd.DataFrame({"y": y, "ds": y.index})

        # Add seasonality/seasonalities
        if self.add_seasonality:
            if type(self.add_seasonality) == dict:
                self._forecaster.add_seasonality(**self.add_seasonality)
            elif type(self.add_seasonality) == list:
                for seasonality in self.add_seasonality:
                    self._forecaster.add_seasonality(**seasonality)

        # Add country holidays
        if self.add_country_holidays:
            self._forecaster.add_country_holidays(**self.add_country_holidays)

        # Add regressor (multivariate)
        if X is not None:
            df, X = _merge_X(df, X)
            for col in X.columns:
                self._forecaster.add_regressor(col)

        if self.verbose:
            self._forecaster.fit(df=df, **fit_params)
        else:
            with _suppress_stdout_stderr():
                self._forecaster.fit(df=df, **fit_params)

        return self
Beispiel #3
0
    def fit(self, y, X=None, fh=None):
        """Fit forecaster to training data.

        public method including checks & utility
        dispatches to core logic in _fit

        Parameters
        ----------
        y : pd.Series
            Target time series to which to fit the forecaster.
        fh : int, list, np.array or ForecastingHorizon, optional (default=None)
            The forecasters horizon with the steps ahead to to predict.
        X : pd.DataFrame, optional (default=None)
            Exogeneous data
        Returns
        -------
        self : reference to self.

        State change
        ------------
        stores data in self._X and self._y
        stores fh, if passed
        updates self.cutoff to most recent time in y
        creates fitted model (attributes ending in "_")
        sets is_fitted flag to true
        """
        # if fit is called, fitted state is re-set
        self._is_fitted = False

        self._set_fh(fh)
        y, X = check_y_X(y, X)

        self._X = X
        self._y = y

        self._set_cutoff(y.index[-1])

        self._fit(y=y, X=X, fh=fh)

        # this should happen last
        self._is_fitted = True

        return self
Beispiel #4
0
    def _update_y_X(self, y, X=None):
        """Update training data.

        Parameters
        ----------
        y : pd.Series
            Endogenous time series
        X : pd.DataFrame, optional (default=None)
            Exogenous time series
        """
        # update only for non-empty data
        y, X = check_y_X(y, X=X, allow_empty=True)

        if len(y) > 0:
            self._y = y.combine_first(self._y)

            # set cutoff to the end of the observation horizon
            self._set_cutoff(y.index[-1])

            # update X if given
            if X is not None:
                self._X = X.combine_first(self._X)
Beispiel #5
0
    def fit(self, y, X=None, fh=None, **fit_params):
        """Fit to training data.

        Parameters
        ----------
        y : pd.Series
            Target time series to which to fit the forecaster.
        fh : int, list or np.array, optional (default=None)
            The forecasters horizon with the steps ahead to to predict.
        X : pd.DataFrame, optional (default=None)
            Exogenous variables are ignored
        Returns
        -------
        self : returns an instance of self.
        """
        y, X = check_y_X(y, X)

        # validate cross-validator
        cv = check_cv(self.cv)
        base_forecaster = clone(self.forecaster)

        scoring = check_scoring(self.scoring)
        scorers = {scoring.name: scoring}
        refit_metric = scoring.name

        fit_and_score_kwargs = dict(
            scorer=scorers,
            fit_params=fit_params,
            return_train_score=self.return_train_score,
            return_times=True,
            return_parameters=False,
            error_score=self.error_score,
            verbose=self.verbose,
        )

        results = {}
        all_candidate_params = []
        all_out = []

        def evaluate_candidates(candidate_params):
            candidate_params = list(candidate_params)
            n_candidates = len(candidate_params)

            if self.verbose > 0:
                n_splits = cv.get_n_splits(y)
                print(  # noqa
                    "Fitting {0} folds for each of {1} candidates,"
                    " totalling {2} fits".format(n_splits, n_candidates,
                                                 n_candidates * n_splits))

            out = []
            for parameters in candidate_params:
                r = _fit_and_score(clone(base_forecaster),
                                   cv,
                                   y,
                                   X,
                                   parameters=parameters,
                                   **fit_and_score_kwargs)
                out.append(r)

            n_splits = cv.get_n_splits(y)

            if len(out) < 1:
                raise ValueError("No fits were performed. "
                                 "Was the CV iterator empty? "
                                 "Were there no candidates?")

            all_candidate_params.extend(candidate_params)
            all_out.extend(out)

            nonlocal results
            results = self._format_results(all_candidate_params, scorers,
                                           all_out)
            return results

        self._run_search(evaluate_candidates)

        self.best_index_ = results["rank_test_%s" % refit_metric].argmin()
        self.best_score_ = results["mean_test_%s" %
                                   refit_metric][self.best_index_]
        self.best_params_ = results["params"][self.best_index_]

        self.best_forecaster_ = clone(base_forecaster).set_params(
            **self.best_params_)

        if self.refit:
            refit_start_time = time.time()
            self.best_forecaster_.fit(y, X, fh)
            self.refit_time_ = time.time() - refit_start_time

        # Store the only scorer not as a dict for single metric evaluation
        self.scorer_ = scorers[scoring.name]

        self.cv_results_ = results
        self.n_splits_ = cv.get_n_splits(y)

        self._is_fitted = True
        return self
Beispiel #6
0
def evaluate(
    forecaster,
    cv,
    y,
    X=None,
    strategy="refit",
    scoring=None,
    fit_params=None,
    return_data=False,
):
    """Evaluate forecaster using cross-validation

    Parameters
    ----------
    forecaster : sktime.forecaster
        Any forecaster
    y : pd.Series
        Target time series to which to fit the forecaster.
    X : pd.DataFrame, optional (default=None)
        Exogenous variables
    cv : Temporal cross-validation splitter
        Splitter of how to split the data into test data and train data
    strategy : str, optional (default="refit")
        Must be "refit" or "update". The strategy defines whether the `forecaster` is
        only fitted on the first train window data and then updated, or always refitted.
    scoring : object of class MetricFunctionWrapper from
        sktime.performance_metrics, optional. Example scoring=sMAPE().
        Used to get a score function that takes y_pred and y_test as arguments,
        by default None (if None, uses sMAPE)
    fit_params : dict, optional (default=None)
        Parameters passed to the `fit` call of the forecaster.
    return_data : bool, optional
        Returns three additional columns in the DataFrame, by default False.
        The cells of the columns contain each a pd.Series for y_train,
        y_pred, y_test.

    Returns
    -------
    pd.DataFrame
        DataFrame that contains several columns with information regarding each
        refit/update and prediction of the forecaster.

    Examples
    --------
    >>> from sktime.datasets import load_airline
    >>> from sktime.forecasting.model_evaluation import evaluate
    >>> from sktime.forecasting.model_selection import ExpandingWindowSplitter
    >>> from sktime.forecasting.naive import NaiveForecaster
    >>> y = load_airline()
    >>> forecaster = NaiveForecaster(strategy="mean", sp=12)
    >>> cv = ExpandingWindowSplitter(initial_window=24, step_length=12,
    ...                              fh=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])
    >>> results = evaluate(forecaster=forecaster, y=y, cv=cv)
    """
    _check_strategy(strategy)
    cv = check_cv(cv, enforce_start_with_window=True)
    scoring = check_scoring(scoring)
    y, X = check_y_X(y, X)
    fit_params = {} if fit_params is None else fit_params

    # Define score name.
    score_name = "test_" + scoring.name

    # Initialize dataframe.
    results = pd.DataFrame()

    # Run temporal cross-validation.
    for i, (train, test) in enumerate(cv.split(y)):
        # split data
        y_train, y_test, X_train, X_test = _split(y, X, train, test, cv.fh)

        # create forecasting horizon
        fh = ForecastingHorizon(y_test.index, is_relative=False)

        # fit/update
        start_fit = time.time()
        if i == 0 or strategy == "refit":
            forecaster.fit(y_train, X_train, fh=fh, **fit_params)

        else:  # if strategy == "update":
            forecaster.update(y_train, X_train)
        fit_time = time.time() - start_fit

        # predict
        start_pred = time.time()
        y_pred = forecaster.predict(fh, X=X_test)
        pred_time = time.time() - start_pred

        # score
        score = scoring(y_pred, y_test)

        # save results
        results = results.append(
            {
                score_name: score,
                "fit_time": fit_time,
                "pred_time": pred_time,
                "len_train_window": len(y_train),
                "cutoff": forecaster.cutoff,
                "y_train": y_train if return_data else np.nan,
                "y_test": y_test if return_data else np.nan,
                "y_pred": y_pred if return_data else np.nan,
            },
            ignore_index=True,
        )

    # post-processing of results
    if not return_data:
        results = results.drop(columns=["y_train", "y_test", "y_pred"])
    results["len_train_window"] = results["len_train_window"].astype(int)

    return results
Beispiel #7
0
    def fit(self, y, X=None, fh=None, **fit_params):
        """Fit to training data.

        Parameters
        ----------
        y : pd.Series
            Target time series to which to fit the forecaster.
        fh : int, list or np.array, optional (default=None)
            The forecasters horizon with the steps ahead to to predict.
        X : pd.DataFrame, optional (default=None)
            Exogenous variables are ignored
        Returns
        -------
        self : returns an instance of self.
        """
        y, X = check_y_X(y, X)
        cv = check_cv(self.cv)
        scoring = check_scoring(self.scoring)
        scoring_name = f"test_{scoring.name}"

        parallel = Parallel(n_jobs=self.n_jobs, pre_dispatch=self.pre_dispatch)

        def _fit_and_score(params):
            # Clone forecaster.
            forecaster = clone(self.forecaster)

            # Set parameters.
            forecaster.set_params(**params)

            # Evaluate.
            out = evaluate(
                forecaster,
                cv,
                y,
                X,
                strategy=self.strategy,
                scoring=scoring,
                fit_params=fit_params,
            )

            # Filter columns.
            out = out.filter(items=[scoring_name, "fit_time", "pred_time"],
                             axis=1)

            # Aggregate results.
            out = out.mean()
            out = out.add_prefix("mean_")

            # Add parameters to output table.
            out["params"] = params

            return out

        def evaluate_candidates(candidate_params):
            candidate_params = list(candidate_params)

            if self.verbose > 0:
                n_candidates = len(candidate_params)
                n_splits = cv.get_n_splits(y)
                print(  # noqa
                    "Fitting {0} folds for each of {1} candidates,"
                    " totalling {2} fits".format(n_splits, n_candidates,
                                                 n_candidates * n_splits))

            out = parallel(
                delayed(_fit_and_score)(params) for params in candidate_params)

            if len(out) < 1:
                raise ValueError("No fits were performed. "
                                 "Was the CV iterator empty? "
                                 "Were there no candidates?")

            return out

        # Run grid-search cross-validation.
        results = self._run_search(evaluate_candidates)

        results = pd.DataFrame(results)

        # Rank results, according to whether greater is better for the given scoring.
        results[
            f"rank_{scoring_name}"] = results.loc[:,
                                                  f"mean_{scoring_name}"].rank(
                                                      ascending=~scoring.
                                                      greater_is_better)
        self.cv_results_ = results

        # Select best parameters.
        self.best_index_ = results.loc[:, f"rank_{scoring_name}"].argmin()
        self.best_score_ = results.loc[self.best_index_,
                                       f"mean_{scoring_name}"]
        self.best_params_ = results.loc[self.best_index_, "params"]
        self.best_forecaster_ = clone(
            self.forecaster).set_params(**self.best_params_)

        # Refit model with best parameters.
        if self.refit:
            self.best_forecaster_.fit(y, X, fh)

        self._is_fitted = True
        return self