Example #1
0
    def _split(self, y: Optional[ACCEPTED_Y_TYPES]) -> SPLIT_GENERATOR_TYPE:
        n_timepoints = y.shape[0]
        step_length = check_step_length(self.step_length)
        window_length = check_window_length(self.window_length, n_timepoints,
                                            "window_length")
        initial_window = check_window_length(self.initial_window, n_timepoints,
                                             "initial_window")
        fh = _check_fh(self.fh)
        _check_window_lengths(y, fh, window_length, initial_window)

        if self.initial_window is not None:
            if not self.start_with_window:
                raise ValueError(
                    "`start_with_window` must be True if `initial_window` is given"
                )

            if self.initial_window <= self.window_length:
                raise ValueError(
                    "`initial_window` must greater than `window_length`")

            if is_timedelta_or_date_offset(x=self.initial_window):
                initial_window_threshold = y.get_loc(y[0] +
                                                     self.initial_window)
            else:
                initial_window_threshold = self.initial_window
            # For in-sample forecasting horizons, the first split must ensure that
            # in-sample test set is still within the data.
            if not fh.is_all_out_of_sample() and abs(
                    fh[0]) >= initial_window_threshold:
                initial_start = abs(fh[0]) - self.initial_window + 1
            else:
                initial_start = 0

            if is_timedelta_or_date_offset(x=initial_window):
                initial_end = y.get_loc(y[initial_start] + initial_window)
            else:
                initial_end = initial_start + initial_window
            train = np.arange(initial_start, initial_end)
            test = initial_end + fh.to_numpy() - 1
            yield train, test

        start = self._get_start(y=y, fh=fh)
        end = _get_end(y=y, fh=fh)
        step_length = self._get_step_length(x=step_length)

        for train, test in self._split_windows(start, end,
                                               step_length, window_length, y,
                                               fh.to_numpy()):
            yield train, test
Example #2
0
    def get_cutoffs(self, y: Optional[ACCEPTED_Y_TYPES] = None) -> np.ndarray:
        """Return the cutoff points.

        Parameters
        ----------
        y : pd.Series or pd.Index, optional (default=None)
            Time series to split

        Returns
        -------
        cutoffs : np.array
            The array of cutoff points.
        """
        if y is None:
            raise ValueError(
                f"{self.__class__.__name__} requires `y` to compute the cutoffs."
            )
        y = _check_y(y)
        fh = _check_fh(self.fh)
        step_length = check_step_length(self.step_length)

        if hasattr(self, "initial_window") and self.initial_window is not None:
            if is_timedelta_or_date_offset(x=self.initial_window):
                start = y.get_loc(y[0] + self.initial_window)
            else:
                start = self.initial_window
        else:
            start = self._get_start(y=y, fh=fh)

        end = _get_end(y, fh)
        step_length = self._get_step_length(x=step_length)

        return np.arange(start, end, step_length) - 1
Example #3
0
def _cutoffs_fh_window_length_types_are_supported(
    cutoffs: VALID_CUTOFF_TYPES,
    fh: FORECASTING_HORIZON_TYPES,
    window_length: ACCEPTED_WINDOW_LENGTH_TYPES,
) -> bool:
    """Check that combination of inputs is supported.

    Currently, only two cases are allowed:
    either all inputs are integers, or they are all datetime or timedelta

    Parameters
    ----------
    cutoffs : np.array or pd.Index
        cutoff points, positive and integer- or datetime-index like
    fh : int, timedelta, list or np.array of ints or timedeltas
    window_length : int or timedelta or pd.DateOffset

    Returns
    -------
    True if all inputs are compatible, False otherwise
    """
    all_int = array_is_int(cutoffs) and array_is_int(fh) and is_int(
        window_length)
    all_dates = (array_is_datetime64(cutoffs)
                 and array_is_timedelta_or_date_offset(fh)
                 and is_timedelta_or_date_offset(window_length))
    if all_int or all_dates:
        return True
    else:
        return False
Example #4
0
def _check_window_lengths(
    y: ACCEPTED_Y_TYPES,
    fh: ForecastingHorizon,
    window_length: NON_FLOAT_WINDOW_LENGTH_TYPES,
    initial_window: NON_FLOAT_WINDOW_LENGTH_TYPES,
) -> None:
    n_timepoints = y.shape[0]
    fh_max = fh[-1]

    error_msg_for_incompatible_window_length = (
        f"The `window_length` and the forecasting horizon are incompatible "
        f"with the length of `y`. Found `window_length`={window_length}, "
        f"`max(fh)`={fh_max}, but len(y)={n_timepoints}. "
        f"It is required that the window length plus maximum forecast horizon "
        f"is smaller than the length of the time series `y` itself."
    )
    if is_timedelta_or_date_offset(x=window_length):
        if y.get_loc(min(y[-1], y[0] + window_length)) + fh_max > n_timepoints:
            raise ValueError(error_msg_for_incompatible_window_length)
    else:
        if window_length + fh_max > n_timepoints:
            raise ValueError(error_msg_for_incompatible_window_length)

    error_msg_for_incompatible_initial_window = (
        f"The `initial_window` and the forecasting horizon are incompatible "
        f"with the length of `y`. Found `initial_window`={initial_window}, "
        f"`max(fh)`={fh_max}, but len(y)={n_timepoints}. "
        f"It is required that the initial window plus maximum forecast horizon "
        f"is smaller than the length of the time series `y` itself."
    )
    error_msg_for_incompatible_types = (
        "The `initial_window` and `window_length` types are incompatible. "
        "They should be either all timedelta or all int."
    )
    if initial_window is not None:
        if is_timedelta_or_date_offset(x=initial_window):
            if y.get_loc(min(y[-1], y[0] + initial_window)) + fh_max > n_timepoints:
                raise ValueError(error_msg_for_incompatible_initial_window)
            if not is_timedelta_or_date_offset(x=window_length):
                raise ValueError(error_msg_for_incompatible_types)
        else:
            if initial_window + fh_max > n_timepoints:
                raise ValueError(error_msg_for_incompatible_initial_window)
            if is_timedelta_or_date_offset(x=window_length):
                raise ValueError(error_msg_for_incompatible_types)
Example #5
0
 def _get_train_start(start, window_length: ACCEPTED_WINDOW_LENGTH_TYPES,
                      y: ACCEPTED_Y_TYPES) -> int:
     if is_timedelta_or_date_offset(x=window_length):
         train_start = y.get_loc(
             max(y[min(start,
                       len(y) - 1)] - window_length, min(y)))
         if start >= len(y):
             train_start += 1
     else:
         train_start = start - window_length
     return train_start
Example #6
0
    def _split_for_initial_window(self, y: pd.Index) -> SPLIT_ARRAY_TYPE:
        """Get train/test splits for non-empty initial window.

        Parameters
        ----------
        y : pd.Index
            Index of the time series to split

        Returns
        -------
        (np.ndarray, np.ndarray)
            Integer indices of the train/test windows

        """
        fh = _check_fh(self.fh)
        if not self.start_with_window:
            raise ValueError(
                "`start_with_window` must be True if `initial_window` is given"
            )
        if self.initial_window <= self.window_length:
            raise ValueError("`initial_window` must greater than `window_length`")
        if is_timedelta_or_date_offset(x=self.initial_window):
            initial_window_threshold = y.get_loc(y[0] + self.initial_window)
        else:
            initial_window_threshold = self.initial_window
        # For in-sample forecasting horizons, the first split must ensure that
        # in-sample test set is still within the data.
        if not fh.is_all_out_of_sample() and abs(fh[0]) >= initial_window_threshold:
            initial_start = abs(fh[0]) - self.initial_window + 1
        else:
            initial_start = 0
        if is_timedelta_or_date_offset(x=self.initial_window):
            initial_end = y.get_loc(y[initial_start] + self.initial_window)
        else:
            initial_end = initial_start + self.initial_window
        train = self._get_train_window(
            y=y, train_start=initial_start, split_point=initial_end
        )
        test = initial_end + fh.to_numpy() - 1
        return train, test
Example #7
0
    def _get_start(self, y: ACCEPTED_Y_TYPES, fh: ForecastingHorizon) -> int:
        """Get the first split point."""
        # By default, the first split point is the index zero, the first
        # observation in
        # the data.
        start = 0

        # If we start with a full window, the first split point depends on the window
        # length.
        if hasattr(self, "start_with_window") and self.start_with_window:

            if hasattr(self,
                       "initial_window") and self.initial_window is not None:

                if hasattr(self, "step_length"):
                    step_length = self._get_step_length(x=self.step_length)
                else:
                    step_length = 1

                if is_timedelta_or_date_offset(x=self.initial_window):
                    start = y.get_loc(y[start] +
                                      self.initial_window) + step_length
                else:
                    start += self.initial_window + step_length
            else:
                if is_timedelta_or_date_offset(x=self.window_length):
                    start = y.get_loc(y[start] + self.window_length)
                else:
                    start += self.window_length

        # For in-sample forecasting horizons, the first split must ensure that
        # in-sample test set is still within the data.
        if not fh.is_all_out_of_sample():
            fh_min = abs(fh[0])
            if fh_min >= start:
                start = fh_min + 1

        return start
Example #8
0
    def _split(self, y: ACCEPTED_Y_TYPES) -> SPLIT_GENERATOR_TYPE:
        n_timepoints = y.shape[0]
        window_length = check_window_length(self.window_length, n_timepoints)
        fh = _check_fh(self.fh)

        end = _get_end(y, fh) - 1
        if window_length is None:
            start = 0
        elif is_timedelta_or_date_offset(x=window_length):
            start = y.get_loc(y[end - 1] - window_length) + 1
        else:
            start = end - window_length
        train = np.arange(start, end)
        test = end + fh.to_numpy() - 1
        yield train, test
Example #9
0
    def _split(self, y: ACCEPTED_Y_TYPES) -> SPLIT_GENERATOR_TYPE:
        cutoffs = check_cutoffs(self.cutoffs)
        if np.max(cutoffs) >= y.shape[0]:
            raise ValueError("`cutoffs` are incompatible with given `y`.")

        fh = _check_fh(self.fh)
        n_timepoints = y.shape[0]

        if np.max(cutoffs) + np.max(fh) > y.shape[0]:
            raise ValueError("`fh` is incompatible with given `cutoffs` and `y`.")
        window_length = check_window_length(self.window_length, n_timepoints)
        for cutoff in cutoffs:
            if is_timedelta_or_date_offset(x=window_length):
                train_start = y.get_loc(max(y[0], y[cutoff] - window_length))
            else:
                train_start = cutoff - window_length
            training_window = np.arange(train_start, cutoff) + 1
            test_window = cutoff + fh
            yield training_window, test_window
Example #10
0
    def _split(self, y: ACCEPTED_Y_TYPES) -> SPLIT_GENERATOR_TYPE:
        n_timepoints = y.shape[0]
        cutoffs = check_cutoffs(cutoffs=self.cutoffs)
        fh = _check_fh(fh=self.fh)
        window_length = check_window_length(window_length=self.window_length,
                                            n_timepoints=n_timepoints)
        _check_cutoffs_fh_window_length(cutoffs=cutoffs,
                                        fh=fh,
                                        window_length=window_length)
        _check_cutoffs_and_y(cutoffs=cutoffs, y=y)
        _check_cutoffs_fh_y(cutoffs=cutoffs, fh=fh, y=y)
        max_fh = fh.max()
        max_cutoff = np.max(cutoffs)

        for cutoff in cutoffs:
            if is_int(x=window_length) and is_int(x=cutoff):
                train_start = cutoff - window_length
            elif is_timedelta_or_date_offset(x=window_length) and is_datetime(
                    x=cutoff):
                train_start = y.get_loc(max(y[0], cutoff - window_length))
            else:
                raise TypeError(f"Unsupported combination of types: "
                                f"`window_length`: {type(window_length)}, "
                                f"`cutoff`: {type(cutoff)}")

            if is_int(x=cutoff):
                training_window = np.arange(train_start, cutoff) + 1
            else:
                training_window = np.arange(train_start, y.get_loc(cutoff)) + 1

            test_window = cutoff + fh.to_numpy()
            if is_datetime(x=max_cutoff) and is_timedelta(x=max_fh):
                test_window = test_window[test_window >= y.min()]
                test_window = np.array(
                    [y.get_loc(timestamp) for timestamp in test_window])
            yield training_window, test_window
Example #11
0
def _windows_are_incompatible(initial_window, window_length) -> bool:
    return (is_timedelta_or_date_offset(x=initial_window)
            and not is_timedelta_or_date_offset(x=window_length)) or (
                is_timedelta_or_date_offset(x=window_length)
                and not is_timedelta_or_date_offset(x=initial_window))
Example #12
0
def _check_values(values: Union[VALID_FORECASTING_HORIZON_TYPES]) -> pd.Index:
    """Validate forecasting horizon values.

    Validation checks validity and also converts forecasting horizon values
    to supported pandas.Index types if possible.

    Parameters
    ----------
    values : int, list, array, certain pd.Index types
        Forecasting horizon with steps ahead to predict.

    Raises
    ------
    TypeError :
        Raised if `values` type is not supported

    Returns
    -------
    values : pd.Index
        Sorted and validated forecasting horizon values.
    """
    # if values are one of the supported pandas index types, we don't have
    # to do
    # anything as the forecasting horizon directly wraps the index, note that
    # isinstance() does not work here, because index types inherit from each
    # other,
    # hence we check for type equality here
    if type(values) in VALID_INDEX_TYPES:
        pass

    # convert single integer to pandas index, no further checks needed
    elif is_int(values):
        return pd.Int64Index([values], dtype=int)

    elif is_timedelta_or_date_offset(values):
        return pd.Index([values])

    # convert np.array or list to pandas index
    elif is_array(values) and array_is_int(values):
        values = pd.Int64Index(values, dtype=int)

    elif is_array(values) and array_is_timedelta_or_date_offset(values):
        values = pd.Index(values)

    # otherwise, raise type error
    else:
        valid_types = (
            "int",
            "np.array",
            "list",
            *[f"pd.{index_type.__name__}" for index_type in VALID_INDEX_TYPES],
        )
        raise TypeError(
            f"Invalid `fh`. The type of the passed `fh` values is not supported. "
            f"Please use one of {valid_types}, but found: {type(values)}")

    # check values does not contain duplicates
    if len(values) != values.nunique():
        raise ValueError(
            "Invalid `fh`. The `fh` values must not contain any duplicates.")

    # return sorted values
    return values.sort_values()
Example #13
0
def _check_window_lengths(
    y: ACCEPTED_Y_TYPES,
    fh: ForecastingHorizon,
    window_length: NON_FLOAT_WINDOW_LENGTH_TYPES,
    initial_window: NON_FLOAT_WINDOW_LENGTH_TYPES,
) -> None:
    """Check that combination of inputs is compatible.

    Parameters
    ----------
    y : pd.Series, pd.DataFrame, np.ndarray, or pd.Index
        coerced and checked version of input y
    fh : int, timedelta, list or np.array of ints or timedeltas
    window_length : int or timedelta or pd.DateOffset
    initial_window : int or timedelta or pd.DateOffset
        Window length of first window

    Raises
    ------
    ValueError
        if window length plus max horizon is above the last observation in `y`,
        or if initial window plus max horizon is above the last observation in `y`
    TypeError
        if type of the input is not supported
    """
    n_timepoints = y.shape[0]
    fh_max = fh[-1]

    error_msg_for_incompatible_window_length = (
        f"The `window_length` and the forecasting horizon are incompatible "
        f"with the length of `y`. Found `window_length`={window_length}, "
        f"`max(fh)`={fh_max}, but len(y)={n_timepoints}. "
        f"It is required that the window length plus maximum forecast horizon "
        f"is smaller than the length of the time series `y` itself.")
    if is_timedelta_or_date_offset(x=window_length):
        if y.get_loc(min(y[-1], y[0] + window_length)) + fh_max > n_timepoints:
            raise ValueError(error_msg_for_incompatible_window_length)
    else:
        if window_length + fh_max > n_timepoints:
            raise ValueError(error_msg_for_incompatible_window_length)

    error_msg_for_incompatible_initial_window = (
        f"The `initial_window` and the forecasting horizon are incompatible "
        f"with the length of `y`. Found `initial_window`={initial_window}, "
        f"`max(fh)`={fh_max}, but len(y)={n_timepoints}. "
        f"It is required that the initial window plus maximum forecast horizon "
        f"is smaller than the length of the time series `y` itself.")
    error_msg_for_incompatible_types = (
        "The `initial_window` and `window_length` types are incompatible. "
        "They should be either all timedelta or all int.")
    if initial_window is not None:
        if is_timedelta_or_date_offset(x=initial_window):
            if y.get_loc(min(y[-1],
                             y[0] + initial_window)) + fh_max > n_timepoints:
                raise ValueError(error_msg_for_incompatible_initial_window)
            if not is_timedelta_or_date_offset(x=window_length):
                raise TypeError(error_msg_for_incompatible_types)
        else:
            if initial_window + fh_max > n_timepoints:
                raise ValueError(error_msg_for_incompatible_initial_window)
            if is_timedelta_or_date_offset(x=window_length):
                raise TypeError(error_msg_for_incompatible_types)