def _split(self, y: Optional[ACCEPTED_Y_TYPES]) -> SPLIT_GENERATOR_TYPE: n_timepoints = y.shape[0] step_length = check_step_length(self.step_length) window_length = check_window_length(self.window_length, n_timepoints, "window_length") initial_window = check_window_length(self.initial_window, n_timepoints, "initial_window") fh = _check_fh(self.fh) _check_window_lengths(y, fh, window_length, initial_window) if self.initial_window is not None: if not self.start_with_window: raise ValueError( "`start_with_window` must be True if `initial_window` is given" ) if self.initial_window <= self.window_length: raise ValueError( "`initial_window` must greater than `window_length`") if is_timedelta_or_date_offset(x=self.initial_window): initial_window_threshold = y.get_loc(y[0] + self.initial_window) else: initial_window_threshold = self.initial_window # For in-sample forecasting horizons, the first split must ensure that # in-sample test set is still within the data. if not fh.is_all_out_of_sample() and abs( fh[0]) >= initial_window_threshold: initial_start = abs(fh[0]) - self.initial_window + 1 else: initial_start = 0 if is_timedelta_or_date_offset(x=initial_window): initial_end = y.get_loc(y[initial_start] + initial_window) else: initial_end = initial_start + initial_window train = np.arange(initial_start, initial_end) test = initial_end + fh.to_numpy() - 1 yield train, test start = self._get_start(y=y, fh=fh) end = _get_end(y=y, fh=fh) step_length = self._get_step_length(x=step_length) for train, test in self._split_windows(start, end, step_length, window_length, y, fh.to_numpy()): yield train, test
def get_cutoffs(self, y: Optional[ACCEPTED_Y_TYPES] = None) -> np.ndarray: """Return the cutoff points. Parameters ---------- y : pd.Series or pd.Index, optional (default=None) Time series to split Returns ------- cutoffs : np.array The array of cutoff points. """ if y is None: raise ValueError( f"{self.__class__.__name__} requires `y` to compute the cutoffs." ) y = _check_y(y) fh = _check_fh(self.fh) step_length = check_step_length(self.step_length) if hasattr(self, "initial_window") and self.initial_window is not None: if is_timedelta_or_date_offset(x=self.initial_window): start = y.get_loc(y[0] + self.initial_window) else: start = self.initial_window else: start = self._get_start(y=y, fh=fh) end = _get_end(y, fh) step_length = self._get_step_length(x=step_length) return np.arange(start, end, step_length) - 1
def _cutoffs_fh_window_length_types_are_supported( cutoffs: VALID_CUTOFF_TYPES, fh: FORECASTING_HORIZON_TYPES, window_length: ACCEPTED_WINDOW_LENGTH_TYPES, ) -> bool: """Check that combination of inputs is supported. Currently, only two cases are allowed: either all inputs are integers, or they are all datetime or timedelta Parameters ---------- cutoffs : np.array or pd.Index cutoff points, positive and integer- or datetime-index like fh : int, timedelta, list or np.array of ints or timedeltas window_length : int or timedelta or pd.DateOffset Returns ------- True if all inputs are compatible, False otherwise """ all_int = array_is_int(cutoffs) and array_is_int(fh) and is_int( window_length) all_dates = (array_is_datetime64(cutoffs) and array_is_timedelta_or_date_offset(fh) and is_timedelta_or_date_offset(window_length)) if all_int or all_dates: return True else: return False
def _check_window_lengths( y: ACCEPTED_Y_TYPES, fh: ForecastingHorizon, window_length: NON_FLOAT_WINDOW_LENGTH_TYPES, initial_window: NON_FLOAT_WINDOW_LENGTH_TYPES, ) -> None: n_timepoints = y.shape[0] fh_max = fh[-1] error_msg_for_incompatible_window_length = ( f"The `window_length` and the forecasting horizon are incompatible " f"with the length of `y`. Found `window_length`={window_length}, " f"`max(fh)`={fh_max}, but len(y)={n_timepoints}. " f"It is required that the window length plus maximum forecast horizon " f"is smaller than the length of the time series `y` itself." ) if is_timedelta_or_date_offset(x=window_length): if y.get_loc(min(y[-1], y[0] + window_length)) + fh_max > n_timepoints: raise ValueError(error_msg_for_incompatible_window_length) else: if window_length + fh_max > n_timepoints: raise ValueError(error_msg_for_incompatible_window_length) error_msg_for_incompatible_initial_window = ( f"The `initial_window` and the forecasting horizon are incompatible " f"with the length of `y`. Found `initial_window`={initial_window}, " f"`max(fh)`={fh_max}, but len(y)={n_timepoints}. " f"It is required that the initial window plus maximum forecast horizon " f"is smaller than the length of the time series `y` itself." ) error_msg_for_incompatible_types = ( "The `initial_window` and `window_length` types are incompatible. " "They should be either all timedelta or all int." ) if initial_window is not None: if is_timedelta_or_date_offset(x=initial_window): if y.get_loc(min(y[-1], y[0] + initial_window)) + fh_max > n_timepoints: raise ValueError(error_msg_for_incompatible_initial_window) if not is_timedelta_or_date_offset(x=window_length): raise ValueError(error_msg_for_incompatible_types) else: if initial_window + fh_max > n_timepoints: raise ValueError(error_msg_for_incompatible_initial_window) if is_timedelta_or_date_offset(x=window_length): raise ValueError(error_msg_for_incompatible_types)
def _get_train_start(start, window_length: ACCEPTED_WINDOW_LENGTH_TYPES, y: ACCEPTED_Y_TYPES) -> int: if is_timedelta_or_date_offset(x=window_length): train_start = y.get_loc( max(y[min(start, len(y) - 1)] - window_length, min(y))) if start >= len(y): train_start += 1 else: train_start = start - window_length return train_start
def _split_for_initial_window(self, y: pd.Index) -> SPLIT_ARRAY_TYPE: """Get train/test splits for non-empty initial window. Parameters ---------- y : pd.Index Index of the time series to split Returns ------- (np.ndarray, np.ndarray) Integer indices of the train/test windows """ fh = _check_fh(self.fh) if not self.start_with_window: raise ValueError( "`start_with_window` must be True if `initial_window` is given" ) if self.initial_window <= self.window_length: raise ValueError("`initial_window` must greater than `window_length`") if is_timedelta_or_date_offset(x=self.initial_window): initial_window_threshold = y.get_loc(y[0] + self.initial_window) else: initial_window_threshold = self.initial_window # For in-sample forecasting horizons, the first split must ensure that # in-sample test set is still within the data. if not fh.is_all_out_of_sample() and abs(fh[0]) >= initial_window_threshold: initial_start = abs(fh[0]) - self.initial_window + 1 else: initial_start = 0 if is_timedelta_or_date_offset(x=self.initial_window): initial_end = y.get_loc(y[initial_start] + self.initial_window) else: initial_end = initial_start + self.initial_window train = self._get_train_window( y=y, train_start=initial_start, split_point=initial_end ) test = initial_end + fh.to_numpy() - 1 return train, test
def _get_start(self, y: ACCEPTED_Y_TYPES, fh: ForecastingHorizon) -> int: """Get the first split point.""" # By default, the first split point is the index zero, the first # observation in # the data. start = 0 # If we start with a full window, the first split point depends on the window # length. if hasattr(self, "start_with_window") and self.start_with_window: if hasattr(self, "initial_window") and self.initial_window is not None: if hasattr(self, "step_length"): step_length = self._get_step_length(x=self.step_length) else: step_length = 1 if is_timedelta_or_date_offset(x=self.initial_window): start = y.get_loc(y[start] + self.initial_window) + step_length else: start += self.initial_window + step_length else: if is_timedelta_or_date_offset(x=self.window_length): start = y.get_loc(y[start] + self.window_length) else: start += self.window_length # For in-sample forecasting horizons, the first split must ensure that # in-sample test set is still within the data. if not fh.is_all_out_of_sample(): fh_min = abs(fh[0]) if fh_min >= start: start = fh_min + 1 return start
def _split(self, y: ACCEPTED_Y_TYPES) -> SPLIT_GENERATOR_TYPE: n_timepoints = y.shape[0] window_length = check_window_length(self.window_length, n_timepoints) fh = _check_fh(self.fh) end = _get_end(y, fh) - 1 if window_length is None: start = 0 elif is_timedelta_or_date_offset(x=window_length): start = y.get_loc(y[end - 1] - window_length) + 1 else: start = end - window_length train = np.arange(start, end) test = end + fh.to_numpy() - 1 yield train, test
def _split(self, y: ACCEPTED_Y_TYPES) -> SPLIT_GENERATOR_TYPE: cutoffs = check_cutoffs(self.cutoffs) if np.max(cutoffs) >= y.shape[0]: raise ValueError("`cutoffs` are incompatible with given `y`.") fh = _check_fh(self.fh) n_timepoints = y.shape[0] if np.max(cutoffs) + np.max(fh) > y.shape[0]: raise ValueError("`fh` is incompatible with given `cutoffs` and `y`.") window_length = check_window_length(self.window_length, n_timepoints) for cutoff in cutoffs: if is_timedelta_or_date_offset(x=window_length): train_start = y.get_loc(max(y[0], y[cutoff] - window_length)) else: train_start = cutoff - window_length training_window = np.arange(train_start, cutoff) + 1 test_window = cutoff + fh yield training_window, test_window
def _split(self, y: ACCEPTED_Y_TYPES) -> SPLIT_GENERATOR_TYPE: n_timepoints = y.shape[0] cutoffs = check_cutoffs(cutoffs=self.cutoffs) fh = _check_fh(fh=self.fh) window_length = check_window_length(window_length=self.window_length, n_timepoints=n_timepoints) _check_cutoffs_fh_window_length(cutoffs=cutoffs, fh=fh, window_length=window_length) _check_cutoffs_and_y(cutoffs=cutoffs, y=y) _check_cutoffs_fh_y(cutoffs=cutoffs, fh=fh, y=y) max_fh = fh.max() max_cutoff = np.max(cutoffs) for cutoff in cutoffs: if is_int(x=window_length) and is_int(x=cutoff): train_start = cutoff - window_length elif is_timedelta_or_date_offset(x=window_length) and is_datetime( x=cutoff): train_start = y.get_loc(max(y[0], cutoff - window_length)) else: raise TypeError(f"Unsupported combination of types: " f"`window_length`: {type(window_length)}, " f"`cutoff`: {type(cutoff)}") if is_int(x=cutoff): training_window = np.arange(train_start, cutoff) + 1 else: training_window = np.arange(train_start, y.get_loc(cutoff)) + 1 test_window = cutoff + fh.to_numpy() if is_datetime(x=max_cutoff) and is_timedelta(x=max_fh): test_window = test_window[test_window >= y.min()] test_window = np.array( [y.get_loc(timestamp) for timestamp in test_window]) yield training_window, test_window
def _windows_are_incompatible(initial_window, window_length) -> bool: return (is_timedelta_or_date_offset(x=initial_window) and not is_timedelta_or_date_offset(x=window_length)) or ( is_timedelta_or_date_offset(x=window_length) and not is_timedelta_or_date_offset(x=initial_window))
def _check_values(values: Union[VALID_FORECASTING_HORIZON_TYPES]) -> pd.Index: """Validate forecasting horizon values. Validation checks validity and also converts forecasting horizon values to supported pandas.Index types if possible. Parameters ---------- values : int, list, array, certain pd.Index types Forecasting horizon with steps ahead to predict. Raises ------ TypeError : Raised if `values` type is not supported Returns ------- values : pd.Index Sorted and validated forecasting horizon values. """ # if values are one of the supported pandas index types, we don't have # to do # anything as the forecasting horizon directly wraps the index, note that # isinstance() does not work here, because index types inherit from each # other, # hence we check for type equality here if type(values) in VALID_INDEX_TYPES: pass # convert single integer to pandas index, no further checks needed elif is_int(values): return pd.Int64Index([values], dtype=int) elif is_timedelta_or_date_offset(values): return pd.Index([values]) # convert np.array or list to pandas index elif is_array(values) and array_is_int(values): values = pd.Int64Index(values, dtype=int) elif is_array(values) and array_is_timedelta_or_date_offset(values): values = pd.Index(values) # otherwise, raise type error else: valid_types = ( "int", "np.array", "list", *[f"pd.{index_type.__name__}" for index_type in VALID_INDEX_TYPES], ) raise TypeError( f"Invalid `fh`. The type of the passed `fh` values is not supported. " f"Please use one of {valid_types}, but found: {type(values)}") # check values does not contain duplicates if len(values) != values.nunique(): raise ValueError( "Invalid `fh`. The `fh` values must not contain any duplicates.") # return sorted values return values.sort_values()
def _check_window_lengths( y: ACCEPTED_Y_TYPES, fh: ForecastingHorizon, window_length: NON_FLOAT_WINDOW_LENGTH_TYPES, initial_window: NON_FLOAT_WINDOW_LENGTH_TYPES, ) -> None: """Check that combination of inputs is compatible. Parameters ---------- y : pd.Series, pd.DataFrame, np.ndarray, or pd.Index coerced and checked version of input y fh : int, timedelta, list or np.array of ints or timedeltas window_length : int or timedelta or pd.DateOffset initial_window : int or timedelta or pd.DateOffset Window length of first window Raises ------ ValueError if window length plus max horizon is above the last observation in `y`, or if initial window plus max horizon is above the last observation in `y` TypeError if type of the input is not supported """ n_timepoints = y.shape[0] fh_max = fh[-1] error_msg_for_incompatible_window_length = ( f"The `window_length` and the forecasting horizon are incompatible " f"with the length of `y`. Found `window_length`={window_length}, " f"`max(fh)`={fh_max}, but len(y)={n_timepoints}. " f"It is required that the window length plus maximum forecast horizon " f"is smaller than the length of the time series `y` itself.") if is_timedelta_or_date_offset(x=window_length): if y.get_loc(min(y[-1], y[0] + window_length)) + fh_max > n_timepoints: raise ValueError(error_msg_for_incompatible_window_length) else: if window_length + fh_max > n_timepoints: raise ValueError(error_msg_for_incompatible_window_length) error_msg_for_incompatible_initial_window = ( f"The `initial_window` and the forecasting horizon are incompatible " f"with the length of `y`. Found `initial_window`={initial_window}, " f"`max(fh)`={fh_max}, but len(y)={n_timepoints}. " f"It is required that the initial window plus maximum forecast horizon " f"is smaller than the length of the time series `y` itself.") error_msg_for_incompatible_types = ( "The `initial_window` and `window_length` types are incompatible. " "They should be either all timedelta or all int.") if initial_window is not None: if is_timedelta_or_date_offset(x=initial_window): if y.get_loc(min(y[-1], y[0] + initial_window)) + fh_max > n_timepoints: raise ValueError(error_msg_for_incompatible_initial_window) if not is_timedelta_or_date_offset(x=window_length): raise TypeError(error_msg_for_incompatible_types) else: if initial_window + fh_max > n_timepoints: raise ValueError(error_msg_for_incompatible_initial_window) if is_timedelta_or_date_offset(x=window_length): raise TypeError(error_msg_for_incompatible_types)