def _cutoffs_fh_window_length_types_are_supported( cutoffs: VALID_CUTOFF_TYPES, fh: FORECASTING_HORIZON_TYPES, window_length: ACCEPTED_WINDOW_LENGTH_TYPES, ) -> bool: """Check that combination of inputs is supported. Currently, only two cases are allowed: either all inputs are integers, or they are all datetime or timedelta Parameters ---------- cutoffs : np.array or pd.Index cutoff points, positive and integer- or datetime-index like fh : int, timedelta, list or np.array of ints or timedeltas window_length : int or timedelta or pd.DateOffset Returns ------- True if all inputs are compatible, False otherwise """ all_int = array_is_int(cutoffs) and array_is_int(fh) and is_int( window_length) all_dates = (array_is_datetime64(cutoffs) and array_is_timedelta_or_date_offset(fh) and is_timedelta_or_date_offset(window_length)) if all_int or all_dates: return True else: return False
def _check_cutoffs_against_train_windows(cutoffs, windows, y): # Cutoffs should always be the last values of the train windows. if array_is_int(cutoffs): actual = np.array([window[-1] for window in windows[1:]]) elif array_is_datetime64(cutoffs): actual = np.array( [y.index[window[-1]].to_datetime64() for window in windows[1:]], dtype="datetime64", ) else: raise ValueError( f"Provided `cutoffs` type is not supported: {type(cutoffs[0])}") np.testing.assert_array_equal(actual, cutoffs[1:]) # We treat the first window separately, since it may be empty when setting # `start_with_window=False`. if len(windows[0]) > 0: if array_is_int(cutoffs): np.testing.assert_array_equal(windows[0][-1], cutoffs[0]) elif array_is_datetime64(cutoffs): np.testing.assert_array_equal( y.index[windows[0][-1]].to_datetime64(), cutoffs[0]) else: raise ValueError( f"Provided `cutoffs` type is not supported: {type(cutoffs[0])}" )
def test_single_window_splitter_default_window_length(y, fh): """Test SingleWindowSplitter.""" cv = SingleWindowSplitter(fh=fh) train_windows, test_windows, cutoffs, n_splits = _check_cv(cv, y) train_window = train_windows[0] test_window = test_windows[0] assert n_splits == 1 checked_fh = check_fh(fh) assert test_window.shape[0] == len(checked_fh) fh = cv.get_fh() if fh.is_all_in_sample(): assert train_window.shape[0] == len(y) else: if array_is_int(checked_fh): assert train_window.shape[0] == len(y) - checked_fh.max() else: assert train_window.shape[0] == len( y[y.index <= y.index.max() - checked_fh.max()]) if array_is_int(checked_fh): test_window_expected = train_window[-1] + checked_fh else: test_window_expected = np.array([ y.index.get_loc(y.index[train_window[-1]] + x) for x in checked_fh ]) np.testing.assert_array_equal(test_window, test_window_expected)
def _get_end(y_index: pd.Index, fh: ForecastingHorizon) -> int: """Compute the end of the last training window for a forecasting horizon. For a time series index `y_index`, `y_index[end]` will give the index of the training window. Correspondingly, for a time series `y` with index `y_index`, `y.iloc[end]` or `y.loc[y_index[end]]` will provide the last index of the training window. Parameters ---------- y_index : pd.Index Index of time series fh : int, timedelta, list or np.ndarray of ints or timedeltas Returns ------- end : int 0-indexed integer end of the training window """ # `fh` is assumed to be ordered and checked by `_check_fh` and `window_length` by # `check_window_length`. n_timepoints = y_index.shape[0] assert isinstance(y_index, pd.Index) # For purely in-sample forecasting horizons, the last split point is the end of the # training data. # Otherwise, the last point must ensure that the last horizon is within the data. null = 0 if array_is_int(fh) else pd.Timedelta(0) fh_offset = null if fh.is_all_in_sample() else fh[-1] if array_is_int(fh): return n_timepoints - fh_offset - 1 else: return y_index.get_loc(y_index[-1] - fh_offset)
def _check_cutoffs_and_y(cutoffs: VALID_CUTOFF_TYPES, y: ACCEPTED_Y_TYPES) -> None: """Check that combination of inputs is compatible. Parameters ---------- cutoffs : np.array or pd.Index cutoff points, positive and integer- or datetime-index like y : pd.Series, pd.DataFrame, np.ndarray, or pd.Index coerced and checked version of input y Raises ------ ValueError if max cutoff is above the last observation in `y` TypeError if `cutoffs` type is not supported """ max_cutoff = np.max(cutoffs) msg = ("`cutoffs` are incompatible with given `y`. " "Maximum cutoff is not smaller than the ") if array_is_int(cutoffs): if max_cutoff >= y.shape[0]: raise ValueError(msg + "number of observations.") elif array_is_datetime64(cutoffs): if max_cutoff >= np.max(y): raise ValueError(msg + "maximum index value of `y`.") else: raise TypeError("Unsupported type of `cutoffs`")
def check_cutoffs(cutoffs: VALID_CUTOFF_TYPES) -> np.ndarray: """Validate the cutoff. Parameters ---------- cutoffs : np.ndarray or pd.Index Returns ------- cutoffs (Sorted array) Raises ------ ValueError If cutoffs is not a instance of np.array or pd.Index If cutoffs array is empty. """ if not isinstance(cutoffs, ACCEPTED_CUTOFF_TYPES): raise ValueError( f"`cutoffs` must be a np.array or pd.Index, but found: {type(cutoffs)}" ) assert array_is_int(cutoffs) or array_is_datetime64(cutoffs) if len(cutoffs) == 0: raise ValueError("Found empty `cutoff` array") return np.sort(cutoffs)
def test_single_window_splitter(y, fh, window_length): """Test SingleWindowSplitter.""" if _inputs_are_supported([fh, window_length]): cv = SingleWindowSplitter(fh=fh, window_length=window_length) train_windows, test_windows, cutoffs, n_splits = _check_cv(cv, y) train_window = train_windows[0] test_window = test_windows[0] assert n_splits == 1 assert train_window.shape[0] == _coerce_duration_to_int( duration=window_length, freq="D") checked_fh = check_fh(fh) assert test_window.shape[0] == len(checked_fh) if array_is_int(checked_fh): test_window_expected = train_window[-1] + checked_fh else: test_window_expected = np.array([ y.index.get_loc(y.index[train_window[-1]] + x) for x in checked_fh ]) np.testing.assert_array_equal(test_window, test_window_expected) else: with pytest.raises(TypeError, match="Unsupported combination of types"): SingleWindowSplitter(fh=fh, window_length=window_length)
def get_cutoffs(self, y: Optional[ACCEPTED_Y_TYPES] = None) -> np.ndarray: """Return the cutoff points. Since this splitter returns a single train/test split, this method returns a single one-dimensional array with the last train set index. Parameters ---------- y : pd.Series or pd.Index, optional (default=None) Time series to split Returns ------- cutoffs : np.array The array of cutoff points. """ if y is None: raise ValueError( f"{self.__class__.__name__} requires `y` to compute the cutoffs." ) fh = _check_fh(self.fh) y = get_index_for_series(y) end = _get_end(y_index=y, fh=fh) cutoff = end if array_is_int(fh) else y[end].to_datetime64() return np.array([cutoff])
def _split(self, y: pd.Index) -> SPLIT_GENERATOR_TYPE: n_timepoints = y.shape[0] window_length = check_window_length(self.window_length, n_timepoints) fh = _check_fh(self.fh) end = _get_end(y_index=y, fh=fh) if window_length is None: start = 0 elif is_int(window_length): start = end - window_length + 1 else: start = np.argwhere(y > y[end] - window_length).flatten()[0] train = self._get_train_window(y=y, train_start=start, split_point=end + 1) if array_is_int(fh): test = end + fh.to_numpy() else: test = np.array([y.get_loc(y[end] + x) for x in fh.to_pandas()]) yield train, test
def _check_values(values: Union[VALID_FORECASTING_HORIZON_TYPES]) -> pd.Index: """Validate forecasting horizon values. Validation checks validity and also converts forecasting horizon values to supported pandas.Index types if possible. Parameters ---------- values : int, list, array, certain pd.Index types Forecasting horizon with steps ahead to predict. Raises ------ TypeError : Raised if `values` type is not supported Returns ------- values : pd.Index Sorted and validated forecasting horizon values. """ # if values are one of the supported pandas index types, we don't have # to do # anything as the forecasting horizon directly wraps the index, note that # isinstance() does not work here, because index types inherit from each # other, # hence we check for type equality here if type(values) in VALID_INDEX_TYPES: pass # convert single integer to pandas index, no further checks needed elif is_int(values): return pd.Int64Index([values], dtype=int) elif is_timedelta_or_date_offset(values): return pd.Index([values]) # convert np.array or list to pandas index elif is_array(values) and array_is_int(values): values = pd.Int64Index(values, dtype=int) elif is_array(values) and array_is_timedelta_or_date_offset(values): values = pd.Index(values) # otherwise, raise type error else: valid_types = ( "int", "np.array", "list", *[f"pd.{index_type.__name__}" for index_type in VALID_INDEX_TYPES], ) raise TypeError( f"Invalid `fh`. The type of the passed `fh` values is not supported. " f"Please use one of {valid_types}, but found: {type(values)}") # check values does not contain duplicates if len(values) != values.nunique(): raise ValueError( "Invalid `fh`. The `fh` values must not contain any duplicates.") # return sorted values return values.sort_values()
def _check_cutoffs(cutoffs): assert isinstance(cutoffs, np.ndarray) assert array_is_int(cutoffs) or array_is_datetime64(cutoffs) assert cutoffs.ndim == 1 assert len(cutoffs) > 0