def get_table_dyn(S: pd.Index, n_tot: int, max_length=100): """ Parameters ---------- S: pd.Index or np.ndarray a Series of occurrences n_tot: int total number of occurrences in the original events max_length: int, default=None maximum number of occurrences for a cycle to cover, by default it will be set to :math:`\log_{2}\left(|S|\right)` """ diffs = np.diff(S) triples = sliding_window_view(S, 3) diff_pairs = sliding_window_view(diffs, 2) dS = S.max() - S.min() score_one = residual_length(1, n_tot, dS) # 1 really ? scores = sum(cycle_length(triples, diff_pairs, len(S), dS)) change = scores > 3 * score_one scores[change] = 3 * score_one # inplace replacement cut_points = np.array([-1] * len(scores), dtype=object) cut_points[~change] = None scores = dict(zip(((i, i + 2) for i in range(len(scores))), scores)) cut_points = dict(zip(scores.keys(), cut_points)) max_length = min([len(S), max_length]) for k in range(4, max_length + 1): w = sliding_window_view(S, k) _diffs = sliding_window_view(diffs, k - 1) _s = sum(cycle_length(w, _diffs, len(S), dS)) for ia, best_score in enumerate(_s): cut_point = None iz = ia + k - 1 for im in range(ia, iz): if im - ia + 1 < 3: score_left = score_one * (im - ia + 1) else: score_left = scores[(ia, im)] if iz - im < 3: score_right = score_one * (iz - im) else: score_right = scores[(im + 1, iz)] if score_left + score_right < best_score: best_score = score_left + score_right cut_point = im scores[(ia, iz)] = best_score cut_points[(ia, iz)] = cut_point return scores, cut_points
def _split(self, y: pd.Index) -> SPLIT_GENERATOR_TYPE: n_timepoints = y.shape[0] cutoffs = check_cutoffs(cutoffs=self.cutoffs) fh = _check_fh(fh=self.fh) window_length = check_window_length( window_length=self.window_length, n_timepoints=n_timepoints ) _check_cutoffs_and_y(cutoffs=cutoffs, y=y) _check_cutoffs_fh_y(cutoffs=cutoffs, fh=fh, y=y) max_fh = fh.max() max_cutoff = np.max(cutoffs) for cutoff in cutoffs: if is_int(x=window_length) and is_int(x=cutoff): train_start = cutoff - window_length elif is_timedelta_or_date_offset(x=window_length) and is_datetime(x=cutoff): train_start = y.get_loc(max(y[0], cutoff - window_length)) else: raise TypeError( f"Unsupported combination of types: " f"`window_length`: {type(window_length)}, " f"`cutoff`: {type(cutoff)}" ) split_point = cutoff if is_int(x=cutoff) else y.get_loc(y[y <= cutoff][-1]) training_window = self._get_train_window( y=y, train_start=train_start + 1, split_point=split_point + 1 ) test_window = cutoff + fh.to_numpy() if is_datetime(x=max_cutoff) and is_timedelta(x=max_fh): test_window = test_window[test_window >= y.min()] test_window = np.array( [y.get_loc(timestamp) for timestamp in test_window] ) yield training_window, test_window