def get_cv_accounting_for_years(y_train=pd.DataFrame, kfold: int = 5, seed: int = 1, groups=None): ''' Train-test split that gives priority to keep data of same year as blocks, datapoints of same year are very much not i.i.d. and should be seperated. Parameters ---------- total_size : int total length of dataset. kfold : int prefered number of folds, however, if folds do not fit the number of years, kfold is incremented untill it does. seed : int, optional random seed. The default is 1. Returns ------- cv : sk-learn cross-validation generator ''' # if dealing with subseasonal data, there is a lot of autocorrelation. # it is best practice to keep the groups of target dates within a year well # seperated, therefore: if groups is None and np.unique(y_train.index.year).size != y_train.size: # find where there is a gap in time, indication of seperate RV period gapdays = (y_train.index[1:] - y_train.index[:-1]).days adjecent_dates = gapdays > (np.median(gapdays) + gapdays / 2) n_gr = gapdays[gapdays > (np.median(gapdays) + gapdays / 2)].size + 1 dategroupsize = np.argmax(adjecent_dates) + 1 groups = np.repeat(np.arange(0, n_gr), dategroupsize) if groups.size != y_train.size: # else revert to keeping years together groups = y_train.index.year else: groups = y_train.index.year # annual data, no autocorrelation groups high_normal_low = y_train.groupby(groups).sum() high_normal_low[(high_normal_low > high_normal_low.quantile(q=.66)).values] = 1 high_normal_low[(high_normal_low < high_normal_low.quantile(q=.33)).values] = -1 high_normal_low[np.logical_and(high_normal_low != 1, high_normal_low != -1)] = 0 # high_normal_low = high_normal_low.groupby(groups).sum() freq = high_normal_low # freq = y_train.groupby(groups).sum() # freq = (freq > freq.mean()).astype(int) # all_years = np.unique(freq.index) Folds may be of different size # while all_years.size % kfold != 0: # kfold += 1 cv_strat = StratifiedKFold(n_splits=kfold, shuffle=True, random_state=seed) test_gr = [] for i, j in cv_strat.split(X=freq.index, y=freq.values): test_gr.append(j) # test_gr.append(freq.index[j].values) label_test = np.zeros(y_train.size, dtype=int) for i, test_fold in enumerate(test_gr): for j, gr in enumerate(groups): if j in list(test_fold): label_test[j] = i cv = PredefinedSplit(label_test) cv.uniqgroups = test_gr return cv