Esempio n. 1
0
def get_cv_accounting_for_years(y_train=pd.DataFrame,
                                kfold: int = 5,
                                seed: int = 1,
                                groups=None):
    '''
    Train-test split that gives priority to keep data of same year as blocks,
    datapoints of same year are very much not i.i.d. and should be seperated.


    Parameters
    ----------
    total_size : int
        total length of dataset.
    kfold : int
        prefered number of folds, however, if folds do not fit the number of
        years, kfold is incremented untill it does.
    seed : int, optional
        random seed. The default is 1.

    Returns
    -------
    cv : sk-learn cross-validation generator

    '''
    # if dealing with subseasonal data, there is a lot of autocorrelation.
    # it is best practice to keep the groups of target dates within a year well
    # seperated, therefore:
    if groups is None and np.unique(y_train.index.year).size != y_train.size:
        # find where there is a gap in time, indication of seperate RV period
        gapdays = (y_train.index[1:] - y_train.index[:-1]).days
        adjecent_dates = gapdays > (np.median(gapdays) + gapdays / 2)
        n_gr = gapdays[gapdays > (np.median(gapdays) + gapdays / 2)].size + 1
        dategroupsize = np.argmax(adjecent_dates) + 1
        groups = np.repeat(np.arange(0, n_gr), dategroupsize)
        if groups.size != y_train.size:  # else revert to keeping years together
            groups = y_train.index.year
    else:
        groups = y_train.index.year  # annual data, no autocorrelation groups

    high_normal_low = y_train.groupby(groups).sum()
    high_normal_low[(high_normal_low >
                     high_normal_low.quantile(q=.66)).values] = 1
    high_normal_low[(high_normal_low <
                     high_normal_low.quantile(q=.33)).values] = -1
    high_normal_low[np.logical_and(high_normal_low != 1,
                                   high_normal_low != -1)] = 0
    # high_normal_low = high_normal_low.groupby(groups).sum()
    freq = high_normal_low
    # freq = y_train.groupby(groups).sum()
    # freq = (freq > freq.mean()).astype(int)

    # all_years = np.unique(freq.index) Folds may be of different size
    # while all_years.size % kfold != 0:
    #     kfold += 1

    cv_strat = StratifiedKFold(n_splits=kfold, shuffle=True, random_state=seed)
    test_gr = []
    for i, j in cv_strat.split(X=freq.index, y=freq.values):
        test_gr.append(j)
        # test_gr.append(freq.index[j].values)

    label_test = np.zeros(y_train.size, dtype=int)
    for i, test_fold in enumerate(test_gr):
        for j, gr in enumerate(groups):
            if j in list(test_fold):
                label_test[j] = i

    cv = PredefinedSplit(label_test)
    cv.uniqgroups = test_gr
    return cv