def test_timegapsplit():
    cv = TimeGapSplit(
        date_serie=df["date"],
        train_duration=timedelta(days=5),
        valid_duration=timedelta(days=3),
        gap_duration=timedelta(days=0),
    )

    for i, indices in enumerate(cv.split(X_train, y_train)):
        train_mindate = df.loc[X_train.iloc[indices[0]].index]["date"].min()
        train_maxdate = df.loc[X_train.iloc[indices[0]].index]["date"].max()
        valid_mindate = df.loc[X_train.iloc[indices[1]].index]["date"].min()
        valid_maxdate = df.loc[X_train.iloc[indices[1]].index]["date"].max()

        assert train_mindate <= train_maxdate <= valid_mindate <= valid_maxdate

    # regression testing, check if output changes of the last fold
    assert train_mindate == datetime.datetime.strptime(
        "2018-01-16", "%Y-%m-%d")
    assert train_maxdate == datetime.datetime.strptime(
        "2018-01-20", "%Y-%m-%d")
    assert valid_mindate == datetime.datetime.strptime(
        "2018-01-21", "%Y-%m-%d")
    assert valid_maxdate == datetime.datetime.strptime(
        "2018-01-23", "%Y-%m-%d")
Beispiel #2
0
def test_timegapsplit_using_splits():
    cv = TimeGapSplit(date_serie=df["date"],
                      train_duration=timedelta(days=5),
                      valid_duration=timedelta(days=3),
                      gap_duration=timedelta(days=1),
                      n_splits=3)
    assert len(list(cv.split(X_train, y_train))) == 3
Beispiel #3
0
def test_timegapsplit_too_many_splits():
    cv = TimeGapSplit(date_serie=df["date"],
                      train_duration=timedelta(days=5),
                      valid_duration=timedelta(days=3),
                      gap_duration=timedelta(days=1),
                      n_splits=7)
    with pytest.raises(ValueError):
        list(cv.split(X_train, y_train))
Beispiel #4
0
def test_timegapsplit_without_train_duration():
    cv = TimeGapSplit(date_serie=df["date"],
                      train_duration=None,
                      valid_duration=timedelta(days=3),
                      gap_duration=timedelta(days=5),
                      n_splits=3)
    csv = list(cv.split(X_train, y_train))

    assert len(csv) == 3
    assert cv.train_duration == timedelta(days=10)
def test_timegapsplit_with_a_gap():
    gap_duration = timedelta(days=2)
    cv_gap = TimeGapSplit(date_serie=df['date'],
                          train_duration=timedelta(days=5),
                          valid_duration=timedelta(days=3),
                          gap_duration=gap_duration)

    for i, indices in enumerate(cv_gap.split(X_train, y_train)):
        train_mindate = df.loc[X_train.iloc[indices[0]].index]['date'].min()
        train_maxdate = df.loc[X_train.iloc[indices[0]].index]['date'].max()
        valid_mindate = df.loc[X_train.iloc[indices[1]].index]['date'].min()
        valid_maxdate = df.loc[X_train.iloc[indices[1]].index]['date'].max()

        assert train_mindate <= train_maxdate <= valid_mindate <= valid_maxdate
        assert valid_mindate - train_maxdate >= gap_duration