def test_timegapsplit(): cv = TimeGapSplit( date_serie=df["date"], train_duration=timedelta(days=5), valid_duration=timedelta(days=3), gap_duration=timedelta(days=0), ) for i, indices in enumerate(cv.split(X_train, y_train)): train_mindate = df.loc[X_train.iloc[indices[0]].index]["date"].min() train_maxdate = df.loc[X_train.iloc[indices[0]].index]["date"].max() valid_mindate = df.loc[X_train.iloc[indices[1]].index]["date"].min() valid_maxdate = df.loc[X_train.iloc[indices[1]].index]["date"].max() assert train_mindate <= train_maxdate <= valid_mindate <= valid_maxdate # regression testing, check if output changes of the last fold assert train_mindate == datetime.datetime.strptime( "2018-01-16", "%Y-%m-%d") assert train_maxdate == datetime.datetime.strptime( "2018-01-20", "%Y-%m-%d") assert valid_mindate == datetime.datetime.strptime( "2018-01-21", "%Y-%m-%d") assert valid_maxdate == datetime.datetime.strptime( "2018-01-23", "%Y-%m-%d")
def test_timegapsplit_using_splits(): cv = TimeGapSplit(date_serie=df["date"], train_duration=timedelta(days=5), valid_duration=timedelta(days=3), gap_duration=timedelta(days=1), n_splits=3) assert len(list(cv.split(X_train, y_train))) == 3
def test_timegapsplit_too_many_splits(): cv = TimeGapSplit(date_serie=df["date"], train_duration=timedelta(days=5), valid_duration=timedelta(days=3), gap_duration=timedelta(days=1), n_splits=7) with pytest.raises(ValueError): list(cv.split(X_train, y_train))
def test_timegapsplit_without_train_duration(): cv = TimeGapSplit(date_serie=df["date"], train_duration=None, valid_duration=timedelta(days=3), gap_duration=timedelta(days=5), n_splits=3) csv = list(cv.split(X_train, y_train)) assert len(csv) == 3 assert cv.train_duration == timedelta(days=10)
def test_timegapsplit_with_a_gap(): gap_duration = timedelta(days=2) cv_gap = TimeGapSplit(date_serie=df['date'], train_duration=timedelta(days=5), valid_duration=timedelta(days=3), gap_duration=gap_duration) for i, indices in enumerate(cv_gap.split(X_train, y_train)): train_mindate = df.loc[X_train.iloc[indices[0]].index]['date'].min() train_maxdate = df.loc[X_train.iloc[indices[0]].index]['date'].max() valid_mindate = df.loc[X_train.iloc[indices[1]].index]['date'].min() valid_maxdate = df.loc[X_train.iloc[indices[1]].index]['date'].max() assert train_mindate <= train_maxdate <= valid_mindate <= valid_maxdate assert valid_mindate - train_maxdate >= gap_duration