def test_time_series_split(max_delay, gap, X_none, y_none): X = pd.DataFrame({"features": range(1, 32)}) y = pd.Series(range(1, 32)) # Splitter does not need a daterange index. We use a daterange index so that the # expected answer is easier to understand y.index = pd.date_range("2020-10-01", "2020-10-31") X.index = pd.date_range("2020-10-01", "2020-10-31") answer = [(pd.date_range("2020-10-01", f"2020-10-{10 + gap}"), pd.date_range(f"2020-10-{11 - max_delay}", f"2020-10-{17 + gap}")), (pd.date_range("2020-10-01", f"2020-10-{17 + gap}"), pd.date_range(f"2020-10-{18 - max_delay}", f"2020-10-{24 + gap}")), (pd.date_range("2020-10-01", f"2020-10-{24 + gap}"), pd.date_range(f"2020-10-{25 - max_delay}", "2020-10-31"))] if X_none: X = None if y_none: y = None ts_split = TimeSeriesSplit(gap=gap, max_delay=max_delay) for i, (train, test) in enumerate(ts_split.split(X, y)): if not X_none: X_train, X_test = X.iloc[train], X.iloc[test] pd.testing.assert_index_equal(X_train.index, answer[i][0]) pd.testing.assert_index_equal(X_test.index, answer[i][1]) if not y_none: y_train, y_test = y.iloc[train], y.iloc[test] pd.testing.assert_index_equal(y_train.index, answer[i][0]) pd.testing.assert_index_equal(y_test.index, answer[i][1])
def test_time_series_split_n_splits_too_big(): splitter = TimeSeriesSplit(gap=7, n_splits=4, max_delay=3) X = pd.DataFrame({"features": range(15)}) # Each split would have 15 // 5 = 3 data points. However, this is smaller than the number of data_points required # for max_delay and gap with pytest.raises(ValueError, match="Please use a smaller number of splits or collect more data."): list(splitter.split(X))
def test_time_series_split_init(): ts_split = TimeSeriesSplit(gap=3, max_delay=4, n_splits=5) assert ts_split.get_n_splits() == 5 with pytest.raises(ValueError, match="Both X and y cannot be None or empty in TimeSeriesSplit.split"): _ = list(ts_split.split(X=None, y=None)) with pytest.raises(ValueError, match="Both X and y cannot be None or empty in TimeSeriesSplit.split"): _ = list(ts_split.split(X=pd.DataFrame(), y=pd.Series([])))
def make_data_splitter(X, y, problem_type, problem_configuration=None, n_splits=3, shuffle=True, random_state=0): """Given the training data and ML problem parameters, compute a data splitting method to use during AutoML search. Arguments: X (pd.DataFrame, ww.DataTable): The input training data of shape [n_samples, n_features]. y (pd.Series, ww.DataColumn): The target training data of length [n_samples]. problem_type (ProblemType): the type of machine learning problem. problem_configuration (dict, None): Additional parameters needed to configure the search. For example, in time series problems, values should be passed in for the gap and max_delay variables. n_splits (int, None): the number of CV splits, if applicable. Default 3. shuffle (bool): whether or not to shuffle the data before splitting, if applicable. Default True. random_state (int, np.random.RandomState): The random seed/state. Defaults to 0. Returns: sklearn.model_selection.BaseCrossValidator: data splitting method. """ problem_type = handle_problem_types(problem_type) data_splitter = None if problem_type == ProblemTypes.REGRESSION: data_splitter = KFold(n_splits=n_splits, random_state=random_state, shuffle=shuffle) elif problem_type in [ProblemTypes.BINARY, ProblemTypes.MULTICLASS]: data_splitter = StratifiedKFold(n_splits=n_splits, random_state=random_state, shuffle=shuffle) elif is_time_series(problem_type): if not problem_configuration: raise ValueError("problem_configuration is required for time series problem types") data_splitter = TimeSeriesSplit(n_splits=n_splits, gap=problem_configuration.get('gap'), max_delay=problem_configuration.get('max_delay')) if X.shape[0] > _LARGE_DATA_ROW_THRESHOLD: data_splitter = TrainingValidationSplit(test_size=_LARGE_DATA_PERCENT_VALIDATION, shuffle=True) return data_splitter
def make_data_splitter(X, y, problem_type, problem_configuration=None, n_splits=3, shuffle=True, random_state=None, random_seed=0): """Given the training data and ML problem parameters, compute a data splitting method to use during AutoML search. Arguments: X (ww.DataTable, pd.DataFrame): The input training data of shape [n_samples, n_features]. y (ww.DataColumn, pd.Series): The target training data of length [n_samples]. problem_type (ProblemType): The type of machine learning problem. problem_configuration (dict, None): Additional parameters needed to configure the search. For example, in time series problems, values should be passed in for the gap and max_delay variables. Defaults to None. n_splits (int, None): The number of CV splits, if applicable. Defaults to 3. shuffle (bool): Whether or not to shuffle the data before splitting, if applicable. Defaults to True. random_state (None, int): Deprecated - use random_seed instead. random_seed (int): Seed for the random number generator. Defaults to 0. Returns: sklearn.model_selection.BaseCrossValidator: Data splitting method. """ random_seed = deprecate_arg("random_state", "random_seed", random_state, random_seed) problem_type = handle_problem_types(problem_type) if is_time_series(problem_type): if not problem_configuration: raise ValueError( "problem_configuration is required for time series problem types" ) return TimeSeriesSplit( n_splits=n_splits, gap=problem_configuration.get('gap'), max_delay=problem_configuration.get('max_delay')) if X.shape[0] > _LARGE_DATA_ROW_THRESHOLD: if problem_type == ProblemTypes.REGRESSION: return TrainingValidationSplit( test_size=_LARGE_DATA_PERCENT_VALIDATION, shuffle=shuffle) elif problem_type in [ProblemTypes.BINARY, ProblemTypes.MULTICLASS]: return BalancedClassificationDataTVSplit( test_size=_LARGE_DATA_PERCENT_VALIDATION, shuffle=shuffle, random_seed=random_seed) if problem_type == ProblemTypes.REGRESSION: return KFold(n_splits=n_splits, random_state=random_seed, shuffle=shuffle) elif problem_type in [ProblemTypes.BINARY, ProblemTypes.MULTICLASS]: return BalancedClassificationDataCVSplit(n_splits=n_splits, random_seed=random_seed, shuffle=shuffle)