def test_tvsplit_size(): X = pd.DataFrame({'col1': np.arange(0, 10)}) y = pd.Series(np.arange(100, 110), name='target') splitter = TrainingValidationSplit(test_size=0.2, train_size=0.3) splits = list(splitter.split(X, y=y)) assert len(splits) == 1 and len(splits[0]) == 2 np.testing.assert_equal(splits[0][0], [0, 1, 2]) np.testing.assert_equal(splits[0][1], [3, 4]) splitter = TrainingValidationSplit(test_size=2, train_size=3) splits = list(splitter.split(X, y=y)) assert len(splits) == 1 and len(splits[0]) == 2 np.testing.assert_equal(splits[0][0], [0, 1, 2]) np.testing.assert_equal(splits[0][1], [3, 4])
def test_tvsplit_always_within_bounds_with_custom_index(random_state): N = 11000 X = pd.DataFrame({'col1': np.arange(0, N)}, index=np.arange(20000, 20000 + N)) splitter = TrainingValidationSplit(train_size=0.75, shuffle=True, random_state=random_state) splits = list(splitter.split(X, y=None)) assert np.all(np.logical_and(splits[0][0] < N, splits[0][0] >= 0)) assert np.all(np.logical_and(splits[0][1] < N, splits[0][1] >= 0))
def split_data(X, y, problem_type, problem_configuration=None, test_size=.2, random_seed=0): """Splits data into train and test sets. Arguments: X (ww.DataTable, pd.DataFrame or np.ndarray): data of shape [n_samples, n_features] y (ww.DataColumn, pd.Series, or np.ndarray): target data of length [n_samples] problem_type (str or ProblemTypes): type of supervised learning problem. see evalml.problem_types.problemtype.all_problem_types for a full list. problem_configuration (dict): Additional parameters needed to configure the search. For example, in time series problems, values should be passed in for the date_index, gap, and max_delay variables. test_size (float): What percentage of data points should be included in the test set. Defaults to 0.2 (20%). random_seed (int): Seed for the random number generator. Defaults to 0. Returns: ww.DataTable, ww.DataTable, ww.DataColumn, ww.DataColumn: Feature and target data each split into train and test sets """ X = infer_feature_types(X) y = infer_feature_types(y) data_splitter = None if is_time_series(problem_type): data_splitter = TrainingValidationSplit(test_size=test_size, shuffle=False, stratify=None, random_seed=random_seed) elif is_regression(problem_type): data_splitter = ShuffleSplit(n_splits=1, test_size=test_size, random_state=random_seed) elif is_classification(problem_type): data_splitter = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=random_seed) train, test = next(data_splitter.split(X.to_dataframe(), y.to_series())) X_train = X.iloc[train] X_test = X.iloc[test] y_train = y.iloc[train] y_test = y.iloc[test] return X_train, X_test, y_train, y_test
def test_tvsplit_stratify(): X = pd.DataFrame({'col1': np.arange(0, 10)}) y = pd.Series(np.arange(5).repeat(2), name='target') splitter = TrainingValidationSplit(train_size=5, test_size=5, shuffle=True, stratify=y, random_state=0) splits = list(splitter.split(X, y=y)) assert len(splits) == 1 and len(splits[0]) == 2 np.testing.assert_equal(splits[0][0], [1, 4, 2, 8, 7]) np.testing.assert_equal(splits[0][1], [3, 6, 9, 0, 5])
def test_tvsplit_shuffle(): X = pd.DataFrame({'col1': np.arange(0, 10)}) y = pd.Series(np.arange(100, 110), name='target') splitter = TrainingValidationSplit(shuffle=True, random_state=0) splits = list(splitter.split(X, y=y)) assert len(splits) == 1 and len(splits[0]) == 2 np.testing.assert_equal(splits[0][0], [9, 1, 6, 7, 3, 0, 5]) np.testing.assert_equal(splits[0][1], [2, 8, 4])
def test_tvsplit_default(): X = pd.DataFrame({'col1': np.arange(0, 10)}) y = pd.Series(np.arange(100, 110), name='target') splitter = TrainingValidationSplit() splits = list(splitter.split(X, y=y)) assert len(splits) == 1 and len(splits[0]) == 2 # sklearn train_test_split will do a 75/25 split by default np.testing.assert_equal(splits[0][0], [0, 1, 2, 3, 4, 5, 6]) np.testing.assert_equal(splits[0][1], [7, 8, 9])