def split_data(X, y, problem_type, problem_configuration=None, test_size=.2, random_seed=0): """Splits data into train and test sets. Arguments: X (ww.DataTable, pd.DataFrame or np.ndarray): data of shape [n_samples, n_features] y (ww.DataColumn, pd.Series, or np.ndarray): target data of length [n_samples] problem_type (str or ProblemTypes): type of supervised learning problem. see evalml.problem_types.problemtype.all_problem_types for a full list. problem_configuration (dict): Additional parameters needed to configure the search. For example, in time series problems, values should be passed in for the date_index, gap, and max_delay variables. test_size (float): What percentage of data points should be included in the test set. Defaults to 0.2 (20%). random_seed (int): Seed for the random number generator. Defaults to 0. Returns: ww.DataTable, ww.DataTable, ww.DataColumn, ww.DataColumn: Feature and target data each split into train and test sets """ X = infer_feature_types(X) y = infer_feature_types(y) data_splitter = None if is_time_series(problem_type): data_splitter = TrainingValidationSplit(test_size=test_size, shuffle=False, stratify=None, random_seed=random_seed) elif is_regression(problem_type): data_splitter = ShuffleSplit(n_splits=1, test_size=test_size, random_state=random_seed) elif is_classification(problem_type): data_splitter = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=random_seed) train, test = next(data_splitter.split(X.to_dataframe(), y.to_series())) X_train = X.iloc[train] X_test = X.iloc[test] y_train = y.iloc[train] y_test = y.iloc[test] return X_train, X_test, y_train, y_test
def test_tvsplit_always_within_bounds_with_custom_index(random_state): N = 11000 X = pd.DataFrame({'col1': np.arange(0, N)}, index=np.arange(20000, 20000 + N)) splitter = TrainingValidationSplit(train_size=0.75, shuffle=True, random_state=random_state) splits = list(splitter.split(X, y=None)) assert np.all(np.logical_and(splits[0][0] < N, splits[0][0] >= 0)) assert np.all(np.logical_and(splits[0][1] < N, splits[0][1] >= 0))
def test_tvsplit_stratify(): X = pd.DataFrame({'col1': np.arange(0, 10)}) y = pd.Series(np.arange(5).repeat(2), name='target') splitter = TrainingValidationSplit(train_size=5, test_size=5, shuffle=True, stratify=y, random_state=0) splits = list(splitter.split(X, y=y)) assert len(splits) == 1 and len(splits[0]) == 2 np.testing.assert_equal(splits[0][0], [1, 4, 2, 8, 7]) np.testing.assert_equal(splits[0][1], [3, 6, 9, 0, 5])
def test_tvsplit_shuffle(): X = pd.DataFrame({'col1': np.arange(0, 10)}) y = pd.Series(np.arange(100, 110), name='target') splitter = TrainingValidationSplit(shuffle=True, random_state=0) splits = list(splitter.split(X, y=y)) assert len(splits) == 1 and len(splits[0]) == 2 np.testing.assert_equal(splits[0][0], [9, 1, 6, 7, 3, 0, 5]) np.testing.assert_equal(splits[0][1], [2, 8, 4])
def test_tvsplit_default(): X = pd.DataFrame({'col1': np.arange(0, 10)}) y = pd.Series(np.arange(100, 110), name='target') splitter = TrainingValidationSplit() splits = list(splitter.split(X, y=y)) assert len(splits) == 1 and len(splits[0]) == 2 # sklearn train_test_split will do a 75/25 split by default np.testing.assert_equal(splits[0][0], [0, 1, 2, 3, 4, 5, 6]) np.testing.assert_equal(splits[0][1], [7, 8, 9])
def make_data_splitter(X, y, problem_type, problem_configuration=None, n_splits=3, shuffle=True, random_state=0): """Given the training data and ML problem parameters, compute a data splitting method to use during AutoML search. Arguments: X (pd.DataFrame, ww.DataTable): The input training data of shape [n_samples, n_features]. y (pd.Series, ww.DataColumn): The target training data of length [n_samples]. problem_type (ProblemType): the type of machine learning problem. problem_configuration (dict, None): Additional parameters needed to configure the search. For example, in time series problems, values should be passed in for the gap and max_delay variables. n_splits (int, None): the number of CV splits, if applicable. Default 3. shuffle (bool): whether or not to shuffle the data before splitting, if applicable. Default True. random_state (int, np.random.RandomState): The random seed/state. Defaults to 0. Returns: sklearn.model_selection.BaseCrossValidator: data splitting method. """ problem_type = handle_problem_types(problem_type) data_splitter = None if problem_type == ProblemTypes.REGRESSION: data_splitter = KFold(n_splits=n_splits, random_state=random_state, shuffle=shuffle) elif problem_type in [ProblemTypes.BINARY, ProblemTypes.MULTICLASS]: data_splitter = StratifiedKFold(n_splits=n_splits, random_state=random_state, shuffle=shuffle) elif is_time_series(problem_type): if not problem_configuration: raise ValueError("problem_configuration is required for time series problem types") data_splitter = TimeSeriesSplit(n_splits=n_splits, gap=problem_configuration.get('gap'), max_delay=problem_configuration.get('max_delay')) if X.shape[0] > _LARGE_DATA_ROW_THRESHOLD: data_splitter = TrainingValidationSplit(test_size=_LARGE_DATA_PERCENT_VALIDATION, shuffle=True) return data_splitter
def make_data_splitter(X, y, problem_type, problem_configuration=None, n_splits=3, shuffle=True, random_state=None, random_seed=0): """Given the training data and ML problem parameters, compute a data splitting method to use during AutoML search. Arguments: X (ww.DataTable, pd.DataFrame): The input training data of shape [n_samples, n_features]. y (ww.DataColumn, pd.Series): The target training data of length [n_samples]. problem_type (ProblemType): The type of machine learning problem. problem_configuration (dict, None): Additional parameters needed to configure the search. For example, in time series problems, values should be passed in for the gap and max_delay variables. Defaults to None. n_splits (int, None): The number of CV splits, if applicable. Defaults to 3. shuffle (bool): Whether or not to shuffle the data before splitting, if applicable. Defaults to True. random_state (None, int): Deprecated - use random_seed instead. random_seed (int): Seed for the random number generator. Defaults to 0. Returns: sklearn.model_selection.BaseCrossValidator: Data splitting method. """ random_seed = deprecate_arg("random_state", "random_seed", random_state, random_seed) problem_type = handle_problem_types(problem_type) if is_time_series(problem_type): if not problem_configuration: raise ValueError( "problem_configuration is required for time series problem types" ) return TimeSeriesSplit( n_splits=n_splits, gap=problem_configuration.get('gap'), max_delay=problem_configuration.get('max_delay')) if X.shape[0] > _LARGE_DATA_ROW_THRESHOLD: if problem_type == ProblemTypes.REGRESSION: return TrainingValidationSplit( test_size=_LARGE_DATA_PERCENT_VALIDATION, shuffle=shuffle) elif problem_type in [ProblemTypes.BINARY, ProblemTypes.MULTICLASS]: return BalancedClassificationDataTVSplit( test_size=_LARGE_DATA_PERCENT_VALIDATION, shuffle=shuffle, random_seed=random_seed) if problem_type == ProblemTypes.REGRESSION: return KFold(n_splits=n_splits, random_state=random_seed, shuffle=shuffle) elif problem_type in [ProblemTypes.BINARY, ProblemTypes.MULTICLASS]: return BalancedClassificationDataCVSplit(n_splits=n_splits, random_seed=random_seed, shuffle=shuffle)
def test_tvsplit_size(): X = pd.DataFrame({'col1': np.arange(0, 10)}) y = pd.Series(np.arange(100, 110), name='target') splitter = TrainingValidationSplit(test_size=0.2, train_size=0.3) splits = list(splitter.split(X, y=y)) assert len(splits) == 1 and len(splits[0]) == 2 np.testing.assert_equal(splits[0][0], [0, 1, 2]) np.testing.assert_equal(splits[0][1], [3, 4]) splitter = TrainingValidationSplit(test_size=2, train_size=3) splits = list(splitter.split(X, y=y)) assert len(splits) == 1 and len(splits[0]) == 2 np.testing.assert_equal(splits[0][0], [0, 1, 2]) np.testing.assert_equal(splits[0][1], [3, 4])
y = pd.Series(y) X.iloc[0, :] = value data_split = splitter() # handles both TV and CV iterations next(data_split.split(X, y)) data_split.transform_sample(X, y) @pytest.mark.parametrize( 'balanced_splitter,data_splitter', [(BalancedClassificationDataTVSplit(sampling_ratio=1, min_samples=50, test_size=0.2, shuffle=True, random_seed=0), TrainingValidationSplit(test_size=0.2, shuffle=True, random_seed=0)), (BalancedClassificationDataCVSplit(sampling_ratio=1, min_samples=50, shuffle=True, n_splits=3, random_seed=0), StratifiedKFold(shuffle=True, n_splits=3, random_state=0))]) @pytest.mark.parametrize('data_type', ['np', 'pd', 'ww']) def test_data_splitters_data_type(data_type, balanced_splitter, data_splitter, make_data_type, X_y_binary): X, y = X_y_binary # make imbalanced X_extended = np.append(X, X, 0) y_extended = np.append(y, np.array([0] * len(y)), 0) sample_method = BalancedClassificationSampler(sampling_ratio=1, min_samples=50,
def test_tvsplit_nsplits(): assert TrainingValidationSplit().get_n_splits() == 1
import time from evalml.automl.utils import AutoMLConfig from evalml.exceptions import PipelineScoreError from evalml.objectives.utils import get_objective from evalml.pipelines import BinaryClassificationPipeline from evalml.preprocessing.data_splitters import TrainingValidationSplit # Top-level replacement for AutoML object to supply data for testing purposes. def err_call(*args, **kwargs): """No-op""" ensembling_indices = [0] data_splitter = TrainingValidationSplit() problem_type = "binary" objective = get_objective("Log Loss Binary", return_instance=True) additional_objectives = [] optimize_thresholds = False error_callback = err_call random_seed = 0 automl_data = AutoMLConfig(ensembling_indices=ensembling_indices, data_splitter=data_splitter, problem_type=problem_type, objective=objective, additional_objectives=additional_objectives, optimize_thresholds=optimize_thresholds, error_callback=error_callback, random_seed=random_seed)