def test_data_splitters_imbalanced_binary_tv(): X = pd.DataFrame({ "a": [i for i in range(1000)], "b": [i % 5 for i in range(1000)] }) # make y a 9:1 class ratio y = pd.Series([0] * 100 + [1] * 900) splitter = BalancedClassificationDataTVSplit() for i, (train_indices, test_indices) in enumerate(splitter.split(X, y)): assert len(test_indices) == 250 # test_size defaults to 0.25 # remaining data will still preserve 9:1 ratio, which we want to get to 4:1 # we don't know the exact number since we don't stratify split assert len(train_indices) < 500 # we can only test the balance of the train since the split isn't stratified y_balanced_train = y.iloc[train_indices] y_train_counts = y_balanced_train.value_counts(normalize=True) assert max(y_train_counts.values) == 0.8
def test_data_splitters_imbalanced_multiclass_tv(): X = pd.DataFrame({ "a": [i for i in range(1500)], "b": [i % 5 for i in range(1500)] }) # make y a 8:1:1 class ratio y = pd.Series([0] * 150 + [1] * 1200 + [2] * 150) splitter = BalancedClassificationDataTVSplit() for i, (train_indices, test_indices) in enumerate(splitter.split(X, y)): assert len(test_indices) == 375 # test_size defaults to 0.25 # we don't know the exact number since we don't stratify split assert len(train_indices) < 1000 # we can only test the balance of the train since the split isn't stratified y_balanced_train = y.iloc[train_indices] y_train_counts = y_balanced_train.value_counts(normalize=True) # assert the values are around 2/3 for the majority class assert max(y_train_counts.values) < 7 / 10 assert max(y_train_counts.values) > 6 / 10
def make_data_splitter(X, y, problem_type, problem_configuration=None, n_splits=3, shuffle=True, random_state=None, random_seed=0): """Given the training data and ML problem parameters, compute a data splitting method to use during AutoML search. Arguments: X (ww.DataTable, pd.DataFrame): The input training data of shape [n_samples, n_features]. y (ww.DataColumn, pd.Series): The target training data of length [n_samples]. problem_type (ProblemType): The type of machine learning problem. problem_configuration (dict, None): Additional parameters needed to configure the search. For example, in time series problems, values should be passed in for the gap and max_delay variables. Defaults to None. n_splits (int, None): The number of CV splits, if applicable. Defaults to 3. shuffle (bool): Whether or not to shuffle the data before splitting, if applicable. Defaults to True. random_state (None, int): Deprecated - use random_seed instead. random_seed (int): Seed for the random number generator. Defaults to 0. Returns: sklearn.model_selection.BaseCrossValidator: Data splitting method. """ random_seed = deprecate_arg("random_state", "random_seed", random_state, random_seed) problem_type = handle_problem_types(problem_type) if is_time_series(problem_type): if not problem_configuration: raise ValueError( "problem_configuration is required for time series problem types" ) return TimeSeriesSplit( n_splits=n_splits, gap=problem_configuration.get('gap'), max_delay=problem_configuration.get('max_delay')) if X.shape[0] > _LARGE_DATA_ROW_THRESHOLD: if problem_type == ProblemTypes.REGRESSION: return TrainingValidationSplit( test_size=_LARGE_DATA_PERCENT_VALIDATION, shuffle=shuffle) elif problem_type in [ProblemTypes.BINARY, ProblemTypes.MULTICLASS]: return BalancedClassificationDataTVSplit( test_size=_LARGE_DATA_PERCENT_VALIDATION, shuffle=shuffle, random_seed=random_seed) if problem_type == ProblemTypes.REGRESSION: return KFold(n_splits=n_splits, random_state=random_seed, shuffle=shuffle) elif problem_type in [ProblemTypes.BINARY, ProblemTypes.MULTICLASS]: return BalancedClassificationDataCVSplit(n_splits=n_splits, random_seed=random_seed, shuffle=shuffle)
def test_data_splitter_no_error(splitter, value, X_y_binary): X, y = X_y_binary X = pd.DataFrame(X) y = pd.Series(y) X.iloc[0, :] = value data_split = splitter() # handles both TV and CV iterations next(data_split.split(X, y)) data_split.transform_sample(X, y) @pytest.mark.parametrize( 'balanced_splitter,data_splitter', [(BalancedClassificationDataTVSplit(sampling_ratio=1, min_samples=50, test_size=0.2, shuffle=True, random_seed=0), TrainingValidationSplit(test_size=0.2, shuffle=True, random_seed=0)), (BalancedClassificationDataCVSplit(sampling_ratio=1, min_samples=50, shuffle=True, n_splits=3, random_seed=0), StratifiedKFold(shuffle=True, n_splits=3, random_state=0))]) @pytest.mark.parametrize('data_type', ['np', 'pd', 'ww']) def test_data_splitters_data_type(data_type, balanced_splitter, data_splitter, make_data_type, X_y_binary): X, y = X_y_binary # make imbalanced X_extended = np.append(X, X, 0)