def validate(self, X, y): """Check if any of the features are highly correlated with the target by using mutual information or Pearson correlation. If `method='mutual'`, supports all target and feature types. Otherwise, if `method='pearson'` only supports binary with numeric and boolean dtypes. Pearson correlation returns a value in [-1, 1], while mutual information returns a value in [0, 1]. Arguments: X (ww.DataTable, pd.DataFrame, np.ndarray): The input features to check y (ww.DataColumn, pd.Series, np.ndarray): The target data Returns: dict (DataCheckWarning): dict with a DataCheckWarning if target leakage is detected. Example: >>> import pandas as pd >>> X = pd.DataFrame({ ... 'leak': [10, 42, 31, 51, 61], ... 'x': [42, 54, 12, 64, 12], ... 'y': [13, 5, 13, 74, 24], ... }) >>> y = pd.Series([10, 42, 31, 51, 40]) >>> target_leakage_check = TargetLeakageDataCheck(pct_corr_threshold=0.95) >>> assert target_leakage_check.validate(X, y) == {"warnings": [{"message": "Column 'leak' is 95.0% or more correlated with the target",\ "data_check_name": "TargetLeakageDataCheck",\ "level": "warning",\ "code": "TARGET_LEAKAGE",\ "details": {"column": "leak"}}],\ "errors": [],\ "actions": [{"code": "DROP_COL",\ "metadata": {"column": "leak"}}]} """ results = {"warnings": [], "errors": [], "actions": []} X = infer_feature_types(X) y = infer_feature_types(y) if self.method == 'pearson': highly_corr_cols = self._calculate_pearson(X, y) else: X = _convert_woodwork_types_wrapper(X.to_dataframe()) y = _convert_woodwork_types_wrapper(y.to_series()) highly_corr_cols = self._calculate_mutual_information(X, y) warning_msg = "Column '{}' is {}% or more correlated with the target" results["warnings"].extend([ DataCheckWarning(message=warning_msg.format( col_name, self.pct_corr_threshold * 100), data_check_name=self.name, message_code=DataCheckMessageCode.TARGET_LEAKAGE, details={ "column": col_name }).to_dict() for col_name in highly_corr_cols ]) results["actions"].extend([ DataCheckAction(DataCheckActionCode.DROP_COL, metadata={ "column": col_name }).to_dict() for col_name in highly_corr_cols ]) return results
def fit_resample(self, X, y): """Resampling technique for this sampler. Arguments: X (pd.DataFrame): Training data to fit and resample y (pd.Series): Training data targets to fit and resample Returns: list: Indices to keep for training data """ X_ww = infer_feature_types(X) y_ww = infer_feature_types(y) X = _convert_woodwork_types_wrapper(X_ww.to_dataframe()) y = _convert_woodwork_types_wrapper(y_ww.to_series()) result = self._find_ideal_samples(y) indices_to_drop = [] if len(result): # iterate through the classes we need to undersample and remove the number of samples we need to remove for key, value in result.items(): indices = y.index[y == key].values indices_to_remove = self.random_state.choice(indices, value, replace=False) indices_to_drop.extend(indices_to_remove) return list(set(list(y.index.values)).difference(set(indices_to_drop)))
def transform(self, X, y=None): """No transformation needs to be done here. Arguments: X (ww.DataFrame): Training features. Ignored. y (ww.DataColumn): Target features. Ignored. Returns: ww.DataTable, ww.DataColumn: X and y data that was passed in. """ X = infer_feature_types(X) if y is not None: y = infer_feature_types(y) return X, y
def fit_transform(self, X, y): """Fit and transform the data using the data sampler. Used during training of the pipeline Arguments: X (ww.DataFrame): Training features y (ww.DataColumn): Target features Returns: ww.DataTable, ww.DataColumn: Sampled X and y data """ self.fit(X, y) _, _, X_pd, y_pd = self._prepare_data(X, y) X_new, y_new = self._component_obj.fit_resample(X_pd, y_pd) return infer_feature_types(X_new), infer_feature_types(y_new)
def test_ensemble_data(mock_fit, mock_score, dummy_binary_pipeline_class, stackable_classifiers): X = pd.DataFrame({"a": [i for i in range(100)]}) y = pd.Series([i % 2 for i in range(100)]) automl = AutoMLSearch(X_train=X, y_train=y, problem_type='binary', max_batches=19, ensembling=True, _ensembling_split_size=0.25) mock_should_continue_callback = MagicMock(return_value=True) mock_pre_evaluation_callback = MagicMock() mock_post_evaluation_callback = MagicMock() training_indices, ensembling_indices, _, _ = split_data( ww.DataTable(np.arange(X.shape[0])), y, problem_type='binary', test_size=0.25, random_seed=0) training_indices, ensembling_indices = training_indices.to_dataframe( )[0].tolist(), ensembling_indices.to_dataframe()[0].tolist() engine = SequentialEngine( X_train=infer_feature_types(X), y_train=infer_feature_types(y), ensembling_indices=ensembling_indices, automl=automl, should_continue_callback=mock_should_continue_callback, pre_evaluation_callback=mock_pre_evaluation_callback, post_evaluation_callback=mock_post_evaluation_callback) pipeline1 = [dummy_binary_pipeline_class({'Mock Classifier': {'a': 1}})] engine.evaluate_batch(pipeline1) # check the fit length is correct, taking into account the data splits assert len(mock_fit.call_args[0][0]) == int(2 / 3 * len(training_indices)) input_pipelines = [ make_pipeline_from_components([classifier], problem_type='binary') for classifier in stackable_classifiers ] pipeline2 = [ make_pipeline_from_components( [StackedEnsembleClassifier(input_pipelines, n_jobs=1)], problem_type='binary', custom_name="Stacked Ensemble Classification Pipeline") ] engine.evaluate_batch(pipeline2) assert len(mock_fit.call_args[0][0]) == int(2 / 3 * len(ensembling_indices))
def test_samplers_perform_equally(problem_type, component_sampler, imblearn_sampler, X_y_binary, X_y_multi): if problem_type == 'binary': X, _ = X_y_binary y = np.array([0] * 90 + [1] * 10) imb_learn_sampling_ratio = 0.5 expected_y = np.array([0] * 90 + [1] * 45) else: X, _ = X_y_multi y = np.array([0] * 70 + [1] * 20 + [2] * 10) imb_learn_sampling_ratio = {0: 70, 1: 35, 2: 35} expected_y = np.array([0] * 70 + [1] * 35 + [2] * 35) sampling_ratio = 0.5 sampling_dic = {'sampling_ratio': sampling_ratio} X2 = X random_seed = 1 if component_sampler != SMOTENCSampler: component = component_sampler(**sampling_dic, random_seed=random_seed) imb_sampler = imblearn_sampler(sampling_strategy=imb_learn_sampling_ratio, random_state=random_seed) else: X2 = infer_feature_types(X, feature_types={1: "Categorical", 2: "Categorical", 3: "Categorical", 4: "Categorical"}) component = component_sampler(**sampling_dic, random_seed=random_seed) imb_sampler = imblearn_sampler(sampling_strategy=imb_learn_sampling_ratio, categorical_features=[1, 2, 3, 4], random_state=random_seed) X_com, y_com = component.fit_transform(X2, y) X_im, y_im = imb_sampler.fit_resample(X, y) np.testing.assert_equal(X_com.to_dataframe().values, X_im) np.testing.assert_equal(y_com.to_series().values, y_im) np.testing.assert_equal(sorted(y_im), expected_y)
def test_oversample_seed_same_outputs(sampler, X_y_binary): X, y = X_y_binary X = pd.DataFrame(X) y = pd.Series([0] * 90 + [1] * 10) samplers = [] for seed in [0, 0, 1]: oversampler = sampler(sampling_ratio=1, random_seed=seed) if 'NC' in sampler.name: X = infer_feature_types(X, feature_types={1: "Categorical"}) oversampler = sampler(sampling_ratio=1, random_seed=seed) samplers.append(oversampler) # iterate through different indices in samplers # in group 1, first two oversamplers in samplers should be equal # in group 2, calling same oversamplers twice should be equal # in group 3, last two oversamplers in samplers should be different for s1, s2 in [[0, 1], [1, 1], [1, 2]]: X1, y1 = samplers[s1].fit_transform(X, y) X2, y2 = samplers[s2].fit_transform(X, y) if s2 == 2 and sampler != SMOTENSampler: # group 3, SMOTENSampler performance doesn't change with different random states with pytest.raises(AssertionError): pd.testing.assert_frame_equal(X1.to_dataframe(), X2.to_dataframe()) else: pd.testing.assert_frame_equal(X1.to_dataframe(), X2.to_dataframe()) pd.testing.assert_series_equal(y1.to_series(), y2.to_series())
def test_oversample_imbalanced_binary(data_type, sampler, make_data_type): X = np.array([[i for i in range(1000)], [i % 7 for i in range(1000)], [0.3 * (i % 3) for i in range(1000)]]).T y = np.array([0] * 150 + [1] * 850) X = make_data_type(data_type, X) y = make_data_type(data_type, y) oversampler = sampler if oversampler.name == "SMOTENC Oversampler": X2 = infer_feature_types(X, feature_types={1: "Categorical"}) if data_type == "ww": X2 = X2.set_types({0: "Categorical"}) new_X, new_y = oversampler.fit_transform(X2, y) else: new_X, new_y = oversampler.fit_transform(X, y) new_length = 1700 assert len(new_X) == new_length assert len(new_y) == new_length value_counts = new_y.to_series().value_counts() assert value_counts.values[0] == value_counts.values[1] pd.testing.assert_series_equal(value_counts, pd.Series([850, 850]), check_dtype=False) transform_X, transform_y = oversampler.transform(X, y) if data_type == "ww": X = X.to_dataframe().values y = y.to_series().values elif data_type == "pd": X = X.values y = y.values np.testing.assert_equal(X, transform_X.to_dataframe().values) np.testing.assert_equal(y, transform_y.to_series().values)
def fit_resample(self, X, y): """Resampling technique for this sampler. Arguments: X (pd.DataFrame): Training data to fit and resample y (pd.Series): Training data targets to fit and resample Returns: list: Indices to keep for training data """ y_ww = infer_feature_types(y) y = _convert_woodwork_types_wrapper(y_ww.to_series()) # if we have a dictionary provided, opt to use that if len(self.sampling_ratio_dict): result = self._sampling_dict_to_remove_dict(y) else: result = self._find_ideal_samples(y) indices_to_drop = [] if len(result): # iterate through the classes we need to undersample and remove the number of samples we need to remove for key, value in result.items(): indices = y.index[y == key].values indices_to_remove = self.random_state.choice(indices, value, replace=False) indices_to_drop.extend(indices_to_remove) # indices of the y datacolumn original_indices = list( set(y.index.values).difference(set(indices_to_drop))) return original_indices
def validate(self, X, y=None): """Checks if there are any columns in the input that are too unique in the case of classification problems or not unique enough in the case of regression problems. Arguments: X (ww.DataTable, pd.DataFrame, np.ndarray): Features. y (ww.DataColumn, pd.Series, np.ndarray): Ignored. Defaults to None. Returns: dict: dict with a DataCheckWarning if there are any too unique or not unique enough columns. Example: >>> import pandas as pd >>> df = pd.DataFrame({ ... 'regression_unique_enough': [float(x) for x in range(100)], ... 'regression_not_unique_enough': [float(1) for x in range(100)] ... }) >>> uniqueness_check = UniquenessDataCheck(problem_type="regression", threshold=0.8) >>> assert uniqueness_check.validate(df) == {"errors": [],\ "warnings": [{"message": "Input columns (regression_not_unique_enough) for regression problem type are not unique enough.",\ "data_check_name": "UniquenessDataCheck",\ "level": "warning",\ "code": "NOT_UNIQUE_ENOUGH",\ "details": {"column": "regression_not_unique_enough", 'uniqueness_score': 0.0}}],\ "actions": []} """ results = {"warnings": [], "errors": [], "actions": []} X = infer_feature_types(X) X = _convert_woodwork_types_wrapper(X.to_dataframe()) res = X.apply(UniquenessDataCheck.uniqueness_score) if is_regression(self.problem_type): not_unique_enough_cols = list(res.index[res < self.threshold]) results["warnings"].extend([ DataCheckWarning( message=warning_not_unique_enough.format( col_name, self.problem_type), data_check_name=self.name, message_code=DataCheckMessageCode.NOT_UNIQUE_ENOUGH, details={ "column": col_name, "uniqueness_score": res.loc[col_name] }).to_dict() for col_name in not_unique_enough_cols ]) elif is_multiclass(self.problem_type): too_unique_cols = list(res.index[res > self.threshold]) results["warnings"].extend([ DataCheckWarning(message=warning_too_unique.format( col_name, self.problem_type), data_check_name=self.name, message_code=DataCheckMessageCode.TOO_UNIQUE, details={ "column": col_name, "uniqueness_score": res.loc[col_name] }).to_dict() for col_name in too_unique_cols ]) return results
def _calculate_mutual_information(self, X, y): highly_corr_cols = [] for col in X.columns: cols_to_compare = infer_feature_types(pd.DataFrame({col: X[col], str(col) + "y": y})) mutual_info = cols_to_compare.mutual_information() if len(mutual_info) > 0 and mutual_info['mutual_info'].iloc[0] > self.pct_corr_threshold: highly_corr_cols.append(col) return highly_corr_cols
def _prepare_data(self, X, y): """Transforms the input data to pandas data structure that our sampler can ingest. Arguments: X (ww.DataFrame): Training features y (ww.DataColumn): Target features Returns: ww.DataTable, ww.DataColumn, pd.DataFrame, pd.Series: Prepared X and y data, both woodwork and pandas """ X = infer_feature_types(X) if y is None: raise ValueError("y cannot be none") y = infer_feature_types(y) X_pd = _convert_woodwork_types_wrapper(X.to_dataframe()) y_pd = _convert_woodwork_types_wrapper(y.to_series()) return X, y, X_pd, y_pd
def transform(self, X, y=None): X_return = X.to_dataframe().copy() X_embeded = self._component_obj.transform( cudf.from_pandas(X.to_dataframe()).astype('float32')) for i in range(len(X_embeded.columns)): X_return[f'component_{i}_fe'] = X_embeded[i].to_array() return infer_feature_types(X_return)
def test_tune_binary_threshold(mock_fit, mock_score, mock_predict_proba, mock_optimize_threshold, dummy_binary_pipeline_class, X_y_binary): mock_optimize_threshold.return_value = 0.42 mock_score.return_value = {'F1': 1.0} X, y = X_y_binary X = infer_feature_types(X) y = infer_feature_types(y) pipeline = dummy_binary_pipeline_class({}) tune_binary_threshold(pipeline, F1(), 'binary', X, y) assert pipeline.threshold == 0.42 pipeline = dummy_binary_pipeline_class({}) tune_binary_threshold(pipeline, F1(), 'binary', None, None) assert pipeline.threshold == 0.5 pipeline = dummy_binary_pipeline_class({}) tune_binary_threshold(pipeline, F1(), 'multiclass', X, y) assert pipeline.threshold is None
def split(self, X, y): """Splits and returns the indices of the training and testing using the data sampler provided. Arguments: X (ww.DataTable): DataTable of points to split y (ww.DataTable): DataColumn of points to split Returns: tuple(train, test): A tuple containing the resulting train and test indices, post sampling. """ X_ww = infer_feature_types(X) y_ww = infer_feature_types(y) X = _convert_woodwork_types_wrapper(X_ww.to_dataframe()) y = _convert_woodwork_types_wrapper(y_ww.to_series()) index_df = pd.Series(y.index) for train, test in self.splitter.split(X, y): X_train, y_train = X.iloc[train], y.iloc[train] train_index_drop = self.sampler.fit_resample(X_train, y_train) # convert the indices of the y column into index indices of the original pre-split y train_indices = index_df[index_df.isin(train_index_drop)].dropna().index.values.tolist() yield iter([train_indices, test])
def test_none_y(sampler): X = pd.DataFrame({"a": [i for i in range(5)], "b": [1 for i in range(5)]}) X = infer_feature_types(X, feature_types={"a": "Categorical"}) oversampler = sampler with pytest.raises(ValueError, match="y cannot be none"): oversampler.fit(X, None) with pytest.raises(ValueError, match="y cannot be none"): oversampler.fit_transform(X, None) oversampler.fit(X, pd.Series([0] * 4 + [1])) oversampler.transform(X, None)
def validate(self, X, y=None): """Calculates what percentage of each column's unique values exceed the count threshold and compare that percentage to the sparsity threshold stored in the class instance. Arguments: X (ww.DataTable, pd.DataFrame, np.ndarray): Features. y (ww.DataColumn, pd.Series, np.ndarray): Ignored. Returns: dict: dict with a DataCheckWarning if there are any sparse columns. Example: >>> import pandas as pd >>> df = pd.DataFrame({ ... 'sparse': [float(x) for x in range(100)], ... 'not_sparse': [float(1) for x in range(100)] ... }) >>> sparsity_check = SparsityDataCheck(problem_type="multiclass", threshold=0.5, unique_count_threshold=10) >>> assert sparsity_check.validate(df) == {"errors": [],\ "warnings": [{"message": "Input columns (sparse) for multiclass problem type are too sparse.",\ "data_check_name": "SparsityDataCheck",\ "level": "warning",\ "code": "TOO_SPARSE",\ "details": {"column": "sparse", 'sparsity_score': 0.0}}],\ "actions": [{"code": "DROP_COL",\ "metadata": {"column": "sparse"}}]} """ results = {"warnings": [], "errors": [], "actions": []} X = infer_feature_types(X) X = _convert_woodwork_types_wrapper(X.to_dataframe()) res = X.apply(SparsityDataCheck.sparsity_score, count_threshold=self.unique_count_threshold) too_sparse_cols = [col for col in res.index[res < self.threshold]] results["warnings"].extend([ DataCheckWarning(message=warning_too_unique.format( col_name, self.problem_type), data_check_name=self.name, message_code=DataCheckMessageCode.TOO_SPARSE, details={ "column": col_name, "sparsity_score": res.loc[col_name] }).to_dict() for col_name in too_sparse_cols ]) results["actions"].extend([ DataCheckAction(action_code=DataCheckActionCode.DROP_COL, metadata={ "column": col_name }).to_dict() for col_name in too_sparse_cols ]) return results
def transform_sample(self, X, y): """Transforms the input data with the balancing strategy. Arguments: X (ww.DataTable): DataTable of points to split y (ww.DataTable): DataColumn of points to split Returns: list: List of indices to keep """ y_ww = infer_feature_types(y) y = _convert_woodwork_types_wrapper(y_ww.to_series()) index_df = pd.Series(y.index) train_index_drop = self.sampler.fit_resample(X, y) # convert the indices of the y column into index indices of the original pre-split y train_indices = index_df[index_df.isin(train_index_drop)].dropna().index.values.tolist() return train_indices
def validate(self, X, y=None): """Checks if any natural language columns contain NaN values. Arguments: X (ww.DataTable, pd.DataFrame, np.ndarray): Features. y (ww.DataColumn, pd.Series, np.ndarray): Ignored. Defaults to None. Returns: dict: dict with a DataCheckError if NaN values are present in natural language columns. Example: >>> import pandas as pd >>> import woodwork as ww >>> import numpy as np >>> data = pd.DataFrame() >>> data['A'] = [None, "string_that_is_long_enough_for_natural_language"] >>> data['B'] = ['string_that_is_long_enough_for_natural_language', 'string_that_is_long_enough_for_natural_language'] >>> data['C'] = np.random.randint(0, 3, size=len(data)) >>> data = ww.DataTable(data, logical_types={'A': 'NaturalLanguage', 'B': 'NaturalLanguage'}) >>> nl_nan_check = NaturalLanguageNaNDataCheck() >>> assert nl_nan_check.validate(data) == { ... "warnings": [], ... "actions": [], ... "errors": [DataCheckError(message='Input natural language column(s) (A) contains NaN values. Please impute NaN values or drop these rows or columns.', ... data_check_name=NaturalLanguageNaNDataCheck.name, ... message_code=DataCheckMessageCode.NATURAL_LANGUAGE_HAS_NAN, ... details={"columns": 'A'}).to_dict()] ... } """ results = { "warnings": [], "errors": [], "actions": [] } X = infer_feature_types(X) X = X.select('natural_language') X_describe = X.describe_dict() nan_columns = [str(col) for col in X_describe if X_describe[col]['nan_count'] > 0] if len(nan_columns) > 0: cols_str = ', '.join(nan_columns) results["errors"].append(DataCheckError(message=error_contains_nan.format(cols_str), data_check_name=self.name, message_code=DataCheckMessageCode.NATURAL_LANGUAGE_HAS_NAN, details={"columns": cols_str}).to_dict()) return results
def test_smotenc_output_shape(X_y_binary): X, y = X_y_binary y_imbalanced = pd.Series([0] * 90 + [1] * 10) X_ww = infer_feature_types(X, feature_types={0: 'Categorical', 1: 'Categorical'}) snc = SMOTENCSampler() with pytest.raises(ComponentNotYetFittedError, match=f'You must fit SMOTENCSampler'): snc.transform(X_ww, y) # test sampling and no sampling for y_value in [y, y_imbalanced]: snc.fit(X_ww, y_value) X_out, y_out = snc.transform(X_ww, y_value) assert X_out.shape[1] == X_ww.shape[1] assert y_out.shape[0] == X_out.shape[0] X_out, y_out = snc.fit_transform(X_ww, y) assert X_out.shape[1] == X_ww.shape[1] assert y_out.shape[0] == X_out.shape[0]
def validate(self, X, y=None): """Checks if any datetime columns contain NaN values. Arguments: X (ww.DataTable, pd.DataFrame, np.ndarray): Features. y (ww.DataColumn, pd.Series, np.ndarray): Ignored. Defaults to None. Returns: dict: dict with a DataCheckError if NaN values are present in datetime columns. Example: >>> import pandas as pd >>> import woodwork as ww >>> import numpy as np >>> dates = np.arange(np.datetime64('2017-01-01'), np.datetime64('2017-01-08')) >>> dates[0] = np.datetime64('NaT') >>> ww_input = ww.DataTable(pd.DataFrame(dates, columns=['index'])) >>> dt_nan_check = DateTimeNaNDataCheck() >>> assert dt_nan_check.validate(ww_input) == {"warnings": [], ... "actions": [], ... "errors": [DataCheckError(message='Input datetime column(s) (index) contains NaN values. Please impute NaN values or drop these rows or columns.', ... data_check_name=DateTimeNaNDataCheck.name, ... message_code=DataCheckMessageCode.DATETIME_HAS_NAN, ... details={"columns": 'index'}).to_dict()]} """ results = { "warnings": [], "errors": [], "actions": [] } X = infer_feature_types(X) datetime_cols = _convert_woodwork_types_wrapper(X.select("datetime").to_dataframe()) nan_columns = datetime_cols.columns[datetime_cols.isna().any()].tolist() if len(nan_columns) > 0: nan_columns = [str(col) for col in nan_columns] cols_str = ', '.join(nan_columns) if len(nan_columns) > 1 else nan_columns[0] results["errors"].append(DataCheckError(message=error_contains_nan.format(cols_str), data_check_name=self.name, message_code=DataCheckMessageCode.DATETIME_HAS_NAN, details={"columns": cols_str}).to_dict()) return results
def test_no_oversample(data_type, sampler, make_data_type, X_y_binary): X, y = X_y_binary X = make_data_type(data_type, X) y = make_data_type(data_type, y) oversampler = sampler if oversampler.name == "SMOTENC Oversampler": X2 = infer_feature_types(X, feature_types={1: "Categorical"}) if data_type == "ww": X2 = X2.set_types({0: "Categorical"}) new_X, new_y = oversampler.fit_transform(X2, y) else: new_X, new_y = oversampler.fit_transform(X, y) if data_type == "ww": X = X.to_dataframe().values y = y.to_series().values elif data_type == "pd": X = X.values y = y.values np.testing.assert_equal(X, new_X.to_dataframe().values) np.testing.assert_equal(y, new_y.to_series().values)
def test_oversample_imbalanced_multiclass(data_type, sampler, sampling_ratio, make_data_type): X = np.array([[i for i in range(1000)], [i % 7 for i in range(1000)], [0.3 * (i % 3) for i in range(1000)]]).T y = np.array([0] * 800 + [1] * 100 + [2] * 100) X = make_data_type(data_type, X) y = make_data_type(data_type, y) X2 = X oversampler = sampler(sampling_ratio=sampling_ratio) if sampler.name == 'SMOTENC Oversampler': X2 = infer_feature_types(X, feature_types={0: "Categorical"}) if data_type == "ww": X2 = X2.set_types({0: "Categorical"}) oversampler = sampler(sampling_ratio=sampling_ratio) new_X, new_y = oversampler.fit_transform(X2, y) num_samples = [800, 800 * sampling_ratio, 800 * sampling_ratio] # check the lengths and sampled values are as we expect assert len(new_X) == sum(num_samples) assert len(new_y) == sum(num_samples) value_counts = new_y.to_series().value_counts() assert value_counts.values[1] == value_counts.values[2] np.testing.assert_equal(value_counts.values, np.array(num_samples)) transform_X, transform_y = oversampler.transform(X2, y) if data_type == "ww": X = X.to_dataframe().values y = y.to_series().values elif data_type == "pd": X = X.values y = y.values np.testing.assert_equal(X, transform_X.to_dataframe().values) np.testing.assert_equal(y, transform_y.to_series().values)
def predict_proba(self, X): predictions_pandas = self._component_obj.predict_proba( cudf.DataFrame.from_pandas( X.to_dataframe().astype('float32'))).to_pandas() return infer_feature_types(predictions_pandas)
def feature_importance(self): return infer_feature_types( self._component_obj.feature_importances_.to_pandas())
def validate(self, X, y): """Checks if the target data contains missing or invalid values. Arguments: X (ww.DataTable, pd.DataFrame, np.ndarray): Features. Ignored. y (ww.DataColumn, pd.Series, np.ndarray): Target data to check for invalid values. Returns: dict (DataCheckError): List with DataCheckErrors if any invalid values are found in the target data. Example: >>> import pandas as pd >>> X = pd.DataFrame({"col": [1, 2, 3, 1]}) >>> y = pd.Series([0, 1, None, None]) >>> target_check = InvalidTargetDataCheck('binary', 'Log Loss Binary') >>> assert target_check.validate(X, y) == {"errors": [{"message": "2 row(s) (50.0%) of target values are null",\ "data_check_name": "InvalidTargetDataCheck",\ "level": "error",\ "code": "TARGET_HAS_NULL",\ "details": {"num_null_rows": 2, "pct_null_rows": 50}}],\ "warnings": [],\ "actions": [{'code': 'IMPUTE_COL', 'metadata': {'column': None, 'impute_strategy': 'most_frequent', 'is_target': True}}]} """ results = {"warnings": [], "errors": [], "actions": []} if y is None: results["errors"].append( DataCheckError( message="Target is None", data_check_name=self.name, message_code=DataCheckMessageCode.TARGET_IS_NONE, details={}).to_dict()) return results y = infer_feature_types(y) is_supported_type = y.logical_type in numeric_and_boolean_ww + [ ww.logical_types.Categorical ] if not is_supported_type: results["errors"].append( DataCheckError( message= "Target is unsupported {} type. Valid Woodwork logical types include: {}" .format( y.logical_type, ", ".join([ ltype.type_string for ltype in numeric_and_boolean_ww ])), data_check_name=self.name, message_code=DataCheckMessageCode.TARGET_UNSUPPORTED_TYPE, details={ "unsupported_type": y.logical_type.type_string }).to_dict()) y_df = _convert_woodwork_types_wrapper(y.to_series()) null_rows = y_df.isnull() if null_rows.all(): results["errors"].append( DataCheckError(message="Target is either empty or fully null.", data_check_name=self.name, message_code=DataCheckMessageCode. TARGET_IS_EMPTY_OR_FULLY_NULL, details={}).to_dict()) return results elif null_rows.any(): num_null_rows = null_rows.sum() pct_null_rows = null_rows.mean() * 100 results["errors"].append( DataCheckError( message="{} row(s) ({}%) of target values are null".format( num_null_rows, pct_null_rows), data_check_name=self.name, message_code=DataCheckMessageCode.TARGET_HAS_NULL, details={ "num_null_rows": num_null_rows, "pct_null_rows": pct_null_rows }).to_dict()) impute_strategy = "mean" if is_regression( self.problem_type) else "most_frequent" results["actions"].append( DataCheckAction(DataCheckActionCode.IMPUTE_COL, metadata={ "column": None, "is_target": True, "impute_strategy": impute_strategy }).to_dict()) value_counts = y_df.value_counts() unique_values = value_counts.index.tolist() if is_binary(self.problem_type) and len(value_counts) != 2: if self.n_unique is None: details = {"target_values": unique_values} else: details = { "target_values": unique_values[:min(self.n_unique, len(unique_values))] } results["errors"].append( DataCheckError( message= "Binary class targets require exactly two unique values.", data_check_name=self.name, message_code=DataCheckMessageCode. TARGET_BINARY_NOT_TWO_UNIQUE_VALUES, details=details).to_dict()) if self.problem_type == ProblemTypes.REGRESSION and "numeric" not in y.semantic_tags: results["errors"].append( DataCheckError( message= "Target data type should be numeric for regression type problems.", data_check_name=self.name, message_code=DataCheckMessageCode.TARGET_UNSUPPORTED_TYPE, details={}).to_dict()) if is_multiclass(self.problem_type): if value_counts.min() <= 1: least_populated = value_counts[value_counts <= 1] details = { "least_populated_class_labels": least_populated.index.tolist() } results["errors"].append( DataCheckError( message= "Target does not have at least two instances per class which is required for multiclass classification", data_check_name=self.name, message_code=DataCheckMessageCode. TARGET_MULTICLASS_NOT_TWO_EXAMPLES_PER_CLASS, details=details).to_dict()) if len(unique_values) <= 2: details = {"num_classes": len(unique_values)} results["errors"].append( DataCheckError( message= "Target has two or less classes, which is too few for multiclass problems. Consider changing to binary.", data_check_name=self.name, message_code=DataCheckMessageCode. TARGET_MULTICLASS_NOT_ENOUGH_CLASSES, details=details).to_dict()) num_class_to_num_value_ratio = len(unique_values) / len(y) if num_class_to_num_value_ratio >= self.multiclass_continuous_threshold: details = { "class_to_value_ratio": num_class_to_num_value_ratio } results["warnings"].append( DataCheckWarning( message= "Target has a large number of unique values, could be regression type problem.", data_check_name=self.name, message_code=DataCheckMessageCode. TARGET_MULTICLASS_HIGH_UNIQUE_CLASS, details=details).to_dict()) any_neg = not (y_df > 0).all() if y.logical_type in [ ww.logical_types.Integer, ww.logical_types.Double ] else None if any_neg and self.objective.positive_only: details = { "Count of offending values": sum(val <= 0 for val in y_df.values.flatten()) } results["errors"].append( DataCheckError( message= f"Target has non-positive values which is not supported for {self.objective.name}", data_check_name=self.name, message_code=DataCheckMessageCode. TARGET_INCOMPATIBLE_OBJECTIVE, details=details).to_dict()) if X is not None: X = infer_feature_types(X) X_index = list(X.to_dataframe().index) y_index = list(y_df.index) X_length = len(X_index) y_length = len(y_index) if X_length != y_length: results["warnings"].append( DataCheckWarning( message= "Input target and features have different lengths", data_check_name=self.name, message_code=DataCheckMessageCode.MISMATCHED_LENGTHS, details={ "features_length": X_length, "target_length": y_length }).to_dict()) if X_index != y_index: if set(X_index) == set(y_index): results["warnings"].append( DataCheckWarning( message= "Input target and features have mismatched indices order", data_check_name=self.name, message_code=DataCheckMessageCode. MISMATCHED_INDICES_ORDER, details={}).to_dict()) else: index_diff_not_in_X = list(set(y_index) - set(X_index))[:10] index_diff_not_in_y = list(set(X_index) - set(y_index))[:10] results["warnings"].append( DataCheckWarning( message= "Input target and features have mismatched indices", data_check_name=self.name, message_code=DataCheckMessageCode. MISMATCHED_INDICES, details={ "indices_not_in_features": index_diff_not_in_X, "indices_not_in_target": index_diff_not_in_y }).to_dict()) return results
def _get_categorical(self, X): X = infer_feature_types(X) self.categorical_features = [i for i, val in enumerate(X.types['Logical Type'].items()) if str(val[1]) == 'Categorical'] self._parameters['categorical_features'] = self.categorical_features
def test_smotenc_categorical_features(X_y_binary): X, y = X_y_binary X_ww = infer_feature_types(X, feature_types={0: 'Categorical', 1: 'Categorical'}) snc = SMOTENCSampler() X_out, y_out = snc.fit_transform(X_ww, y) assert snc.categorical_features == [0, 1]