def _check_for_errors(self, column_name, count_unique, any_nulls): """Checks if a column has no variance. Arguments: column_name (str): Name of the column we are checking. count_unique (float): Number of unique values in this column. any_nulls (bool): Whether this column has any missing data. Returns: DataCheckError if the column has no variance or DataCheckWarning if the column has two unique values including NaN. """ message = f"{column_name} has {int(count_unique)} unique value." if count_unique <= 1: return DataCheckError( message=message.format(name=column_name), data_check_name=self.name, message_code=DataCheckMessageCode.NO_VARIANCE, details={"column": column_name}) elif count_unique == 2 and not self._dropnan and any_nulls: return DataCheckWarning( message=f"{column_name} has two unique values including nulls. " "Consider encoding the nulls for " "this column to be useful for machine learning.", data_check_name=self.name, message_code=DataCheckMessageCode.NO_VARIANCE_WITH_NULL, details={"column": column_name})
def test_multicollinearity_returns_warning(): col = pd.Series([1, 0, 2, 3, 4]) X = pd.DataFrame({ 'col_1': col, 'col_2': col * 3, 'col_3': ~col, 'col_4': col / 2, 'col_5': col + 1, 'not_collinear': [0, 1, 0, 0, 0] }) multi_check = MulticollinearityDataCheck(threshold=0.95) assert multi_check.validate(X) == { "warnings": [ DataCheckWarning( message= "Columns are likely to be correlated: [('col_1', 'col_2'), ('col_1', 'col_3'), ('col_1', 'col_4'), ('col_1', 'col_5'), ('col_2', 'col_3'), ('col_2', 'col_4'), ('col_2', 'col_5'), ('col_3', 'col_4'), ('col_3', 'col_5'), ('col_4', 'col_5')]", data_check_name=multi_data_check_name, message_code=DataCheckMessageCode.IS_MULTICOLLINEAR, details={ 'columns': [('col_1', 'col_2'), ('col_1', 'col_3'), ('col_1', 'col_4'), ('col_1', 'col_5'), ('col_2', 'col_3'), ('col_2', 'col_4'), ('col_2', 'col_5'), ('col_3', 'col_4'), ('col_3', 'col_5'), ('col_4', 'col_5')] }).to_dict() ], "errors": [] }
def test_outliers_data_check_warnings(): a = np.arange(10) * 0.01 data = np.tile(a, (100, 10)) X = pd.DataFrame(data=data) X.iloc[0, 3] = 1000 X.iloc[3, 25] = 1000 X.iloc[5, 55] = 10000 X.iloc[10, 72] = -1000 X.iloc[:, 90] = 'string_values' outliers_check = OutliersDataCheck() assert outliers_check.validate(X) == { "warnings": [ DataCheckWarning( message= "Column(s) '3', '25', '55', '72' are likely to have outlier data.", data_check_name=outliers_data_check_name, message_code=DataCheckMessageCode.HAS_OUTLIERS, details={ "columns": [3, 25, 55, 72] }).to_dict() ], "errors": [] }
def validate(self, X, y): """Check if any of the features are highly correlated with the target by using mutual information or Pearson correlation. If `method='mutual'`, supports all target and feature types. Otherwise, if `method='pearson'` only supports binary with numeric and boolean dtypes. Pearson correlation returns a value in [-1, 1], while mutual information returns a value in [0, 1]. Arguments: X (ww.DataTable, pd.DataFrame, np.ndarray): The input features to check y (ww.DataColumn, pd.Series, np.ndarray): The target data Returns: dict (DataCheckWarning): dict with a DataCheckWarning if target leakage is detected. Example: >>> import pandas as pd >>> X = pd.DataFrame({ ... 'leak': [10, 42, 31, 51, 61], ... 'x': [42, 54, 12, 64, 12], ... 'y': [13, 5, 13, 74, 24], ... }) >>> y = pd.Series([10, 42, 31, 51, 40]) >>> target_leakage_check = TargetLeakageDataCheck(pct_corr_threshold=0.95) >>> assert target_leakage_check.validate(X, y) == {"warnings": [{"message": "Column 'leak' is 95.0% or more correlated with the target",\ "data_check_name": "TargetLeakageDataCheck",\ "level": "warning",\ "code": "TARGET_LEAKAGE",\ "details": {"column": "leak"}}],\ "errors": [],\ "actions": [{"code": "DROP_COL",\ "metadata": {"column": "leak"}}]} """ results = {"warnings": [], "errors": [], "actions": []} X = infer_feature_types(X) y = infer_feature_types(y) if self.method == 'pearson': highly_corr_cols = self._calculate_pearson(X, y) else: X = _convert_woodwork_types_wrapper(X.to_dataframe()) y = _convert_woodwork_types_wrapper(y.to_series()) highly_corr_cols = self._calculate_mutual_information(X, y) warning_msg = "Column '{}' is {}% or more correlated with the target" results["warnings"].extend([ DataCheckWarning(message=warning_msg.format( col_name, self.pct_corr_threshold * 100), data_check_name=self.name, message_code=DataCheckMessageCode.TARGET_LEAKAGE, details={ "column": col_name }).to_dict() for col_name in highly_corr_cols ]) results["actions"].extend([ DataCheckAction(DataCheckActionCode.DROP_COL, metadata={ "column": col_name }).to_dict() for col_name in highly_corr_cols ]) return results
def test_multicollinearity_nonnumeric_cols(data_type, make_data_type): X = pd.DataFrame({ 'col_1': ["a", "b", "c", "d", "a"], 'col_2': ["w", "x", "y", "z", "b"], 'col_3': ["a", "a", "c", "d", "a"], 'col_4': ["a", "b", "c", "d", "a"], 'col_5': ["0", "0", "1", "2", "0"], 'col_6': [1, 1, 2, 3, 1] }) X = make_data_type(data_type, X) multi_check = MulticollinearityDataCheck(threshold=0.9) assert multi_check.validate(X) == { "warnings": [ DataCheckWarning( message= "Columns are likely to be correlated: [('col_1', 'col_4'), ('col_3', 'col_5'), ('col_3', 'col_6'), ('col_5', 'col_6'), ('col_1', 'col_2'), ('col_2', 'col_4')]", data_check_name=multi_data_check_name, message_code=DataCheckMessageCode.IS_MULTICOLLINEAR, details={ 'columns': [('col_1', 'col_4'), ('col_3', 'col_5'), ('col_3', 'col_6'), ('col_5', 'col_6'), ('col_1', 'col_2'), ('col_2', 'col_4')] }).to_dict() ], "errors": [] }
def validate(self, X, y=None): """Check if any set of features are likely to be multicollinear. Arguments: X (ww.DataTable, pd.DataFrame, np.ndarray): The input features to check Returns: dict: dict with a DataCheckWarning if there are any potentially multicollinear columns. """ messages = {"warnings": [], "errors": []} X = infer_feature_types(X) mutual_info_df = X.mutual_information() if mutual_info_df.empty: return messages above_threshold = mutual_info_df.loc[ mutual_info_df['mutual_info'] >= self.threshold] correlated_cols = [(col_1, col_2) for col_1, col_2 in zip( above_threshold['column_1'], above_threshold['column_2'])] if correlated_cols: warning_msg = "Columns are likely to be correlated: {}" messages["warnings"].append( DataCheckWarning( message=warning_msg.format(correlated_cols), data_check_name=self.name, message_code=DataCheckMessageCode.IS_MULTICOLLINEAR, details={ "columns": correlated_cols }).to_dict()) return messages
def test_default_data_checks_regression(input_type): X = pd.DataFrame({ 'lots_of_null': [None, None, None, None, "some data"], 'all_null': [None, None, None, None, None], 'also_all_null': [None, None, None, None, None], 'no_null': [1, 2, 3, 5, 5], 'id': [0, 1, 2, 3, 4], 'has_label_leakage': [100, 200, 100, 200, 100] }) y = pd.Series([0.3, 100.0, np.nan, 1.0, 0.2]) y_no_variance = pd.Series([5] * 5) if input_type == "ww": X = ww.DataTable(X) y = ww.DataColumn(y) y_no_variance = ww.DataColumn(y_no_variance) null_leakage = [ DataCheckWarning( message= "Column 'lots_of_null' is 95.0% or more correlated with the target", data_check_name="TargetLeakageDataCheck", message_code=DataCheckMessageCode.TARGET_LEAKAGE, details={ "column": "lots_of_null" }).to_dict() ] data_checks = DefaultDataChecks( "regression", get_default_primary_search_objective("regression")) assert data_checks.validate(X, y) == { "warnings": messages[:3], "errors": messages[3:] } # Skip Invalid Target assert data_checks.validate(X, y_no_variance) == { "warnings": messages[:3] + null_leakage, "errors": messages[4:] + [ DataCheckError(message="Y has 1 unique value.", data_check_name="NoVarianceDataCheck", message_code=DataCheckMessageCode.NO_VARIANCE, details={ "column": "Y" }).to_dict() ] } data_checks = DataChecks( DefaultDataChecks._DEFAULT_DATA_CHECK_CLASSES, { "InvalidTargetDataCheck": { "problem_type": "regression", "objective": get_default_primary_search_objective("regression") } }) assert data_checks.validate(X, y) == { "warnings": messages[:3], "errors": messages[3:] }
def test_sparsity_data_check_warnings(): data = pd.DataFrame({ 'most_sparse': [float(x) for x in range(10)], # [0,1,2,3,4,5,6,7,8,9] 'more_sparse': [x % 5 for x in range(10)], # [0,1,2,3,4,0,1,2,3,4] 'sparse': [x % 3 for x in range(10)], # [0,1,2,0,1,2,0,1,2,0] 'less_sparse': [x % 2 for x in range(10)], # [0,1,0,1,0,1,0,1,0,1] 'not_sparse': [float(1) for x in range(10)] }) # [1,1,1,1,1,1,1,1,1,1] sparsity_check = SparsityDataCheck(problem_type="multiclass", threshold=.4, unique_count_threshold=3) assert sparsity_check.validate(data) == { "warnings": [ DataCheckWarning( message= "Input columns (most_sparse) for multiclass problem type are too sparse.", data_check_name=sparsity_data_check_name, message_code=DataCheckMessageCode.TOO_SPARSE, details={ "column": "most_sparse", 'sparsity_score': 0 }).to_dict(), DataCheckWarning( message= "Input columns (more_sparse) for multiclass problem type are too sparse.", data_check_name=sparsity_data_check_name, message_code=DataCheckMessageCode.TOO_SPARSE, details={ "column": "more_sparse", 'sparsity_score': 0 }).to_dict(), DataCheckWarning( message= "Input columns (sparse) for multiclass problem type are too sparse.", data_check_name=sparsity_data_check_name, message_code=DataCheckMessageCode.TOO_SPARSE, details={ "column": "sparse", 'sparsity_score': 0.3333333333333333 }).to_dict() ], "errors": [], "actions": [] }
def validate(self, X, y): """Checks if any target labels are imbalanced beyond a threshold for binary and multiclass problems Ignores NaN values in target labels if they appear. Arguments: X (ww.DataTable, pd.DataFrame, np.ndarray): Features. Ignored. y (ww.DataColumn, pd.Series, np.ndarray): Target labels to check for imbalanced data. Returns: dict: Dictionary with DataCheckWarnings if imbalance in classes is less than the threshold, and DataCheckErrors if the number of values for each target is below 2 * num_cv_folds. Example: >>> import pandas as pd >>> X = pd.DataFrame() >>> y = pd.Series([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]) >>> target_check = ClassImbalanceDataCheck(threshold=0.10) >>> assert target_check.validate(X, y) == {"errors": [{"message": "The number of instances of these targets is less than 2 * the number of cross folds = 6 instances: [0]",\ "data_check_name": "ClassImbalanceDataCheck",\ "level": "error",\ "code": "CLASS_IMBALANCE_BELOW_FOLDS",\ "details": {"target_values": [0]}}],\ "warnings": [{"message": "The following labels fall below 10% of the target: [0]",\ "data_check_name": "ClassImbalanceDataCheck",\ "level": "warning",\ "code": "CLASS_IMBALANCE_BELOW_THRESHOLD",\ "details": {"target_values": [0]}}]} """ messages = { "warnings": [], "errors": [] } y = infer_feature_types(y) y = _convert_woodwork_types_wrapper(y.to_series()) fold_counts = y.value_counts(normalize=False) # search for targets that occur less than twice the number of cv folds first below_threshold_folds = fold_counts.where(fold_counts < self.cv_folds).dropna() if len(below_threshold_folds): below_threshold_values = below_threshold_folds.index.tolist() error_msg = "The number of instances of these targets is less than 2 * the number of cross folds = {} instances: {}" DataCheck._add_message(DataCheckError(message=error_msg.format(self.cv_folds, below_threshold_values), data_check_name=self.name, message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_FOLDS, details={"target_values": below_threshold_values}), messages) counts = fold_counts / fold_counts.sum() below_threshold = counts.where(counts < self.threshold).dropna() # if there are items that occur less than the threshold, add them to the list of messages if len(below_threshold): below_threshold_values = below_threshold.index.tolist() warning_msg = "The following labels fall below {:.0f}% of the target: {}" DataCheck._add_message(DataCheckWarning(message=warning_msg.format(self.threshold * 100, below_threshold_values), data_check_name=self.name, message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_THRESHOLD, details={"target_values": below_threshold_values}), messages) return messages
def test_default_data_checks_null_rows(): class SeriesWrap(): def __init__(self, series): self.series = series def __eq__(self, series_2): return all(self.series.eq(series_2.series)) X = pd.DataFrame({'all_null': [None, None, None, None, None], 'also_all_null': [None, None, None, None, None]}) y = pd.Series([0, 1, np.nan, 1, 0]) data_checks = DefaultDataChecks("regression", get_default_primary_search_objective("regression")) highly_null_rows = SeriesWrap(pd.Series([1.0, 1.0, 1.0, 1.0, 1.0])) expected = { "warnings": [DataCheckWarning(message="5 out of 5 rows are more than 95.0% null", data_check_name="HighlyNullDataCheck", message_code=DataCheckMessageCode.HIGHLY_NULL_ROWS, details={"pct_null_cols": highly_null_rows}).to_dict(), DataCheckWarning(message="Column 'all_null' is 95.0% or more null", data_check_name="HighlyNullDataCheck", message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, details={"column": 'all_null', "pct_null_rows": 1.0}).to_dict(), DataCheckWarning(message="Column 'also_all_null' is 95.0% or more null", data_check_name="HighlyNullDataCheck", message_code=DataCheckMessageCode.HIGHLY_NULL_COLS, details={"column": 'also_all_null', "pct_null_rows": 1.0}).to_dict()], "errors": [DataCheckError(message="1 row(s) (20.0%) of target values are null", data_check_name="InvalidTargetDataCheck", message_code=DataCheckMessageCode.TARGET_HAS_NULL, details={"num_null_rows": 1, "pct_null_rows": 20.0}).to_dict(), DataCheckError(message="all_null has 0 unique value.", data_check_name="NoVarianceDataCheck", message_code=DataCheckMessageCode.NO_VARIANCE, details={"column": "all_null"}).to_dict(), DataCheckError(message="also_all_null has 0 unique value.", data_check_name="NoVarianceDataCheck", message_code=DataCheckMessageCode.NO_VARIANCE, details={"column": "also_all_null"}).to_dict()], "actions": [DataCheckAction(DataCheckActionCode.DROP_ROWS, metadata={"rows": [0, 1, 2, 3, 4]}).to_dict(), DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'all_null'}).to_dict(), DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'also_all_null'}).to_dict(), DataCheckAction(DataCheckActionCode.IMPUTE_COL, metadata={"column": None, "is_target": True, "impute_strategy": "mean"}).to_dict()]} validation_results = data_checks.validate(X, y) validation_results['warnings'][0]['details']['pct_null_cols'] = SeriesWrap(validation_results['warnings'][0]['details']['pct_null_cols']) assert validation_results == expected
def test_uniqueness_data_check_warnings(): data = pd.DataFrame({ 'regression_unique_enough': [float(x) for x in range(100)], 'regression_not_unique_enough': [float(1) for x in range(100)] }) uniqueness_check = UniquenessDataCheck(problem_type="regression") assert uniqueness_check.validate(data) == { "warnings": [ DataCheckWarning( message= "Input columns (regression_not_unique_enough) for regression problem type are not unique enough.", data_check_name=uniqueness_data_check_name, message_code=DataCheckMessageCode.NOT_UNIQUE_ENOUGH, details={ "column": "regression_not_unique_enough", 'uniqueness_score': 0.0 }).to_dict() ], "errors": [], "actions": [] } data = pd.DataFrame({ 'multiclass_too_unique': ["Cats", "Are", "Absolutely", "The", "Best"] * 20, 'multiclass_not_too_unique': ["Cats", "Cats", "Best", "Best", "Best"] * 20 }) uniqueness_check = UniquenessDataCheck(problem_type="multiclass") assert uniqueness_check.validate(data) == { "warnings": [ DataCheckWarning( message= "Input columns (multiclass_too_unique) for multiclass problem type are too unique.", data_check_name=uniqueness_data_check_name, message_code=DataCheckMessageCode.TOO_UNIQUE, details={ "column": "multiclass_too_unique", 'uniqueness_score': 0.7999999999999999 }).to_dict() ], "errors": [], "actions": [] }
def test_invalid_target_data_check_multiclass_problem_almostcontinuous_data(): invalid_targets_check = InvalidTargetDataCheck( "multiclass", get_default_primary_search_objective("multiclass")) y_multiclass_high_classes = pd.Series( list(range(0, 100)) * 3) # 100 classes, 300 samples, .33 class/sample ratio X = pd.DataFrame({"col": range(len(y_multiclass_high_classes))}) data_check_error = DataCheckWarning( message= f"Target has a large number of unique values, could be regression type problem.", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_MULTICLASS_HIGH_UNIQUE_CLASS, details={ "class_to_value_ratio": 1 / 3 }).to_dict() assert invalid_targets_check.validate(X, y=y_multiclass_high_classes) == { "warnings": [data_check_error], "errors": [] } y_multiclass_med_classes = pd.Series( list(range(0, 5)) * 20) # 5 classes, 100 samples, .05 class/sample ratio X = pd.DataFrame({"col": range(len(y_multiclass_med_classes))}) data_check_error = DataCheckWarning( message= f"Target has a large number of unique values, could be regression type problem.", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_MULTICLASS_HIGH_UNIQUE_CLASS, details={ "class_to_value_ratio": .05 }).to_dict() assert invalid_targets_check.validate(X, y=y_multiclass_med_classes) == { "warnings": [data_check_error], "errors": [] } y_multiclass_low_classes = pd.Series( list(range(0, 3)) * 100) # 2 classes, 300 samples, .01 class/sample ratio X = pd.DataFrame({"col": range(len(y_multiclass_low_classes))}) assert invalid_targets_check.validate(X, y=y_multiclass_low_classes) == { "warnings": [], "errors": [] }
def validate(self, X, y=None): """Check if any of the features are likely to be ID columns. Currently performs these simple checks: - column name is "id" - column name ends in "_id" - column contains all unique values (and is categorical / integer type) Arguments: X (ww.DataTable, pd.DataFrame, np.ndarray): The input features to check Returns: dict: A dictionary of features with column name or index and their probability of being ID columns Example: >>> import pandas as pd >>> df = pd.DataFrame({ ... 'df_id': [0, 1, 2, 3, 4], ... 'x': [10, 42, 31, 51, 61], ... 'y': [42, 54, 12, 64, 12] ... }) >>> id_col_check = IDColumnsDataCheck() >>> assert id_col_check.validate(df) == {"errors": [],\ "warnings": [{"message": "Column 'df_id' is 100.0% or more likely to be an ID column",\ "data_check_name": "IDColumnsDataCheck",\ "level": "warning",\ "code": "HAS_ID_COLUMN",\ "details": {"column": "df_id"}}]} """ messages = { "warnings": [], "errors": [] } X = _convert_to_woodwork_structure(X) col_names = [col for col in X.columns] cols_named_id = [col for col in col_names if (str(col).lower() == "id")] # columns whose name is "id" id_cols = {col: 0.95 for col in cols_named_id} X = X.select(include=['Integer', 'Categorical']) X = _convert_woodwork_types_wrapper(X.to_dataframe()) check_all_unique = (X.nunique() == len(X)) cols_with_all_unique = check_all_unique[check_all_unique].index.tolist() # columns whose values are all unique id_cols.update([(col, 1.0) if col in id_cols else (col, 0.95) for col in cols_with_all_unique]) col_ends_with_id = [col for col in col_names if str(col).lower().endswith("_id")] # columns whose name ends with "_id" id_cols.update([(col, 1.0) if str(col) in id_cols else (col, 0.95) for col in col_ends_with_id]) id_cols_above_threshold = {key: value for key, value in id_cols.items() if value >= self.id_threshold} warning_msg = "Column '{}' is {}% or more likely to be an ID column" messages["warnings"].extend([DataCheckWarning(message=warning_msg.format(col_name, self.id_threshold * 100), data_check_name=self.name, message_code=DataCheckMessageCode.HAS_ID_COLUMN, details={"column": col_name}).to_dict() for col_name in id_cols_above_threshold]) return messages
def validate(self, X, y): return { "warnings": [ DataCheckWarning(message="warning one", data_check_name=self.name, message_code=None).to_dict() ], "errors": [] }
def test_invalid_target_data_check_mismatched_indices(): X = pd.DataFrame({"col": [1, 2, 3]}) y_same_index = pd.Series([1, 0, 1]) y_diff_index = pd.Series([0, 1, 0], index=[1, 5, 10]) y_diff_index_order = pd.Series([0, 1, 0], index=[0, 2, 1]) invalid_targets_check = InvalidTargetDataCheck("binary", get_default_primary_search_objective("binary")) assert invalid_targets_check.validate(X=None, y=y_same_index) == {"warnings": [], "errors": [], "actions": []} assert invalid_targets_check.validate(X, y_same_index) == {"warnings": [], "errors": [], "actions": []} X_index_missing = list(set(y_diff_index.index) - set(X.index)) y_index_missing = list(set(X.index) - set(y_diff_index.index)) assert invalid_targets_check.validate(X, y_diff_index) == { "warnings": [DataCheckWarning(message="Input target and features have mismatched indices", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.MISMATCHED_INDICES, details={"indices_not_in_features": X_index_missing, "indices_not_in_target": y_index_missing}).to_dict()], "errors": [], "actions": [] } assert invalid_targets_check.validate(X, y_diff_index_order) == { "warnings": [DataCheckWarning(message="Input target and features have mismatched indices order", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.MISMATCHED_INDICES_ORDER, details={}).to_dict()], "errors": [], "actions": [] } # Test that we only store ten mismatches when there are more than 10 differences in indices found X_large = pd.DataFrame({"col": range(20)}) y_more_than_ten_diff_indices = pd.Series([0, 1] * 10, index=range(20, 40)) X_index_missing = list(set(y_more_than_ten_diff_indices.index) - set(X.index)) y_index_missing = list(set(X_large.index) - set(y_more_than_ten_diff_indices.index)) assert invalid_targets_check.validate(X_large, y_more_than_ten_diff_indices) == { "warnings": [DataCheckWarning(message="Input target and features have mismatched indices", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.MISMATCHED_INDICES, details={"indices_not_in_features": X_index_missing[:10], "indices_not_in_target": y_index_missing[:10]}).to_dict()], "errors": [], "actions": [] }
def test_highly_null_data_check_warnings(): data = pd.DataFrame({'lots_of_null': [None, None, None, None, 5], 'all_null': [None, None, None, None, None], 'no_null': [1, 2, 3, 4, 5]}) no_null_check = HighlyNullDataCheck(pct_null_threshold=0.0) assert no_null_check.validate(data) == { "warnings": [DataCheckWarning(message="Column 'lots_of_null' is more than 0% null", data_check_name=highly_null_data_check_name, message_code=DataCheckMessageCode.HIGHLY_NULL, details={"column": "lots_of_null"}).to_dict(), DataCheckWarning(message="Column 'all_null' is more than 0% null", data_check_name=highly_null_data_check_name, message_code=DataCheckMessageCode.HIGHLY_NULL, details={"column": "all_null"}).to_dict()], "errors": [], "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'lots_of_null'}).to_dict(), DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'all_null'}).to_dict()] } some_null_check = HighlyNullDataCheck(pct_null_threshold=0.5) assert some_null_check.validate(data) == { "warnings": [DataCheckWarning(message="Column 'lots_of_null' is 50.0% or more null", data_check_name=highly_null_data_check_name, message_code=DataCheckMessageCode.HIGHLY_NULL, details={"column": "lots_of_null"}).to_dict(), DataCheckWarning(message="Column 'all_null' is 50.0% or more null", data_check_name=highly_null_data_check_name, message_code=DataCheckMessageCode.HIGHLY_NULL, details={"column": "all_null"}).to_dict()], "errors": [], "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'lots_of_null'}).to_dict(), DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'all_null'}).to_dict()] } all_null_check = HighlyNullDataCheck(pct_null_threshold=1.0) assert all_null_check.validate(data) == { "warnings": [DataCheckWarning(message="Column 'all_null' is 100.0% or more null", data_check_name=highly_null_data_check_name, message_code=DataCheckMessageCode.HIGHLY_NULL, details={"column": "all_null"}).to_dict()], "errors": [], "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, metadata={"column": 'all_null'}).to_dict()] }
def test_class_imbalance_severe(min_samples, input_type): X = pd.DataFrame() # 0 will be < 10% of the data, but there will be 50 samples of it y_values_binary = pd.Series([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] * 50) y_values_multiclass = pd.Series([0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2] * 50) if input_type == "ww": X = ww.DataTable(X) y_values_binary = ww.DataColumn(y_values_binary) y_values_multiclass = ww.DataColumn(y_values_multiclass) class_imbalance_check = ClassImbalanceDataCheck(min_samples=min_samples, num_cv_folds=1) warnings = [ DataCheckWarning( message="The following labels fall below 10% of the target: [0]", data_check_name=class_imbalance_data_check_name, message_code=DataCheckMessageCode.CLASS_IMBALANCE_BELOW_THRESHOLD, details={ "target_values": [0] }).to_dict() ] if min_samples > 50: warnings.append( DataCheckWarning( message= f"The following labels in the target have severe class imbalance because they fall under 10% of the target and have less than {min_samples} samples: [0]", data_check_name=class_imbalance_data_check_name, message_code=DataCheckMessageCode.CLASS_IMBALANCE_SEVERE, details={ "target_values": [0] }).to_dict()) assert class_imbalance_check.validate(X, y_values_binary) == { "warnings": warnings, "errors": [], "actions": [] } assert class_imbalance_check.validate(X, y_values_multiclass) == { "warnings": warnings, "errors": [], "actions": [] }
def validate(self, X, y=None): """Checks if there are any outliers in a dataframe by using IQR to determine column anomalies. Column with anomalies are considered to contain outliers. Arguments: X (ww.DataTable, pd.DataFrame, np.ndarray): Features y (ww.DataColumn, pd.Series, np.ndarray): Ignored. Returns: dict: A dictionary with warnings if any columns have outliers. Example: >>> df = pd.DataFrame({ ... 'x': [1, 2, 3, 4, 5], ... 'y': [6, 7, 8, 9, 10], ... 'z': [-1, -2, -3, -1201, -4] ... }) >>> outliers_check = OutliersDataCheck() >>> assert outliers_check.validate(df) == {"warnings": [{"message": "Column(s) 'z' are likely to have outlier data.",\ "data_check_name": "OutliersDataCheck",\ "level": "warning",\ "code": "HAS_OUTLIERS",\ "details": {"columns": ["z"]}}],\ "errors": []} """ messages = {"warnings": [], "errors": []} X = infer_feature_types(X) X = X.select('numeric') X = _convert_woodwork_types_wrapper(X.to_dataframe()) if len(X.columns) == 0: return messages def get_IQR(df, k=2.0): q1 = df.quantile(0.25) q3 = df.quantile(0.75) iqr = q3 - q1 lower_bound = pd.Series(q1 - (k * iqr), name='lower_bound') upper_bound = pd.Series(q3 + (k * iqr), name='upper_bound') return pd.concat([lower_bound, upper_bound], axis=1) iqr = get_IQR(X, k=2.0) has_outliers = ((X < iqr['lower_bound']) | (X > iqr['upper_bound'])).any() cols = list(has_outliers.index[has_outliers]) warning_msg = "Column(s) {} are likely to have outlier data.".format( ", ".join([f"'{col}'" for col in cols])) messages["warnings"].append( DataCheckWarning(message=warning_msg, data_check_name=self.name, message_code=DataCheckMessageCode.HAS_OUTLIERS, details={ "columns": cols }).to_dict()) return messages
def validate(self, X, y=None): """Checks if there are any outliers in a dataframe by using IQR to determine column anomalies. Column with anomalies are considered to contain outliers. Arguments: X (ww.DataTable, pd.DataFrame, np.ndarray): Features y (ww.DataColumn, pd.Series, np.ndarray): Ignored. Returns: dict: A dictionary with warnings if any columns have outliers. Example: >>> import pandas as pd >>> df = pd.DataFrame({ ... 'x': [1, 2, 3, 4, 5], ... 'y': [6, 7, 8, 9, 10], ... 'z': [-1, -2, -3, -1201, -4] ... }) >>> outliers_check = OutliersDataCheck() >>> assert outliers_check.validate(df) == {"warnings": [{"message": "Column(s) 'z' are likely to have outlier data.",\ "data_check_name": "OutliersDataCheck",\ "level": "warning",\ "code": "HAS_OUTLIERS",\ "details": {"columns": ["z"]}}],\ "errors": [],\ "actions": []} """ results = {"warnings": [], "errors": [], "actions": []} X = infer_feature_types(X) X = X.select('numeric') X = _convert_woodwork_types_wrapper(X.to_dataframe()) if len(X.columns) == 0: return results has_outliers = [] for col in X.columns: outlier_results = OutliersDataCheck._outlier_score(X[col], False) if outlier_results is not None and outlier_results[ "score"] <= 0.9: # 0.9 is threshold indicating data needs improvement has_outliers.append(col) warning_msg = "Column(s) {} are likely to have outlier data.".format( ", ".join([f"'{col}'" for col in has_outliers])) results["warnings"].append( DataCheckWarning(message=warning_msg, data_check_name=self.name, message_code=DataCheckMessageCode.HAS_OUTLIERS, details={ "columns": has_outliers }).to_dict()) return results
def test_data_check_message_to_dict(): error = DataCheckError(message="test message", data_check_name="same test name", message_code=DataCheckMessageCode.HIGHLY_NULL, details={"detail 1": "error info"}) assert error.to_dict() == { "message": "test message", "level": "error", "data_check_name": "same test name", "code": DataCheckMessageCode.HIGHLY_NULL.name, "details": {"detail 1": "error info"} } warning = DataCheckWarning(message="test message", data_check_name="same test name", message_code=DataCheckMessageCode.HIGHLY_NULL, details={"detail 1": "warning info"}) assert warning.to_dict() == { "message": "test message", "level": "warning", "data_check_name": "same test name", "code": DataCheckMessageCode.HIGHLY_NULL.name, "details": {"detail 1": "warning info"} }
def validate(self, X, y=None): """Calculates what percentage of each column's unique values exceed the count threshold and compare that percentage to the sparsity threshold stored in the class instance. Arguments: X (ww.DataTable, pd.DataFrame, np.ndarray): Features. y (ww.DataColumn, pd.Series, np.ndarray): Ignored. Returns: dict: dict with a DataCheckWarning if there are any sparse columns. Example: >>> import pandas as pd >>> df = pd.DataFrame({ ... 'sparse': [float(x) for x in range(100)], ... 'not_sparse': [float(1) for x in range(100)] ... }) >>> sparsity_check = SparsityDataCheck(problem_type="multiclass", threshold=0.5, unique_count_threshold=10) >>> assert sparsity_check.validate(df) == {"errors": [],\ "warnings": [{"message": "Input columns (sparse) for multiclass problem type are too sparse.",\ "data_check_name": "SparsityDataCheck",\ "level": "warning",\ "code": "TOO_SPARSE",\ "details": {"column": "sparse", 'sparsity_score': 0.0}}],\ "actions": [{"code": "DROP_COL",\ "metadata": {"column": "sparse"}}]} """ results = {"warnings": [], "errors": [], "actions": []} X = infer_feature_types(X) X = _convert_woodwork_types_wrapper(X.to_dataframe()) res = X.apply(SparsityDataCheck.sparsity_score, count_threshold=self.unique_count_threshold) too_sparse_cols = [col for col in res.index[res < self.threshold]] results["warnings"].extend([ DataCheckWarning(message=warning_too_unique.format( col_name, self.problem_type), data_check_name=self.name, message_code=DataCheckMessageCode.TOO_SPARSE, details={ "column": col_name, "sparsity_score": res.loc[col_name] }).to_dict() for col_name in too_sparse_cols ]) results["actions"].extend([ DataCheckAction(action_code=DataCheckActionCode.DROP_COL, metadata={ "column": col_name }).to_dict() for col_name in too_sparse_cols ]) return results
def test_invalid_target_data_check_numeric_binary_classification_error(): y = pd.Series([1, 5, 1, 5, 1, 1]) X = pd.DataFrame({"col": range(len(y))}) invalid_targets_check = InvalidTargetDataCheck("binary", get_default_primary_search_objective("binary")) assert invalid_targets_check.validate(X, y) == { "warnings": [DataCheckWarning( message="Numerical binary classification target classes must be [0, 1], got [1, 5] instead", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_BINARY_INVALID_VALUES, details={"target_values": [1, 5]}).to_dict()], "errors": [] } y = pd.Series([0, 5, np.nan, np.nan]) X = pd.DataFrame({"col": range(len(y))}) assert invalid_targets_check.validate(X, y) == { "warnings": [DataCheckWarning( message="Numerical binary classification target classes must be [0, 1], got [5.0, 0.0] instead", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_BINARY_INVALID_VALUES, details={"target_values": [5.0, 0.0]}).to_dict()], "errors": [DataCheckError(message="2 row(s) (50.0%) of target values are null", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_HAS_NULL, details={"num_null_rows": 2, "pct_null_rows": 50}).to_dict()] } y = pd.Series([0, 1, 1, 0, 1, 2]) X = pd.DataFrame({"col": range(len(y))}) assert invalid_targets_check.validate(X, y) == { "warnings": [], "errors": [DataCheckError(message="Binary class targets require exactly two unique values.", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_BINARY_NOT_TWO_UNIQUE_VALUES, details={"target_values": [1, 0, 2]}).to_dict()] }
def test_outliers_data_check_string_cols(): a = np.arange(10) * 0.01 data = np.tile(a, (100, 2)) n_cols = 20 X = pd.DataFrame(data=data, columns=[string.ascii_lowercase[i] for i in range(n_cols)]) X.iloc[0, 3] = 1000 outliers_check = OutliersDataCheck() assert outliers_check.validate(X) == { "warnings": [DataCheckWarning(message="Column(s) 'd' are likely to have outlier data.", data_check_name=outliers_data_check_name, message_code=DataCheckMessageCode.HAS_OUTLIERS, details={"columns": ["d"]}).to_dict()], "errors": [] }
def test_data_check_message_attributes_optional(): data_check_warning = DataCheckWarning( message="test warning", data_check_name="test data check warning name") assert data_check_warning.message == "test warning" assert data_check_warning.data_check_name == "test data check warning name" assert data_check_warning.message_type == DataCheckMessageType.WARNING assert data_check_warning.message_code is None assert data_check_warning.details is None data_check_error = DataCheckError( message="test error", data_check_name="test data check error name") assert data_check_error.message == "test error" assert data_check_error.data_check_name == "test data check error name" assert data_check_error.message_type == DataCheckMessageType.ERROR assert data_check_error.message_code is None assert data_check_error.details is None
def validate(self, pipeline_name, cv_scores): """Checks cross-validation scores and issues an warning if variance is higher than specified threshhold. Arguments: pipeline_name (str): name of pipeline that produced cv_scores cv_scores (pd.Series, np.ndarray, list): list of scores of each cross-validation fold Returns: dict: Dictionary with DataCheckWarnings if imbalance in classes is less than the threshold. Example: >>> cv_scores = pd.Series([0, 1, 1, 1]) >>> check = HighVarianceCVDataCheck(threshold=0.10) >>> assert check.validate("LogisticRegressionPipeline", cv_scores) == {"warnings": [{"message": "High coefficient of variation (cv >= 0.1) within cross validation scores. LogisticRegressionPipeline may not perform as estimated on unseen data.",\ "data_check_name": "HighVarianceCVDataCheck",\ "level": "warning",\ "code": "HIGH_VARIANCE",\ "details": {"variance": 2.0/3.0, "pipeline_name": "LogisticRegressionPipeline"}}],\ "errors": []} """ messages = {"warnings": [], "errors": []} if not isinstance(cv_scores, pd.Series): cv_scores = pd.Series(cv_scores) variance = 0 if cv_scores.mean() == 0: high_variance_cv = False else: variance = abs(cv_scores.std() / cv_scores.mean()) high_variance_cv = abs( cv_scores.std() / cv_scores.mean()) > self.threshold # if there are items that occur less than the threshold, add them to the list of messages if high_variance_cv: warning_msg = f"High coefficient of variation (cv >= {self.threshold}) within cross validation scores. {pipeline_name} may not perform as estimated on unseen data." DataCheck._add_message( DataCheckWarning( message=warning_msg, data_check_name=self.name, message_code=DataCheckMessageCode.HIGH_VARIANCE, details={ "variance": variance, "pipeline_name": pipeline_name }), messages) return messages
def test_id_cols_data_check_input_formats(): id_cols_check = IDColumnsDataCheck(id_threshold=0.8) # test empty pd.DataFrame assert id_cols_check.validate(pd.DataFrame()) == {"warnings": [], "errors": [], "actions": []} # test Woodwork ww_input = ww.DataTable(np.array([[0, 1], [1, 2], [2, 3], [3, 4], [4, 5]])) assert id_cols_check.validate(ww_input) == { "warnings": [DataCheckWarning(message="Column '0' is 80.0% or more likely to be an ID column", data_check_name=id_data_check_name, message_code=DataCheckMessageCode.HAS_ID_COLUMN, details={"column": 0}).to_dict(), DataCheckWarning(message="Column '1' is 80.0% or more likely to be an ID column", data_check_name=id_data_check_name, message_code=DataCheckMessageCode.HAS_ID_COLUMN, details={"column": 1}).to_dict()], "errors": [], "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": 0}).to_dict(), DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": 1}).to_dict()] } # test 2D list assert id_cols_check.validate([[0, 1], [1, 2], [2, 3], [3, 4], [4, 5]]) == { "warnings": [DataCheckWarning(message="Column '0' is 80.0% or more likely to be an ID column", data_check_name=id_data_check_name, message_code=DataCheckMessageCode.HAS_ID_COLUMN, details={"column": 0}).to_dict(), DataCheckWarning("Column '1' is 80.0% or more likely to be an ID column", data_check_name=id_data_check_name, message_code=DataCheckMessageCode.HAS_ID_COLUMN, details={"column": 1}).to_dict()], "errors": [], "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": 0}).to_dict(), DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": 1}).to_dict()] } # test np.array assert id_cols_check.validate(np.array([[0, 1], [1, 2], [2, 3], [3, 4], [4, 5]])) == { "warnings": [DataCheckWarning(message="Column '0' is 80.0% or more likely to be an ID column", data_check_name=id_data_check_name, message_code=DataCheckMessageCode.HAS_ID_COLUMN, details={"column": 0}).to_dict(), DataCheckWarning(message="Column '1' is 80.0% or more likely to be an ID column", data_check_name=id_data_check_name, message_code=DataCheckMessageCode.HAS_ID_COLUMN, details={"column": 1}).to_dict()], "errors": [], "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": 0}).to_dict(), DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": 1}).to_dict()] }
def test_high_variance_cv_data_check_negative(): high_variance_cv = HighVarianceCVDataCheck() cv_scores = pd.Series([0, -1, -1, -1]) variance = abs(cv_scores.std() / cv_scores.mean()) assert high_variance_cv.validate( pipeline_name=hv_pipeline_name, cv_scores=cv_scores ) == { "warnings": [ DataCheckWarning( message= "High coefficient of variation (cv >= 0.2) within cross validation scores. LogisticRegressionPipeline may not perform as estimated on unseen data.", data_check_name=high_variance_data_check_name, message_code=DataCheckMessageCode.HIGH_VARIANCE, details={ "variance": variance, "pipeline_name": hv_pipeline_name }).to_dict() ], "errors": [] }
def test_id_columns_warning(): X_dict = {'col_1_id': [0, 1, 2, 3], 'col_2': [2, 3, 4, 5], 'col_3_id': [1, 1, 2, 3], 'Id': [3, 1, 2, 0], 'col_5': [0, 0, 1, 2], 'col_6': [0.1, 0.2, 0.3, 0.4] } X = pd.DataFrame.from_dict(X_dict) id_cols_check = IDColumnsDataCheck(id_threshold=0.95) assert id_cols_check.validate(X) == { "warnings": [DataCheckWarning(message="Column 'Id' is 95.0% or more likely to be an ID column", data_check_name=id_data_check_name, message_code=DataCheckMessageCode.HAS_ID_COLUMN, details={"column": "Id"}).to_dict(), DataCheckWarning(message="Column 'col_1_id' is 95.0% or more likely to be an ID column", data_check_name=id_data_check_name, message_code=DataCheckMessageCode.HAS_ID_COLUMN, details={"column": "col_1_id"}).to_dict(), DataCheckWarning(message="Column 'col_2' is 95.0% or more likely to be an ID column", data_check_name=id_data_check_name, message_code=DataCheckMessageCode.HAS_ID_COLUMN, details={"column": "col_2"}).to_dict(), DataCheckWarning(message="Column 'col_3_id' is 95.0% or more likely to be an ID column", data_check_name=id_data_check_name, message_code=DataCheckMessageCode.HAS_ID_COLUMN, details={"column": "col_3_id"}).to_dict()], "errors": [], "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": "Id"}).to_dict(), DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": "col_1_id"}).to_dict(), DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": "col_2"}).to_dict(), DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": "col_3_id"}).to_dict()] } X = pd.DataFrame.from_dict(X_dict) id_cols_check = IDColumnsDataCheck(id_threshold=1.0) assert id_cols_check.validate(X) == { "warnings": [DataCheckWarning(message="Column 'Id' is 100.0% or more likely to be an ID column", data_check_name=id_data_check_name, message_code=DataCheckMessageCode.HAS_ID_COLUMN, details={"column": "Id"}).to_dict(), DataCheckWarning(message="Column 'col_1_id' is 100.0% or more likely to be an ID column", data_check_name=id_data_check_name, message_code=DataCheckMessageCode.HAS_ID_COLUMN, details={"column": "col_1_id"}).to_dict()], "errors": [], "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": "Id"}).to_dict(), DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": "col_1_id"}).to_dict()] }
def test_id_columns_strings(): X_dict = {'col_1_id': ["a", "b", "c", "d"], 'col_2': ["w", "x", "y", "z"], 'col_3_id': ["123456789012345", "234567890123456", "3456789012345678", "45678901234567"], 'Id': ["z", "y", "x", "a"], 'col_5': ["0", "0", "1", "2"], 'col_6': [0.1, 0.2, 0.3, 0.4] } X = pd.DataFrame.from_dict(X_dict) id_cols_check = IDColumnsDataCheck(id_threshold=0.95) assert id_cols_check.validate(X) == { "warnings": [DataCheckWarning(message="Column 'Id' is 95.0% or more likely to be an ID column", data_check_name=id_data_check_name, message_code=DataCheckMessageCode.HAS_ID_COLUMN, details={"column": "Id"}).to_dict(), DataCheckWarning(message="Column 'col_1_id' is 95.0% or more likely to be an ID column", data_check_name=id_data_check_name, message_code=DataCheckMessageCode.HAS_ID_COLUMN, details={"column": "col_1_id"}).to_dict(), DataCheckWarning(message="Column 'col_2' is 95.0% or more likely to be an ID column", data_check_name=id_data_check_name, message_code=DataCheckMessageCode.HAS_ID_COLUMN, details={"column": "col_2"}).to_dict(), DataCheckWarning(message="Column 'col_3_id' is 95.0% or more likely to be an ID column", data_check_name=id_data_check_name, message_code=DataCheckMessageCode.HAS_ID_COLUMN, details={"column": "col_3_id"}).to_dict()], "errors": [], "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": "Id"}).to_dict(), DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": "col_1_id"}).to_dict(), DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": "col_2"}).to_dict(), DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": "col_3_id"}).to_dict()] } id_cols_check = IDColumnsDataCheck(id_threshold=1.0) assert id_cols_check.validate(X) == { "warnings": [DataCheckWarning(message="Column 'Id' is 100.0% or more likely to be an ID column", data_check_name=id_data_check_name, message_code=DataCheckMessageCode.HAS_ID_COLUMN, details={"column": "Id"}).to_dict(), DataCheckWarning(message="Column 'col_1_id' is 100.0% or more likely to be an ID column", data_check_name=id_data_check_name, message_code=DataCheckMessageCode.HAS_ID_COLUMN, details={"column": "col_1_id"}).to_dict()], "errors": [], "actions": [DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": "Id"}).to_dict(), DataCheckAction(DataCheckActionCode.DROP_COL, details={"column": "col_1_id"}).to_dict()] }
"warnings": [], "errors": [labels_1_unique], "actions": [] }), (all_distinct_X, all_null_y, False, { "warnings": [], "errors": [labels_0_unique], "actions": [] }), (two_distinct_with_nulls_X, two_distinct_with_nulls_y, True, { "warnings": [ DataCheckWarning( message= "feature has two unique values including nulls. Consider encoding the nulls for " "this column to be useful for machine learning.", data_check_name=no_variance_data_check_name, message_code=DataCheckMessageCode.NO_VARIANCE_WITH_NULL, details={ "column": "feature" }).to_dict(), DataCheckWarning( message= "Y has two unique values including nulls. Consider encoding the nulls for " "this column to be useful for machine learning.", data_check_name=no_variance_data_check_name, message_code=DataCheckMessageCode.NO_VARIANCE_WITH_NULL, details={ "column": "Y" }).to_dict() ], "errors": [],