def validate(self, X, y=None): """Checks if there are any columns in the input that are too unique in the case of classification problems or not unique enough in the case of regression problems. Arguments: X (ww.DataTable, pd.DataFrame, np.ndarray): Features. y (ww.DataColumn, pd.Series, np.ndarray): Ignored. Defaults to None. Returns: dict: dict with a DataCheckWarning if there are any too unique or not unique enough columns. Example: >>> import pandas as pd >>> df = pd.DataFrame({ ... 'regression_unique_enough': [float(x) for x in range(100)], ... 'regression_not_unique_enough': [float(1) for x in range(100)] ... }) >>> uniqueness_check = UniquenessDataCheck(problem_type="regression", threshold=0.8) >>> assert uniqueness_check.validate(df) == {"errors": [],\ "warnings": [{"message": "Input columns (regression_not_unique_enough) for regression problem type are not unique enough.",\ "data_check_name": "UniquenessDataCheck",\ "level": "warning",\ "code": "NOT_UNIQUE_ENOUGH",\ "details": {"column": "regression_not_unique_enough", 'uniqueness_score': 0.0}}],\ "actions": []} """ results = {"warnings": [], "errors": [], "actions": []} X = infer_feature_types(X) X = _convert_woodwork_types_wrapper(X.to_dataframe()) res = X.apply(UniquenessDataCheck.uniqueness_score) if is_regression(self.problem_type): not_unique_enough_cols = list(res.index[res < self.threshold]) results["warnings"].extend([ DataCheckWarning( message=warning_not_unique_enough.format( col_name, self.problem_type), data_check_name=self.name, message_code=DataCheckMessageCode.NOT_UNIQUE_ENOUGH, details={ "column": col_name, "uniqueness_score": res.loc[col_name] }).to_dict() for col_name in not_unique_enough_cols ]) elif is_multiclass(self.problem_type): too_unique_cols = list(res.index[res > self.threshold]) results["warnings"].extend([ DataCheckWarning(message=warning_too_unique.format( col_name, self.problem_type), data_check_name=self.name, message_code=DataCheckMessageCode.TOO_UNIQUE, details={ "column": col_name, "uniqueness_score": res.loc[col_name] }).to_dict() for col_name in too_unique_cols ]) return results
def test_invalid_target_data_action_for_data_with_null(problem_type): y = pd.Series([None, None, None, 0, 0, 0, 0, 0, 0, 0]) X = pd.DataFrame({"col": range(len(y))}) invalid_targets_check = InvalidTargetDataCheck(problem_type, get_default_primary_search_objective(problem_type)) impute_strategy = "mean" if is_regression(problem_type) else "most_frequent" expected = { "warnings": [], "errors": [DataCheckError(message="3 row(s) (30.0%) of target values are null", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_HAS_NULL, details={"num_null_rows": 3, "pct_null_rows": 30.0}).to_dict()], "actions": [DataCheckAction(DataCheckActionCode.IMPUTE_COL, metadata={"column": None, "is_target": True, "impute_strategy": impute_strategy}).to_dict()] } if is_binary(problem_type): expected["errors"].append(DataCheckError(message="Binary class targets require exactly two unique values.", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_BINARY_NOT_TWO_UNIQUE_VALUES, details={"target_values": [0]}).to_dict()) elif is_multiclass(problem_type): expected["errors"].append(DataCheckError(message=f"Target has two or less classes, which is too few for multiclass problems. Consider changing to binary.", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_MULTICLASS_NOT_ENOUGH_CLASSES, details={"num_classes": 1}).to_dict()) expected["warnings"].append(DataCheckWarning(message=f"Target has a large number of unique values, could be regression type problem.", data_check_name=invalid_targets_data_check_name, message_code=DataCheckMessageCode.TARGET_MULTICLASS_HIGH_UNIQUE_CLASS, details={"class_to_value_ratio": 0.1}).to_dict()) messages = invalid_targets_check.validate(X, y) assert messages == expected
def test_split_data(problem_type, data_type, X_y_binary, X_y_multi, X_y_regression, make_data_type): if is_binary(problem_type): X, y = X_y_binary if is_multiclass(problem_type): X, y = X_y_multi if is_regression(problem_type): X, y = X_y_regression problem_configuration = None if is_time_series(problem_type): problem_configuration = {'gap': 1, 'max_delay': 7} X = make_data_type(data_type, X) y = make_data_type(data_type, y) test_pct = 0.25 X_train, X_test, y_train, y_test = split_data( X, y, test_size=test_pct, problem_type=problem_type, problem_configuration=problem_configuration) test_size = len(X) * test_pct train_size = len(X) - test_size assert len(X_train) == train_size assert len(X_test) == test_size assert len(y_train) == train_size assert len(y_test) == test_size assert isinstance(X_train, ww.DataTable) assert isinstance(X_test, ww.DataTable) assert isinstance(y_train, ww.DataColumn) assert isinstance(y_test, ww.DataColumn)
def test_explain_predictions_stacked_ensemble( problem_type, dummy_stacked_ensemble_binary_estimator, dummy_stacked_ensemble_multiclass_estimator, dummy_stacked_ensemble_regressor_estimator, X_y_binary, X_y_multi, X_y_regression): if is_binary(problem_type): X, y = X_y_binary pipeline = dummy_stacked_ensemble_binary_estimator elif is_multiclass(problem_type): X, y = X_y_multi pipeline = dummy_stacked_ensemble_multiclass_estimator else: X, y = X_y_regression pipeline = dummy_stacked_ensemble_regressor_estimator with pytest.raises( ValueError, match="Cannot explain predictions for a stacked ensemble pipeline" ): explain_predictions(pipeline, X, y, indices_to_explain=[0]) with pytest.raises( ValueError, match="Cannot explain predictions for a stacked ensemble pipeline" ): explain_predictions_best_worst(pipeline, X, y)
def test_estimators_feature_name_with_random_ascii(X_y_binary, X_y_multi, X_y_regression, ts_data, helper_functions): for estimator_class in _all_estimators_used_in_search(): if estimator_class.__name__ == 'ARIMARegressor': continue supported_problem_types = [ handle_problem_types(pt) for pt in estimator_class.supported_problem_types ] for problem_type in supported_problem_types: clf = helper_functions.safe_init_component_with_njobs_1( estimator_class) if is_binary(problem_type): X, y = X_y_binary elif is_multiclass(problem_type): X, y = X_y_multi elif is_regression(problem_type): X, y = X_y_regression X = get_random_state(clf.random_seed).random( (X.shape[0], len(string.printable))) col_names = [ 'column_{}'.format(ascii_char) for ascii_char in string.printable ] X = pd.DataFrame(X, columns=col_names) assert clf.input_feature_names is None clf.fit(X, y) assert len(clf.feature_importance) == len(X.columns) assert not np.isnan(clf.feature_importance).all().all() predictions = clf.predict(X).to_series() assert len(predictions) == len(y) assert not np.isnan(predictions).all() assert (clf.input_feature_names == col_names)
def test_type_checks(problem_type): assert is_regression(problem_type) == (problem_type in [ ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION ]) assert is_binary(problem_type) == (problem_type in [ ProblemTypes.BINARY, ProblemTypes.TIME_SERIES_BINARY ]) assert is_multiclass(problem_type) == (problem_type in [ ProblemTypes.MULTICLASS, ProblemTypes.TIME_SERIES_MULTICLASS ]) assert is_classification(problem_type) == (problem_type in [ ProblemTypes.BINARY, ProblemTypes.MULTICLASS, ProblemTypes.TIME_SERIES_BINARY, ProblemTypes.TIME_SERIES_MULTICLASS ]) assert is_time_series(problem_type) == (problem_type in [ ProblemTypes.TIME_SERIES_BINARY, ProblemTypes.TIME_SERIES_MULTICLASS, ProblemTypes.TIME_SERIES_REGRESSION ])
def __init__(self, problem_type, threshold, unique_count_threshold=10): """Checks each column in the input to determine the sparsity of the values in those columns. Arguments: problem_type (str or ProblemTypes): The specific problem type to data check for. 'multiclass' or 'time series multiclass' is the only accepted problem type. threshold (float): The threshold value, or percentage of each column's unique values, below which, a column exhibits sparsity. Should be between 0 and 1. unique_count_threshold (int): The minimum number of times a unique value has to be present in a column to not be considered "sparse." Default is 10. """ self.problem_type = handle_problem_types(problem_type) if not is_multiclass(self.problem_type): raise ValueError("Sparsity is only defined for multiclass problem types.") self.threshold = threshold if threshold < 0 or threshold > 1: raise ValueError("Threshold must be a float between 0 and 1, inclusive.") self.unique_count_threshold = unique_count_threshold if unique_count_threshold < 0 or not isinstance(unique_count_threshold, int): raise ValueError("Unique count threshold must be positive integer.")
def train_and_score_pipeline(pipeline, automl, full_X_train, full_y_train): """Given a pipeline, config and data, train and score the pipeline and return the CV or TV scores Arguments: pipeline (PipelineBase): The pipeline to score automl (AutoMLSearch): The AutoML search, used to access config and for the error callback full_X_train (ww.DataTable): Training features full_y_train (ww.DataColumn): Training target Returns: dict: A dict containing cv_score_mean, cv_scores, training_time and a cv_data structure with details. """ start = time.time() cv_data = [] logger.info("\tStarting cross validation") X_pd = _convert_woodwork_types_wrapper(full_X_train.to_dataframe()) y_pd = _convert_woodwork_types_wrapper(full_y_train.to_series()) y_pd_encoded = y_pd # Encode target for classification problems so that we can support float targets. This is okay because we only use split to get the indices to split on if is_classification(automl.problem_type): y_mapping = { original_target: encoded_target for (encoded_target, original_target) in enumerate(y_pd.value_counts().index) } y_pd_encoded = y_pd.map(y_mapping) for i, (train, valid) in enumerate( automl.data_splitter.split(X_pd, y_pd_encoded)): if pipeline.model_family == ModelFamily.ENSEMBLE and i > 0: # Stacked ensembles do CV internally, so we do not run CV here for performance reasons. logger.debug( f"Skipping fold {i} because CV for stacked ensembles is not supported." ) break logger.debug(f"\t\tTraining and scoring on fold {i}") X_train, X_valid = full_X_train.iloc[train], full_X_train.iloc[ valid] y_train, y_valid = full_y_train.iloc[train], full_y_train.iloc[ valid] if is_binary(automl.problem_type) or is_multiclass( automl.problem_type): diff_train = set( np.setdiff1d(full_y_train.to_series(), y_train.to_series())) diff_valid = set( np.setdiff1d(full_y_train.to_series(), y_valid.to_series())) diff_string = f"Missing target values in the training set after data split: {diff_train}. " if diff_train else "" diff_string += f"Missing target values in the validation set after data split: {diff_valid}." if diff_valid else "" if diff_string: raise Exception(diff_string) objectives_to_score = [automl.objective ] + automl.additional_objectives cv_pipeline = None try: logger.debug(f"\t\t\tFold {i}: starting training") cv_pipeline = EngineBase.train_pipeline( pipeline, X_train, y_train, automl.optimize_thresholds, automl.objective) logger.debug(f"\t\t\tFold {i}: finished training") if automl.optimize_thresholds and pipeline.can_tune_threshold_with_objective( automl.objective ) and automl.objective.can_optimize_threshold: logger.debug( f"\t\t\tFold {i}: Optimal threshold found ({cv_pipeline.threshold:.3f})" ) logger.debug(f"\t\t\tFold {i}: Scoring trained pipeline") scores = cv_pipeline.score(X_valid, y_valid, objectives=objectives_to_score) logger.debug( f"\t\t\tFold {i}: {automl.objective.name} score: {scores[automl.objective.name]:.3f}" ) score = scores[automl.objective.name] except Exception as e: if automl.error_callback is not None: automl.error_callback(exception=e, traceback=traceback.format_tb( sys.exc_info()[2]), automl=automl, fold_num=i, pipeline=pipeline) if isinstance(e, PipelineScoreError): nan_scores = { objective: np.nan for objective in e.exceptions } scores = {**nan_scores, **e.scored_successfully} scores = OrderedDict({ o.name: scores[o.name] for o in [automl.objective] + automl.additional_objectives }) score = scores[automl.objective.name] else: score = np.nan scores = OrderedDict( zip([n.name for n in automl.additional_objectives], [np.nan] * len(automl.additional_objectives))) ordered_scores = OrderedDict() ordered_scores.update({automl.objective.name: score}) ordered_scores.update(scores) ordered_scores.update({"# Training": y_train.shape[0]}) ordered_scores.update({"# Validation": y_valid.shape[0]}) evaluation_entry = { "all_objective_scores": ordered_scores, "score": score, 'binary_classification_threshold': None } if is_binary( automl.problem_type ) and cv_pipeline is not None and cv_pipeline.threshold is not None: evaluation_entry[ 'binary_classification_threshold'] = cv_pipeline.threshold cv_data.append(evaluation_entry) training_time = time.time() - start cv_scores = pd.Series([fold['score'] for fold in cv_data]) cv_score_mean = cv_scores.mean() logger.info( f"\tFinished cross validation - mean {automl.objective.name}: {cv_score_mean:.3f}" ) return { 'cv_data': cv_data, 'training_time': training_time, 'cv_scores': cv_scores, 'cv_score_mean': cv_score_mean }
def validate(self, X, y): """Checks if the target data contains missing or invalid values. Arguments: X (ww.DataTable, pd.DataFrame, np.ndarray): Features. Ignored. y (ww.DataColumn, pd.Series, np.ndarray): Target data to check for invalid values. Returns: dict (DataCheckError): List with DataCheckErrors if any invalid values are found in the target data. Example: >>> import pandas as pd >>> X = pd.DataFrame({"col": [1, 2, 3, 1]}) >>> y = pd.Series([0, 1, None, None]) >>> target_check = InvalidTargetDataCheck('binary', 'Log Loss Binary') >>> assert target_check.validate(X, y) == {"errors": [{"message": "2 row(s) (50.0%) of target values are null",\ "data_check_name": "InvalidTargetDataCheck",\ "level": "error",\ "code": "TARGET_HAS_NULL",\ "details": {"num_null_rows": 2, "pct_null_rows": 50}}],\ "warnings": [],\ "actions": [{'code': 'IMPUTE_COL', 'metadata': {'column': None, 'impute_strategy': 'most_frequent', 'is_target': True}}]} """ results = {"warnings": [], "errors": [], "actions": []} if y is None: results["errors"].append( DataCheckError( message="Target is None", data_check_name=self.name, message_code=DataCheckMessageCode.TARGET_IS_NONE, details={}).to_dict()) return results y = infer_feature_types(y) is_supported_type = y.logical_type in numeric_and_boolean_ww + [ ww.logical_types.Categorical ] if not is_supported_type: results["errors"].append( DataCheckError( message= "Target is unsupported {} type. Valid Woodwork logical types include: {}" .format( y.logical_type, ", ".join([ ltype.type_string for ltype in numeric_and_boolean_ww ])), data_check_name=self.name, message_code=DataCheckMessageCode.TARGET_UNSUPPORTED_TYPE, details={ "unsupported_type": y.logical_type.type_string }).to_dict()) y_df = _convert_woodwork_types_wrapper(y.to_series()) null_rows = y_df.isnull() if null_rows.all(): results["errors"].append( DataCheckError(message="Target is either empty or fully null.", data_check_name=self.name, message_code=DataCheckMessageCode. TARGET_IS_EMPTY_OR_FULLY_NULL, details={}).to_dict()) return results elif null_rows.any(): num_null_rows = null_rows.sum() pct_null_rows = null_rows.mean() * 100 results["errors"].append( DataCheckError( message="{} row(s) ({}%) of target values are null".format( num_null_rows, pct_null_rows), data_check_name=self.name, message_code=DataCheckMessageCode.TARGET_HAS_NULL, details={ "num_null_rows": num_null_rows, "pct_null_rows": pct_null_rows }).to_dict()) impute_strategy = "mean" if is_regression( self.problem_type) else "most_frequent" results["actions"].append( DataCheckAction(DataCheckActionCode.IMPUTE_COL, metadata={ "column": None, "is_target": True, "impute_strategy": impute_strategy }).to_dict()) value_counts = y_df.value_counts() unique_values = value_counts.index.tolist() if is_binary(self.problem_type) and len(value_counts) != 2: if self.n_unique is None: details = {"target_values": unique_values} else: details = { "target_values": unique_values[:min(self.n_unique, len(unique_values))] } results["errors"].append( DataCheckError( message= "Binary class targets require exactly two unique values.", data_check_name=self.name, message_code=DataCheckMessageCode. TARGET_BINARY_NOT_TWO_UNIQUE_VALUES, details=details).to_dict()) if self.problem_type == ProblemTypes.REGRESSION and "numeric" not in y.semantic_tags: results["errors"].append( DataCheckError( message= "Target data type should be numeric for regression type problems.", data_check_name=self.name, message_code=DataCheckMessageCode.TARGET_UNSUPPORTED_TYPE, details={}).to_dict()) if is_multiclass(self.problem_type): if value_counts.min() <= 1: least_populated = value_counts[value_counts <= 1] details = { "least_populated_class_labels": least_populated.index.tolist() } results["errors"].append( DataCheckError( message= "Target does not have at least two instances per class which is required for multiclass classification", data_check_name=self.name, message_code=DataCheckMessageCode. TARGET_MULTICLASS_NOT_TWO_EXAMPLES_PER_CLASS, details=details).to_dict()) if len(unique_values) <= 2: details = {"num_classes": len(unique_values)} results["errors"].append( DataCheckError( message= "Target has two or less classes, which is too few for multiclass problems. Consider changing to binary.", data_check_name=self.name, message_code=DataCheckMessageCode. TARGET_MULTICLASS_NOT_ENOUGH_CLASSES, details=details).to_dict()) num_class_to_num_value_ratio = len(unique_values) / len(y) if num_class_to_num_value_ratio >= self.multiclass_continuous_threshold: details = { "class_to_value_ratio": num_class_to_num_value_ratio } results["warnings"].append( DataCheckWarning( message= "Target has a large number of unique values, could be regression type problem.", data_check_name=self.name, message_code=DataCheckMessageCode. TARGET_MULTICLASS_HIGH_UNIQUE_CLASS, details=details).to_dict()) any_neg = not (y_df > 0).all() if y.logical_type in [ ww.logical_types.Integer, ww.logical_types.Double ] else None if any_neg and self.objective.positive_only: details = { "Count of offending values": sum(val <= 0 for val in y_df.values.flatten()) } results["errors"].append( DataCheckError( message= f"Target has non-positive values which is not supported for {self.objective.name}", data_check_name=self.name, message_code=DataCheckMessageCode. TARGET_INCOMPATIBLE_OBJECTIVE, details=details).to_dict()) if X is not None: X = infer_feature_types(X) X_index = list(X.to_dataframe().index) y_index = list(y_df.index) X_length = len(X_index) y_length = len(y_index) if X_length != y_length: results["warnings"].append( DataCheckWarning( message= "Input target and features have different lengths", data_check_name=self.name, message_code=DataCheckMessageCode.MISMATCHED_LENGTHS, details={ "features_length": X_length, "target_length": y_length }).to_dict()) if X_index != y_index: if set(X_index) == set(y_index): results["warnings"].append( DataCheckWarning( message= "Input target and features have mismatched indices order", data_check_name=self.name, message_code=DataCheckMessageCode. MISMATCHED_INDICES_ORDER, details={}).to_dict()) else: index_diff_not_in_X = list(set(y_index) - set(X_index))[:10] index_diff_not_in_y = list(set(X_index) - set(y_index))[:10] results["warnings"].append( DataCheckWarning( message= "Input target and features have mismatched indices", data_check_name=self.name, message_code=DataCheckMessageCode. MISMATCHED_INDICES, details={ "indices_not_in_features": index_diff_not_in_X, "indices_not_in_target": index_diff_not_in_y }).to_dict()) return results