コード例 #1
0
    def validate(self, X, y=None):
        """Checks if there are any columns in the input that are too unique in the case of classification
        problems or not unique enough in the case of regression problems.

        Arguments:
            X (ww.DataTable, pd.DataFrame, np.ndarray): Features.
            y (ww.DataColumn, pd.Series, np.ndarray): Ignored.  Defaults to None.

        Returns:
            dict: dict with a DataCheckWarning if there are any too unique or not
                unique enough columns.

        Example:
            >>> import pandas as pd
            >>> df = pd.DataFrame({
            ...    'regression_unique_enough': [float(x) for x in range(100)],
            ...    'regression_not_unique_enough': [float(1) for x in range(100)]
            ... })
            >>> uniqueness_check = UniquenessDataCheck(problem_type="regression", threshold=0.8)
            >>> assert uniqueness_check.validate(df) == {"errors": [],\
                                                         "warnings": [{"message": "Input columns (regression_not_unique_enough) for regression problem type are not unique enough.",\
                                                                 "data_check_name": "UniquenessDataCheck",\
                                                                 "level": "warning",\
                                                                 "code": "NOT_UNIQUE_ENOUGH",\
                                                                 "details": {"column": "regression_not_unique_enough", 'uniqueness_score': 0.0}}],\
                                                         "actions": []}
        """
        results = {"warnings": [], "errors": [], "actions": []}

        X = infer_feature_types(X)
        X = _convert_woodwork_types_wrapper(X.to_dataframe())

        res = X.apply(UniquenessDataCheck.uniqueness_score)

        if is_regression(self.problem_type):
            not_unique_enough_cols = list(res.index[res < self.threshold])
            results["warnings"].extend([
                DataCheckWarning(
                    message=warning_not_unique_enough.format(
                        col_name, self.problem_type),
                    data_check_name=self.name,
                    message_code=DataCheckMessageCode.NOT_UNIQUE_ENOUGH,
                    details={
                        "column": col_name,
                        "uniqueness_score": res.loc[col_name]
                    }).to_dict() for col_name in not_unique_enough_cols
            ])
        elif is_multiclass(self.problem_type):
            too_unique_cols = list(res.index[res > self.threshold])
            results["warnings"].extend([
                DataCheckWarning(message=warning_too_unique.format(
                    col_name, self.problem_type),
                                 data_check_name=self.name,
                                 message_code=DataCheckMessageCode.TOO_UNIQUE,
                                 details={
                                     "column": col_name,
                                     "uniqueness_score": res.loc[col_name]
                                 }).to_dict() for col_name in too_unique_cols
            ])
        return results
コード例 #2
0
def test_invalid_target_data_action_for_data_with_null(problem_type):
    y = pd.Series([None, None, None, 0, 0, 0, 0, 0, 0, 0])
    X = pd.DataFrame({"col": range(len(y))})
    invalid_targets_check = InvalidTargetDataCheck(problem_type, get_default_primary_search_objective(problem_type))
    impute_strategy = "mean" if is_regression(problem_type) else "most_frequent"

    expected = {
        "warnings": [],
        "errors": [DataCheckError(message="3 row(s) (30.0%) of target values are null",
                                  data_check_name=invalid_targets_data_check_name,
                                  message_code=DataCheckMessageCode.TARGET_HAS_NULL,
                                  details={"num_null_rows": 3, "pct_null_rows": 30.0}).to_dict()],
        "actions": [DataCheckAction(DataCheckActionCode.IMPUTE_COL, metadata={"column": None, "is_target": True, "impute_strategy": impute_strategy}).to_dict()]
    }
    if is_binary(problem_type):
        expected["errors"].append(DataCheckError(message="Binary class targets require exactly two unique values.",
                                                 data_check_name=invalid_targets_data_check_name,
                                                 message_code=DataCheckMessageCode.TARGET_BINARY_NOT_TWO_UNIQUE_VALUES,
                                                 details={"target_values": [0]}).to_dict())
    elif is_multiclass(problem_type):
        expected["errors"].append(DataCheckError(message=f"Target has two or less classes, which is too few for multiclass problems.  Consider changing to binary.",
                                                 data_check_name=invalid_targets_data_check_name,
                                                 message_code=DataCheckMessageCode.TARGET_MULTICLASS_NOT_ENOUGH_CLASSES,
                                                 details={"num_classes": 1}).to_dict())
        expected["warnings"].append(DataCheckWarning(message=f"Target has a large number of unique values, could be regression type problem.",
                                                     data_check_name=invalid_targets_data_check_name,
                                                     message_code=DataCheckMessageCode.TARGET_MULTICLASS_HIGH_UNIQUE_CLASS,
                                                     details={"class_to_value_ratio": 0.1}).to_dict())

    messages = invalid_targets_check.validate(X, y)
    assert messages == expected
コード例 #3
0
def test_split_data(problem_type, data_type, X_y_binary, X_y_multi,
                    X_y_regression, make_data_type):
    if is_binary(problem_type):
        X, y = X_y_binary
    if is_multiclass(problem_type):
        X, y = X_y_multi
    if is_regression(problem_type):
        X, y = X_y_regression
    problem_configuration = None
    if is_time_series(problem_type):
        problem_configuration = {'gap': 1, 'max_delay': 7}

    X = make_data_type(data_type, X)
    y = make_data_type(data_type, y)

    test_pct = 0.25
    X_train, X_test, y_train, y_test = split_data(
        X,
        y,
        test_size=test_pct,
        problem_type=problem_type,
        problem_configuration=problem_configuration)
    test_size = len(X) * test_pct
    train_size = len(X) - test_size
    assert len(X_train) == train_size
    assert len(X_test) == test_size
    assert len(y_train) == train_size
    assert len(y_test) == test_size
    assert isinstance(X_train, ww.DataTable)
    assert isinstance(X_test, ww.DataTable)
    assert isinstance(y_train, ww.DataColumn)
    assert isinstance(y_test, ww.DataColumn)
コード例 #4
0
def test_estimators_feature_name_with_random_ascii(X_y_binary, X_y_multi,
                                                   X_y_regression, ts_data,
                                                   helper_functions):
    for estimator_class in _all_estimators_used_in_search():
        if estimator_class.__name__ == 'ARIMARegressor':
            continue
        supported_problem_types = [
            handle_problem_types(pt)
            for pt in estimator_class.supported_problem_types
        ]
        for problem_type in supported_problem_types:
            clf = helper_functions.safe_init_component_with_njobs_1(
                estimator_class)
            if is_binary(problem_type):
                X, y = X_y_binary
            elif is_multiclass(problem_type):
                X, y = X_y_multi
            elif is_regression(problem_type):
                X, y = X_y_regression

            X = get_random_state(clf.random_seed).random(
                (X.shape[0], len(string.printable)))
            col_names = [
                'column_{}'.format(ascii_char)
                for ascii_char in string.printable
            ]
            X = pd.DataFrame(X, columns=col_names)
            assert clf.input_feature_names is None
            clf.fit(X, y)
            assert len(clf.feature_importance) == len(X.columns)
            assert not np.isnan(clf.feature_importance).all().all()
            predictions = clf.predict(X).to_series()
            assert len(predictions) == len(y)
            assert not np.isnan(predictions).all()
            assert (clf.input_feature_names == col_names)
コード例 #5
0
ファイル: utils.py プロジェクト: passion4energy/evalml
def split_data(X, y, problem_type, problem_configuration=None, test_size=.2, random_seed=0):
    """Splits data into train and test sets.

    Arguments:
        X (ww.DataTable, pd.DataFrame or np.ndarray): data of shape [n_samples, n_features]
        y (ww.DataColumn, pd.Series, or np.ndarray): target data of length [n_samples]
        problem_type (str or ProblemTypes): type of supervised learning problem. see evalml.problem_types.problemtype.all_problem_types for a full list.
        problem_configuration (dict): Additional parameters needed to configure the search. For example,
            in time series problems, values should be passed in for the date_index, gap, and max_delay variables.
        test_size (float): What percentage of data points should be included in the test set. Defaults to 0.2 (20%).
        random_seed (int): Seed for the random number generator. Defaults to 0.

    Returns:
        ww.DataTable, ww.DataTable, ww.DataColumn, ww.DataColumn: Feature and target data each split into train and test sets
    """

    X = infer_feature_types(X)
    y = infer_feature_types(y)

    data_splitter = None
    if is_time_series(problem_type):
        data_splitter = TrainingValidationSplit(test_size=test_size, shuffle=False, stratify=None, random_seed=random_seed)
    elif is_regression(problem_type):
        data_splitter = ShuffleSplit(n_splits=1, test_size=test_size, random_state=random_seed)
    elif is_classification(problem_type):
        data_splitter = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=random_seed)

    train, test = next(data_splitter.split(X.to_dataframe(), y.to_series()))

    X_train = X.iloc[train]
    X_test = X.iloc[test]
    y_train = y.iloc[train]
    y_test = y.iloc[test]

    return X_train, X_test, y_train, y_test
コード例 #6
0
def test_type_checks(problem_type):
    assert is_regression(problem_type) == (problem_type in [
        ProblemTypes.REGRESSION, ProblemTypes.TIME_SERIES_REGRESSION
    ])
    assert is_binary(problem_type) == (problem_type in [
        ProblemTypes.BINARY, ProblemTypes.TIME_SERIES_BINARY
    ])
    assert is_multiclass(problem_type) == (problem_type in [
        ProblemTypes.MULTICLASS, ProblemTypes.TIME_SERIES_MULTICLASS
    ])
    assert is_classification(problem_type) == (problem_type in [
        ProblemTypes.BINARY, ProblemTypes.MULTICLASS,
        ProblemTypes.TIME_SERIES_BINARY, ProblemTypes.TIME_SERIES_MULTICLASS
    ])
    assert is_time_series(problem_type) == (problem_type in [
        ProblemTypes.TIME_SERIES_BINARY, ProblemTypes.TIME_SERIES_MULTICLASS,
        ProblemTypes.TIME_SERIES_REGRESSION
    ])
コード例 #7
0
ファイル: explainers.py プロジェクト: skvorekn/evalml
def explain_predictions_best_worst(pipeline,
                                   input_features,
                                   y_true,
                                   num_to_explain=5,
                                   top_k_features=3,
                                   include_shap_values=False,
                                   metric=None,
                                   output_format="text"):
    """Creates a report summarizing the top contributing features for the best and worst points in the dataset as measured by error to true labels.

    XGBoost models and CatBoost multiclass classifiers are not currently supported.

    Arguments:
        pipeline (PipelineBase): Fitted pipeline whose predictions we want to explain with SHAP.
        input_features (ww.DataTable, pd.DataFrame): Input data to evaluate the pipeline on.
        y_true (ww.DataColumn, pd.Series): True labels for the input data.
        num_to_explain (int): How many of the best, worst, random data points to explain.
        top_k_features (int): How many of the highest/lowest contributing feature to include in the table for each
            data point.
        include_shap_values (bool): Whether SHAP values should be included in the table. Default is False.
        metric (callable): The metric used to identify the best and worst points in the dataset. Function must accept
            the true labels and predicted value or probabilities as the only arguments and lower values
            must be better. By default, this will be the absolute error for regression problems and cross entropy loss
            for classification problems.
        output_format (str): Either "text" or "dict". Default is "text".

    Returns:
        str, dict, or pd.DataFrame - A report explaining the top contributing features for the best/worst predictions in the input_features.
            For each of the best/worst rows of input_features, the predicted values, true labels, metric value,
            feature names, prediction contribution, and SHAP Value (optional) will be listed.

    Raises:
        ValueError: if input_features does not have more than twice the requested features to explain.
        ValueError: if y_true and input_features have mismatched lengths.
        ValueError: if an output_format outside of "text", "dict" or "dataframe is provided.
    """
    input_features = infer_feature_types(input_features)
    input_features = _convert_woodwork_types_wrapper(
        input_features.to_dataframe())
    y_true = infer_feature_types(y_true)
    y_true = _convert_woodwork_types_wrapper(y_true.to_series())

    if not (input_features.shape[0] >= num_to_explain * 2):
        raise ValueError(
            f"Input features must be a dataframe with more than {num_to_explain * 2} rows! "
            "Convert to a dataframe and select a smaller value for num_to_explain if you do not have "
            "enough data.")
    if y_true.shape[0] != input_features.shape[0]:
        raise ValueError(
            "Parameters y_true and input_features must have the same number of data points. Received: "
            f"true labels: {y_true.shape[0]} and {input_features.shape[0]}")
    if output_format not in {"text", "dict", "dataframe"}:
        raise ValueError(
            f"Parameter output_format must be either text, dict, or dataframe. Received {output_format}"
        )
    if not metric:
        metric = DEFAULT_METRICS[pipeline.problem_type]

    try:
        if is_regression(pipeline.problem_type):
            if is_time_series(pipeline.problem_type):
                y_pred = pipeline.predict(input_features, y=y_true).to_series()
            else:
                y_pred = pipeline.predict(input_features).to_series()
            y_pred_values = None
            y_true_no_nan, y_pred_no_nan = drop_rows_with_nans(y_true, y_pred)
            errors = metric(y_true_no_nan, y_pred_no_nan)
        else:
            if is_time_series(pipeline.problem_type):
                y_pred = pipeline.predict_proba(input_features,
                                                y=y_true).to_dataframe()
                y_pred_values = pipeline.predict(input_features,
                                                 y=y_true).to_series()
            else:
                y_pred = pipeline.predict_proba(input_features).to_dataframe()
                y_pred_values = pipeline.predict(input_features).to_series()
            y_true_no_nan, y_pred_no_nan, y_pred_values_no_nan = drop_rows_with_nans(
                y_true, y_pred, y_pred_values)
            errors = metric(pipeline._encode_targets(y_true_no_nan),
                            y_pred_no_nan)
    except Exception as e:
        tb = traceback.format_tb(sys.exc_info()[2])
        raise PipelineScoreError(exceptions={metric.__name__: (e, tb)},
                                 scored_successfully={})

    errors = pd.Series(errors, index=y_pred_no_nan.index)
    sorted_scores = errors.sort_values()
    best_indices = sorted_scores.index[:num_to_explain]
    worst_indices = sorted_scores.index[-num_to_explain:]
    index_list = best_indices.tolist() + worst_indices.tolist()

    pipeline_features = pipeline.compute_estimator_features(
        input_features, y_true).to_dataframe()

    data = _ReportData(pipeline, pipeline_features, input_features, y_true,
                       y_pred, y_pred_values, errors, index_list, metric)

    report_creator = _report_creator_factory(
        data,
        report_type="explain_predictions_best_worst",
        output_format=output_format,
        top_k_features=top_k_features,
        include_shap_values=include_shap_values,
        num_to_explain=num_to_explain)
    return report_creator(data)
コード例 #8
0
    def validate(self, X, y):
        """Checks if the target data contains missing or invalid values.

        Arguments:
            X (ww.DataTable, pd.DataFrame, np.ndarray): Features. Ignored.
            y (ww.DataColumn, pd.Series, np.ndarray): Target data to check for invalid values.

        Returns:
            dict (DataCheckError): List with DataCheckErrors if any invalid values are found in the target data.

        Example:
            >>> import pandas as pd
            >>> X = pd.DataFrame({"col": [1, 2, 3, 1]})
            >>> y = pd.Series([0, 1, None, None])
            >>> target_check = InvalidTargetDataCheck('binary', 'Log Loss Binary')
            >>> assert target_check.validate(X, y) == {"errors": [{"message": "2 row(s) (50.0%) of target values are null",\
                                                                   "data_check_name": "InvalidTargetDataCheck",\
                                                                   "level": "error",\
                                                                   "code": "TARGET_HAS_NULL",\
                                                                   "details": {"num_null_rows": 2, "pct_null_rows": 50}}],\
                                                       "warnings": [],\
                                                       "actions": [{'code': 'IMPUTE_COL', 'metadata': {'column': None, 'impute_strategy': 'most_frequent', 'is_target': True}}]}
        """
        results = {"warnings": [], "errors": [], "actions": []}

        if y is None:
            results["errors"].append(
                DataCheckError(
                    message="Target is None",
                    data_check_name=self.name,
                    message_code=DataCheckMessageCode.TARGET_IS_NONE,
                    details={}).to_dict())
            return results

        y = infer_feature_types(y)
        is_supported_type = y.logical_type in numeric_and_boolean_ww + [
            ww.logical_types.Categorical
        ]
        if not is_supported_type:
            results["errors"].append(
                DataCheckError(
                    message=
                    "Target is unsupported {} type. Valid Woodwork logical types include: {}"
                    .format(
                        y.logical_type, ", ".join([
                            ltype.type_string
                            for ltype in numeric_and_boolean_ww
                        ])),
                    data_check_name=self.name,
                    message_code=DataCheckMessageCode.TARGET_UNSUPPORTED_TYPE,
                    details={
                        "unsupported_type": y.logical_type.type_string
                    }).to_dict())
        y_df = _convert_woodwork_types_wrapper(y.to_series())
        null_rows = y_df.isnull()
        if null_rows.all():
            results["errors"].append(
                DataCheckError(message="Target is either empty or fully null.",
                               data_check_name=self.name,
                               message_code=DataCheckMessageCode.
                               TARGET_IS_EMPTY_OR_FULLY_NULL,
                               details={}).to_dict())
            return results
        elif null_rows.any():
            num_null_rows = null_rows.sum()
            pct_null_rows = null_rows.mean() * 100
            results["errors"].append(
                DataCheckError(
                    message="{} row(s) ({}%) of target values are null".format(
                        num_null_rows, pct_null_rows),
                    data_check_name=self.name,
                    message_code=DataCheckMessageCode.TARGET_HAS_NULL,
                    details={
                        "num_null_rows": num_null_rows,
                        "pct_null_rows": pct_null_rows
                    }).to_dict())
            impute_strategy = "mean" if is_regression(
                self.problem_type) else "most_frequent"
            results["actions"].append(
                DataCheckAction(DataCheckActionCode.IMPUTE_COL,
                                metadata={
                                    "column": None,
                                    "is_target": True,
                                    "impute_strategy": impute_strategy
                                }).to_dict())

        value_counts = y_df.value_counts()
        unique_values = value_counts.index.tolist()

        if is_binary(self.problem_type) and len(value_counts) != 2:
            if self.n_unique is None:
                details = {"target_values": unique_values}
            else:
                details = {
                    "target_values":
                    unique_values[:min(self.n_unique, len(unique_values))]
                }
            results["errors"].append(
                DataCheckError(
                    message=
                    "Binary class targets require exactly two unique values.",
                    data_check_name=self.name,
                    message_code=DataCheckMessageCode.
                    TARGET_BINARY_NOT_TWO_UNIQUE_VALUES,
                    details=details).to_dict())

        if self.problem_type == ProblemTypes.REGRESSION and "numeric" not in y.semantic_tags:
            results["errors"].append(
                DataCheckError(
                    message=
                    "Target data type should be numeric for regression type problems.",
                    data_check_name=self.name,
                    message_code=DataCheckMessageCode.TARGET_UNSUPPORTED_TYPE,
                    details={}).to_dict())

        if is_multiclass(self.problem_type):
            if value_counts.min() <= 1:
                least_populated = value_counts[value_counts <= 1]
                details = {
                    "least_populated_class_labels":
                    least_populated.index.tolist()
                }
                results["errors"].append(
                    DataCheckError(
                        message=
                        "Target does not have at least two instances per class which is required for multiclass classification",
                        data_check_name=self.name,
                        message_code=DataCheckMessageCode.
                        TARGET_MULTICLASS_NOT_TWO_EXAMPLES_PER_CLASS,
                        details=details).to_dict())
            if len(unique_values) <= 2:
                details = {"num_classes": len(unique_values)}
                results["errors"].append(
                    DataCheckError(
                        message=
                        "Target has two or less classes, which is too few for multiclass problems.  Consider changing to binary.",
                        data_check_name=self.name,
                        message_code=DataCheckMessageCode.
                        TARGET_MULTICLASS_NOT_ENOUGH_CLASSES,
                        details=details).to_dict())

            num_class_to_num_value_ratio = len(unique_values) / len(y)
            if num_class_to_num_value_ratio >= self.multiclass_continuous_threshold:
                details = {
                    "class_to_value_ratio": num_class_to_num_value_ratio
                }
                results["warnings"].append(
                    DataCheckWarning(
                        message=
                        "Target has a large number of unique values, could be regression type problem.",
                        data_check_name=self.name,
                        message_code=DataCheckMessageCode.
                        TARGET_MULTICLASS_HIGH_UNIQUE_CLASS,
                        details=details).to_dict())

        any_neg = not (y_df > 0).all() if y.logical_type in [
            ww.logical_types.Integer, ww.logical_types.Double
        ] else None
        if any_neg and self.objective.positive_only:
            details = {
                "Count of offending values":
                sum(val <= 0 for val in y_df.values.flatten())
            }
            results["errors"].append(
                DataCheckError(
                    message=
                    f"Target has non-positive values which is not supported for {self.objective.name}",
                    data_check_name=self.name,
                    message_code=DataCheckMessageCode.
                    TARGET_INCOMPATIBLE_OBJECTIVE,
                    details=details).to_dict())

        if X is not None:
            X = infer_feature_types(X)
            X_index = list(X.to_dataframe().index)
            y_index = list(y_df.index)
            X_length = len(X_index)
            y_length = len(y_index)
            if X_length != y_length:
                results["warnings"].append(
                    DataCheckWarning(
                        message=
                        "Input target and features have different lengths",
                        data_check_name=self.name,
                        message_code=DataCheckMessageCode.MISMATCHED_LENGTHS,
                        details={
                            "features_length": X_length,
                            "target_length": y_length
                        }).to_dict())

            if X_index != y_index:
                if set(X_index) == set(y_index):
                    results["warnings"].append(
                        DataCheckWarning(
                            message=
                            "Input target and features have mismatched indices order",
                            data_check_name=self.name,
                            message_code=DataCheckMessageCode.
                            MISMATCHED_INDICES_ORDER,
                            details={}).to_dict())
                else:
                    index_diff_not_in_X = list(set(y_index) -
                                               set(X_index))[:10]
                    index_diff_not_in_y = list(set(X_index) -
                                               set(y_index))[:10]
                    results["warnings"].append(
                        DataCheckWarning(
                            message=
                            "Input target and features have mismatched indices",
                            data_check_name=self.name,
                            message_code=DataCheckMessageCode.
                            MISMATCHED_INDICES,
                            details={
                                "indices_not_in_features": index_diff_not_in_X,
                                "indices_not_in_target": index_diff_not_in_y
                            }).to_dict())

        return results
コード例 #9
0
def _best_worst_predicted_values_section(data, regression, classification):
    """Get and initialize the predicted values section maker given the data."""
    predicted_values_class = regression if is_regression(
        data.pipeline.problem_type) else classification
    return predicted_values_class(data.metric.__name__, data.y_pred_values)
コード例 #10
0
def _compute_shap_values(pipeline, features, training_data=None):
    """Computes SHAP values for each feature.

    Arguments:
        pipeline (PipelineBase): Trained pipeline whose predictions we want to explain with SHAP.
        features (pd.DataFrame): Dataframe of features - needs to correspond to data the pipeline was fit on.
        training_data (pd.DataFrame): Training data the pipeline was fit on.
            For non-tree estimators, we need a sample of training data for the KernelSHAP algorithm.

    Returns:
        dict or list(dict): For regression problems, a dictionary mapping a feature name to a list of SHAP values.
            For classification problems, returns a list of dictionaries. One for each class.
    """
    estimator = pipeline.estimator
    if estimator.model_family == ModelFamily.BASELINE:
        raise ValueError(
            "You passed in a baseline pipeline. These are simple enough that SHAP values are not needed."
        )

    feature_names = features.columns

    # This is to make sure all dtypes are numeric - SHAP algorithms will complain otherwise.
    # Sklearn components do this under-the-hood so we're not changing the data the model was trained on.
    # Catboost can naturally handle string-encoded categorical features so we don't need to convert to numeric.
    if estimator.model_family != ModelFamily.CATBOOST:
        features = check_array(features.values)

    if estimator.model_family.is_tree_estimator():
        # Use tree_path_dependent to avoid linear runtime with dataset size
        with warnings.catch_warnings(record=True) as ws:
            explainer = shap.TreeExplainer(
                estimator._component_obj,
                feature_perturbation="tree_path_dependent")
        if ws:
            logger.debug(
                f"_compute_shap_values TreeExplainer: {ws[0].message}")
        shap_values = explainer.shap_values(features, check_additivity=False)
        # shap only outputs values for positive class for Catboost/Xgboost binary estimators.
        # this modifies the output to match the output format of other binary estimators.
        # Ok to fill values of negative class with zeros since the negative class will get dropped
        # in the UI anyways.
        if estimator.model_family in {
                ModelFamily.CATBOOST, ModelFamily.XGBOOST
        } and is_binary(pipeline.problem_type):
            shap_values = [np.zeros(shap_values.shape), shap_values]
    else:
        if training_data is None:
            raise ValueError(
                "You must pass in a value for parameter 'training_data' when the pipeline "
                "does not have a tree-based estimator. "
                f"Current estimator model family is {estimator.model_family}.")

        # More than 100 datapoints can negatively impact runtime according to SHAP
        # https://github.com/slundberg/shap/blob/master/shap/explainers/kernel.py#L114
        sampled_training_data_features = shap.sample(training_data, 100)
        sampled_training_data_features = check_array(
            sampled_training_data_features)

        if is_regression(pipeline.problem_type):
            link_function = "identity"
            decision_function = estimator._component_obj.predict
        else:
            link_function = "logit"
            decision_function = estimator._component_obj.predict_proba
        with warnings.catch_warnings(record=True) as ws:
            explainer = shap.KernelExplainer(decision_function,
                                             sampled_training_data_features,
                                             link_function)
            shap_values = explainer.shap_values(features)
        if ws:
            logger.debug(
                f"_compute_shap_values KernelExplainer: {ws[0].message}")

    # classification problem
    if isinstance(shap_values, list):
        mappings = []
        for class_shap_values in shap_values:
            mappings.append(
                _create_dictionary(class_shap_values, feature_names))
        return mappings
    # regression problem
    elif isinstance(shap_values, np.ndarray):
        return _create_dictionary(shap_values, feature_names)
    else:
        raise ValueError(
            f"Unknown shap_values datatype {str(type(shap_values))}!")
コード例 #11
0
ファイル: test_explainers.py プロジェクト: skvorekn/evalml
def test_explain_predictions_best_worst_and_explain_predictions(
        mock_make_table, mock_default_metrics, problem_type, output_format,
        answer, explain_predictions_answer, custom_index):
    if output_format == "text":
        mock_make_table.return_value = "table goes here"
    elif output_format == "dataframe":
        shap_table = pd.DataFrame({
            "feature_names": [0],
            "feature_values": [0],
            "qualitative_explanation": [0],
            "quantitative_explanation": [0],
        })
        # Use side effect so that we always get a new copy of the dataframe
        mock_make_table.side_effect = lambda *args, **kwargs: shap_table.copy()
    else:
        mock_make_table.return_value = {
            "explanations": ["explanation_dictionary_goes_here"]
        }

    pipeline = MagicMock()
    pipeline.parameters = "Parameters go here"
    input_features = pd.DataFrame({"a": [3, 4]}, index=custom_index)
    pipeline.problem_type = problem_type
    pipeline.name = "Test Pipeline Name"
    pipeline.compute_estimator_features.return_value = ww.DataTable(
        input_features)

    def _add_custom_index(answer, index_best, index_worst, output_format):

        if output_format == "text":
            answer = answer.format(index_0=index_best, index_1=index_worst)
        elif output_format == "dataframe":
            col_name = "prefix" if "prefix" in answer.columns else "rank"
            n_repeats = answer[col_name].value_counts().tolist()[0]
            answer['index_id'] = [index_best] * n_repeats + [index_worst
                                                             ] * n_repeats
        else:
            answer["explanations"][0]["predicted_values"][
                "index_id"] = index_best
            answer["explanations"][1]["predicted_values"][
                "index_id"] = index_worst
        return answer

    if is_regression(problem_type):
        abs_error_mock = MagicMock(__name__="abs_error")
        abs_error_mock.return_value = pd.Series([4., 1.], dtype="float64")
        mock_default_metrics.__getitem__.return_value = abs_error_mock
        pipeline.predict.return_value = ww.DataColumn(pd.Series([2, 1]))
        y_true = pd.Series([3, 2], index=custom_index)
        answer = _add_custom_index(answer,
                                   index_best=custom_index[1],
                                   index_worst=custom_index[0],
                                   output_format=output_format)
    elif is_binary(problem_type):
        pipeline.classes_.return_value = ["benign", "malignant"]
        cross_entropy_mock = MagicMock(__name__="cross_entropy")
        mock_default_metrics.__getitem__.return_value = cross_entropy_mock
        cross_entropy_mock.return_value = pd.Series([0.2, 0.78])
        pipeline.predict_proba.return_value = ww.DataTable(
            pd.DataFrame({
                "benign": [0.05, 0.1],
                "malignant": [0.95, 0.9]
            }))
        pipeline.predict.return_value = ww.DataColumn(
            pd.Series(["malignant"] * 2))
        y_true = pd.Series(["malignant", "benign"], index=custom_index)
        answer = _add_custom_index(answer,
                                   index_best=custom_index[0],
                                   index_worst=custom_index[1],
                                   output_format=output_format)
    else:
        # Multiclass text output is formatted slightly different so need to account for that
        if output_format == "text":
            mock_make_table.return_value = multiclass_table
        pipeline.classes_.return_value = ["setosa", "versicolor", "virginica"]
        cross_entropy_mock = MagicMock(__name__="cross_entropy")
        mock_default_metrics.__getitem__.return_value = cross_entropy_mock
        cross_entropy_mock.return_value = pd.Series([0.15, 0.34])
        pipeline.predict_proba.return_value = ww.DataTable(
            pd.DataFrame({
                "setosa": [0.8, 0.2],
                "versicolor": [0.1, 0.75],
                "virginica": [0.1, 0.05]
            }))
        pipeline.predict.return_value = ww.DataColumn(
            pd.Series(["setosa", "versicolor"]))
        y_true = pd.Series(["setosa", "versicolor"], index=custom_index)
        answer = _add_custom_index(answer,
                                   index_best=custom_index[0],
                                   index_worst=custom_index[1],
                                   output_format=output_format)

    report = explain_predictions(pipeline,
                                 input_features,
                                 y=y_true,
                                 indices_to_explain=[0, 1],
                                 output_format=output_format)
    if output_format == "text":
        compare_two_tables(report.splitlines(),
                           explain_predictions_answer.splitlines())
    elif output_format == "dataframe":
        assert report.columns.tolist(
        ) == explain_predictions_answer.columns.tolist()
        pd.testing.assert_frame_equal(
            report, explain_predictions_answer[report.columns])
    else:
        assert report == explain_predictions_answer

    best_worst_report = explain_predictions_best_worst(
        pipeline,
        input_features,
        y_true=y_true,
        num_to_explain=1,
        output_format=output_format)
    if output_format == "text":
        compare_two_tables(best_worst_report.splitlines(), answer.splitlines())
    elif output_format == "dataframe":
        # Check dataframes equal without caring about column order
        assert sorted(best_worst_report.columns.tolist()) == sorted(
            answer.columns.tolist())
        pd.testing.assert_frame_equal(best_worst_report,
                                      answer[best_worst_report.columns])
    else:
        assert best_worst_report == answer