Ejemplo n.º 1
0
def _explain_trees(
    model: Model,
    transformed_data: Table,
    transformed_reference_data: Table,
    progress_callback: Callable,
) -> Tuple[
    Optional[List[np.ndarray]], Optional[np.ndarray], Optional[np.ndarray]
]:
    """
    Computes and returns SHAP values for learners that are explained by
    TreeExplainer: all sci-kit models based on trees. In case that explanation
    with TreeExplainer is not possible it returns None
    """
    if sparse.issparse(transformed_data.X):
        # sparse not supported by TreeExplainer, KernelExplainer can handle it
        return None, None, None
    try:
        explainer = TreeExplainer(
            model.skl_model, data=sample(transformed_reference_data.X, 100),
        )
    # I know it is too broad but this is what TreeExplainer trows
    except Exception:
        return None, None, None

    # TreeExplaner cannot explain in normal time more cases than 1000
    data_sample, sample_mask = _subsample_data(transformed_data, 1000)
    num_classes = (
        len(model.domain.class_var.values)
        if model.domain.class_var.is_discrete
        else None
    )

    # this method will work in batches since explaining only one attribute
    # at time the processing timed doubles comparing to batch size 10
    shap_values = []
    batch_size = 1  # currently set to 1 to minimize widget blocking
    for i in range(0, len(data_sample), batch_size):
        progress_callback(i / len(data_sample))
        batch = data_sample.X[i : i + batch_size]
        shap_values.append(
            explainer.shap_values(batch, check_additivity=False)
        )

    shap_values = _join_shap_values(shap_values)
    base_value = explainer.expected_value
    # when in training phase one class value was missing skl_model do not
    # output probability for it. For other models it is handled by Orange
    if num_classes is not None:
        missing_d = num_classes - len(shap_values)
        shap_values += [
            np.zeros(shap_values[0].shape) for _ in range(missing_d)
        ]
        base_value = np.hstack((base_value, np.zeros(missing_d)))

    return shap_values, sample_mask, base_value
Ejemplo n.º 2
0
def _explain_other_models(
    model: Model,
    transformed_data: Table,
    transformed_reference_data: Table,
    progress_callback: Callable,
) -> Tuple[List[np.ndarray], np.ndarray, np.ndarray]:
    """
    Computes SHAP values for any learner with KernelExplainer.
    """
    # 1000 is a number that for normal data and model do not take so long
    data_sample, sample_mask = _subsample_data(transformed_data, 1000)

    try:
        ref = kmeans(transformed_reference_data.X, k=10)
    except ValueError:
        # k-means fails with value error when it cannot produce enough clusters
        # in this case we will use sample instead of clusters
        ref = sample(transformed_reference_data.X, nsamples=100)

    explainer = KernelExplainer(
        lambda x:
        (model(x)
         if model.domain.class_var.is_continuous else model(x, model.Probs)),
        ref,
    )

    shap_values = []
    for i, row in enumerate(data_sample.X):
        progress_callback(i / len(data_sample))
        shap_values.append(
            explainer.shap_values(row,
                                  nsamples=100,
                                  silent=True,
                                  l1_reg="num_features(90)"))
    return (
        _join_shap_values(shap_values),
        sample_mask,
        explainer.expected_value,
    )
Ejemplo n.º 3
0
def shap_calc(
    model,
    X,
    return_explainer=False,
    verbose=0,
    sample_size=100,
    approximate=False,
    check_additivity=True,
    **shap_kwargs,
):
    """
    Helper function to calculate the shapley values for a given model.

    Args:
        model (binary model):
            Trained model.

        X (pd.DataFrame or np.ndarray):
            features set.

        return_explainer (boolean):
            if True, returns a a tuple (shap_values, explainer).

        verbose (int, optional):
            Controls verbosity of the output:

            - 0 - nether prints nor warnings are shown
            - 1 - 50 - only most important warnings
            - 51 - 100 - shows other warnings and prints
            - above 100 - presents all prints and all warnings (including SHAP warnings).

         approximate (boolean):
            if True uses shap approximations - less accurate, but very fast. It applies to tree-based explainers only.

         check_additivity (boolean):
            if False SHAP will disable the additivity check for tree-based models.

        **shap_kwargs: kwargs of the shap.Explainer

    Returns:
        (np.ndarray or tuple(np.ndarray, shap.Explainer)):
            shapley_values for the model, optionally also returns the explainer.

    """
    if isinstance(model, Pipeline):
        raise (TypeError(
            "The provided model is a Pipeline. Unfortunately, the features based on SHAP do not support "
            "pipelines, because they cannot be used in combination with shap.Explainer. Please apply any "
            "data transformations before running the probatus module."))
    # Suppress warnings regarding XGboost and Lightgbm models.
    with warnings.catch_warnings():
        if verbose <= 100:
            warnings.simplefilter("ignore")

        # Create the background data,required for non tree based models.
        # A single datapoint can passed as mask (https://github.com/slundberg/shap/issues/955#issuecomment-569837201)

        if X.shape[1] < sample_size:
            sample_size = int(np.ceil(X.shape[1] * 0.2))
        else:
            pass
        mask = sample(X, sample_size)

        if X.select_dtypes("category").shape[1] > 0:
            if verbose > 0:
                warnings.warn(
                    "Using tree_dependent feature_perturbation (in shap) without background"
                    " data for tree-based model with categorical features.")
            explainer = Explainer(model, **shap_kwargs)
        else:
            explainer = Explainer(model, masker=mask, **shap_kwargs)

        # For tree-explainers allow for using check_additivity and approximate arguments
        if isinstance(explainer, Tree):
            # Calculate Shap values.
            shap_values = explainer.shap_values(
                X, check_additivity=check_additivity, approximate=approximate)
        else:
            # Calculate Shap values.
            shap_values = explainer.shap_values(X)

        if isinstance(shap_values, list) and len(shap_values) == 2:
            warnings.warn(
                "Shap values are related to the output probabilities of class 1 for this model, instead of "
                "log odds.")
            shap_values = shap_values[1]

    if return_explainer:
        return shap_values, explainer
    return shap_values