Esempio n. 1
0
    def _(self, background_data, *args,
          **kwargs) -> Union[shap.common.Data, pd.core.frame.DataFrame]:
        """
        Initialises background data if the user passes a `pandas.core.frame.DataFrame` as input.
        If the user has specified groups and given a data frame, it initialises a `shap.common.DenseData`
        object explicitly as this is not handled by `shap` library internally. Otherwise, data initialisation,
        is left to the `shap` library.
        """

        _, groups, weights = args
        new_args = (groups, weights) if weights is not None else (groups, )
        if self.use_groups:
            logger.info(
                "Group names are specified by column headers, group_names will be ignored!"
            )
            keep_index = kwargs.get("keep_index", False)
            if keep_index:
                return DenseDataWithIndex(
                    background_data.values,
                    list(background_data.columns),
                    background_data.index.values,
                    background_data.index.name,
                    *new_args,
                )
            else:
                return DenseData(
                    background_data.values,
                    list(background_data.columns),
                    *new_args,
                )
        else:
            return background_data
def kmeans(X, k, round_values=True):
    """
    This function should be imported from shap.kmeans. Remove it when they
    merge and release the following changes:
    https://github.com/slundberg/shap/pull/1135
    """
    group_names = [str(i) for i in range(X.shape[1])]
    if str(type(X)).endswith("'pandas.core.frame.DataFrame'>"):
        group_names = X.columns
        X = X.values

    # in case there are any missing values in data impute them
    imp = SimpleImputer(missing_values=np.nan, strategy="mean")
    X = imp.fit_transform(X)

    kmeans = KMeans(n_clusters=k, random_state=0).fit(X)

    if round_values:
        for i in range(k):
            for j in range(X.shape[1]):
                xj = X[:, j].toarray().flatten() if issparse(X) else X[:, j]
                ind = np.argmin(np.abs(xj - kmeans.cluster_centers_[i, j]))
                kmeans.cluster_centers_[i, j] = X[ind, j]
    return DenseData(
        kmeans.cluster_centers_,
        group_names,
        None,
        1.0 * np.bincount(kmeans.labels_),
    )
def get_data(kind='array', n_rows=15, n_cols=49, fnames=None, seed=None):
    """
    Generates random data with a specified type for the purposes
    of testing grouping functionality of the wrapper.
    """

    np.random.seed(seed)

    if kind not in SUPPORTED_BACKGROUND_DATA_TYPES:
        msg = "Selected data type, {}, is not an allowed type. " \
              "Allowed types are {}"
        raise ValueError(msg.format(kind, SUPPORTED_BACKGROUND_DATA_TYPES))

    X = get_random_matrix(n_rows=n_rows, n_cols=n_cols)

    if kind == 'array':
        return X
    elif kind == 'sparse':
        return scipy.sparse.csr_matrix(X)
    elif kind == 'frame' or kind == 'series':
        if not fnames:
            fnames = ['feature_{}'.format(i) for i in range(X.shape[-1])]
        if kind == 'frame':
            return pd.DataFrame(data=X, columns=fnames)
        else:
            idx = np.random.choice(np.arange(X.shape[0]))
            return pd.DataFrame(data=X, columns=fnames).iloc[idx, :]
    elif kind == 'data':
        if not fnames:
            group_names = ['feature_{}'.format(i) for i in range(X.shape[-1])]
        else:
            group_names = fnames
        return DenseData(X, group_names)
    else:
        return 0
    def __call__(self, background_data, n_background_samples):
        sampled = self._mock_kmeans(background_data, n_background_samples)
        group_names = [str(i) for i in range(background_data.shape[1])]

        if isinstance(background_data, pandas.DataFrame):
            group_names = background_data.columns

        return DenseData(sampled, group_names, None)
    def take_subset(self, explain_subset):
        """Take a subset of the dataset if not done before.

        :param explain_subset: A list of column indexes to take from the original dataset.
        :type explain_subset: list
        """
        if self._subset_taken:
            return
        # Edge case: Take the subset of the summary in this case,
        # more optimal than recomputing the summary!
        explain_subset = np.array(explain_subset)
        if isinstance(self._dataset, DenseData):
            group_names = np.array(self._dataset.group_names)[explain_subset].tolist()
            self._dataset = DenseData(self._dataset.data[:, explain_subset], group_names)
        else:
            self._dataset = self._dataset[:, explain_subset]
        self._subset_taken = True
Esempio n. 6
0
    def _(self, background_data, *args,
          **kwargs) -> Union[np.ndarray, shap.common.Data]:
        """
        Initialises background data if the user passes an `np.ndarray` object as input.
        If the user specifies feature grouping then a `shap.common.DenseData` object
        is returned. Weights are handled separately to avoid triggering assertion
        correct inside `shap` library. Otherwise, the original data is returned and
        is handled by the `shap` library internally.
        """

        group_names, groups, weights = args
        new_args = (group_names, groups,
                    weights) if weights is not None else (group_names, groups)
        if self.use_groups:
            return DenseData(background_data, *new_args)
        else:
            return background_data
Esempio n. 7
0
    def _(self, background_data, *args,
          **kwargs) -> Union[shap.common.Data, pd.core.frame.Series]:
        """
        Initialises background data if the user passes a `pandas.Series` object as input.
        Original object is returned as this is initialised internally by `shap` is there
        is no group structure specified. Otherwise, a `shap.common.DenseData` object
        is initialised.
        """

        _, groups, _ = args
        if self.use_groups:
            return DenseData(
                background_data.values.reshape(1, len(background_data)),
                list(background_data.index),
                groups,
            )

        return background_data
Esempio n. 8
0
    def _(self, background_data, *args,
          **kwargs) -> Union[shap.common.Data, sparse.spmatrix]:
        """
        Initialises background data if the user passes a sparse matrix as input. If the
        user specifies feature grouping, then the sparse array is converted to a dense
        array. Otherwise, the original array is returned and handled internally by `shap`
        library.
        """

        group_names, groups, weights = args
        new_args = (group_names, groups,
                    weights) if weights is not None else (group_names, groups)

        if self.use_groups:
            logger.warning(
                "Grouping is not currently compatible with sparse matrix inputs. "
                "Converting background data sparse array to dense matrix.")
            background_data = background_data.toarray()
            return DenseData(
                background_data,
                *new_args,
            )

        return background_data
Esempio n. 9
0
def force_plot(base_value,
               shap_values,
               features=None,
               feature_names=None,
               out_names=None,
               link="identity",
               plot_cmap="RdBu",
               matplotlib=False,
               show=True,
               figsize=(20, 3),
               ordering_keys=None,
               ordering_keys_time_format=None):
    """ Visualize the given SHAP values with an additive force layout. """

    # auto unwrap the base_value
    if type(base_value) == np.ndarray and len(base_value) == 1:
        base_value = base_value[0]

    if (type(base_value) == np.ndarray or type(base_value) == list):
        if type(shap_values) != list or len(shap_values) != len(base_value):
            raise Exception("In v0.20 force_plot now requires the base value as the first parameter! " \
                            "Try shap.force_plot(explainer.expected_value, shap_values) or " \
                            "for multi-output models try " \
                            "shap.force_plot(explainer.expected_value[0], shap_values[0]).")

    assert not type(
        shap_values
    ) == list, "The shap_values arg looks looks multi output, try shap_values[i]."

    link = convert_to_link(link)

    if type(shap_values) != np.ndarray:
        return visualize(shap_values)

    # convert from a DataFrame or other types
    if str(type(features)) == "<class 'pandas.core.frame.DataFrame'>":
        if feature_names is None:
            feature_names = list(features.columns)
        features = features.values
    elif str(type(features)) == "<class 'pandas.core.series.Series'>":
        if feature_names is None:
            feature_names = list(features.index)
        features = features.values
    elif isinstance(features, list):
        if feature_names is None:
            feature_names = features
        features = None
    elif features is not None and len(
            features.shape) == 1 and feature_names is None:
        feature_names = features
        features = None

    if len(shap_values.shape) == 1:
        shap_values = np.reshape(shap_values, (1, len(shap_values)))

    if out_names is None:
        out_names = ["output value"]

    if shap_values.shape[0] == 1:
        if feature_names is None:
            feature_names = [
                labels['FEATURE'] % str(i) for i in range(shap_values.shape[1])
            ]
        if features is None:
            features = ["" for _ in range(len(feature_names))]
        if type(features) == np.ndarray:
            features = features.flatten()

        # check that the shape of the shap_values and features match
        if len(features) != shap_values.shape[1]:
            msg = "Length of features is not equal to the length of shap_values!"
            if len(features) == shap_values.shape[1] - 1:
                msg += " You might be using an old format shap_values array with the base value " \
                       "as the last column. In this case just pass the array without the last column."
            raise Exception(msg)

        instance = Instance(np.zeros((1, len(feature_names))), features)
        e = AdditiveExplanation(
            base_value,
            np.sum(shap_values[0, :]) + base_value, shap_values[0, :], None,
            instance, link, Model(None, out_names),
            DenseData(np.zeros((1, len(feature_names))), list(feature_names)))

        return visualize(e, plot_cmap, matplotlib, figsize=figsize, show=show)

    else:
        if matplotlib:
            raise Exception(
                "matplotlib = True is not yet supported for force plots with multiple samples!"
            )

        if shap_values.shape[0] > 3000:
            warnings.warn(
                "shap.force_plot is slow many thousands of rows, try subsampling your data."
            )

        exps = []
        for i in range(shap_values.shape[0]):
            if feature_names is None:
                feature_names = [
                    labels['FEATURE'] % str(i)
                    for i in range(shap_values.shape[1])
                ]
            if features is None:
                display_features = ["" for i in range(len(feature_names))]
            else:
                display_features = features[i, :]

            instance = Instance(np.ones((1, len(feature_names))),
                                display_features)
            e = AdditiveExplanation(
                base_value,
                np.sum(shap_values[i, :]) + base_value, shap_values[i, :],
                None, instance, link, Model(None, out_names),
                DenseData(np.ones((1, len(feature_names))),
                          list(feature_names)))
            exps.append(e)

        return visualize(exps,
                         plot_cmap=plot_cmap,
                         ordering_keys=ordering_keys,
                         ordering_keys_time_format=ordering_keys_time_format)
Esempio n. 10
0
def force_plot(
    base_value,
    shap_values,
    features=None,
    feature_names=None,
    out_names=None,
    link="identity",
    plot_cmap="RdBu",
    show=True,
    figsize=(20, 3),
    ordering_keys=None,
    ordering_keys_time_format=None,
    text_rotation=0,
):
    """ Visualize the given SHAP values with an additive force layout.
    
    Parameters
    ----------
    base_value : float
        This is the reference value that the feature contributions start from. For SHAP values it should
        be the value of explainer.expected_value.
    shap_values : numpy.array
        Matrix of SHAP values (# features) or (# samples x # features). If this is a 1D array then a single
        force plot will be drawn, if it is a 2D array then a stacked force plot will be drawn.
    features : numpy.array
        Matrix of feature values (# features) or (# samples x # features). This provides the values of all the
        features, and should be the same shape as the shap_values argument.
    feature_names : list
        List of feature names (# features).
    out_names : str
        The name of the outout of the model (plural to support multi-output plotting in the future).
    
    link : "identity" or "logit"
        The transformation used when drawing the tick mark labels. Using logit will change log-odds numbers
        into probabilities. 
    """

    # auto unwrap the base_value
    if type(base_value) == np.ndarray and len(base_value) == 1:
        base_value = base_value[0]

    if type(base_value) == np.ndarray or type(base_value) == list:
        if type(shap_values) != list or len(shap_values) != len(base_value):
            raise Exception(
                "In v0.20 force_plot now requires the base value as the first parameter! "
                "Try shap.force_plot(explainer.expected_value, shap_values) or "
                "for multi-output models try "
                "shap.force_plot(explainer.expected_value[0], shap_values[0])."
            )

    assert (
        not type(shap_values) == list
    ), "The shap_values arg looks looks multi output, try shap_values[i]."

    link = convert_to_link(link)

    if type(shap_values) != np.ndarray:
        return visualize(shap_values)

    # convert from a DataFrame or other types
    if str(type(features)) == "<class 'pandas.core.frame.DataFrame'>":
        if feature_names is None:
            feature_names = list(features.columns)
        features = features.values
    elif str(type(features)) == "<class 'pandas.core.series.Series'>":
        if feature_names is None:
            feature_names = list(features.index)
        features = features.values
    elif isinstance(features, list):
        if feature_names is None:
            feature_names = features
        features = None
    elif features is not None and len(features.shape) == 1 and feature_names is None:
        feature_names = features
        features = None

    if len(shap_values.shape) == 1:
        shap_values = np.reshape(shap_values, (1, len(shap_values)))

    if out_names is None:
        out_names = ["output value"]
    elif type(out_names) == str:
        out_names = [out_names]

    if shap_values.shape[0] == 1:
        if feature_names is None:
            feature_names = [
                labels["FEATURE"] % str(i) for i in range(shap_values.shape[1])
            ]
        if features is None:
            features = ["" for _ in range(len(feature_names))]
        if type(features) == np.ndarray:
            features = features.flatten()

        # check that the shape of the shap_values and features match
        if len(features) != shap_values.shape[1]:
            msg = "Length of features is not equal to the length of shap_values!"
            if len(features) == shap_values.shape[1] - 1:
                msg += (
                    " You might be using an old format shap_values array with the base value "
                    "as the last column. In this case just pass the array without the last column."
                )
            raise Exception(msg)

        instance = Instance(np.zeros((1, len(feature_names))), features)
        e = AdditiveExplanation(
            base_value,
            np.sum(shap_values[0, :]) + base_value,
            shap_values[0, :],
            None,
            instance,
            link,
            Model(None, out_names),
            DenseData(np.zeros((1, len(feature_names))), list(feature_names)),
        )

        return visualize(
            e, plot_cmap, figsize=figsize, show=show, text_rotation=text_rotation
        )

    else:

        if shap_values.shape[0] > 3000:
            warnings.warn(
                "shap.force_plot is slow for many thousands of rows, try subsampling your data."
            )

        exps = []
        for k in range(shap_values.shape[0]):
            if feature_names is None:
                feature_names = [
                    labels["FEATURE"] % str(i) for i in range(shap_values.shape[1])
                ]
            if features is None:
                display_features = ["" for i in range(len(feature_names))]
            else:
                display_features = features[k, :]

            instance = Instance(np.ones((1, len(feature_names))), display_features)
            e = AdditiveExplanation(
                base_value,
                np.sum(shap_values[k, :]) + base_value,
                shap_values[k, :],
                None,
                instance,
                link,
                Model(None, out_names),
                DenseData(np.ones((1, len(feature_names))), list(feature_names)),
            )
            exps.append(e)

        return visualize(
            exps,
            plot_cmap=plot_cmap,
            ordering_keys=ordering_keys,
            ordering_keys_time_format=ordering_keys_time_format,
            text_rotation=text_rotation,
        )