def _(self, background_data, *args, **kwargs) -> Union[shap.common.Data, pd.core.frame.DataFrame]: """ Initialises background data if the user passes a `pandas.core.frame.DataFrame` as input. If the user has specified groups and given a data frame, it initialises a `shap.common.DenseData` object explicitly as this is not handled by `shap` library internally. Otherwise, data initialisation, is left to the `shap` library. """ _, groups, weights = args new_args = (groups, weights) if weights is not None else (groups, ) if self.use_groups: logger.info( "Group names are specified by column headers, group_names will be ignored!" ) keep_index = kwargs.get("keep_index", False) if keep_index: return DenseDataWithIndex( background_data.values, list(background_data.columns), background_data.index.values, background_data.index.name, *new_args, ) else: return DenseData( background_data.values, list(background_data.columns), *new_args, ) else: return background_data
def kmeans(X, k, round_values=True): """ This function should be imported from shap.kmeans. Remove it when they merge and release the following changes: https://github.com/slundberg/shap/pull/1135 """ group_names = [str(i) for i in range(X.shape[1])] if str(type(X)).endswith("'pandas.core.frame.DataFrame'>"): group_names = X.columns X = X.values # in case there are any missing values in data impute them imp = SimpleImputer(missing_values=np.nan, strategy="mean") X = imp.fit_transform(X) kmeans = KMeans(n_clusters=k, random_state=0).fit(X) if round_values: for i in range(k): for j in range(X.shape[1]): xj = X[:, j].toarray().flatten() if issparse(X) else X[:, j] ind = np.argmin(np.abs(xj - kmeans.cluster_centers_[i, j])) kmeans.cluster_centers_[i, j] = X[ind, j] return DenseData( kmeans.cluster_centers_, group_names, None, 1.0 * np.bincount(kmeans.labels_), )
def get_data(kind='array', n_rows=15, n_cols=49, fnames=None, seed=None): """ Generates random data with a specified type for the purposes of testing grouping functionality of the wrapper. """ np.random.seed(seed) if kind not in SUPPORTED_BACKGROUND_DATA_TYPES: msg = "Selected data type, {}, is not an allowed type. " \ "Allowed types are {}" raise ValueError(msg.format(kind, SUPPORTED_BACKGROUND_DATA_TYPES)) X = get_random_matrix(n_rows=n_rows, n_cols=n_cols) if kind == 'array': return X elif kind == 'sparse': return scipy.sparse.csr_matrix(X) elif kind == 'frame' or kind == 'series': if not fnames: fnames = ['feature_{}'.format(i) for i in range(X.shape[-1])] if kind == 'frame': return pd.DataFrame(data=X, columns=fnames) else: idx = np.random.choice(np.arange(X.shape[0])) return pd.DataFrame(data=X, columns=fnames).iloc[idx, :] elif kind == 'data': if not fnames: group_names = ['feature_{}'.format(i) for i in range(X.shape[-1])] else: group_names = fnames return DenseData(X, group_names) else: return 0
def __call__(self, background_data, n_background_samples): sampled = self._mock_kmeans(background_data, n_background_samples) group_names = [str(i) for i in range(background_data.shape[1])] if isinstance(background_data, pandas.DataFrame): group_names = background_data.columns return DenseData(sampled, group_names, None)
def take_subset(self, explain_subset): """Take a subset of the dataset if not done before. :param explain_subset: A list of column indexes to take from the original dataset. :type explain_subset: list """ if self._subset_taken: return # Edge case: Take the subset of the summary in this case, # more optimal than recomputing the summary! explain_subset = np.array(explain_subset) if isinstance(self._dataset, DenseData): group_names = np.array(self._dataset.group_names)[explain_subset].tolist() self._dataset = DenseData(self._dataset.data[:, explain_subset], group_names) else: self._dataset = self._dataset[:, explain_subset] self._subset_taken = True
def _(self, background_data, *args, **kwargs) -> Union[np.ndarray, shap.common.Data]: """ Initialises background data if the user passes an `np.ndarray` object as input. If the user specifies feature grouping then a `shap.common.DenseData` object is returned. Weights are handled separately to avoid triggering assertion correct inside `shap` library. Otherwise, the original data is returned and is handled by the `shap` library internally. """ group_names, groups, weights = args new_args = (group_names, groups, weights) if weights is not None else (group_names, groups) if self.use_groups: return DenseData(background_data, *new_args) else: return background_data
def _(self, background_data, *args, **kwargs) -> Union[shap.common.Data, pd.core.frame.Series]: """ Initialises background data if the user passes a `pandas.Series` object as input. Original object is returned as this is initialised internally by `shap` is there is no group structure specified. Otherwise, a `shap.common.DenseData` object is initialised. """ _, groups, _ = args if self.use_groups: return DenseData( background_data.values.reshape(1, len(background_data)), list(background_data.index), groups, ) return background_data
def _(self, background_data, *args, **kwargs) -> Union[shap.common.Data, sparse.spmatrix]: """ Initialises background data if the user passes a sparse matrix as input. If the user specifies feature grouping, then the sparse array is converted to a dense array. Otherwise, the original array is returned and handled internally by `shap` library. """ group_names, groups, weights = args new_args = (group_names, groups, weights) if weights is not None else (group_names, groups) if self.use_groups: logger.warning( "Grouping is not currently compatible with sparse matrix inputs. " "Converting background data sparse array to dense matrix.") background_data = background_data.toarray() return DenseData( background_data, *new_args, ) return background_data
def force_plot(base_value, shap_values, features=None, feature_names=None, out_names=None, link="identity", plot_cmap="RdBu", matplotlib=False, show=True, figsize=(20, 3), ordering_keys=None, ordering_keys_time_format=None): """ Visualize the given SHAP values with an additive force layout. """ # auto unwrap the base_value if type(base_value) == np.ndarray and len(base_value) == 1: base_value = base_value[0] if (type(base_value) == np.ndarray or type(base_value) == list): if type(shap_values) != list or len(shap_values) != len(base_value): raise Exception("In v0.20 force_plot now requires the base value as the first parameter! " \ "Try shap.force_plot(explainer.expected_value, shap_values) or " \ "for multi-output models try " \ "shap.force_plot(explainer.expected_value[0], shap_values[0]).") assert not type( shap_values ) == list, "The shap_values arg looks looks multi output, try shap_values[i]." link = convert_to_link(link) if type(shap_values) != np.ndarray: return visualize(shap_values) # convert from a DataFrame or other types if str(type(features)) == "<class 'pandas.core.frame.DataFrame'>": if feature_names is None: feature_names = list(features.columns) features = features.values elif str(type(features)) == "<class 'pandas.core.series.Series'>": if feature_names is None: feature_names = list(features.index) features = features.values elif isinstance(features, list): if feature_names is None: feature_names = features features = None elif features is not None and len( features.shape) == 1 and feature_names is None: feature_names = features features = None if len(shap_values.shape) == 1: shap_values = np.reshape(shap_values, (1, len(shap_values))) if out_names is None: out_names = ["output value"] if shap_values.shape[0] == 1: if feature_names is None: feature_names = [ labels['FEATURE'] % str(i) for i in range(shap_values.shape[1]) ] if features is None: features = ["" for _ in range(len(feature_names))] if type(features) == np.ndarray: features = features.flatten() # check that the shape of the shap_values and features match if len(features) != shap_values.shape[1]: msg = "Length of features is not equal to the length of shap_values!" if len(features) == shap_values.shape[1] - 1: msg += " You might be using an old format shap_values array with the base value " \ "as the last column. In this case just pass the array without the last column." raise Exception(msg) instance = Instance(np.zeros((1, len(feature_names))), features) e = AdditiveExplanation( base_value, np.sum(shap_values[0, :]) + base_value, shap_values[0, :], None, instance, link, Model(None, out_names), DenseData(np.zeros((1, len(feature_names))), list(feature_names))) return visualize(e, plot_cmap, matplotlib, figsize=figsize, show=show) else: if matplotlib: raise Exception( "matplotlib = True is not yet supported for force plots with multiple samples!" ) if shap_values.shape[0] > 3000: warnings.warn( "shap.force_plot is slow many thousands of rows, try subsampling your data." ) exps = [] for i in range(shap_values.shape[0]): if feature_names is None: feature_names = [ labels['FEATURE'] % str(i) for i in range(shap_values.shape[1]) ] if features is None: display_features = ["" for i in range(len(feature_names))] else: display_features = features[i, :] instance = Instance(np.ones((1, len(feature_names))), display_features) e = AdditiveExplanation( base_value, np.sum(shap_values[i, :]) + base_value, shap_values[i, :], None, instance, link, Model(None, out_names), DenseData(np.ones((1, len(feature_names))), list(feature_names))) exps.append(e) return visualize(exps, plot_cmap=plot_cmap, ordering_keys=ordering_keys, ordering_keys_time_format=ordering_keys_time_format)
def force_plot( base_value, shap_values, features=None, feature_names=None, out_names=None, link="identity", plot_cmap="RdBu", show=True, figsize=(20, 3), ordering_keys=None, ordering_keys_time_format=None, text_rotation=0, ): """ Visualize the given SHAP values with an additive force layout. Parameters ---------- base_value : float This is the reference value that the feature contributions start from. For SHAP values it should be the value of explainer.expected_value. shap_values : numpy.array Matrix of SHAP values (# features) or (# samples x # features). If this is a 1D array then a single force plot will be drawn, if it is a 2D array then a stacked force plot will be drawn. features : numpy.array Matrix of feature values (# features) or (# samples x # features). This provides the values of all the features, and should be the same shape as the shap_values argument. feature_names : list List of feature names (# features). out_names : str The name of the outout of the model (plural to support multi-output plotting in the future). link : "identity" or "logit" The transformation used when drawing the tick mark labels. Using logit will change log-odds numbers into probabilities. """ # auto unwrap the base_value if type(base_value) == np.ndarray and len(base_value) == 1: base_value = base_value[0] if type(base_value) == np.ndarray or type(base_value) == list: if type(shap_values) != list or len(shap_values) != len(base_value): raise Exception( "In v0.20 force_plot now requires the base value as the first parameter! " "Try shap.force_plot(explainer.expected_value, shap_values) or " "for multi-output models try " "shap.force_plot(explainer.expected_value[0], shap_values[0])." ) assert ( not type(shap_values) == list ), "The shap_values arg looks looks multi output, try shap_values[i]." link = convert_to_link(link) if type(shap_values) != np.ndarray: return visualize(shap_values) # convert from a DataFrame or other types if str(type(features)) == "<class 'pandas.core.frame.DataFrame'>": if feature_names is None: feature_names = list(features.columns) features = features.values elif str(type(features)) == "<class 'pandas.core.series.Series'>": if feature_names is None: feature_names = list(features.index) features = features.values elif isinstance(features, list): if feature_names is None: feature_names = features features = None elif features is not None and len(features.shape) == 1 and feature_names is None: feature_names = features features = None if len(shap_values.shape) == 1: shap_values = np.reshape(shap_values, (1, len(shap_values))) if out_names is None: out_names = ["output value"] elif type(out_names) == str: out_names = [out_names] if shap_values.shape[0] == 1: if feature_names is None: feature_names = [ labels["FEATURE"] % str(i) for i in range(shap_values.shape[1]) ] if features is None: features = ["" for _ in range(len(feature_names))] if type(features) == np.ndarray: features = features.flatten() # check that the shape of the shap_values and features match if len(features) != shap_values.shape[1]: msg = "Length of features is not equal to the length of shap_values!" if len(features) == shap_values.shape[1] - 1: msg += ( " You might be using an old format shap_values array with the base value " "as the last column. In this case just pass the array without the last column." ) raise Exception(msg) instance = Instance(np.zeros((1, len(feature_names))), features) e = AdditiveExplanation( base_value, np.sum(shap_values[0, :]) + base_value, shap_values[0, :], None, instance, link, Model(None, out_names), DenseData(np.zeros((1, len(feature_names))), list(feature_names)), ) return visualize( e, plot_cmap, figsize=figsize, show=show, text_rotation=text_rotation ) else: if shap_values.shape[0] > 3000: warnings.warn( "shap.force_plot is slow for many thousands of rows, try subsampling your data." ) exps = [] for k in range(shap_values.shape[0]): if feature_names is None: feature_names = [ labels["FEATURE"] % str(i) for i in range(shap_values.shape[1]) ] if features is None: display_features = ["" for i in range(len(feature_names))] else: display_features = features[k, :] instance = Instance(np.ones((1, len(feature_names))), display_features) e = AdditiveExplanation( base_value, np.sum(shap_values[k, :]) + base_value, shap_values[k, :], None, instance, link, Model(None, out_names), DenseData(np.ones((1, len(feature_names))), list(feature_names)), ) exps.append(e) return visualize( exps, plot_cmap=plot_cmap, ordering_keys=ordering_keys, ordering_keys_time_format=ordering_keys_time_format, text_rotation=text_rotation, )