def _create_linear_explainer(model, multiclass, mean, covariance, seed):
    """Create the linear explainer or, in multiclass case, list of explainers.

    :param model: The linear model to compute the shap values for.
    :type model: linear model that implements sklearn.predict or sklearn.predict_proba
    :param multiclass: True if this is a multiclass model.
    :type multiclass: bool
    :param mean: The mean of the dataset by columns.
    :type mean: numpy.array
    :param covariance: The covariance matrix of the dataset.
    :type covariance: numpy.array
    :param seed: Random number seed.
    :type seed: int
    """
    np.random.seed(seed)
    if multiclass:
        explainers = []
        coefs = model.coef_
        intercepts = model.intercept_
        if isinstance(intercepts, np.ndarray):
            intercepts = intercepts.tolist()
        if isinstance(intercepts, list):
            coef_intercept_list = zip(coefs, intercepts)
        else:
            coef_intercept_list = [(coef, intercepts) for coef in coefs]
        for class_coef, intercept in coef_intercept_list:
            linear_explainer = shap.LinearExplainer((class_coef, intercept),
                                                    (mean, covariance))
            explainers.append(linear_explainer)
        return explainers
    else:
        model_coef = model.coef_
        model_intercept = model.intercept_
        return shap.LinearExplainer((model_coef, model_intercept),
                                    (mean, covariance))
Beispiel #2
0
    def fit(self, X_exp, y_exp, X_train, y_train, X_val, y_val):
        """[Trains a model over the input data and constructs the Explainer object using trained model]

        Args:
            X_exp ([np.array or pd.DataFrame]): [Input data of which Shapley values are calculated]
            y_exp ([np.array or pd.DataFrame]): [Input labels]
            X_train ([np.array or pd.DataFrame]): [Train partition of input data]
            y_train ([np.array or pd.DataFrame]): [Train partition of input labels]
            X_val ([np.array or pd.DataFrame]): [Test partition of input data]
            y_val ([np.array or pd.DataFrame]): [Test partition of input labels]

        Returns:
            [self]: [Returns model itself with `explainer_model` and `base_model`]
        """

        if self.explainer_type == "Linear":
            self.base_model = LinearRegression().fit(X_exp, y_exp)
        else:
            eval = [
                (xgboost.DMatrix(X_train, label=y_train), "train"),
                (xgboost.DMatrix(X_val, label=y_val), "val"),
            ]
            self.base_model = xgboost.train(self.explainer_params,
                                            xgboost.DMatrix(X_train,
                                                            label=y_train),
                                            evals=eval,
                                            **self.keyword_args)

        if self.explainer_type == "Linear":
            self.explainer = shap.LinearExplainer(
                self.base_model, X_exp, feature_dependence="independent")
        else:
            self.explainer = shap.TreeExplainer(self.base_model)

        return self
Beispiel #3
0
    def __init__(self, *argv, **kwargs):
        """
        Initialize shap kernelexplainer object.
        """
        super(LinearExplainer, self).__init__(*argv, **kwargs)

        self.explainer = shap.LinearExplainer(*argv, **kwargs)
Beispiel #4
0
    def shap_plot(self, X_train, Y_train):

        explainer = shap.LinearExplainer(
            self.model, X_train
        )  # The data here suppose to be the data that the model was train on
        shap_values = explainer.shap_values(X_train)
        global_shap_values = np.abs(shap_values).mean(0)

        # Get Features report:
        self.features_report(X_train, Y_train, global_shap_values)

        # Summary plot:
        plt.figure()
        shap.summary_plot(shap_values,
                          X_train,
                          feature_names=X_train.columns,
                          show=False,
                          max_display=45)
        plt.savefig(f"Summary_plot.png", bbox_inches='tight', dpi=600)
        plt.show()

        # Bar summary plot:
        plt.figure()
        shap.summary_plot(shap_values,
                          X_train,
                          plot_type="bar",
                          show=False,
                          max_display=45)
        plt.savefig(f"bar_plot.png", bbox_inches='tight', dpi=600)
        plt.show()
    def explainer(self, X):
        """Compute the importance of each feature for the underlying regressor."""

        try:
            sklearn.utils.validation.check_is_fitted(
                estimator=self.pipe, attributes='_final_estimator')
        except AttributeError:
            print(
                'The pipeline has not been built. Please use the fit method beforehand.'
            )

        if self.explainer_type == 'tree':
            explainer = shap.TreeExplainer(
                model=self.pipe.named_steps['reg'],
                feature_perturbation='interventional',
                data=X)
            shap_values = explainer.shap_values(X=X)
        elif self.explainer_type == 'linear':
            explainer = shap.LinearExplainer(
                model=self.pipe.named_steps['reg'],
                feature_perturbation='correlation_dependent',
                data=X)
            shap_values = explainer.shap_values(X=X)
        elif self.explainer_type == 'kernel':
            explainer = shap.KernelExplainer(
                model=self.pipe.named_steps['reg'].predict, data=X)
            shap_values = explainer.shap_values(X=X, l1_reg='aic')

        return explainer, shap_values
Beispiel #6
0
 def top_shap_dimensions(self,
                         task: str,
                         k: int,
                         clf=None,
                         X_train=None) -> List[int]:
     """
     Averages over absolute shap values for each dimension, returning k top dimensions.
     """
     if not clf:
         # Unpacking from task_data
         clf = self.task_data[task]['clf']
         X_train = self.task_data[task]['X_train']
         task = None
     explainer = shap.LinearExplainer(clf,
                                      X_train,
                                      feature_dependence="independent")
     shap_values = explainer(X_train)
     logger.log(f"Classifier has {len(clf.classes_)} classes")
     if len(clf.classes_) == 2:
         vals = np.abs(shap_values.values).mean(0)
         # Each dimension index, sorted descending by sum of shap score
         sorted_dimensions = np.argsort(-vals, axis=0)
     else:
         vals = np.sum(np.abs(shap_values.values), axis=2).mean(0)
         sorted_dimensions = np.argsort(-vals, axis=0)
     return sorted_dimensions[:k]
    def explain(self,
                x: Optional[TrainData] = None,
                save_shap_values: bool = True) -> np.ndarray:

        assert self.model is not None, "Model must be trained!"

        if self.explainer is None:
            mean = self._calculate_big_mean()
            self.explainer: shap.LinearExplainer = shap.LinearExplainer(  # type: ignore
                self.model, (mean, None),
                feature_dependence="independent")

        if x is None:
            test_arrays_loader = self.get_dataloader(mode="test",
                                                     batch_file_size=1,
                                                     shuffle_data=False)

            _, val = list(next(iter(test_arrays_loader)).items())[0]
            x = val.x

        reshaped_x = self._concatenate_data(x)
        explanations = self.explainer.shap_values(reshaped_x)

        if save_shap_values:
            analysis_folder = self.model_dir / "analysis"
            if not analysis_folder.exists():
                analysis_folder.mkdir()

            np.save(analysis_folder / f"shap_values.npy", explanations)
            np.save(analysis_folder / f"input.npy", reshaped_x)

        return explanations
def create_shap_values(model_name, model, X_train):
    shap_values = None
    if model_name in ['LR']:
        print()
        print(f"train LinearExplainer on {model_name}")
        explainer = shap.LinearExplainer(model, X_train)
        shap_values = explainer.shap_values(X_train)

    elif model_name in ['DTC', 'RF']:
        # size = min(X_train.shape[0], 2000)
        print()
        print(f"train TreeExplainer on {model_name}")
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X_train)

    elif model_name in ['DNN', 'CNN']:
        size = min(X_train.shape[0], 2500)
        print()
        print(f"train DeepExplainer on {model_name}")
        explainer = shap.DeepExplainer(model, X_train[:size])
        shap_values = explainer.shap_values(X_train.iloc[:size].values,
                                            check_additivity=False)
        shap_values = clean_deep_shap_values(shap_values,
                                             X_train.iloc[:size].shape)

    shap_values_as_df, shap_indices = shap_values_to_df(
        shap_values, list(X_train.columns))
    return shap_values_as_df, shap_indices
Beispiel #9
0
    def shap_by_class(self, task: str, k: int = 10):
        """
        Identifies those dimensions with the highest average shap score for each predicted class.
        """
        # Unpack from task_data dict
        clf = self.task_data[task]['clf']
        X_train = self.task_data[task]['X_train']

        logger.status_update("Finding top dimensions across classes...")
        explainer = shap.LinearExplainer(clf,
                                         X_train,
                                         feature_dependence="independent")
        shap_values = explainer(X_train)
        self.task_data[task]['class_shaps'] = {}
        if len(clf.classes_) == 2:
            # For binary classification: negative shap implies push to class 0, positive to 1
            sorted_values = np.argsort(shap_values.values.mean(0), axis=0)
            class_zero_dims = sorted_values[:k]
            class_one_dims = sorted_values[-k:]
            self.task_data[task]['class_shaps'] = {
                0: class_zero_dims,
                1: class_one_dims
            }
        else:
            vals = np.sum(shap_values.values, axis=0)
            for label_ind in range(vals.shape[1]):
                scores = vals[:, label_ind]
                sorted_dimensions = np.argsort(-scores, axis=0)
                self.task_data[task]['class_shaps'][
                    label_ind] = sorted_dimensions[:k]
def get_shap(clf_name, pipeline, train_dev_tokens, test_tokens):
    feature = pipeline.named_steps['feature']
    clf = pipeline.named_steps['clf']
    vocab = feature.vocabulary_
    index_feature_d = {}
    for word, index in vocab.items():
        index_feature_d[index] = word
    X_train = feature.transform(train_dev_tokens)
    X_test = feature.transform(test_tokens).toarray()
    explainer = None
    if 'svm' in clf_name:
        explainer = shap.LinearExplainer(clf,
                                         X_train,
                                         feature_dependence="independent")
    else:
        explainer = shap.TreeExplainer(clf)
    shap_values = explainer.shap_values(X_test)
    # get all features
    features_l, importance_l = [], []
    for idx, row in enumerate(shap_values):
        word_shap_val_d = {}
        for idx_b, shap_val in enumerate(row):
            feature = index_feature_d[idx_b]
            word_shap_val_d[feature] = abs(shap_val)  # taking absolute value
        features_tmp = list(word_shap_val_d.keys())
        features = " ".join(features_tmp)
        features_l.append(features)
        scores = list(word_shap_val_d.values())
        importance_l.append(scores)
    return features_l, importance_l
Beispiel #11
0
def get_shap_analysis(model, classifier, X_train, X_test, satscan=False):
    test_df = get_test_df(satscan=satscan)
    st.markdown("""
        ### Análise SHAP
        A biblioteca SHAP permite uma melhor compreensão dos atributos que mais impactaram a decisão do classificador.
        - **Output value**: é a previsão para o município em questão
        - **Base value**: é o valor que seria predito caso não houvessem informações sobre os atributos daquele município, ou seja, é o valor médio das previsões
        - **Vermelho/Azul**: as cores indicam os atributos que mais "empurram" a previsão para a direita (mostrados em vermelho) ou para a esquerda (mostrados em azul)
    """)

    cadmun = get_cadmun(test_df)
    city = st.selectbox('Selecione uma cidade:', list(cadmun['MUNNOME']))
    muncod = int(cadmun[cadmun["MUNNOME"] == city]["MUNCOD"])
    col_target = "TARGET" if satscan == False else "RISK"
    data_for_prediction = test_df.loc[test_df['MUNCOD'] == muncod].drop(
        columns=["MUNCOD", col_target])
    data_for_prediction_array = data_for_prediction.values.reshape(1, -1)
    if muncod:
        if model == "SVC (Linear)" or model == "Regressão Logística":
            explainer = shap.LinearExplainer(
                classifier, X_train, feature_perturbation="interventional")
            shap_values = explainer.shap_values(data_for_prediction)
            shap.initjs()
            st_shap(
                shap.force_plot(explainer.expected_value, shap_values,
                                data_for_prediction), 400)
        elif model == "Random Forest":
            explainer = shap.TreeExplainer(classifier)
            shap_values = explainer.shap_values(data_for_prediction)
            shap.initjs()
            st_shap(
                shap.force_plot(explainer.expected_value[1], shap_values[1],
                                data_for_prediction), 400)
Beispiel #12
0
def explain_model(model, nlp_args, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    # we use the first 100 training examples as our background dataset to integrate over
    if pipe.named_steps['clf'].__class__.__name__ == "LogisticRegression":
        explainer = shap.LinearExplainer(
            model.named_steps['clf'],
            model.named_steps['vectorizer'].transform(
                model.named_steps['preprocess'].transform(X_train)))
        shap_values = explainer.shap_values(
            model.named_steps['vectorizer'].transform(
                model.named_steps['preprocess'].transform(X_test)).toarray())
    elif pipe.named_steps[
            'clf'].__class__.__name__ == "RandomForestClassifier":
        explainer = shap.TreeExplainer(model.named_steps['clf'])
        shap_values = explainer.shap_values(
            model.named_steps['vectorizer'].transform(
                model.named_steps['preprocess'].transform(X_test)).toarray())
    else:
        X_train_summary = shap.kmeans(
            model.named_steps['vectorizer'].transform(
                model.named_steps['preprocess'].transform(X_train)).toarray(),
            10)
        explainer = shap.KernelExplainer(
            model.named_steps['clf'].predict_proba, X_train_summary)
        shap_values = explainer.shap_values(
            model.named_steps['vectorizer'].transform(
                model.named_steps['preprocess'].transform(X_test)).toarray())

    return explainer, shap_values
Beispiel #13
0
def test_sparse():
    """ Validate running LinearExplainer on scipy sparse data
    """
    import sklearn.linear_model
    from sklearn.datasets import make_multilabel_classification
    from scipy.special import expit
    import numpy as np
    import shap

    np.random.seed(0)
    n_features = 20
    X, y = make_multilabel_classification(n_samples=100,
                                          sparse=True,
                                          n_features=n_features,
                                          n_classes=1,
                                          n_labels=2)

    # train linear model
    model = sklearn.linear_model.LogisticRegression()
    model.fit(X, y)

    # explain the model's predictions using SHAP values
    explainer = shap.LinearExplainer(model, X)
    shap_values = explainer.shap_values(X)
    assert np.max(np.abs(expit(explainer.expected_value + shap_values.sum(1)) - model.predict_proba(X)[:, 1])) < 1e-6
Beispiel #14
0
def test_shape_values_linear_many_features():
    import numpy as np
    import shap
    from sklearn.linear_model import Ridge

    np.random.seed(0)

    coef = np.array([1, 2]).T

    # generate linear data
    X = np.random.normal(1, 10, size=(1000, len(coef)))
    y = np.dot(X, coef) + 1 + np.random.normal(scale=0.1, size=1000)

    # train linear model
    model = Ridge(0.1)
    model.fit(X, y)

    # explain the model's predictions using SHAP values
    explainer = shap.LinearExplainer(model, X.mean(0).reshape(1,-1))

    values = explainer.shap_values(X)

    assert values.shape == (1000, 2)

    expected = (X - X.mean(0)) * coef
    np.testing.assert_allclose(expected - values, 0, atol=0.01)
Beispiel #15
0
    def explain(self, x: Optional[TrainData] = None,
                save_shap_values: bool = True) -> np.ndarray:

        assert self.model is not None, 'Model must be trained!'

        if self.explainer is None:
            mean = self._calculate_big_mean()
            self.explainer: shap.LinearExplainer = shap.LinearExplainer(
                self.model, (mean, None), feature_dependence='independent')

        if x is None:
            test_arrays_loader = DataLoader(data_path=self.data_path, batch_file_size=1,
                                            experiment=self.experiment,
                                            shuffle_data=False, mode='test')
            _, val = list(next(iter(test_arrays_loader)).items())[0]
            x = val.x

        reshaped_x = self._concatenate_data(x)
        explanations = self.explainer.shap_values(reshaped_x)

        if save_shap_values:
            analysis_folder = self.model_dir / 'analysis'
            if not analysis_folder.exists():
                analysis_folder.mkdir()

            np.save(analysis_folder / f'shap_values.npy', explanations)
            np.save(analysis_folder / f'input.npy', reshaped_x)

        return explanations
Beispiel #16
0
    def explainShapley(self,X_exp,y_exp,X_train,y_train,X_val,y_val):
        params = {
            "eta": 0.01,
            "max_depth": 1,
            "objective": "reg:squarederror",
            "subsample": 0.5,
            "eval_metric": "rmse"
        }

        eval_results = {}
        kwargs = {
            'num_boost_round':500,
            'verbose_eval': 500,
            'evals_result' : {},
            'early_stopping_rounds' : 100
        }
        if self.explainer_type == 'Linear':
            whole_model = LinearRegression().fit(X_exp, y_exp)
        else:
            eval = [(xgboost.DMatrix(X_train, label=y_train),"train"),(xgboost.DMatrix(X_val, label=y_val),"val"),]
            whole_model = xgboost.train(params, xgboost.DMatrix(X_train, label=y_train),evals = eval,**kwargs)

        if self.explainer_type == 'Linear':
            self.explainer = shap.LinearExplainer(whole_model,X_exp)
        else:
            self.explainer = shap.TreeExplainer(whole_model)

        shap_values = self.explainer.shap_values(X_exp)

        return shap_values
Beispiel #17
0
    def __init__(self, model, x_train, x_test, y_test, learner: str):

        self.model = model
        self.x_train = x_train
        self.x_test = x_test
        self.y_test = y_test

        if learner == "linear":
            self.explainer = shap.LinearExplainer(
                self.model, self.x_train, feature_dependence="independent")
        elif learner == "tree":
            self.explainer = shap.TreeExplainer(self.model)
            self.shap_interaction_values = self.explainer.shap_interaction_values(
                self.x_test)
        elif learner == "kernel":

            if hasattr(self.model, "predict_proba"):
                func = self.model.predict_proba
            else:
                func = self.model.predict

            self.explainer = shap.KernelExplainer(func, self.x_train)
        else:
            raise ValueError(f"Learner: {learner} is not supported yet.")

        self.expected_value = self.explainer.expected_value
        self.shap_values = np.array(self.explainer.shap_values(
            self.x_test)).astype(float)

        # Calculate misclassified values
        self.misclassified_values = self._calculate_misclassified()

        # As per SHAP guidelines, test data needs to be dense for plotting functions
        self.x_test_array = self.x_test.values
    def init_explainer(self, X):
        """Initialize the explainer.
        
        If model_type is None then use shap.KernelExplainer class.
        
        Else if it's 'tree' then shap.TreeExplainer.
        
        Else use shap.LinearExplainer.
        
        Parameters
        ----------
        X: array like
            data (possibly training) to start the explainer
        
        Returns
        -------
        shap.KernelExplainer, shap.TreeExplainer, shap.LinearExplainer:
            explainer initialized
        """
        if self.model_type is None:
            explainer = shap.KernelExplainer(self.predict, X)        
        elif self.model_type == 'tree':
            explainer = shap.TreeExplainer(self.model, X)
        else:
            explainer = shap.LinearExplainer(self.model, X)

        return explainer
    def run(self) -> None:
        """
        This is the main function that runs all the functionality for the class
        :return: None
        """
        if self.model_name == 'Logistic Regression':
            if self.only_test:
                explainer_lr_test = shap.LinearExplainer(self.model, self.x_test_scaled)
                shap_values_lr_test = explainer_lr_test.shap_values(self.x_test_scaled)
                plt.figure(figsize=(5, 16))
                shap.summary_plot(shap_values_lr_test,
                                  self.x_test_scaled,
                                  feature_names=self.x_test.columns, show=False)
                st.pyplot(plt.gcf(), bbox_inches='tight', pad_inches=1)
                plt.clf()

        elif self.model_name == 'Random Forest':
            if self.only_test:
                rf_summary_plot(self.model, self.x_test_scaled, self.x_test.columns)
                # explainer_rf_test = shap.TreeExplainer(self.model)
                # shap_values_rf_test = explainer_rf_test.shap_values(self.x_test_scaled)
                # shap.summary_plot(shap_values_rf_test,
                #                   self.x_test_scaled,
                #                   feature_names=self.x_test.columns, show=False)
                #
                st.pyplot(plt.gcf(), bbox_inches='tight')
                plt.clf()
Beispiel #20
0
def test_tied_pair():
    np.random.seed(0)
    beta = np.array([1, 0, 0])
    mu = np.zeros(3)
    Sigma = np.array([[1, 0.999999, 0], [0.999999, 1, 0], [0, 0, 1]])
    X = np.ones((1, 3))
    explainer = shap.LinearExplainer((beta, 0), (mu, Sigma), feature_dependence="correlation")
    assert np.abs(explainer.shap_values(X) - np.array([0.5, 0.5, 0])).max() < 0.05
Beispiel #21
0
def show_aud(df):
    """the function returns two list of words that are discussed on Yelp about the particular restaurant."""

    #train test split
    features = df['text']
    target = df['pos_neg']
    X_train, X_test, y_train, y_test = train_test_split(features,
                                                        target,
                                                        test_size=0.25,
                                                        random_state=42)

    #vec
    my_stop_words = text.ENGLISH_STOP_WORDS.union([
        'highly', 'amazing', 'great', 'did', 'make', 'wa', 'wasn', 'don',
        'didn', 'oh', 've', 'definitely', 'absolutely', 'cool', 'best', 'like'
    ])
    vectorizer = TfidfVectorizer(stop_words=my_stop_words)

    #vec for SHAP
    features_train_transformed = vectorizer.fit_transform(X_train)
    features_test_transformed = vectorizer.transform(X_test)

    # Fit a linear logistic regression model
    model = LogisticRegression(solver='lbfgs')
    model.fit(features_train_transformed, y_train)

    # Explain the linear model
    explainer = shap.LinearExplainer(model,
                                     features_train_transformed,
                                     feature_dependence="independent")
    shap_values = explainer.shap_values(features_test_transformed)
    X_test_array = features_test_transformed.toarray()

    #build dataframe for illustration
    res_sv_df = pd.DataFrame(shap_values,
                             columns=vectorizer.get_feature_names())

    #present max/min SHAP values for
    neg_df = res_sv_df.min(axis=0)
    neg_sort = neg_df.sort_values().head(15)
    neg_list = list(dict(neg_sort).keys())
    neg_df_final = pd.DataFrame(
        neg_list,
        index=range(1, 16),
        columns=['Words with Negative Impact on Ratings'])

    pos_df = res_sv_df.max(axis=0)
    pos_sort = pos_df.sort_values(ascending=False).head(15)
    pos_list = list(dict(pos_sort).keys())
    pos_df_final = pd.DataFrame(
        pos_list,
        index=range(1, 16),
        columns=['Words with Positive Impact on Ratings'])

    combo_df = pd.concat([pos_df_final, neg_df_final], axis=1)

    return combo_df
Beispiel #22
0
def test_tied_triple():
    np.random.seed(0)
    beta = np.array([0, 1, 0, 0])
    mu = 1*np.ones(4)
    Sigma = np.array([[1, 0.999999, 0.999999, 0], [0.999999, 1, 0.999999, 0], [0.999999, 0.999999, 1, 0], [0, 0, 0, 1]])
    X = 2*np.ones((1, 4))
    explainer = shap.LinearExplainer((beta, 0), (mu, Sigma), feature_dependence="correlation")
    assert explainer.expected_value == 1
    assert np.abs(explainer.shap_values(X) - np.array([0.33333, 0.33333, 0.33333, 0])).max() < 0.05
Beispiel #23
0
    def get_shap_feature_importance(self, base_model, X_test, X_train):

        try:
            import shap
        except ImportError:
            raise ImportError('You must have shap installed to use shap')

        if self.flags['tree'] or self.flags['linear']:

            if self.flags['linear']:

                fp = self.shap_params.linear_feature_perturbation
                n = self.shap_params.linear_nsamples
                explainer = shap.LinearExplainer(base_model, X_train,
                                                 nsamples=n,
                                                 feature_perturbation=fp)

                shap_values = explainer.shap_values(X_test)

            elif self.flags['tree']:

                tmo = self.shap_params.tree_model_output
                tfp = self.shap_params.tree_feature_perturbation
                explainer =\
                    shap.TreeExplainer(
                        base_model, X_train,
                        model_output=tmo,
                        feature_perturbation=tfp)

                ttl = self.shap_params.tree_tree_limit
                shap_values =\
                    explainer.shap_values(X_test,
                                          tree_limit=ttl)

        # Kernel
        else:

            nkmean = self.shap_params.kernel_nkmean

            if nkmean is not None:
                X_train_summary = shap.kmeans(X_train, nkmean)
            else:
                X_train_summary = X_train

            explainer =\
                self.get_kernel_explainer(base_model, X_train_summary,
                                          self.shap_params.kernel_link)

            klr = self.shap_params.kernel_l1_reg
            kns = self.shap_params.kernel_nsamples

            shap_values =\
                explainer.shap_values(np.array(X_test),
                                      l1_reg=klr,
                                      n_samples=kns)

        return self.proc_shap_vals(shap_values)
Beispiel #24
0
 def _explain(self, X_specimens):
     X_bg = _get_X_bg(self.X_bg, X_specimens)
     impacts = np.asarray(
         shap.LinearExplainer(
             self.model, X_bg,
             **self.constr_kwargs).shap_values(X_specimens))
     return [
         TabularExplanation(X_specimens[idx], impacts[..., idx, :])
         for idx in range(X_specimens.shape[0])
     ]
Beispiel #25
0
def test_tied_pair_new():
    import numpy as np
    import shap
    np.random.seed(0)
    beta = np.array([1, 0, 0])
    mu = np.zeros(3)
    Sigma = np.array([[1, 0.999999, 0], [0.999999, 1, 0], [0, 0, 1]])
    X = np.ones((1,3))
    explainer = shap.LinearExplainer((beta, 0), shap.maskers.Impute({"mean": mu, "cov": Sigma}))
    assert np.abs(explainer.shap_values(X) - np.array([0.5, 0.5, 0])).max() < 0.05
Beispiel #26
0
    def linear(self, print_result=False):
        result = {}
        result['name'] = self.name
        result['shap_method'] = 'linear'

        explainer = shap.LinearExplainer(self.model,
                                         self.X_train,
                                         feature_dependence="correlation")
        shap_values = explainer.shap_values(self.X_test)

        return self._run(shap_values, result, print_result)
Beispiel #27
0
 def _find_right_explainer(self, x_train):
     if isinstance(self._model_to_explain, LogisticRegression):
         return shap.LinearExplainer(self._model_to_explain,
                                     data=x_train.values)
     if isinstance(self._model_to_explain,
                   (BaseDecisionTree, ForestClassifier, XGBClassifier)):
         return shap.TreeExplainer(self._model_to_explain)
     if isinstance(self._model_to_explain, KerasClassifier):
         return shap.DeepExplainer(self._model_to_explain.model,
                                   data=x_train.values)
     return shap.KernelExplainer(self._model_to_explain,
                                 data=x_train.values)
 def show_summary(self, model_type=None):
     # load JS visualization code to notebook
     shap.initjs()
     self.explainer = None
     if model_type == 'linear':
         self.explainer = shap.LinearExplainer(self.model,self.X_train)
     elif model_type == 'tree':
         self.explainer = shap.TreeExplainer(self.model,self.X_train)
     else:
         self.explainer = shap.KernelExplainer(self.model,self.X_train)
     shap_values = self.explainer.shap_values(self.X_train)
     display(shap.summary_plot(shap_values,self.X_train))
     display(shap.summary_plot(shap_values,self.X_train,plot_type='bar'))
Beispiel #29
0
 def __init__(self, X_train, y_train, n_runs, results_path):
     self.explainer = None
     y_train = np.squeeze(np.array(y_train))
     os.mkdir(results_path + '/logistic_models')
     print("Training Logistic Regression Model....")
     for n in range(n_runs):
         lr_model = LogisticRegression()
         lr_model.fit(X_train, y_train)
         filename = results_path + '/logistic_models/logistic_model_' + str(
             n) + '.sav'
         pickle.dump(lr_model, open(filename, 'wb'))
     print("Training Completed.")
     self.explainer = shap.LinearExplainer(lr_model, X_train)
Beispiel #30
0
    def analyze_sentence(self, task, text, ngram_size=3, k=50):
        if task not in BINARY_CLASSIFICATION_TASKS:
            raise ValueError(
                "Can only analyze with binary classification tasks")
        # Unpack from task_data
        clf = self.task_data[task]['clf']
        X_train = self.task_data[task]["X_train"]
        if not clf:
            logger.yellow(
                f"No classifier cached for {task}, training one now...")
            self.model_inference(task)
            clf = self.task_data[task]['clf']
            X_train = self.task_data[task]["X_train"]
        if not self.task_data[task]['class_shaps'] or len(
                list(self.task_data[task]['class_shaps'].values())[0]) != k:
            logger.yellow(
                f"Class shaps not cached for {task}, calculating now...")
            self.shap_by_class(task, k=k)
        vector_dict = self.get_vector_dict(original=True, inf_default=True)
        text = [t.lower() for t in nltk.word_tokenize(text)]
        v = np.reshape(self.get_sentence_vector(text, vector_dict), (1, -1))

        class_pred = clf.predict(v)
        logger.status_update(f"Predicted class: {class_pred[0]}")

        explainer = shap.LinearExplainer(clf,
                                         X_train,
                                         feature_dependence="independent")
        shap_values = explainer(v)
        pos_class_dims = np.argsort(-shap_values.values, axis=1)[0][:k]
        neg_class_dims = np.argsort(shap_values.values, axis=1)[0][:k]

        if task in TASK_EXPLANATIONS:
            print()
            logger.yellow(TASK_EXPLANATIONS[task])
            print()

        ngrams = self.get_ngrams(text, n=ngram_size, unigrams=True)
        out = {0: [], 1: []}
        for gram in ngrams:
            out[0].append((" ".join(gram),
                           self.subspace_score(gram, neg_class_dims,
                                               vector_dict)))
            out[1].append((" ".join(gram),
                           self.subspace_score(gram, pos_class_dims,
                                               vector_dict)))
        out[0] = sorted(out[0], key=lambda x: x[1], reverse=True)[:10]
        out[1] = sorted(out[1], key=lambda x: x[1], reverse=True)[:10]
        out['pred'] = class_pred[0]

        return out