Beispiel #1
0
    def explain(self, images: np.ndarray, texts: np.ndarray, mode: str):
        """ Main API to calculate shap values

        Args:
            images: np.ndarray of shape (N, D1, D2, C);
                N = number of samples
                D1, D2, C = three channel image
            texts: np.ndarray of shape (N,)

        Returns:
            shap values calculated

        """

        # input validations
        if mode not in self._supported_modes:
            raise ValueError(f"This mode {mode} is not supported!")

        if images.shape[0] != texts.shape[0]:
            raise ValueError(
                f"Shape mismatch, inputs' first dimensions should be equal!")

        if mode == "text_only":
            self._fixed_images = images
            if not isinstance(images[0], Image.Image):
                self._fixed_images = utils.arr_to_img(images)
            # tokenizer and masker
            tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased",
                                                      use_fast=True)
            text_masker = shap.maskers.Text(tokenizer)
            # NOTE: if using text heatmap you need output_names arg to let it know its text output!!!
            explainer = shap.Explainer(self._f_text,
                                       text_masker,
                                       algorithm=self.algorithm)
            shap_values = explainer(texts,
                                    max_evals=self.max_evals,
                                    batch_size=self.batch_size)

        elif mode == "image_only":
            self._fixed_texts = texts
            image_masker = shap.maskers.Image("inpaint_telea", images[0].shape)
            image_explainer = shap.Explainer(self._f_image,
                                             image_masker,
                                             algorithm=self.algorithm)
            shap_values = image_explainer(images,
                                          max_evals=self.max_evals,
                                          batch_size=self.batch_size)

        return shap_values
def display_hover_data(n_clicks, value):

    # explain the model on two sample inputs
    explainer = shap.Explainer(model)
    shap_values = explainer([value])

    # visualize the first prediction's explanation for the POSITIVE output class
    return text_plot(shap_values[0, :, "POSITIVE"])._repr_html_()


# app.clientside_callback(
#     """
#     function(figure, scale) {
#         if(figure === undefined) {
#             return {'data': [], 'layout': {}};
#         }
#         const fig = Object.assign({}, figure, {
#             'layout': {
#                 ...figure.layout,
#                 'yaxis': {
#                     ...figure.layout.yaxis, type: scale
#                 }
#              }
#         });
#         return fig;
#     }
#     """,
#     Output('clientside-graph-px', 'figure'),
#     Input('clientside-figure-store-px', 'data'),
#     Input('clientside-graph-scale-px', 'value')
# )
Beispiel #3
0
def test_pyfunc_serve_and_score():
    X, y = shap.datasets.boston()
    reg = sklearn.ensemble.RandomForestRegressor(n_estimators=10).fit(X, y)
    model = shap.Explainer(
        reg.predict,
        masker=X,
        algorithm="permutation",
        # `link` defaults to `shap.links.identity` which is decorated by `numba.jit` and causes
        # the following error when loading the explainer for serving:
        # ```
        # Exception: The passed link function needs to be callable and have a callable
        # .inverse property!
        # ```
        # As a workaround, use an identify function that's NOT decorated by `numba.jit`.
        link=create_identity_function(),
    )
    artifact_path = "model"
    with mlflow.start_run():
        mlflow.shap.log_explainer(model, artifact_path)
        model_uri = mlflow.get_artifact_uri(artifact_path)

    resp = pyfunc_serve_and_score_model(
        model_uri,
        data=pd.DataFrame(X[:3]),
        content_type=pyfunc_scoring_server.CONTENT_TYPE_JSON_SPLIT_ORIENTED,
    )
    scores = pd.read_json(resp.content, orient="records").values
    np.testing.assert_allclose(scores, model(X[:3]).values, rtol=100, atol=100)
Beispiel #4
0
def test_sklearn_log_explainer():
    """
    Tests mlflow.shap log_explainer with mlflow serialization of the underlying model
    """

    with mlflow.start_run() as run:

        run_id = run.info.run_id

        X, y = shap.datasets.boston()
        model = sklearn.ensemble.RandomForestRegressor(n_estimators=100)
        model.fit(X, y)

        explainer_original = shap.Explainer(model.predict, X, algorithm="permutation")
        shap_values_original = explainer_original(X[:5])

        mlflow.shap.log_explainer(explainer_original, "test_explainer")

        explainer_uri = "runs:/" + run_id + "/test_explainer"

        explainer_loaded = mlflow.shap.load_explainer(explainer_uri)
        shap_values_new = explainer_loaded(X[:5])

        explainer_path = _download_artifact_from_uri(artifact_uri=explainer_uri)
        flavor_conf = _get_flavor_configuration(
            model_path=explainer_path, flavor_name=mlflow.shap.FLAVOR_NAME
        )
        underlying_model_flavor = flavor_conf["underlying_model_flavor"]

        assert underlying_model_flavor == mlflow.sklearn.FLAVOR_NAME
        np.testing.assert_array_equal(shap_values_original.base_values, shap_values_new.base_values)
        np.testing.assert_allclose(
            shap_values_original.values, shap_values_new.values, rtol=100, atol=100
        )
Beispiel #5
0
def test_sklearn_log_explainer_pyfunc():
    """
    Tests mlflow.shap log_explainer with mlflow
    serialization of the underlying model using pyfunc flavor
    """

    with mlflow.start_run() as run:

        run_id = run.info.run_id

        X, y = shap.datasets.boston()
        model = sklearn.ensemble.RandomForestRegressor(n_estimators=100)
        model.fit(X, y)

        explainer_original = shap.Explainer(model.predict,
                                            X,
                                            algorithm="permutation")
        shap_values_original = explainer_original(X[:2])

        mlflow.shap.log_explainer(explainer_original, "test_explainer")

        explainer_pyfunc = mlflow.pyfunc.load_model("runs:/" + run_id +
                                                    "/test_explainer")
        shap_values_new = explainer_pyfunc.predict(X[:2])

        np.testing.assert_allclose(shap_values_original.values,
                                   shap_values_new,
                                   rtol=100,
                                   atol=100)
Beispiel #6
0
def test_serialization_exact():
    xgboost = pytest.importorskip('xgboost')
    # get a dataset on income prediction
    X,y = shap.datasets.adult()

    # train an XGBoost model (but any other model type would also work)
    model = xgboost.XGBClassifier()
    model.fit(X, y)

    explainer_original = shap.Explainer(model.predict_proba, X, algorithm='exact')
    shap_values_original = explainer_original(X[:1])

    temp_serialization_file = tempfile.TemporaryFile()
    # Serialization
    explainer_original.save(temp_serialization_file)

    temp_serialization_file.seek(0)

    # Deserialization
    explainer_new = shap.Explainer.load(temp_serialization_file)

    temp_serialization_file.close()

    shap_values_new = explainer_new(X[:1])

    for i in range(len(explainer_original.masker.feature_names)):
        assert explainer_original.masker.feature_names[i] == explainer_new.masker.feature_names[i]

    assert np.array_equal(shap_values_original.base_values,shap_values_new.base_values)
    assert type(explainer_original) == type(explainer_new)
    assert type(explainer_original.masker) == type(explainer_new.masker)
def get_shap_explainations(model, data):
    '''
    Plot SHAP's output explanations.
    "SHAP (SHapley Additive exPlanations) is a game theoretic approach to explain the output
    of any machine learning model. It connects optimal credit allocation with local explanations
    using the classic Shapley values from game theory and their related extensions."
    https://github.com/slundberg/shap
    :param model: the model to explain
    :param data: data to explain
    :return:-
    '''
    # explain the model's predictions using SHAP
    explainer = shap.Explainer(model)
    shap_values = explainer(data)

    # visualize the first prediction's explanation
    shap.plots.waterfall(shap_values[0])

    # visualize the first prediction's explanation with a force plot
    shap.plots.force(shap_values[0])

    # visualize all the training set predictions
    shap.plots.force(shap_values)

    # create a dependence scatter plot to show the effect of a single feature across the whole dataset
    # shap.plots.scatter(shap_values[:, "RM"], color=shap_values)

    # summarize the effects of all the features
    shap.plots.beeswarm(shap_values)

    shap.plots.bar(shap_values)

    shap.summary_plot(shap_values, data)
Beispiel #8
0
def test_serialization_exact_numpy_custom_model_save():
    xgboost = pytest.importorskip('xgboost')
    pickle = pytest.importorskip('pickle')

    # get a dataset on income prediction
    X,y = shap.datasets.adult()
    X = X.values

    # train an XGBoost model (but any other model type would also work)
    model = xgboost.XGBClassifier()
    model.fit(X, y)

    explainer_original = shap.Explainer(model.predict_proba, X, algorithm='exact')
    shap_values_original = explainer_original(X[:1])

    temp_serialization_file = tempfile.TemporaryFile()

    # Serialization
    explainer_original.model.save = lambda out_file, model: pickle.dump(model, out_file)
    explainer_original.save(temp_serialization_file)

    temp_serialization_file.seek(0)

    # Deserialization
    model_loader = lambda in_file: pickle.load(in_file)
    explainer_new = shap.Explainer.load(temp_serialization_file, model_loader = model_loader)

    temp_serialization_file.close()


    shap_values_new = explainer_new(X[:1])

    assert np.array_equal(shap_values_original.base_values,shap_values_new.base_values)
    assert type(explainer_original) == type(explainer_new)
    assert type(explainer_original.masker) == type(explainer_new.masker)
Beispiel #9
0
def get_reg_shap_explainer_global_and_local(model: object, X_train):
    """return the shap explainer object and shap values for
       global and local plot

    Args:
        model (object): a traine pycaret model
        X_train (pd.DataFrame): the X training data
    """
    sample_values = None

    if model.__class__.__name__ == "CatBoostRegressor":
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X_train)
    elif model.__class__.__name__ == "RANSACRegressor" \
        or model.__class__.__name__ == "KernelRidge" \
        or model.__class__.__name__ == "SVR" \
        or model.__class__.__name__ == "MLPRegressor" \
        or model.__class__.__name__ == "KNeighborsRegressor" \
        or model.__class__.__name__ == "AdaBoostRegressor":
        explainer, shap_values, sample_values = get_shap_kernel(model, X_train)
    else:
        explainer = shap.Explainer(model, X_train)
        shap_values = explainer(X_train)

    return explainer, shap_values, sample_values
Beispiel #10
0
def test_sklearn_log_explainer_self_serialization():
    """
    Tests mlflow.shap log_explainer with SHAP internal serialization of the underlying model
    """

    with mlflow.start_run() as run:

        run_id = run.info.run_id

        X, y = shap.datasets.boston()
        model = sklearn.ensemble.RandomForestRegressor(n_estimators=100)
        model.fit(X, y)

        explainer_original = shap.Explainer(model.predict, X, algorithm="permutation")
        shap_values_original = explainer_original(X[:5])

        mlflow.shap.log_explainer(
            explainer_original, "test_explainer", serialize_model_using_mlflow=False
        )

        explainer_loaded = mlflow.shap.load_explainer("runs:/" + run_id + "/test_explainer")
        shap_values_new = explainer_loaded(X[:5])

        np.testing.assert_array_equal(shap_values_original.base_values, shap_values_new.base_values)
        np.testing.assert_allclose(
            shap_values_original.values, shap_values_new.values, rtol=100, atol=100
        )
Beispiel #11
0
    def _get_shap_values(
        self,
        estimator,
        X,
        shap_kwargs,
    ):
        """
        FOR INTERNAL PURPOSES ONLY.

        """
        masker = shap_kwargs.get('masker')
        algorithm = shap_kwargs.get('algorithm', 'auto')

        if self.estimator_output == "probability":
            model = estimator.predict_proba
        else:
            model = estimator.predict

        explainer = shap.Explainer(
            model=model,
            masker=masker,
            algorithm=algorithm,
        )

        shap_results = explainer(X)

        if self.estimator_output == "probability":
            shap_results = shap_results[..., 1]

        contributions = shap_results.values
        bias = shap_results.base_values

        return contributions, bias
Beispiel #12
0
def test_serialization_permutation():
    import shap
    import xgboost
    import pickle
    import numpy as np

    # get a dataset on income prediction
    X,y = shap.datasets.adult()

    # train an XGBoost model (but any other model type would also work)
    model = xgboost.XGBClassifier()
    model.fit(X, y)

    explainer_original = shap.Explainer(model.predict_proba, X, algorithm='permutation')
    shap_values_original = explainer_original(X[:1])


    # Serialization 
    out_file = open(r'test_serialization_permutation_dataframe_scratch_file.bin', "wb")
    explainer_original.save(out_file)
    out_file.close()

    # Deserialization
    in_file = open(r'test_serialization_permutation_dataframe_scratch_file.bin', "rb")
    explainer_new = shap.Explainer.load(in_file)
    in_file.close()

    shap_values_new = explainer_new(X[:1])

    for i in range(len(explainer_original.masker.feature_names)):
        assert explainer_original.masker.feature_names[i] == explainer_new.masker.feature_names[i] 

    assert np.array_equal(shap_values_original.base_values,shap_values_new.base_values)
    assert type(explainer_original) == type(explainer_new)
    assert type(explainer_original.masker) == type(explainer_new.masker)
Beispiel #13
0
    def get_feature_importance(self, input_data) -> Dict[str, float]:
        """Computes feature importance for each feature based on an input data.

        Most of models are supported by SHAP (https://github.com/slundberg/shap). For
        unsupported models, please override this method by a workable solution.
        """
        explainer = shap.Explainer(self.model)
        shap_values = explainer(input_data)

        def _get_shap_values_one_sample(shap_values, index: int):
            # For LightGBM and XGBoost, shap_values[index].values is a 2d array
            # representing logit of two classes. They are basically negative of each
            # other, we only need one.
            # Related issue https://github.com/slundberg/shap/issues/526.
            if len(shap_values[index].values.shape
                   ) == 2:  # binary classification
                return shap_values[index].values[:, 0]
            assert len(shap_values[index].values.shape) == 1, len(
                shap_values[index].values)
            return shap_values[index].values

        feature_importances = np.mean(
            [
                np.abs(_get_shap_values_one_sample(shap_values, i))
                for i in range(len(shap_values))
            ],
            axis=0,
        ).tolist()

        feature_names = input_data.columns.tolist()
        feature_importance_dict = dict(zip(feature_names, feature_importances))
        return feature_importance_dict
Beispiel #14
0
    def __init__(self, X, y, model, n_samples=1000):
        store_attr()

        model.fit(X, y)

        self.samples = samples = X.iloc[:n_samples]
        self.explainer = shap.Explainer(model, samples)
        self.shap_values = self.explainer(samples)
Beispiel #15
0
def shap_calc(model, X, approximate=False, return_explainer=False, verbose=0, sample_size = 100, **shap_kwargs):
    """
    Helper function to calculate the shapley values for a given model.

    Args:
        model (binary model):
            Trained model.

        X (pd.DataFrame or np.ndarray):
            features set.

         approximate (boolean):
            if True uses shap approximations - less accurate, but very fast.

        return_explainer (boolean):
            if True, returns a a tuple (shap_values, explainer).

        verbose (int, optional):
            Controls verbosity of the output:

            - 0 - nether prints nor warnings are shown
            - 1 - 50 - only most important warnings
            - 51 - 100 - shows other warnings and prints
            - above 100 - presents all prints and all warnings (including SHAP warnings).

        **shap_kwargs: kwargs of the shap.Explainer

    Returns:
        (np.ndarray or tuple(np.ndarray, shap.Explainer)):
            shapley_values for the model, optionally also returns the explainer.

    """
    # Suppress warnings regarding XGboost and Lightgbm models.
    with warnings.catch_warnings():
        if verbose <= 100:
            warnings.simplefilter("ignore")

        # Create the background data,required for non tree based models.
        # A single datapoint can passed as mask (https://github.com/slundberg/shap/issues/955#issuecomment-569837201)
        
        if X.shape[1]< sample_size :
            sample_size = int(np.ceil(X.shape[1]*0.2))
        else :
           pass
        mask = shap.utils.sample(X,sample_size)

        explainer = shap.Explainer(model,masker=mask,**shap_kwargs)
        # Calculate Shap values.
        shap_values = explainer.shap_values(X)

        if isinstance(shap_values, list) and len(shap_values)==2:
            warnings.warn('Shap values are related to the output probabilities of class 1 for this model, instead of '
                          'log odds.')
            shap_values = shap_values[1]

    if return_explainer:
        return shap_values, explainer
    return shap_values
Beispiel #16
0
def explainer():
    df_new = dataLoader()

    gradY_df = [0]
    count = 0
    i = 0
    while (1):
        if count == 250:
            break
        temp = df_new['y'][i]
        t2 = df_new['y'][i + 1]
        if df_new['y'][i + 1] > temp:
            gradY_df.append(1)
        else:
            gradY_df.append(0)
        i = i + 1
        count += 1
    gradY_df.append(0)
    df_new.insert(6, 'gradY_df', gradY_df)
    shap_df = df_new[['x1', 'x2', 'x3', 'x4', 'x5', 'gradY_df']]

    n_train_time = int(len(shap_df) * 0.90)
    train = shap_df[:n_train_time]
    test = shap_df[n_train_time:]
    train_x, train_y = train[['x1', 'x2', 'x3', 'x4',
                              'x5']], train[['gradY_df']]
    test_x, test_y = test[['x1', 'x2', 'x3', 'x4', 'x5']], test[['gradY_df']]

    yrr = shap_df[['gradY_df']].values.reshape(252, )

    # train an XGBoost model
    X = shap_df[['x1', 'x2', 'x3', 'x4', 'x5']]
    y = yrr
    model = xgboost.XGBRegressor().fit(X, y)

    # explain the model's predictions using SHAP
    # (same syntax works for LightGBM, CatBoost, scikit-learn, transformers, Spark, etc.)
    explainer = shap.Explainer(model)
    shap_values = explainer(X)

    # visualize the first prediction's explanation
    shap.plots.waterfall(shap_values[0], show=False)
    plt.savefig("static/img/img_number1.png")
    plt.close()

    shap.summary_plot(shap_values, X, show=False)
    plt.savefig("static/img/img_number2.png")
    plt.close()

    shap.summary_plot(shap_values, X, plot_type="bar", show=False)
    plt.savefig('static/img/img_number3.png')
    plt.close()

    shap.plots.bar(shap_values, show=False)
    plt.savefig('static/img/img_number4.png')
    plt.close()
    def __init__(self, clf, X, X_df=None ):

        self.clf = clf
        self.X = X
        self.explainer = shap.Explainer(clf.steps[-1][1])
        if X_df is None:
            self.X_df = DataReconstructor(self.X, self.clf).make()
        else:
            self.X_df = X_df
        self.shap_values = self.explainer(self.X_df)
Beispiel #18
0
    def __init__(self, model):
        '''
        Only with BERT
        '''
        label2id = model.model.config.label2id
        labels = sorted(label2id, key=label2id.get)

        self.tokenizer = model.tokenizer
        self.predict_proba = lambda s: model.predict_proba_batch(s)
        self.exp = shap.Explainer(
            self.predict_proba, self.tokenizer, output_names=labels)
        self.model = model
Beispiel #19
0
def shap_plots(model, train_features, test_features, test_labels):
    print("Computing shapley values..")
    # compute SHAP values
    if isinstance(
            model,
        (MLP, MLPRegressor, MLPClassifier, ElasticNet, LogisticRegression)):
        train_sample = shap.sample(train_features, 10)
        explainer = shap.Explainer(model.predict, train_sample)
    elif isinstance(model, (RandomForestRegressor, RandomForestClassifier)):
        explainer = shap.TreeExplainer(model, train_features)
    else:
        explainer = shap.Explainer(model, train_features)

    shap_values = explainer(test_features)
    shap.plots.bar(shap_values, max_display=10)
    # shap.plots.bar(shap_values[0]) # Local

    # beeswarm plot
    shap.plots.beeswarm(shap_values)

    # Decision plot
    expected_value = explainer.expected_value
    select = range(20)
    features_sample = test_features.iloc[select]
    shap.decision_plot(expected_value, explainer.shap_values(features_sample),
                       features_sample)

    # Heatmap
    shap.plots.heatmap(shap_values, max_display=10)

    # Scatter
    shap.plots.scatter(shap_values[:, "hs_child_age_None"],
                       color=shap_values,
                       alpha=0.8)

    # Feature clustering (redondant feature detection)
    clustering = shap.utils.hclust(
        test_features, test_labels
    )  # by default this trains (X.shape[1] choose 2) 2-feature XGBoost models
    shap.plots.bar(shap_values, clustering=clustering, clustering_cutoff=0.5)
Beispiel #20
0
def pickle( target,title,max_depth=3,n_esti=160,lr=0.1,withexperience = False, color='YlGnBu'):
    matrics = []
    seed(2145)
    df_small = df_model_draft[df_model_draft['surgyear'].isin([2015])]
    print (df_small.shape)
    groups = df_small['HospID']
    print (groups)
    if withexperience is False:
        X = df_small.drop(
            ['SiteID', 'surgid', 'Complics', 'STSRCHOSPD', 'STSRCOM'], axis=1)
        y = df_small[target]
    else:
        X = df_small.drop(
            ['SiteID', 'surgid', 'Complics', 'STSRCHOSPD', 'STSRCOM',
             'HospID_Reop_CABG', 'HospID_total_CABG', 'surgyear', 'HospID_total_cardiac_surgery',
             'surgid_total_cardiac_surgery', 'surgid_total_CABG', 'surgid_Reop_CABG'], axis=1)
        y = df_small[target]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    sm = SMOTE()  # SVMSMOTE(random_state=21)
    # fit and apply the transform
    X_over, y_over = sm.fit_resample(X_train, y_train)

    # summarize class distribution
    print("after under sampling")
    counter = Counter(y_over)
    print(counter)
    estimate = counter[0] / counter[1]
    print('Estimate: %.3f' % estimate)

    model = XGBClassifier(objective='binary:logistic', eval_metric='logloss', max_depth=max_depth, learning_rate=lr, n_estimators=n_esti)
    model.fit(X_over, y_over)
    y_pred = model.predict(X_test)
    explainer = shap.Explainer(model)
    shap_values = explainer(X)

    # visualize the first prediction's explanation
    shap.plots.beeswarm(shap_values, max_display=50,show=False)
    shaptitle = path+ 'SHAP '+title+'.png'
    print(shaptitle)
    plt.savefig(shaptitle,bbox_inches='tight')
    # plt.show()
    auc = roc_auc_score(y_test, model.predict_proba(X_test.values)[:, 1])
    cm = confusion_matrix(y_test, y_pred)
    mats = Make_Confusion_Matrix(cm,roc=auc, categories=categories, cmap=color, title=title, group_names=labels,y_pred=y_pred,y_test=y_test)

    mats['AUROC'] = auc
    matrics.append(mats)

    return matrics,title
Beispiel #21
0
def test_wrapping_for_topk_lm_model():
    """ This tests using the Explainer class to auto wrap a masker in a language modelling scenario.
    """

    transformers = pytest.importorskip("transformers")

    tokenizer = transformers.AutoTokenizer.from_pretrained("gpt2")
    model = transformers.AutoModelForCausalLM.from_pretrained("gpt2")
    wrapped_model = shap.models.TopKLM(model, tokenizer)
    masker = shap.maskers.Text(tokenizer, mask_token="...")

    explainer = shap.Explainer(wrapped_model, masker, seed=1)

    assert shap.utils.safe_isinstance(explainer.masker, "shap.maskers.FixedComposite")
Beispiel #22
0
 def _fixed_mode_shap_vals(self, X, to_fix, func, masker):
     """ Helper stud to compute shap values in single-modal mode where one sample is fixed """
     assert len(X) == len(to_fix)
     out = []
     explainer = shap.Explainer(func, masker, algorithm=self.algorithm)
     last_shape = X[0].shape
     # loop through samples
     for i in range(len(X)):
         self._curr_fixed = to_fix[i]
         if self._mode == "fix_text" and X[i].shape != last_shape:
             # reinitialise explainer using new masker if shape different
             masker = shap.maskers.Image("inpaint_telea", X[i].shape)
             explainer = shap.Explainer(func, masker, algorithm=self.algorithm)
             last_shape = X[i].shape
         if self._mode == "fix_image" and self.algorithm == "permutation":
             # Workaround if max_evals given was too low, increase to acceptable value
             self.max_evals = self._min_acceptable_evals(
                 len(self.tokenizer.tokenize(X[i]))
             )
         values = explainer(
             X[i : i + 1], max_evals=self.max_evals, batch_size=self.batch_size
         )
         out.append(values[0])  # select the only first element
     return out
Beispiel #23
0
def test_load_pyfunc(tmpdir):

    X, y = shap.datasets.boston()
    model = sklearn.ensemble.RandomForestRegressor(n_estimators=100)
    model.fit(X, y)

    explainer_original = shap.Explainer(model.predict, X, algorithm="permutation")
    shap_values_original = explainer_original(X[:2])
    path = tmpdir.join("pyfunc_test").strpath
    mlflow.shap.save_explainer(explainer_original, path)

    explainer_pyfunc = mlflow.shap._load_pyfunc(path)
    shap_values_new = explainer_pyfunc.predict(X[:2])

    np.testing.assert_allclose(shap_values_original.values, shap_values_new, rtol=100, atol=100)
Beispiel #24
0
def main():
    df = 'features_gfa_cts.csv'
    save = './'
    seed = 22478
    model = 'xgboost_model.pickle'

    os.makedirs(save, exist_ok=True)

    with open(model, 'rb') as f:
        model = pickle.load(f)

    # Setting seed for reproducibility
    np.random.seed(seed)
    np.random.RandomState(seed)
    model.random_state = seed

    df = pd.read_csv(df)
    df = df.sample(frac=1, random_state=seed).reset_index(drop=True)

    y = df['gfa'].values
    y = [1 if i == 'good' else 0 for i in y]

    feats = df.drop(['comp', 'gfa'], axis=1)
    X = feats.values

    scaler = StandardScaler()
    scaler.fit(X)
    X = scaler.transform(X)

    feat_names = np.array(feats.columns)

    names = [label[i] for i in list(feats.columns)]

    model.fit(X, y)
    pickle.dump(model, open(save + '/xgboost_model.pickle', 'wb'))
    explainer = shap.Explainer(model, X, feature_names=names)

    ranks = explainer(X)

    with open(save + '/shap.pickle', 'wb') as f:
        pickle.dump(ranks, f)

    shap.summary_plot(ranks,
                      X,
                      class_names=['Poor GFA', 'Good GFA'],
                      show=False)
    pl.savefig(save + '/shap')
    pl.close('all')
Beispiel #25
0
def test_serialization_permutation_no_model_or_masker():
    import shap
    import xgboost
    import pickle
    import numpy as np
    import tempfile

    # get a dataset on income prediction
    X, y = shap.datasets.adult()

    # train an XGBoost model (but any other model type would also work)
    model = xgboost.XGBClassifier()
    model.fit(X, y)

    explainer_original = shap.Explainer(model.predict_proba,
                                        X,
                                        algorithm='permutation')
    shap_values_original = explainer_original(X[:1])

    temp_serialization_file = tempfile.TemporaryFile()

    # Serialization
    explainer_original.model.save = None
    explainer_original.masker.save = None
    explainer_original.save(temp_serialization_file)

    temp_serialization_file.seek(0)

    # Deserialization
    explainer_new = shap.Explainer.load(temp_serialization_file)

    temp_serialization_file.close()

    # manually insert model and masker
    explainer_new.model = explainer_original.model
    explainer_new.masker = explainer_original.masker

    shap_values_new = explainer_new(X[:1])

    for i in range(len(explainer_original.masker.feature_names)):
        assert explainer_original.masker.feature_names[
            i] == explainer_new.masker.feature_names[i]

    assert np.array_equal(shap_values_original.base_values,
                          shap_values_new.base_values)
    assert type(explainer_original) == type(explainer_new)
    assert type(explainer_original.masker) == type(explainer_new.masker)
 def grid_search_fit_svc(self, c=None, scale=False):
     if scale:
         s = StandardScaler()
         s.fit(self.x_train)
         self.x_train = s.transform(self.x_train)
         self.x_pool = s.transform(self.x_pool)
         self.x_test = s.transform(self.x_test)
     if c is None:
         c = [0.8, 1, 2]
     max_iter = 1000
     best_f1 = 0
     model = None
     for c_option in c:
         m = SVC(max_iter=max_iter,
                 C=c_option,
                 kernel='linear',
                 class_weight='balanced',
                 probability=True)
         m.fit(self.x_train, self.y_train)
         predictions = m.predict(self.x_test)
         f1 = f1_score(predictions, self.y_test)
         if f1 > best_f1:
             self.model = m
             best_f1 = f1
     pred = self.model.predict(self.x_test)
     print("F1 score on test set ", f1_score(self.y_test, pred))
     print("Confusion matrix on test set ",
           confusion_matrix(self.y_test, pred))
     print("Accuracy test set", accuracy_score(self.y_test, pred))
     pred = self.model.predict(self.x_pool)
     print("F1 score on pool ", f1_score(self.y_pool, pred))
     print("Confusion matrix of final model on pool ",
           confusion_matrix(self.y_pool, pred))
     print("Accuracy of final model on pool",
           accuracy_score(self.y_pool, pred))
     explainer = shap.Explainer(self.model,
                                self.x_train,
                                feature_perturbation="independent")
     # TODO extract feature importance value of each feature
     self.shap_values_train = explainer.shap_values(self.x_train)
     self.shap_values_pool = explainer.shap_values(self.x_pool)
     feature_names = np.array(self.tfid.get_feature_names(
     ))  # len(feature_names) = #cols in shap_values_pool
     shap.summary_plot(self.shap_values_train,
                       self.x_train,
                       feature_names=feature_names)
     return self.model, explainer
Beispiel #27
0
def test_wrapping_for_text_to_text_teacher_forcing_model():
    """ This tests using the Explainer class to auto wrap a masker in a text to text scenario.
    """

    transformers = pytest.importorskip("transformers")

    def f(x): # pylint: disable=unused-argument
        pass

    tokenizer = transformers.AutoTokenizer.from_pretrained("gpt2")
    model = transformers.AutoModelForCausalLM.from_pretrained("gpt2")
    wrapped_model = shap.models.TeacherForcing(f, similarity_model=model, similarity_tokenizer=tokenizer)
    masker = shap.maskers.Text(tokenizer, mask_token="...")

    explainer = shap.Explainer(wrapped_model, masker, seed=1)

    assert shap.utils.safe_isinstance(explainer.masker, "shap.maskers.OutputComposite")
Beispiel #28
0
def test_single_class_independent_auto_api():
    xgboost = pytest.importorskip('xgboost')

    # get a dataset on income prediction
    X, y = shap.datasets.adult()
    X = X.iloc[:100]
    y = y[:100]

    # train an XGBoost model (but any other model type would also work)
    model = xgboost.XGBClassifier()
    model.fit(X, y)

    # build an Exact explainer and explain the model predictions on the given dataset
    explainer = shap.Explainer(model.predict, X, algorithm="permutation")
    shap_values = explainer(X)

    assert np.max(np.abs(shap_values.base_values + shap_values.values.sum(1) - model.predict(X[:100])) < 1e6)
Beispiel #29
0
def Explain_model(Model, X_train):
    'Get shap in linear models'
    explainer = shap.Explainer(Model, X_train, feature_names=X_train.columns)
    shap_values = explainer(X_train)
    #Plot
    fig = shap.plots.waterfall(shap_values[1], show=False)
    plt.savefig('scratch.png')

    #Importance values
    vals = np.abs(shap_values[1].values).mean(0)
    feature_importance = pd.DataFrame(
        list(zip(X_train.columns, vals)),
        columns=['col_name', 'feature_importance_vals'])
    feature_importance.sort_values(by=['feature_importance_vals'],
                                   ascending=False,
                                   inplace=True)
    print(feature_importance)
Beispiel #30
0
def test_raw_function():
    """ Make sure passing a simple masking function works.
    """

    X, _ = shap.datasets.boston()

    def test(X):
        return np.sum(X, 1)

    def custom_masker(mask, x):
        return (x * mask).reshape(
            1, len(x))  # just zero out the features we are masking

    explainer = shap.Explainer(test, custom_masker)
    shap_values = explainer(X[:100])

    assert np.var(shap_values.values - shap_values.data) < 1e-6