def shapley_kernel_wrapper(model, trainx, testx, config):
    """ This function is called by the prediction algorithms (ml_functions) 
    to compute the Shapley values. Note that the decision tree based models
    such as random forest do provide faster and exact (non-approximated)
     Shapley values with the TreeShapExplainer"""
    if config.exp_shap_background >= trainx.shape[0]:
        background = trainx
    else:
        background = shap.kmeans(trainx, config.exp_shap_background)

        # random sample of background values
        # ixx = np.random.choice(trainx.shape[0], config.exp_shap_background, replace=False)
        # background = trainx[ixx, :]
        # print(background.shape)

        explainer = shap.KernelExplainer(model.predict_proba, background)
        if isinstance(
                model, LogisticRegression
        ):  # one background instance is enough if we use a linear model
            background = shap.kmeans(trainx, 1)
            backup_base = explainer.fnull
            explainer = shap.KernelExplainer(model.predict_proba, background)
            explainer.fnull = backup_base

    fnull_save = explainer.fnull[1]
    out = [
        explainer.shap_values(testx[i, :], l1_reg=0.0)[1]
        for i in np.arange(len(testx))
    ]
    return np.vstack(out)
Example #2
0
def get_shap_kernel(estimator: object, X_train):
    """compute the shap value importance for non-tree based model

    Args:
        estimator (a none tree based sklearn estimator): a sklearn non tree based estimator
        x_train ((pd.DataFrame, np.ndarray),): X training data
        x_test ((pd.DataFrame, np.ndarray),): X testing data

    Returns:
        shap plot
    """
    warnings.filterwarnings("ignore")
    # because the kernel explainer for non-tree based model extremly slower
    # so we must use kmeans to extract mainly information from x_train
    # to speed up the calculation
    if X_train.shape[1] > 3:
        x_train_summary = shap.kmeans(X_train, 3)
    else:
        x_train_summary = shap.kmeans(X_train, X_train.shape[1])
    explainer = shap.KernelExplainer(estimator.predict, x_train_summary)

    size = len(X_train)
    if size < 50:
        size = size
    elif size * 0.2 > 50:
        size = 50
    else:
        size = int(size * 0.2)
    sample_values = shap.sample(X_train, size)
    shap_values = explainer.shap_values(sample_values,
                                        lr_reg='num_features(10)')

    return explainer, shap_values, sample_values
Example #3
0
def explain_dnns(datasource, estimator, shap_dataset, plot_type, result_table,
                 feature_column_names, is_pai, pai_table, hdfs_namenode_addr,
                 hive_location, hdfs_user, hdfs_pass):
    def predict(d):
        def input_fn():
            return tf.data.Dataset.from_tensor_slices(
                dict(pd.DataFrame(d,
                                  columns=shap_dataset.columns))).batch(1000)

        return np.array(
            [p['probabilities'][-1] for p in estimator.predict(input_fn)])

    if len(shap_dataset) > 100:
        # Reduce to 16 weighted samples to speed up
        shap_dataset_summary = shap.kmeans(shap_dataset, 16)
    else:
        shap_dataset_summary = shap_dataset
    shap_values = shap.KernelExplainer(
        predict, shap_dataset_summary).shap_values(shap_dataset, l1_reg="aic")
    if result_table != "":
        if is_pai:
            write_shap_values(shap_values, "pai_maxcompute", None,
                              result_table, feature_column_names,
                              hdfs_namenode_addr, hive_location, hdfs_user,
                              hdfs_pass)
        else:
            conn = connect_with_data_source(datasource)
            write_shap_values(shap_values, conn.driver, conn, result_table,
                              feature_column_names, hdfs_namenode_addr,
                              hive_location, hdfs_user, hdfs_pass)
    else:
        explainer.plot_and_save(lambda: shap.summary_plot(
            shap_values, shap_dataset, show=False, plot_type=plot_type))
Example #4
0
    def init_shap(self):
        """
        Initialize shap. Calculate shap values. This operation is time consuming.
        :return: void (Sets the value of the shap_values variable)
        """
        import shap

        if not self.shap_values:

            log.info("Initializing Shap - calculating shap values."
                     " This operation is time-consuming so please be patient.")

            logger = log.getLogger('shap')
            logger.setLevel(log.WARN)

            shap_kernel_explainer = shap.KernelExplainer(
                self.model[1].predict_proba, shap.kmeans(self.X_test_ohe, 1))
            shap_values = shap_kernel_explainer.shap_values(self.X_test_ohe)
            # from util.commons import RANDOM_NUMBER
            # shap_values = shap_kernel_explainer.shap_values(self.X_test_ohe.sample(66, random_state=RANDOM_NUMBER))

            self.shap_kernel_explainer = shap_kernel_explainer
            self.shap_values = shap_values
        else:
            log.info("Shap is already initialized.")
def _summarize_data(X, k=10, to_round_values=True):
    """Summarize a dataset.

    For dense dataset, use k mean samples weighted by the number of data points they
    each represent.
    For sparse dataset, use a sparse row for the background with calculated
    median for dense columns.

    :param X: Matrix of data samples to summarize (# samples x # features).
    :type X: numpy.array or pandas.DataFrame or scipy.sparse.csr_matrix
    :param k: Number of cluster centroids to use for approximation.
    :type k: int
    :param to_round_values: When using kmeans, for each element of every cluster centroid to match the nearest value
        from X in the corresponding dimension. This ensures discrete features
        always get a valid value.  Ignored for sparse data sample.
    :type to_round_values: bool
    :return: DenseData or SparseData object.
    :rtype: iml.datatypes.DenseData or iml.datatypes.SparseData
    """
    is_sparse = issparse(X)
    if not isinstance(X, DenseData):
        if is_sparse:
            module_logger.debug('Creating sparse data summary as csr matrix')
            # calculate median of sparse background data
            median_dense = csc_median_axis_0(X.tocsc())
            return csr_matrix(median_dense)
        elif len(X) > 10 * k:
            module_logger.debug('Create dense data summary with k-means')
            # use kmeans to summarize the examples for initialization
            # if there are more than 10 x k of them
            return shap.kmeans(X, k, to_round_values)
    return X
Example #6
0
def get_shap_values():
    global data
    model = []
    model_params = []
    xx = []
    y = []
    y_train = []
    x_train = []
    for d in data:
        model.append(d['model'])
        model_params.append(d['model_params'])
        xx.append(d['x'])
        y.append(d['y'])
        y_train.append(d['y_train'])
        x_train.append(d['x_train'])
    all_shap = []
    shap_names = get_shap_names()
    for l, g, g_params, x, bd in zip(y, model, model_params, xx, x_train):
        r_model = reg_model(g, g_params)
        k_meaned = shap.kmeans(bd, 20)
        shap_values = shap.KernelExplainer(r_model, k_meaned).shap_values(x)
        all_shap.extend(shap_values)
    all_shap = np.array(all_shap)
    all_x = np.concatenate(xx)
    shap.summary_plot(all_shap,
                      features=all_x,
                      feature_names=shap_names,
                      max_display=8)
    print('nice')
Example #7
0
def test_log_explanation_with_small_features():
    """
    Verifies that `log_explanation` does not fail even when `features` has less records than
    `_MAXIMUM_BACKGROUND_DATA_SIZE`.
    """
    num_rows = 50
    assert num_rows < mlflow.shap._MAXIMUM_BACKGROUND_DATA_SIZE

    X, y = get_boston()
    X, y = X.iloc[:num_rows], y[:num_rows]
    model = RandomForestRegressor()
    model.fit(X, y)

    with mlflow.start_run() as run:
        explanation_uri = mlflow.shap.log_explanation(model.predict, X)

    artifact_path = "model_explanations_shap"
    artifacts = set(yield_artifacts(run.info.run_id))

    assert explanation_uri == os.path.join(run.info.artifact_uri, artifact_path)
    assert artifacts == {
        os.path.join(artifact_path, "base_values.npy"),
        os.path.join(artifact_path, "shap_values.npy"),
        os.path.join(artifact_path, "summary_bar_plot.png"),
    }

    explainer = shap.KernelExplainer(model.predict, shap.kmeans(X, num_rows))
    shap_values_expected = explainer.shap_values(X)

    base_values = np.load(os.path.join(explanation_uri, "base_values.npy"))
    shap_values = np.load(os.path.join(explanation_uri, "shap_values.npy"))
    np.testing.assert_array_equal(base_values, explainer.expected_value)
    np.testing.assert_array_equal(shap_values, shap_values_expected)
Example #8
0
def explain_model(model, nlp_args, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    # we use the first 100 training examples as our background dataset to integrate over
    if pipe.named_steps['clf'].__class__.__name__ == "LogisticRegression":
        explainer = shap.LinearExplainer(
            model.named_steps['clf'],
            model.named_steps['vectorizer'].transform(
                model.named_steps['preprocess'].transform(X_train)))
        shap_values = explainer.shap_values(
            model.named_steps['vectorizer'].transform(
                model.named_steps['preprocess'].transform(X_test)).toarray())
    elif pipe.named_steps[
            'clf'].__class__.__name__ == "RandomForestClassifier":
        explainer = shap.TreeExplainer(model.named_steps['clf'])
        shap_values = explainer.shap_values(
            model.named_steps['vectorizer'].transform(
                model.named_steps['preprocess'].transform(X_test)).toarray())
    else:
        X_train_summary = shap.kmeans(
            model.named_steps['vectorizer'].transform(
                model.named_steps['preprocess'].transform(X_train)).toarray(),
            10)
        explainer = shap.KernelExplainer(
            model.named_steps['clf'].predict_proba, X_train_summary)
        shap_values = explainer.shap_values(
            model.named_steps['vectorizer'].transform(
                model.named_steps['preprocess'].transform(X_test)).toarray())

    return explainer, shap_values
Example #9
0
def script(num_models, k, scoring, num_clust):
    X = in_data.X
    y = in_data.Y
    best_params = {'hidden_layer_sizes': (28, 28), 'solver': 'adam'}

    print(in_data.X.shape)
    print(best_params)

    results = {}
    scores = {}
    models = {}
    for m in range(num_models):
        model = MLPRegressor(**best_params)
        kfold = model_selection.KFold(n_splits=k)
        cv_results = model_selection.cross_validate(model,
                                                    X,
                                                    y,
                                                    cv=kfold,
                                                    scoring=scoring)
        print(cv_results)
        score = sum(cv_results['test_r2']) / len(cv_results['test_r2'])
        results[m] = cv_results
        scores[m] = score
        models[m] = model
        print(m, max(scores.values()), score)

    print(max(scores, key=scores.get), scores, results[max(scores,
                                                           key=scores.get)])
    best_model = models[max(scores, key=scores.get)]

    feat_names = [x.name for x in in_data.domain.attributes]
    #new_feat_names = [x.split('=')[-1] for x in feat_names]
    print(feat_names)

    shap.initjs()

    best_model.fit(X, y)
    Xsum = shap.kmeans(X, num_clust) if num_clust is not None else X
    explainer = shap.KernelExplainer(best_model.predict, Xsum.data)

    shap_values = explainer.shap_values(X)

    #feat_names = ['{0}: {1}'.format(x[0], x[1]) for x in list(zip(feat_names, np.around(np.mean(np.absolute(shap_values), axis=0), 3)))]
    #id = len(Window.get_all_windows())
    #summary_plot(shap_values, features=X, feature_names=feat_names, class_names=class_names, plot_type='bar', color='black', id=id)
    #summary_plot(shap_values, features=X, feature_names=feat_names, class_names=class_names, plot_type='dot', title=title, id=id)

    d = Orange.data.Domain(
        [Orange.data.ContinuousVariable(f) for f in feat_names] +
        [Orange.data.ContinuousVariable('S_' + f) for f in feat_names],
        class_vars=in_data.domain.class_vars,
        metas=in_data.domain.metas)

    xs = np.concatenate((in_data.X, shap_values), axis=1)
    ys = in_data.Y
    metas = in_data.metas

    out_data = Orange.data.Table.from_numpy(d, xs, Y=ys, metas=metas)

    return out_data, None, None, shap_values
    def run_callback(self):
        self.model_idx = self.model_names.index(self.model_select.value)
        self.kfold = int(self.kfold_select.value)


        # set all the parameters of the new kfold
        self.geos_train, self.X_train_df, self.y_train, self.train_probas, \
        self.geos_test, self.X_test_df, self.y_test, self.test_probas, \
        self.models, self.model_names, self.auc_scores = self._extract_model_results(self.full_results, self.model_idx,
                                                                                     self.kfold)

        self.num_kfold = len(self.full_results[0]['train_idx'])
        self.features_df = pd.concat([self.X_train_df, self.X_test_df])
        self.features_kmeans = shap.kmeans(self.features_df, 10)
        self.all_probas = np.concatenate([self.train_probas[self.model_idx], self.test_probas[self.model_idx]])
        self.all_probas_df = pd.Series(data=self.all_probas, index=self.features_df.index)

        # create new precision recall curve
        test_probas_df = pd.DataFrame(
            {model_name: test_proba for model_name, test_proba in zip(self.model_names, self.test_probas)})
        pr_curve = bokeh_multiple_pr_curves(test_probas_df, self.y_test.values)
        self.left_column.children[0] = pr_curve

        # create new folium map
        print(f"Choose model: {self.model_select.value}\t kfold: {self.kfold}")
        new_folium = bokeh_scored_polygons_folium([self.all_probas_df],
                                                  [True], train_geos=self.geos_train,
                                                  test_geos=self.geos_test, start_zoom=self.folium_zoom,
                                                  start_location=self.start_location, file_name=self.folium_name,
                                                  width=700, lonlat_text_inputs=self.lonlat_text_inputs)
        self.folium_column.children[-2] = new_folium

        self.sample_feature_importance_update()
Example #11
0
File: SHAP.py Project: timm/LAS
 def explain(self):
     shap.initjs()
     X_train_summary = shap.kmeans(self.X_train, self.n_cluster)
     rf_explainer = shap.KernelExplainer(self.clf.predict, X_train_summary)
     shap_values_RF_test = rf_explainer.shap_values(self.X_test)
     self.shap = shap_values_RF_test
     return shap_values_RF_test
Example #12
0
def SHAP_values(model, X_train):
    k_sample = shap.kmeans(X_train, 5)
    explainer = shap.KernelExplainer(model.predict, k_sample)
    shap_values = explainer.shap_values(X_train)

    shap.summary_plot(shap_values, X_train, plot_type='bar')
    plt.show()
Example #13
0
def calculate_deepshap_values(model, training_data, test_data, K=None):
    """Calculate deepSHAP values for the given model.

    Args:
        model: trained deep model
        training_data: data of shape (num_samples, num_features) on which model was trained
        test_data: data of shape (num_samples, num_features) on which model was tested 
        K: number of means to use for K-means summarization of training_data (used for
            calculating feature importances in SHAP values); if None, then all the
            training data are used for this calculation (potentially expensive)
    
    Returns:
        A tuple consisting of 1. the explainer's expected values and 2. the SHAP values.
    """

    if K:
        kmeans_summary = shap.kmeans(training_data, K)
        print("Finished k-means summarization!")
        explainer = shap.DeepExplainer(model, kmeans_summary, link="logit")
    else:
        explainer = shap.DeepExplainer(model, training_data, link="logit")

    shap_values = explainer.shap_values(test_data)

    return explainer.expected_value, shap_values
    def __init__(self, task: TaskHandler, results_dir=BUILDING_RESULTS_DIR):
        self.task = task
        self.main_panel = column()
        self.folium_name = task.__str__()

        # unpack building experiment results
        self.kfold = 0
        self.model_idx = 0
        # self.full_results = load_experiment_results(results_dir)['model_results']
        self.full_results = load_separate_model_results(results_dir)['model_results']
        self.model_results = self.full_results[self.model_idx]
        self.geos_train, self.X_train_df, self.y_train, self.train_probas, \
        self.geos_test, self.X_test_df, self.y_test, self.test_probas, \
        self.models, self.model_names, self.auc_scores = self._extract_model_results(self.full_results, self.model_idx,
                                                                                     self.kfold)

        self.num_kfold = len(self.full_results[0]['train_idx'])
        self.features_df = pd.concat([self.X_train_df, self.X_test_df])
        self.features_kmeans = shap.kmeans(self.features_df, 10)
        self.all_probas = np.concatenate([self.train_probas[self.model_idx], self.test_probas[self.model_idx]])
        self.all_probas_df = pd.Series(data=self.all_probas, index=self.features_df.index)

        self.mean_auc = self.mean_auc_panel(self.model_names, self.auc_scores)
        plot = self.bokeh_plot()
        self.main_panel.children.append(plot)
Example #15
0
 def run(self, X_train, nsamples=1000):
     X_train_summary = shap.kmeans(X_train, 10)
     self.sensor_names = X_train.columns
     self.explainer = shap.KernelExplainer(self.predict_LR,
                                           X_train_summary,
                                           l1_reg="auto")
     self.shap_values = np.array(
         self.explainer.shap_values(X_test, nsamples=nsamples))  #[0]
Example #16
0
def get_shap_mean_values(model, df, features):
    explainer = shap.KernelExplainer(model=model.predict,
                                     data=shap.kmeans(df[features], 10))
    shap_values = explainer.shap_values(df[FEATURES].sample(10),
                                        l1_reg='aic',
                                        silent=True)

    mean_shap_values = shap_values.mean(axis=0)
    return mean_shap_values
Example #17
0
    def get_shap_feature_importance(self, base_model, X_test, X_train):

        try:
            import shap
        except ImportError:
            raise ImportError('You must have shap installed to use shap')

        if self.flags['tree'] or self.flags['linear']:

            if self.flags['linear']:

                fp = self.shap_params.linear_feature_perturbation
                n = self.shap_params.linear_nsamples
                explainer = shap.LinearExplainer(base_model, X_train,
                                                 nsamples=n,
                                                 feature_perturbation=fp)

                shap_values = explainer.shap_values(X_test)

            elif self.flags['tree']:

                tmo = self.shap_params.tree_model_output
                tfp = self.shap_params.tree_feature_perturbation
                explainer =\
                    shap.TreeExplainer(
                        base_model, X_train,
                        model_output=tmo,
                        feature_perturbation=tfp)

                ttl = self.shap_params.tree_tree_limit
                shap_values =\
                    explainer.shap_values(X_test,
                                          tree_limit=ttl)

        # Kernel
        else:

            nkmean = self.shap_params.kernel_nkmean

            if nkmean is not None:
                X_train_summary = shap.kmeans(X_train, nkmean)
            else:
                X_train_summary = X_train

            explainer =\
                self.get_kernel_explainer(base_model, X_train_summary,
                                          self.shap_params.kernel_link)

            klr = self.shap_params.kernel_l1_reg
            kns = self.shap_params.kernel_nsamples

            shap_values =\
                explainer.shap_values(np.array(X_test),
                                      l1_reg=klr,
                                      n_samples=kns)

        return self.proc_shap_vals(shap_values)
Example #18
0
def regressor():
    X, y = get_boston()
    model = RandomForestRegressor()
    model.fit(X, y)

    explainer = shap.KernelExplainer(model.predict, shap.kmeans(X, 100))
    shap_values = explainer.shap_values(X)

    return ModelWithExplanation(model, X, shap_values, explainer.expected_value)
Example #19
0
def classifier():
    X, y = get_iris()
    model = RandomForestClassifier()
    model.fit(X, y)

    explainer = shap.KernelExplainer(model.predict_proba, shap.kmeans(X, 100))
    shap_values = explainer.shap_values(X)

    return ModelWithExplanation(model, X, shap_values, explainer.expected_value)
Example #20
0
def main(toolName, datasetName):

    tool = getToolObject(toolName)

    #load the dataset [used as data in shapley plot]
    sequences = getDataset(datasetName, toolName)
    print("Loaded the dataset " + datasetName + " of " + str(len(sequences)) +
          " data points.")

    #Calculate the features for the dataset
    print(
        "Calculating features for the dataset. [Might take a while for wu-crispr]"
    )
    feature_set = []
    cnt = 1
    for seq in sequences:
        features = tool.getFeatures(seq)
        feature_set.append(features)
        if toolName == 'wu-crispr' and cnt % 100 == 0:  #inform on progress [wu-crispr takes a while]
            print("-- Calculated features for " + str(cnt))
        cnt = cnt + 1
    print("Calculated the features for this dataset.")

    #Get feature names [for printing in the shapley plot]
    feature_names = tool.loadFeatureNames()
    print("Loaded the names of all " + str(len(feature_names)) + " features.")

    #Put together features with names in one dataframe
    dataset_df = pd.DataFrame(np.array(feature_set), columns=feature_names)

    #training set (must be loaded before model)
    train_df = tool.loadTrainingSet()
    print("Loaded the training set used for tool " + toolName + ", size: " +
          str(train_df.shape) + ".")

    #load model of the tool
    model = tool.loadModel()
    print("Loaded the model for tool " + toolName + ".")

    #summarize training set and subsample test set
    summary_train_df = shap.kmeans(train_df, 2)
    dataset_sub_df = dataset_df  # optional to speed up things can use dataset_df.sample(400)

    #compute and plot shapley values
    shap_explainer = shap.KernelExplainer(model.predict, summary_train_df)
    print(dataset_sub_df)
    shap_values = shap_explainer.shap_values(dataset_sub_df)

    #save the values
    with open("../results/SHAP-" + toolName + "-" + datasetName, 'wb') as file:
        pickle.dump(shap_values, file)  #the SHAP values
        pickle.dump(dataset_sub_df, file)  #the data used
    print("Computed and saved SHAP values.")

    #ploatting
    shap.summary_plot(shap_values, dataset_sub_df)
Example #21
0
 def _kmeans(self, means: int) -> DenseData:
     """
     Wrapper to cache kmeans results for repeated runs
     :param means: amount of centers for kmeans
     :return: kmeaned background data for shap (either from cache or newly counted)
     """
     if means in self.sampled_background.keys():
         return self.sampled_background[means]
     self.sampled_background[means] = kmeans(self.X_train, means)
     return self.sampled_background[means]
Example #22
0
def build_shap_explainer(model, dataset):
    # --- Summarizing the dataset ---
    # Can be fine tuned further (Kmeans, etc.)
    # summ = np.median(dataset, axis=0).reshape((1,dataset.shape[1]))
    summ = shap.kmeans(dataset, 100)
    print("Summary data for SHAP:", summ)
    explainer = shap.KernelExplainer(model.run_model_data_prob,
                                     summ,
                                     link="identity")
    print("Expected value for SHAP:", explainer.expected_value[1])
    return explainer
Example #23
0
def run_shap(features_csv, labels_csv, output_file):
    # Import dfs
    labels = pd.read_csv(os.path.join(os.getcwd(), labels_csv))
    imp_feat = pd.read_csv(os.path.join(os.getcwd(), features_csv))

    # set label index
    labels.set_index('respondent_id', inplace=True)

    # IMPUTED
    imp_feat.set_index('Unnamed: 0', inplace=True)
    imp_feat.sort_index(inplace=True)

    # merge_df options

    merged_df = imp_feat.join(labels)
    # merged_df = imp_feat_small.join(labels)

    df_h1n1 = merged_df.reset_index(drop=True).drop(['seasonal_vaccine'],
                                                    axis=1)
    print(df_h1n1.shape)

    X = df_h1n1.iloc[:, :-1]
    y = df_h1n1.iloc[:, -1]

    X_train, X_val, y_train, y_val = train_test_split(X,
                                                      y,
                                                      test_size=0.1,
                                                      stratify=y,
                                                      random_state=42)

    # get feature names
    feature_names = list(X_train)

    # check shape
    print(X.shape)
    print(X_train.shape)

    # IMPUTED Scaling and
    X_train = StandardScaler().fit_transform(X_train)
    print(X_train.shape)

    X_val = StandardScaler().fit_transform(X_val)

    clf = SVC(kernel='rbf', probability=True).fit(X_train, y_train)
    X_train_summary = shap.kmeans(X_train, 10)
    explainer = shap.KernelExplainer(clf.predict_proba, X_train_summary)
    shap_values_train = explainer.shap_values(X_train)
    shap_values_test = explainer.shap_values(X_val)

    df_SVC = pd.DataFrame(shap_values_train[0].mean(0),
                          index=X.columns,
                          columns=['SVC']).sort_values('SVC', ascending=False)
    df_SVC.to_csv(output_file)
Example #24
0
def run_shap(results, X_test, new_path, hp):
    for key, item in results.items():
        model = item[0]
        # X_test = X_test.iloc[:10, :]
        k_X = shap.kmeans(X_test, 5)
        if key == 'MLP':
            explainer = shap.KernelExplainer(model.model.predict_classes, k_X)
        else:
            explainer = shap.KernelExplainer(model.model.predict, k_X)
        shap_values = explainer.shap_values(X_test)
        f = plt.figure()
        shap.summary_plot(shap_values, X_test, show=False, plot_type='bar')
        f.savefig(new_path + "/" + hp + '-' + key + "-summary_plot.pdf", bbox_inches='tight', dpi=600)
Example #25
0
def _get_kernel_explainer(predict_func, bkgrd_data, kmeans_size=10):
    if predict_func is None:
        raise ValueError(
            "No target to compute shap values. Expected either model or predict_func"
        )
    # rather than use the whole training set to estimate expected values,
    # summarize with a set of weighted kmeans, each weighted by
    # the number of points they represent.
    if kmeans_size is None:
        x_bkgrd_summary = bkgrd_data
    else:
        x_bkgrd_summary = shap.kmeans(bkgrd_data, kmeans_size)
    return shap.KernelExplainer(predict_func, x_bkgrd_summary)
Example #26
0
def calculate_shap_values(model, train_data, shap_data, model_type="kernel"):
    # use faster tree explainer for tree based models
    if model_type == "tree":
        explainer = shap.TreeExplainer(model)
    else:
        if len(train_data) > 100:
            explainer = shap.KernelExplainer(model,
                                             shap.kmeans(train_data, 100))
        else:
            explainer = shap.KernelExplainer(model, train_data)

    shap_values = explainer.shap_values(shap_data)
    return explainer, shap_values
Example #27
0
    def kernel(self, print_result=False):
        result = {}
        result['name'] = self.name
        result['shap_method'] = 'kernel'

        X_train_summary = shap.kmeans(self.X_train, 50)
        explainer = shap.KernelExplainer(model=self.model.predict_proba,
                                         data=X_train_summary)
        shap_values = explainer.shap_values(self.X_test)
        if isinstance(shap_values, list):
            shap_values = shap_values[1]

        return self._run(shap_values, result, print_result)
Example #28
0
def explain_dnns(datasource, estimator, shap_dataset, plot_type, result_table,
                 feature_column_names, is_pai, pai_table, hdfs_namenode_addr,
                 hive_location, hdfs_user, hdfs_pass, oss_dest, oss_ak, oss_sk,
                 oss_endpoint, oss_bucket_name):
    def predict(d):
        if len(d) == 1:
            # This is to make sure the progress bar of SHAP display properly:
            # 1. The newline makes the progress bar string captured in pipe
            # 2. The ASCII control code moves cursor up twice for alignment
            print("\033[A" * 2)

        def input_fn():
            return tf.data.Dataset.from_tensor_slices(
                dict(pd.DataFrame(d,
                                  columns=shap_dataset.columns))).batch(1000)

        if plot_type == 'bar':
            predictions = [
                p['logits'] if 'logits' in p else p['predictions']
                for p in estimator.predict(input_fn)
            ]
        else:
            predictions = [
                p['logits'][-1] if 'logits' in p else p['predictions'][-1]
                for p in estimator.predict(input_fn)
            ]
        return np.array(predictions)

    if len(shap_dataset) > 100:
        # Reduce to 16 weighted samples to speed up
        shap_dataset_summary = shap.kmeans(shap_dataset, 16)
    else:
        shap_dataset_summary = shap_dataset
    shap_values = shap.KernelExplainer(
        predict, shap_dataset_summary).shap_values(shap_dataset, l1_reg="aic")
    if result_table != "":
        if is_pai:
            write_shap_values(shap_values, "pai_maxcompute", None,
                              result_table, feature_column_names,
                              hdfs_namenode_addr, hive_location, hdfs_user,
                              hdfs_pass)
        else:
            conn = connect_with_data_source(datasource)
            write_shap_values(shap_values, conn.driver, conn, result_table,
                              feature_column_names, hdfs_namenode_addr,
                              hive_location, hdfs_user, hdfs_pass)
    explainer.plot_and_save(
        lambda: shap.summary_plot(
            shap_values, shap_dataset, show=False, plot_type=plot_type),
        is_pai, oss_dest, oss_ak, oss_sk, oss_endpoint, oss_bucket_name)
Example #29
0
def get_model_explanations(model, train_data, test_data, background_samples=10, nsamples='auto', num_features=50):
    """runs KernelSHAP on model and returns subjectwise model explanations

    model : trained sklearn estimator
    train_data : n x p array of training data used in model training
    test_data : n x p array of test data
    background_samples : number of kmeans clusters used to summarise training data, fewer=faster
    nsamples : number of times to reevaluate model to estimate Shapley values, more=lower variance
    num_features : number of features to include in local model
    """
    explainer = shap.KernelExplainer(model.predict, shap.kmeans(train_data, background_samples), link='identity')
    explanations = explainer.shap_values(test_data, nsamples=nsamples, l1_reg='num_features({:})'.format(num_features))

    return explanations
Example #30
0
    def show_federate_shap_on_each_client(self):
        logging.info("################load data for shap")

        client = self.client_list[0]
        fed_pos = 0  # 联邦特征的起始下标
        for client_idx in range(self.args.client_num_in_total):
            if self.test_data_local_dict[client_idx] is None:
                continue
            client.update_local_dataset(
                0, self.train_data_local_dict[client_idx],
                self.test_data_local_dict[client_idx],
                self.train_data_local_num_dict[client_idx])
            train_X, train_y, test_X = client.get_all_X()

            # federate shapley
            train_X_all_pd = pd.DataFrame(train_X.numpy())
            f_knn = lambda x: self.model_trainer.model.forward(x)
            med = train_X_all_pd.median().values.reshape(
                (1, train_X_all_pd.shape[1]))
            feature_num = len(self.feature_name) - 1  # 特征个数
            fs = FederateShap()

            # Aggregated and average federated shap
            data = shap.kmeans(train_X_all_pd, 20)
            step = 3
            shap_values_whole = []
            cols_federated = self.feature_name[:-1]
            cols_federated[fed_pos] = 'Federated'
            del cols_federated[fed_pos + 1:fed_pos + step]

            for x in data.data:
                phi = fs.kernel_shap_federated_with_step(
                    f_knn, x, med, feature_num, fed_pos, step)
                base_value = phi[-1]
                shap_values = phi[:-1]
                shap_values_whole.append(list(shap_values))
            shap_values_whole = np.array(shap_values_whole)
            shap_values_whole_mean = np.mean(shap_values_whole,
                                             axis=0).transpose()
            # 绘制联邦特征shapley值的图
            shap.summary_plot(shap_values_whole_mean,
                              feature_names=cols_federated,
                              sort=False)
            shap.summary_plot(shap_values_whole_mean,
                              feature_names=cols_federated,
                              sort=False,
                              plot_type="bar")
            fed_pos += step