Ejemplo n.º 1
0
    def decisions_regression(
        df_preds,
        shap_values,
        expected_value,
        X_vald,
        y_vald,
        model_file_path,
        learner_name,
    ):
        fig = plt.gcf()
        shap.decision_plot(
            expected_value,
            shap_values[df_preds.lp[:10], :],
            X_vald.loc[df_preds.index[:10]],
            show=False,
        )
        fig.tight_layout(pad=2.0)
        fig.savefig(
            os.path.join(model_file_path,
                         f"{learner_name}_shap_worst_decisions.png"))
        plt.close("all")

        fig = plt.gcf()
        shap.decision_plot(
            expected_value,
            shap_values[df_preds.lp[-10:], :],
            X_vald.loc[df_preds.index[-10:]],
            show=False,
        )
        fig.tight_layout(pad=2.0)
        fig.savefig(
            os.path.join(model_file_path,
                         f"{learner_name}_shap_best_decisions.png"))
        plt.close("all")
Ejemplo n.º 2
0
def explain_row_shap(scaled_row, explainer, nsamples=100, verbose=0):
    shap_values = explainer.shap_values(scaled_row.reshape(1, shape_size),
                                        nsamples=nsamples,
                                        l1_reg="num_features(32)")

    if (verbose == 1):
        shap.decision_plot(explainer.expected_value[0],
                           shap_values[0][0, :],
                           scaled_row,
                           feature_names=list(
                               df.drop('loan_repaid', axis=1).columns),
                           link="logit")

    map_values = {}
    for class_value in range(len(shap_values)):
        s = shap_values[class_value][0]
        sorted_indices = sorted(range(len(s)),
                                key=lambda k: s[k],
                                reverse=True)
        #         print(sorted_indices)
        ordered_list = [(a, shap_values[class_value][0][a])
                        for a in sorted_indices
                        if shap_values[class_value][0][a] > 0]
        map_values[class_value] = ordered_list
    #     print(map_values)
    return map_values
Ejemplo n.º 3
0
def get_decision_plot(patente, step_id_week):
    f = plt.figure()
    ranker = joblib.load(
        'C:/Users/raskolnnikov/Desktop/projects/samtech/samtech_entrega/modelos/ranker_v_1.0.joblib'
    )
    explainer = shap.TreeExplainer(ranker)
    ranker_features = ranker.feature_names
    expected_value = explainer.expected_value
    entry = RankingEntry.query.filter_by(
        patente=patente, step_id_week=step_id_week).one().instance_json
    instance = pd.read_json(entry, orient='records').T
    instance.columns = ranker_features
    shap_value = ranker.predict(
        xgb.DMatrix(instance[ranker_features]),
        pred_contribs=True,
    )[0, :-1]
    shap.decision_plot(expected_value,
                       shap_value,
                       instance.iloc[0, :],
                       link='logit',
                       highlight=0,
                       show=False)
    buf = BytesIO()
    f.savefig(buf, format="png", dpi=150, bbox_inches='tight')
    buf.seek(0)
    f.clear()
    plt.close(f)
    img_base64 = base64.b64encode(buf.read())
    response = {
        "image": img_base64.decode(),
    }
    return response, 200
Ejemplo n.º 4
0
def shap_decision_plot(expected_value,
                       shap_values,
                       samples,
                       save=True,
                       crop_feature_names=20):

    if crop_feature_names:
        feature_names = []
        for col in samples.columns:
            if len(col) > crop_feature_names:
                feature_names.append(col[:20])
            else:
                feature_names.append(col)

        shap.decision_plot(expected_value,
                           shap_values,
                           features=samples,
                           feature_names=feature_names,
                           show=False)
    else:
        shap.decision_plot(expected_value,
                           shap_values,
                           features=samples,
                           show=False)
    f = plt.gcf()
    f.show()
    if save:
        f.savefig("shap_decision_plot.png")
Ejemplo n.º 5
0
def combine_summary_decision_curve(shap_value, expected_value, features,
                                   feature_names, n_features,
                                   examples_subset_index, misclassified, link,
                                   save_path):
    """
    Generate a combined SHAP Summary and Decision Plot
    Parameters
    ----------
    shap_value: np.ndarray
        SHAP value for a particular output class
    expected_value: float
        Average of the classifier/model output over training dataset
    features: np.ndarray
        Features in testing dataset
    feature_names: np.array
        List of feature names
    n_features: int
        Maximum count of features to use for the plots
    examples_subset_index: list of indices
        Samples to select for the decision plot
    misclassified: list of Boolean values
        Denotes which selected samples are misclassified (set to true)
    link: string
       Link type to be used for the decision plot
    save_path: string
        File path where the plot will be saved
    """

    figsize = (12, 6)
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=figsize)
    # Generate Summary plot
    plt.sca(ax1)
    shap.summary_plot(shap_value,
                      features=features,
                      feature_names=feature_names,
                      sort=True,
                      show=False,
                      max_display=n_features)
    # Generate Decision plot
    plt.gcf().set_size_inches(figsize)
    plt.sca(ax2)
    feature_order = np.argsort(np.sum(np.abs(shap_value), axis=0))
    shap.decision_plot(expected_value,
                       shap_value[examples_subset_index],
                       features[examples_subset_index, :],
                       feature_names=list(feature_names),
                       feature_display_range=slice(None, -(n_features + 1),
                                                   -1),
                       ignore_warnings=True,
                       highlight=misclassified,
                       show=False,
                       plot_color='viridis',
                       feature_order=feature_order,
                       link=link)
    plt.plot([0.5, 0.5], [0, n_features], ':k', alpha=0.3)
    ax2.set_yticklabels([])
    plt.savefig(save_path)
    plt.close()
Ejemplo n.º 6
0
 def decision_plot(self, class_id=0, row_idx=-1, **kwargs):
     "Visualize model decision using cumulative `SHAP` values."
     shap_vals, exp_val = _get_values(self, class_id)
     n_rows = shap_vals.shape[0]
     if row_idx == -1:
         print(f'Displaying rows 0-9 of {n_rows} (use `row_idx` to specify another row)')
         return shap.decision_plot(exp_val, shap_vals[:10], self.test_data.iloc[:10], **kwargs)
     print(f'Displaying row {row_idx} of {n_rows} (use `row_idx` to specify another row)')
     return shap.decision_plot(exp_val, shap_vals[row_idx], self.test_data.iloc[row_idx], **kwargs)
def plot_shap_force(drug_id, expected_value, shap_values_test, data_for_shap,
                    drug_names, X_train, force_plot_file_type, dpi,
                    eval_label):
    curr_shap_value = shap_values_test.loc[
        drug_id, :]  # explainer.shap_values(curr_drug_features)
    curr_features = data_for_shap.loc[drug_id, :]
    drug_name = drug_names.loc[drug_id, 'Drug name']
    title = 'Probability higher risk (%s)' % (drug_name)

    curr_feature_vals = np.array([
        'Yes' if bool(x) else
        "No" if X_train.columns[i] != 'Number of Category' else x
        for i, x in enumerate(curr_features.values)
    ])

    p = shap.force_plot(
        base_value=expected_value,
        shap_values=curr_shap_value.values,
        #feature_names=[x.replace("Cluster: ",'').replace(';','\n') for x in X_train.columns], #x.split(': ')[1] if ': ' in x else
        feature_names=[
            x.split(': ')[1] if ': ' in x else x for x in X_train.columns
        ],
        features=
        curr_feature_vals,  #['Yes' if x else 'No' for x in curr_features.values],
        out_names=[title],
        figsize=(20, 4)  #
        ,
        show=False,
        matplotlib=True,
        text_rotation=int(45 / 2))
    p.savefig(os.path.join('output', 'SHAP' + "_" + eval_label,
                           drug_id + '.' + force_plot_file_type),
              dpi=dpi,
              bbox_inches='tight')
    # p.show()
    plt.close('all')

    shap.decision_plot(
        base_value=expected_value,
        shap_values=curr_shap_value.values,
        feature_names=[
            x.split(': ')[1] if ': ' in x else x for x in X_train.columns
        ],
        features=curr_feature_vals,
        feature_display_range=slice(-1, -11, -1),
        title=title,
        show=False,
        #link='logit',
        #highlight=0
    )
    p = plt.gcf()
    p.savefig(os.path.join('output', 'SHAP' + "_" + eval_label,
                           drug_id + '_decision_plot.' + force_plot_file_type),
              dpi=dpi,
              bbox_inches='tight')
    plt.close('all')
    def decision_plot(self, X, y):
        """Visualization of the additive feature attribution."""

        # Automates single-target slicing
        y = super()._check_target_index(y=y)

        for index in range(_n_targets(y)):
            self.fit(X=X, y=y, index=index)
            explainer, shap_values = self.explainer(X=X)
            shap.decision_plot(base_value=explainer.expected_value,
                               shap_values=shap_values,
                               feature_names=list(X.columns),
                               show=self.show)
Ejemplo n.º 9
0
def visualise_explanation(explanation, per_class=True):
    """
    Visualises an explanation of classification performance.
    
    Parameters
    ----------
    explanation
        Output of `classification.explain_classifier()`.
    per_class : bool, optional
        Whether to also plot explanations at the level of the individual
        classes, or just the summary.
    """

    # Summarise across all classes
    fig, ax = plt.subplots()
    shap.summary_plot(explanation.shap_values,
                      explanation.data,
                      class_names=explanation.clf_categories,
                      max_display=15)
    ax.set_ylabel("Feature")
    ax.set_title("Classifier feature weightings (all classes)")
    fig.tight_layout()

    # And then break down by class
    if per_class:
        for i in range(len(explanation.shap_values)):
            # Summary plot: break down by feature
            fig, ax = plt.subplots()
            shap.summary_plot(explanation.shap_values[i],
                              explanation.data,
                              max_display=10)
            ax.set_ylabel("Feature")
            ax.set_title("Classifier feature weightings (class: {})".format(
                explanation.clf_categories[i]))
            fig.tight_layout()

            # Decision plot: break down by observation
            fig, ax = plt.subplots()
            shap.decision_plot(explanation.explainer.expected_value[i],
                               explanation.shap_values[i],
                               link='logit',
                               features=explanation.data)
            ax.set_ylabel("Feature")
            ax.set_title("Classifier decision weightings (class: {})".format(
                explanation.clf_categories[i]))
            fig.tight_layout()

    return
Ejemplo n.º 10
0
def shap_explain(booster,
                 datasource,
                 dataset,
                 summary_params,
                 result_table="",
                 is_pai=False,
                 oss_dest=None,
                 oss_ak=None,
                 oss_sk=None,
                 oss_endpoint=None,
                 oss_bucket_name=None):
    tree_explainer = shap.TreeExplainer(booster)
    shap_values = tree_explainer.shap_values(dataset)
    if result_table:
        if is_pai:
            conn = PaiIOConnection.from_table(result_table)
        else:
            conn = db.connect_with_data_source(datasource)
        # TODO(typhoonzero): the shap_values is may be a
        # list of shape [3, num_samples, num_features],
        # use the first dimension here, should find out
        # when to use the other two. When shap_values is
        # not a list it can be directly used.
        if isinstance(shap_values, list):
            to_write = shap_values[0]
        else:
            to_write = shap_values

        columns = list(dataset.columns)
        with db.buffered_db_writer(conn, result_table, columns) as w:
            for row in to_write:
                w.write(list(row))
        conn.close()

    if summary_params.get("plot_type") == "decision":
        shap_interaction_values = tree_explainer.shap_interaction_values(
            dataset)
        expected_value = tree_explainer.expected_value
        if isinstance(shap_interaction_values, list):
            shap_interaction_values = shap_interaction_values[0]
        if isinstance(expected_value, list):
            expected_value = expected_value[0]
        plot_func = lambda: shap.decision_plot(  # noqa: E731
            expected_value,
            shap_interaction_values,
            dataset,
            show=False,
            feature_display_range=slice(None, -40, -1),
            alpha=1)
    else:
        plot_func = lambda: shap.summary_plot(  # noqa: E731
            shap_values, dataset, show=False, **summary_params)

    explainer.plot_and_save(plot_func,
                            oss_dest=oss_dest,
                            oss_ak=oss_ak,
                            oss_sk=oss_sk,
                            oss_endpoint=oss_endpoint,
                            oss_bucket_name=oss_bucket_name,
                            filename='summary')
Ejemplo n.º 11
0
    def decision_plot(self,
                      fold,
                      supersample_frac=.1,
                      feature_display_range=None,
                      cmap=None):
        """
        For a small sample of test data points, starting from the models base prediction value,
        incrementely adding features to each prediction and plotting the path of the model score
        as a continuous line. Good for visualising how features across the model combine to create
        a diverse range of model scores.
        """
        expected_value = self.fold_expected_values[fold]
        shap_values = self.shapley_values_array[:, :, fold]
        X = self.SHAP_X_sample

        super_sample = X.sample(frac=supersample_frac, replace=False)
        super_sample_idx = super_sample.reset_index().index.values

        super_sampled_shapley_values = shap_values[super_sample_idx, :]

        return shap.decision_plot(expected_value,
                                  super_sampled_shapley_values,
                                  super_sample,
                                  feature_display_range=feature_display_range,
                                  plot_color=cmap)
Ejemplo n.º 12
0
def local_plot(name,
               explainer,
               shap_values,
               feature_names,
               chosen_sample,
               estimand_name,
               X_test,
               plot_type='force_plot'):
    if plot_type == 'force_plot':
        h = shap.force_plot(base_value=explainer.expected_value,
                            shap_values=shap_values[chosen_sample],
                            features=X_test[chosen_sample],
                            feature_names=feature_names,
                            link="identity",
                            out_names=estimand_name,
                            matplotlib=False,
                            show=False)
        save_plot(h, name)
    elif plot_type == 'decision_plot':
        h = shap.decision_plot(base_value=explainer.expected_value,
                               shap_values=shap_values[chosen_sample],
                               features=X_test[chosen_sample],
                               feature_names=feature_names,
                               link="identity",
                               matplotlib=False,
                               show=False)
        save_plot(h, name)
    return h
Ejemplo n.º 13
0
    def decision_plot(self, X, y):
        """Visualization of the additive feature attribution."""

        y = self._slice_target_index(y=y)

        for index in range(_n_targets(y)):
            if sklearn.utils.multiclass.type_of_target(
                    y) == 'continuous-multioutput':
                self.fit(X, y.iloc[:, index].values.ravel(order='K'))
            else:
                self.fit(X, y)
            explainer, shap_values = self.explainer(X=X)
            shap.decision_plot(base_value=explainer.expected_value,
                               shap_values=shap_values,
                               feature_names=list(X.columns),
                               show=self.show)
Ejemplo n.º 14
0
def shap_explain(datasource,
                 select,
                 feature_field_meta,
                 feature_column_names,
                 label_meta,
                 summary_params,
                 result_table="",
                 is_pai=False,
                 pai_explain_table="",
                 oss_dest=None,
                 oss_ak=None,
                 oss_sk=None,
                 oss_endpoint=None,
                 oss_bucket_name=None,
                 transform_fn=None,
                 feature_column_code=""):
    x = xgb_shap_dataset(datasource,
                         select,
                         feature_column_names,
                         label_meta,
                         feature_field_meta,
                         is_pai,
                         pai_explain_table,
                         transform_fn=transform_fn,
                         feature_column_code=feature_column_code)
    shap_values, shap_interaction_values, expected_value = xgb_shap_values(x)
    if result_table != "":
        if is_pai:
            from runtime.dbapi.paiio import PaiIOConnection
            conn = PaiIOConnection.from_table(result_table)
        else:
            conn = db.connect_with_data_source(datasource)
        # TODO(typhoonzero): the shap_values is may be a
        # list of shape [3, num_samples, num_features],
        # use the first dimension here, should find out
        # when to use the other two. When shap_values is
        # not a list it can be directly used.
        if isinstance(shap_values, list):
            to_write = shap_values[0]
        else:
            to_write = shap_values
        write_shap_values(to_write, conn, result_table, feature_column_names)

    if summary_params.get("plot_type") == "decision":
        explainer.plot_and_save(
            lambda: shap.decision_plot(expected_value,
                                       shap_interaction_values,
                                       x,
                                       show=False,
                                       feature_display_range=slice(
                                           None, -40, -1),
                                       alpha=1), oss_dest, oss_ak, oss_sk,
            oss_endpoint, oss_bucket_name)
    else:
        explainer.plot_and_save(
            lambda: shap.summary_plot(
                shap_values, x, show=False, **summary_params), oss_dest,
            oss_ak, oss_sk, oss_endpoint, oss_bucket_name)
Ejemplo n.º 15
0
def shap_explain(booster, datasource, dataset, summary_params, result_table):

    tree_explainer = shap.TreeExplainer(booster)
    shap_values = tree_explainer.shap_values(dataset)
    if result_table:
        conn = db.connect_with_data_source(datasource)
        # TODO(typhoonzero): the shap_values is may be a
        # list of shape [3, num_samples, num_features],
        # use the first dimension here, should find out
        # when to use the other two. When shap_values is
        # not a list it can be directly used.
        if isinstance(shap_values, list):
            to_write = shap_values[0]
        else:
            to_write = shap_values

        columns = list(dataset.columns)
        dtypes = [DataType.to_db_field_type(conn.driver, DataType.FLOAT32)
                  ] * len(columns)
        _create_table(conn, result_table, columns, dtypes)
        with db.buffered_db_writer(conn, result_table, columns) as w:
            for row in to_write:
                w.write(list(row))

        conn.close()

    if summary_params.get("plot_type") == "decision":
        shap_interaction_values = tree_explainer.shap_interaction_values(
            dataset)
        expected_value = tree_explainer.expected_value
        if isinstance(shap_interaction_values, list):
            shap_interaction_values = shap_interaction_values[0]
        if isinstance(expected_value, list):
            expected_value = expected_value[0]

        plot_func = lambda: shap.decision_plot(  # noqa: E731
            expected_value,
            shap_interaction_values,
            dataset,
            show=False,
            feature_display_range=slice(None, -40, -1),
            alpha=1)
    else:
        plot_func = lambda: shap.summary_plot(  # noqa: E731
            shap_values, dataset, show=False, **summary_params)

    filename = 'summary.png'
    with temp_file.TemporaryDirectory(as_cwd=True):
        explainer.plot_and_save(plot_func, filename=filename)
        with open(filename, 'rb') as f:
            img = f.read()

    img = base64.b64encode(img)
    if six.PY3:
        img = img.decode('utf-8')
    img = "<div align='center'><img src='data:image/png;base64,%s' /></div>" \
          % img
    print(img)
Ejemplo n.º 16
0
    def decision_plot(
        self, num_samples=0.25, sample_no=None, output_file="", **decisionplot_kwargs
    ):
        """
        Plots a SHAP decision plot.
        
        Parameters
        ----------
        num_samples : int, float, or 'all', optional
            Number of samples to display, if less than 1 it will treat it as a percentage, 'all' will include all samples
            , by default 0.25

        sample_no : int, optional
            Sample number to isolate and analyze, if provided it overrides num_samples, by default None

        Returns
        -------
        DecisionPlotResult 
            If return_objects=True (the default). Returns None otherwise.
        """

        return_objects = decisionplot_kwargs.pop("return_objects", True)
        highlight = decisionplot_kwargs.pop("highlight", None)

        if sample_no is not None:
            if sample_no < 1 or not isinstance(sample_no, int):
                raise ValueError("Sample number must be greater than 1.")

            samples = slice(sample_no - 1, sample_no)
        else:
            if num_samples == "all":
                samples = slice(0, len(self.x_test_array))
            elif num_samples <= 0:
                raise ValueError(
                    "Number of samples must be greater than 0. If it is less than 1, it will be treated as a percentage."
                )
            elif num_samples > 0 and num_samples < 1:
                samples = slice(0, int(num_samples * len(self.x_test_array)))
            else:
                samples = slice(0, num_samples)

        if highlight is not None:
            highlight = highlight[samples]

        s = shap.decision_plot(
            self.expected_value,
            self.shap_values[samples],
            self.x_train.columns,
            return_objects=return_objects,
            highlight=highlight,
            show=False,
            **decisionplot_kwargs,
        )

        if output_file:  # pragma: no cover
            pl.savefig(os.path.join(IMAGE_DIR, self.model_name, output_file))

        return s
Ejemplo n.º 17
0
def get_shap(model, exp_data, i):
    shap.initjs()
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(exp_data)
    force_plot = shap.force_plot(explainer.expected_value[i],
                                 shap_values[i][0, :], exp_data.iloc[0, :])
    decision_plot = shap.decision_plot(explainer.expected_value[i],
                                       shap_values[i][0, :],
                                       exp_data.iloc[0, :])
    return force_plot
Ejemplo n.º 18
0
    def explain(self, options, instance=None):

        if instance is None:
            raise ValueError("Instance was not provided")

        initjs()
        instance = instance.to_numpy()
        data = self._kmeans(options['kmeans_count']) \
            if options['background_data'] == 'kmeans' else options['data']
        nsamples = 'auto' if options['auto_nsamples'] else options['nsamples']
        explainer = KernelExplainer(model=self.predict_function,
                                    data=data,
                                    link=options['link'])
        shap_values = explainer.shap_values(X=instance,
                                            nsamples=nsamples,
                                            l1_reg=options['l1_reg'])
        if self.is_classification:
            shap_values = shap_values[options['class_to_explain']]
            base_value = explainer.expected_value[[
                options['class_to_explain']
            ]]
        else:
            base_value = explainer.expected_value

        if options['plot_type'] == 'force' or options['plot_type'] == 'both':
            display(
                force_plot(base_value=base_value,
                           shap_values=shap_values,
                           features=instance,
                           feature_names=self.feature_names,
                           show=True,
                           link=options['link']))

        if options['plot_type'] == 'decision' or options['plot_type'] == 'both':
            decision_plot(base_value=base_value,
                          shap_values=shap_values,
                          features=instance,
                          feature_names=list(self.feature_names),
                          show=True,
                          color_bar=True,
                          link=options['link'])
Ejemplo n.º 19
0
    def decisions_binary(
        df_preds,
        shap_values,
        expected_value,
        X_vald,
        y_vald,
        model_file_path,
        learner_name,
    ):
        # classes are from 0 ...
        for t in np.unique(y_vald):
            fig = plt.gcf()
            shap.decision_plot(
                expected_value,
                shap_values[df_preds[df_preds.target == t].lp[:10], :],
                X_vald.loc[df_preds[df_preds.target == t].index[:10]],
                show=False,
            )
            fig.tight_layout(pad=2.0)
            fig.savefig(
                os.path.join(
                    model_file_path,
                    f"{learner_name}_shap_class_{t}_worst_decisions.png",
                )
            )
            plt.close("all")

            fig = plt.gcf()
            shap.decision_plot(
                expected_value,
                shap_values[df_preds[df_preds.target == t].lp[-10:], :],
                X_vald.loc[df_preds[df_preds.target == t].index[-10:]],
                show=False,
            )
            fig.tight_layout(pad=2.0)
            fig.savefig(
                os.path.join(
                    model_file_path, f"{learner_name}_shap_class_{t}_best_decisions.png"
                )
            )
            plt.close("all")
Ejemplo n.º 20
0
def explain(datasource,
            select,
            feature_field_meta,
            feature_column_names,
            label_spec,
            summary_params,
            result_table="",
            is_pai=False,
            pai_explain_table="",
            hdfs_namenode_addr="",
            hive_location="",
            hdfs_user="",
            hdfs_pass="",
            oss_dest=None,
            oss_ak=None,
            oss_sk=None,
            oss_endpoint=None,
            oss_bucket_name=None):
    x = xgb_shap_dataset(datasource, select, feature_column_names, label_spec,
                         feature_field_meta, is_pai, pai_explain_table)

    shap_values, shap_interaction_values, expected_value = xgb_shap_values(x)

    if result_table != "":
        if is_pai:
            # TODO(typhoonzero): the shape of shap_values is (3, num_samples, num_features)
            # use the first dimension here, should find out how to use the other two.
            write_shap_values(shap_values[0], "pai_maxcompute", None,
                              result_table, feature_column_names,
                              hdfs_namenode_addr, hive_location, hdfs_user,
                              hdfs_pass)
        else:
            conn = connect_with_data_source(datasource)
            write_shap_values(shap_values[0], conn.driver, conn, result_table,
                              feature_column_names, hdfs_namenode_addr,
                              hive_location, hdfs_user, hdfs_pass)
        return

    if summary_params.get("plot_type") == "decision":
        explainer.plot_and_save(
            lambda: shap.decision_plot(expected_value,
                                       shap_interaction_values,
                                       x,
                                       show=False,
                                       feature_display_range=slice(
                                           None, -40, -1),
                                       alpha=1), is_pai, oss_dest, oss_ak,
            oss_sk, oss_endpoint, oss_bucket_name)
    else:
        explainer.plot_and_save(
            lambda: shap.summary_plot(
                shap_values, x, show=False, **summary_params), is_pai,
            oss_dest, oss_ak, oss_sk, oss_endpoint, oss_bucket_name)
Ejemplo n.º 21
0
def shap_plots(model, train_features, test_features, test_labels):
    print("Computing shapley values..")
    # compute SHAP values
    if isinstance(
            model,
        (MLP, MLPRegressor, MLPClassifier, ElasticNet, LogisticRegression)):
        train_sample = shap.sample(train_features, 10)
        explainer = shap.Explainer(model.predict, train_sample)
    elif isinstance(model, (RandomForestRegressor, RandomForestClassifier)):
        explainer = shap.TreeExplainer(model, train_features)
    else:
        explainer = shap.Explainer(model, train_features)

    shap_values = explainer(test_features)
    shap.plots.bar(shap_values, max_display=10)
    # shap.plots.bar(shap_values[0]) # Local

    # beeswarm plot
    shap.plots.beeswarm(shap_values)

    # Decision plot
    expected_value = explainer.expected_value
    select = range(20)
    features_sample = test_features.iloc[select]
    shap.decision_plot(expected_value, explainer.shap_values(features_sample),
                       features_sample)

    # Heatmap
    shap.plots.heatmap(shap_values, max_display=10)

    # Scatter
    shap.plots.scatter(shap_values[:, "hs_child_age_None"],
                       color=shap_values,
                       alpha=0.8)

    # Feature clustering (redondant feature detection)
    clustering = shap.utils.hclust(
        test_features, test_labels
    )  # by default this trains (X.shape[1] choose 2) 2-feature XGBoost models
    shap.plots.bar(shap_values, clustering=clustering, clustering_cutoff=0.5)
Ejemplo n.º 22
0
 def decision_plot(self, class_id=0, **kwargs):
     """
     Visualize model decisions using cumulative SHAP values. Each colored line in the plot represents the model
     prediction for a single observation. Note that plotting too many samples at once can make the plot unintelligible.
     
     `class_id` is used to indicate the class of interest for classification models, it can ba an int or a string
     
     For an up-to-date list of the parameters, see: https://github.com/slundberg/shap/blob/master/shap/plots/decision.py
     For more informations, see: https://github.com/slundberg/shap/blob/master/notebooks/plots/decision_plot.ipynb
     """
     # NOTE: there is a shap.multioutput_decision_plot but it uses a single row
     shap_values, expected_value = _get_values(self, class_id)
     return shap.decision_plot(expected_value, shap_values, self.test_data, **kwargs)
Ejemplo n.º 23
0
 def explain_one_sample(self, X):
     '''
     Draws decision plot for one sample
     @X => one sample dataframe
     '''
     if X.shape[0] > 1:
         raise Exception(
             'You need to pass only one sample of data for this function.\nIt means sample size (1, n_features)'
         )
     explainer = shap.TreeExplainer(self.model[0]['clf'])
     shap_values = explainer.shap_values(self.process_data(X))[1]
     try:
         shap.decision_plot(
             explainer.expected_value[1],
             shap_values,
             ignore_warnings=False,
             feature_names=self.model[0]['teach_cols'].tolist())
     except IndexError:
         shap.decision_plot(
             explainer.expected_value,
             shap_values,
             ignore_warnings=False,
             feature_names=self.model[0]['teach_cols'].tolist())
Ejemplo n.º 24
0
 def _plot_decision_(self,
                     expected_value: float,
                     shap_values: List[np.ndarray] or np.ndarray,
                     title: str = None,
                     gene_names: bool = True,
                     auto_size_plot: bool = True,
                     minimum: int = 0.0,
                     maximum: int = 0.0,
                     feature_display_range=None,
                     save: Path = None):
     #shap.summary_plot(shap_values, self.partitions.X, show=False)
     feature_names = None if gene_names is False else self.feature_names
     min_max = (self.partitions.data.y.min(), self.partitions.data.y.max())
     print(f"min_max dataset values: {min_max}")
     xlim = (min(min_max[0], minimum), max(min_max[1], maximum))
     shap.decision_plot(expected_value,
                        shap_values,
                        xlim=xlim,
                        feature_names=feature_names.tolist(),
                        title=title,
                        auto_size_plot=auto_size_plot,
                        feature_display_range=feature_display_range,
                        show=False)
     return self.make_figure(save)
Ejemplo n.º 25
0
def explain(datasource, select, feature_field_meta, label_spec,
            summary_params):
    feature_column_names = [k["name"] for k in feature_field_meta]
    feature_specs = {k['name']: k for k in feature_field_meta}
    x = xgb_shap_dataset(datasource, select, feature_column_names, label_spec,
                         feature_specs)

    shap_values, shap_interaction_values, expected_value = xgb_shap_values(x)

    if summary_params.get("plot_type") == "decision":
        explainer.plot_and_save(lambda: shap.decision_plot(
            expected_value,
            shap_interaction_values,
            x,
            show=False,
            feature_display_range=slice(None, -40, -1),
            alpha=1))
    else:
        explainer.plot_and_save(lambda: shap.summary_plot(
            shap_values, x, show=False, **summary_params))
Ejemplo n.º 26
0
            """)
            explainer, shap_values = load_shap_explainer(
                root + 'shap_treeExplainer.bz2', X_test_transformed)
            st_shap(
                shap.force_plot(explainer.expected_value,
                                shap_values[0, :],
                                X_test.iloc[0, :],
                                link='logit'))

            # -----------

            shap.decision_plot(base_value=explainer.expected_value,
                               shap_values=shap_values[0],
                               features=X_test.iloc[0, :],
                               link='logit',
                               feature_display_range=slice(
                                   None, -X_test.shape[1] - 1, -1),
                               return_objects=True,
                               show=False,
                               y_demarc_color='#00172b')
            fig = plt.gcf()
            ax = plt.gca()
            fig.patch.set_facecolor('#00172b')
            ax.set_facecolor('#00172b')
            ax.set_xlabel('Probability', fontsize=16, color='white')
            ax.tick_params(axis='both', colors='white')
            ax.grid(axis='both', color='white', linestyle='-', linewidth=0.25)
            for ln in ax.lines:
                ln.set_linewidth(3)
            for text in ax.texts:
                text.set_color('white')
Ejemplo n.º 27
0
Next, we use the SHAP values to build up 2D scatter graphs for every feature. They shows the effect of a feature for the prediction for every instance.

fig, axs = plt.subplots(7,3,figsize=(16,22),squeeze=True)
ind = 0
for ax in axs.flat:
    feat = bst.feature_names[ind]
    ax.scatter(x_df[feat],shap_values_XGB_test[:,ind],s=1,color='gray')
#    ax.set_ylim([-0.2,0.2])
    ax.set_title(feat)
    ind+=1
plt.subplots_adjust(hspace=0.8)
plt.savefig('shap_sc.png')

**Decision_plot()** is interesting as it shows how the prediction is formed from the contributions of different features.

shap.decision_plot(explainerXGB.expected_value,shap_values_XGB_test[0:100],features)

**Force_plot** is similar to decision_plot. We plot only the first 100 instances because it would be very slow to draw a force_plot with all the instances.

shap.force_plot(explainerXGB.expected_value,shap_values_XGB_test[0:100],features,figsize=(20,10))

**Waterfall_plot** is great when you want to analyse one instance.

shap.waterfall_plot(explainerXGB.expected_value,shap_values_XGB_test[2000],x_df.iloc[2000],features)

### Other interpretation methods

For the following methods, we need to use the Xgboost's Scikit-learn wrapper **XGBRegressor()** to make our Xgboost model to be compatible with the Scikit-learn ecosystem.

m_depth = 5
for mode in [df_features_ec_season, df_features_ec_season_permuted]:
    shap_values = explainer.shap_values(mode,
                                        approximate=True,
                                        check_additivity=True)

    # dependence plots
    for name in mode.columns:
        shap.dependence_plot(name, shap_values[1], mode)

    # Summary plots
    shap.summary_plot(shap_values, mode, plot_type="bar")
    shap.summary_plot(shap_values[1], mode, plot_type="bar")
    shap.summary_plot(shap_values[1], mode)  # Failure

    # Decision plots explaining decisions to classify
    shap.decision_plot(explainer.expected_value[1], shap_values[1], mode)
    shap.decision_plot(explainer.expected_value[1], shap_values[1][1],
                       mode.iloc[1])  #2012 year

    # Calculate force plot for a given value 2012
    shap.initjs()
    shap_values_2012 = explainer.shap_values(mode.iloc[[4]])
    shap_display = shap.force_plot(explainer.expected_value[1],
                                   shap_values_2012[1],
                                   mode.iloc[[4]],
                                   matplotlib=True)

#%% Predictions for 2C degree
y_pred_2C = brf_model.predict(df_features_ec_2C_season)
score_prc_2C = sum(y_pred_2C) / len(y_pred_2C)
print("The ratio of failure seasons by total seasons for 2C is:", score_prc_2C)
Ejemplo n.º 29
0
shap.dependence_plot("user_level", shap_values, X_test)
shap.dependence_plot("user_level", shap_values, X_test, interaction_index=None)

shap.dependence_plot("education", shap_values, X_test)
shap.dependence_plot("education", shap_values, X_test, interaction_index=None)

# load JS visualization code to notebook
shap.initjs()

plt.clf()
shap.force_plot(explainer.expected_value,
                shap_values[1594, :],
                X_test_disp.iloc[1594, :],
                matplotlib=True,
                show=False,
                figsize=(20, 4))
plt.savefig("prediction1.png", bbox_inches='tight')

plt.clf()
shap.force_plot(explainer.expected_value,
                shap_values[1594, :],
                X_test_disp.iloc[1594, :],
                matplotlib=True,
                show=False,
                figsize=(20, 4))
plt.savefig("prediction1.eps", bbox_inches='tight', format='eps')

shap.decision_plot(explainer.expected_value, shap_values[1594, :],
                   X_test_disp.iloc[1594, :])
Ejemplo n.º 30
0
def app():
    st.markdown("""<style>.big-font {font-size:100px !important;}</style>""", unsafe_allow_html=True) 
    st.markdown(
        """<style>
        .boxBorder {
            border: 2px solid #990066;
            padding: 10px;
            outline: #990066 solid 5px;
            outline-offset: 5px;
            font-size:25px;
        }</style>
        """, unsafe_allow_html=True) 
    st.markdown('<div class="boxBorder"><font color="RED">Disclaimer: This predictive tool is only for research purposes</font></div>', unsafe_allow_html=True)
    st.write("## Model Perturbation Analysis")

    @st.cache(hash_funcs={"MyUnhashableClass": lambda _: None})
    def load_model2():
        with open('saved_models/trainXGB_class_map.pkl', 'rb') as f:
            class_names = list(pickle.load(f))
        return class_names

    class_names = load_model2()

    # @st.cache(hash_funcs={"MyUnhashableClass": lambda _: None})
    def load_model():
        M_dict = {}
        for classname in class_names:
            M_dict[classname] = joblib.load('saved_models/trainXGB_gpu_{}.model'.format(classname))
        return M_dict

    M_dict = load_model()

    @st.cache(hash_funcs={"MyUnhashableClass": lambda _: None})
    def load_model1():
        with open('saved_models/trainXGB_gpu_{}.data'.format(class_names[0]), 'rb') as f:
            train = pickle.load(f)
        with open('saved_models/trainXGB_categorical_map.pkl', 'rb') as f:
            col_dict_map = pickle.load(f)
        return train, col_dict_map

    train, col_dict_map = load_model1()

    X = train[1]['X_valid'].copy() 
    ids = list(train[3]['ID_test'])
    X.index = ids
    labels_pred =  list(train[3]['y_pred_test']) 
    labels_actual = list(train[3]['y_test']) 
    # select_patient = st.selectbox("Select the patient", list(X.index), index=0)
    
    categorical_columns = []
    numerical_columns = []
    X_new = X.fillna('X')
    for col in X_new.columns:
        # if len(X_new[col].value_counts()) <= 10:
        if col_dict_map.get(col, None) is not None:
            categorical_columns.append(col)
        else:
            numerical_columns.append(col) 
    
    st.write('### Please enter the following {} factors to perform prediction or select a random patient'.format(len(categorical_columns + numerical_columns)))
    # st.write("***Categorical Columns:***", categorical_columns) 
    # st.write("***Numerical Columns:***", numerical_columns) 
    from collections import defaultdict
    if st.button("Random Patient"):
        import random
        select_patient = random.choice(list(X.index))
    else:
        select_patient = list(X.index)[0]

    select_patient_index = ids.index(select_patient) 
    new_feature_input = defaultdict(list) 
    for key, val in col_dict_map.items():
        rval = {j:i for i,j in val.items()}
        X_new[key] = X_new[key].map(lambda x: rval.get(x, x))
    
    st.write('--'*10)
    st.write('##### Note: X denoted NA values')
    col1, col2, col3, col4 = st.beta_columns(4)
    for i in range(0, len(categorical_columns), 4):
        with col1:
            if (i+0) >= len(categorical_columns):
                continue
            c1 = categorical_columns[i+0] 
            idx = list(X_new[c1].unique()).index(X_new.loc[select_patient, c1]) 
            f1 = st.selectbox("{}".format(feature_mapping[c1]), list(X_new[c1].unique()), index=idx)
            new_feature_input[c1].append(col_dict_map[c1].get(f1, np.nan))
        with col2:
            if (i+1) >= len(categorical_columns):
                continue
            c2 = categorical_columns[i+1] 
            idx = list(X_new[c2].unique()).index(X_new.loc[select_patient, c2]) 
            f2 = st.selectbox("{}".format(feature_mapping[c2]), list(X_new[c2].unique()), index=idx)
            new_feature_input[c2].append(col_dict_map[c2].get(f2, np.nan))
        with col3:
            if (i+2) >= len(categorical_columns):
                continue
            c3 = categorical_columns[i+2] 
            idx = list(X_new[c3].unique()).index(X_new.loc[select_patient, c3]) 
            f3 = st.selectbox("{}".format(feature_mapping[c3]), list(X_new[c3].unique()), index=idx)
            new_feature_input[c3].append(col_dict_map[c3].get(f3, np.nan))
        with col4:
            if (i+3) >= len(categorical_columns):
                continue
            c4 = categorical_columns[i+3] 
            idx = list(X_new[c4].unique()).index(X_new.loc[select_patient, c4]) 
            f4 = st.selectbox("{}".format(feature_mapping[c4]), list(X_new[c4].unique()), index=idx)
            new_feature_input[c4].append(col_dict_map[c4].get(f4, np.nan))
    
    for col in numerical_columns:
        X_new[col] = X_new[col].map(lambda x: float(x) if not x=='X' else np.nan)
    for i in range(0, len(numerical_columns), 4):
        with col1:
            if (i+0) >= len(numerical_columns):
                continue
            c1 = numerical_columns[i+0] 
            idx = X_new.loc[select_patient, c1]
            f1 = st.number_input("{}".format(feature_mapping[c1]), min_value=X_new[c1].min(),  max_value=X_new[c1].max(), value=idx)
            new_feature_input[c1].append(f1)
        with col2:
            if (i+1) >= len(numerical_columns):
                continue
            c2 = numerical_columns[i+1] 
            idx = X_new.loc[select_patient, c2]
            f2 = st.number_input("{}".format(feature_mapping[c2]), min_value=X_new[c2].min(),  max_value=X_new[c2].max(), value=idx)
            new_feature_input[c2].append(f2)
        with col3:
            if (i+2) >= len(numerical_columns):
                continue
            c3 = numerical_columns[i+2] 
            idx = X_new.loc[select_patient, c3]
            f3 = st.number_input("{}".format(feature_mapping[c3]), min_value=X_new[c3].min(),  max_value=X_new[c3].max(), value=idx)
            new_feature_input[c3].append(f3)
        with col4:
            if (i+3) >= len(numerical_columns):
                continue
            c4 = numerical_columns[i+3] 
            idx = X_new.loc[select_patient, c4]
            f4 = st.number_input("{}".format(feature_mapping[c4]), min_value=X_new[c4].min(),  max_value=X_new[c4].max(), value=idx)
            new_feature_input[c4].append(f4)
    
    st.write('--'*10)
    st.write("### Do you want to see the effect of changing a factor on this patient?")
    color_discrete_map = {}
    color_discrete_map_list = ["red", "green", "blue", "goldenred", "magenta", "yellow", "pink", "grey"]
    for e, classname in enumerate(class_names):
        color_discrete_map[classname] = color_discrete_map_list[e] 
    
    show_whatif = st.checkbox("Enable what-if analysis")
    col01, col02 = st.beta_columns(2)
    with col01:
        st.write('### Prediction on actual feature values')
        feature_print = X_new.loc[select_patient, :].fillna('X')
        feature_print.index = feature_print.index.map(lambda x: feature_mapping[x])
        feature_print = feature_print.reset_index()
        feature_print.columns = ["Feature Name", "Feature Value"] 
        st.table(feature_print.set_index("Feature Name").astype(str))
        predicted_prob = defaultdict(list)
        predicted_class = -1
        max_val = -1
        for key, val in M_dict.items():
            predicted_prob['predicted_probability'].append(val.predict(xgb.DMatrix(X.loc[select_patient, :].values.reshape(1, -1), feature_names=X.columns))[0])
            predicted_prob['classname'].append(key)
            if predicted_prob['predicted_probability'][-1] > max_val:
                predicted_class = key
                max_val = predicted_prob['predicted_probability'][-1] 
        K = pd.DataFrame(predicted_prob)
        K['predicted_probability'] = K['predicted_probability'] / K['predicted_probability'].sum()
        K['color'] = ['zed' if i==predicted_class else 'red' for i in list(predicted_prob['classname']) ]
        # fig = px.bar(K, x='predicted_probability', y='classname', color='color', width=500, height=400, orientation='h')
        # # fig = px.bar(K, y='predicted_probability', x=sorted(list(predicted_prob['classname'])), width=500, height=400)
        # fig.update_layout(
        #     legend=None,
        #     yaxis_title="Class Labels",
        #     xaxis_title="Predicted Probability",
        #     font=dict(
        #         family="Courier New, monospace",
        #         size=12,
        #         color="RebeccaPurple"
        #     ),
        #     margin=dict(l=10, r=10, t=10, b=10),
        # )
        # st.plotly_chart(fig)
        import altair as alt
        K = K.rename(columns={"classname": "Class Labels", "predicted_probability": "Predicted Probability"})
        f = alt.Chart(K).mark_bar().encode(
                    y=alt.Y('Class Labels:N',sort=alt.EncodingSortField(field="Predicted Probability", order='descending')),
                    x=alt.X('Predicted Probability:Q'),
                    color=alt.Color('color', legend=None),
                ).properties(width=500, height=300)
        st.write(f)
        # st.write('#### Trajectory for Predicted Class')
        st.write('#### Model Output Trajectory for {} Class using SHAP values'.format(predicted_class))

        @st.cache(hash_funcs={"MyUnhashableClass": lambda _: None})
        def load_model5():
            with open('saved_models/trainXGB_gpu_{}.data'.format(predicted_class), 'rb') as f:
                new_train = pickle.load(f)
            return new_train
        new_train = load_model5()
        exval = new_train[2]['explainer_train'] 
        explainer_train = shap.TreeExplainer(M_dict[predicted_class])
        t1 = pd.DataFrame(X.loc[select_patient, :]).T
        t2 = pd.DataFrame(X_new.loc[select_patient, :].fillna('X')).T
        shap_values_train = explainer_train.shap_values(t1)
        shap.force_plot(exval, shap_values_train, t1, show=False, matplotlib=True)
        st.pyplot()
        fig, ax = plt.subplots()
        r = shap.decision_plot(exval, shap_values_train, t2, link='logit', return_objects=True, new_base_value=0, highlight=0)
        st.pyplot(fig)
    if show_whatif:
        with col02:
            dfl = pd.DataFrame(new_feature_input)
            ndfl = dfl.copy()
            for key, val in col_dict_map.items():
                rval = {j:i for i,j in val.items()}
                ndfl[key] = ndfl[key].map(lambda x: rval.get(x, x))
            st.write('### Prediction with what-if analysis')

            feature_print_what = ndfl.iloc[0].fillna('X')
            feature_print_what.index = feature_print_what.index.map(lambda x: feature_mapping[x])
            feature_print_what = feature_print_what.reset_index()
            feature_print_what.columns = ["Feature Name", "Feature Value"] 
            selected = []
            for i in range(len(feature_print_what)):
                if feature_print.iloc[i]["Feature Value"] == feature_print_what.iloc[i]["Feature Value"]:
                    pass
                else:
                    selected.append(feature_print.iloc[i]["Feature Name"])

            # st.table(feature_print)

            st.table(feature_print_what.astype(str).set_index("Feature Name").style.apply(lambda x: ['background: yellow' if (x.name in selected) else 'background: lightgreen' for i in x], axis=1))
            dfl = dfl[X.columns].replace('X', np.nan)
            predicted_prob = defaultdict(list)
            predicted_class = -1
            max_val = -1
            for key, val in M_dict.items():
                predicted_prob['predicted_probability'].append(val.predict(xgb.DMatrix(dfl.iloc[0, :].values.reshape(1, -1), feature_names=dfl.columns))[0])
                predicted_prob['classname'].append(key)
                if predicted_prob['predicted_probability'][-1] > max_val:
                    predicted_class = key
                    max_val = predicted_prob['predicted_probability'][-1] 
            K = pd.DataFrame(predicted_prob)
            K['predicted_probability'] = K['predicted_probability'] / K['predicted_probability'].sum()
            K['color'] = ['zed' if i==predicted_class else 'red' for i in list(predicted_prob['classname']) ]
            import altair as alt
            K = K.rename(columns={"classname": "Class Labels", "predicted_probability": "Predicted Probability"})
            f = alt.Chart(K).mark_bar().encode(
                y=alt.Y('Class Labels:N',sort=alt.EncodingSortField(field="Predicted Probability", order='descending')),
                    x=alt.X('Predicted Probability:Q'),
                    color=alt.Color('color', legend=None),
                ).properties( width=500, height=300)
            st.write(f)
            # fig = px.bar(K, x='predicted_probability', y='classname', color='color', width=500, height=400, orientation='h')
            # # fig = px.bar(K, y='predicted_probability', x=sorted(list(predicted_prob['classname'])), width=500, height=400)
            # fig.update_layout(
            # legend=None,
            # yaxis_title="Class Labels",
            # xaxis_title="Predicted Probability",
            # font=dict(
            #     family="Courier New, monospace",
            #     size=12,
            #     color="RebeccaPurple"
            # ),
            # margin=dict(l=10, r=10, t=10, b=10),
            # )  
            # st.plotly_chart(fig)
            st.write('#### Model Output Trajectory for {} Class using SHAP values'.format(predicted_class))

            @st.cache(hash_funcs={"MyUnhashableClass": lambda _: None})
            def load_model6():
                with open('saved_models/trainXGB_gpu_{}.data'.format(predicted_class), 'rb') as f:
                    new_train = pickle.load(f)
                return new_train

            new_train = load_model6()
            exval = new_train[2]['explainer_train']
            explainer_train = shap.TreeExplainer(M_dict[predicted_class])
            t1 = dfl.copy() 
            shap_values_train = explainer_train.shap_values(t1)
            shap.force_plot(exval, shap_values_train, t1, show=False, matplotlib=True)
            st.pyplot()
            fig, ax = plt.subplots()
            _ = shap.decision_plot(exval, shap_values_train, ndfl.fillna('X'), link='logit', feature_order=r.feature_idx, return_objects=True, new_base_value=0, highlight=0)
            st.pyplot(fig)