コード例 #1
0
def explain_pred_contrib(id,
                         clf,
                         X,
                         features,
                         cats=None,
                         waterfall={
                             'rotation_value': 60,
                             'threshold': None
                         }):
    try:
        p = clf.predict_proba(X.loc[X.index == id])[:, 1]
    except:
        p = clf.predict_proba(X.loc[X.index == id].values)[:, 1]
    print(
        f'Prediction explanation for ID: {id}; Probability of event (y=1): {np.round(p[0], 3)}\nModel used: {type(clf)}'
    )
    try:
        df = eli5.show_prediction(clf,
                                  X.loc[id],
                                  show_feature_values=True,
                                  feature_names=features)
        exp = eli5.explain_prediction_df(clf,
                                         X.loc[id],
                                         feature_names=features)
    except:
        df = eli5.show_prediction(clf,
                                  X.loc[id].values,
                                  show_feature_values=True,
                                  feature_names=features)
        exp = eli5.explain_prediction_df(clf,
                                         X.loc[id].values,
                                         feature_names=features)

    if cats is not None:
        c = id2class(exp, cats)
        for k, v in c.items():
            df.data = df.data.replace(k, v)

    if waterfall is not None:
        rot = waterfall['rotation_value']
        threshold = waterfall['threshold']
        waterfall_chart.plot(exp.feature,
                             exp.weight,
                             rotation_value=rot,
                             net_label="Final Score/Proba",
                             other_label="Minor Features",
                             formatting="{:,.2f}",
                             threshold=threshold,
                             Title='Waterfall of features contributions')
    return df
コード例 #2
0
def get_feature_importance(model, features, n_top_features=3):
    """
    Given a model and a features DataFrame with the different observations, calculate the top important features
    for each individual prediction.

    Parameters
    -----------
    model: model fit
        Fit of the model wanted to analyze (e.g. LGBMClassifer).
    features: DataFrame
        Pandas DataFrame with the input matrix used for the model (rows are different observations and columns are features).
    n_top_features: int
        Number of top features wanted. By default, the top 3 features will be displayed.

    Returns
    -------
    Translated text in the format "what on who" (e.g. 'tp_n_tenders' -> 'tenders on type of procurement').
    """

    features_list = []

    for index, row in features.iterrows():
        important_features = eli5.explain_prediction_df(
            model, row, feature_names=[c for c in features.columns
                                       ])['feature'][:n_top_features]
        features_list = features_list + [
            list(
                map(translate_features_to_human_language,
                    [f for f in important_features]))
        ]

    return features_list
コード例 #3
0
ファイル: model.py プロジェクト: kaljuvee/aging-biomarkers
 def explain(self, data):
     data = self.build_input(data)
     X = data.iloc[0].values
     feature_names = [c for c in data.columns]
     return explain_prediction_df(self.model.model.get_booster(),
                                  X,
                                  feature_names=feature_names)
コード例 #4
0
def test_prediction_decomposition_eqal_eli5():
    """Test that the prediction decomposition outputs from xgb.explainer.decompose_prediction are eqaul to the outputs from eli5."""

    model = build_model.build_depth_3_model()

    boston = load_boston()

    boston_data = pd.DataFrame(boston["data"], columns=boston["feature_names"])

    row_data = boston_data.iloc[[0]]

    eli5_decomposition = explain_prediction_df(model, row_data)

    column_mapping = {"<BIAS>": "base"}

    # create mapping because eli5 output will have feature names x0, x1 etc..
    for i, x in enumerate(boston["feature_names"]):

        column_mapping[f"x{i}"] = x

    eli5_decomposition["feature_mapped"] = eli5_decomposition["feature"].map(
        column_mapping
    )

    ttrees_trees_df = ttrees.xgb.parser.parse_model(model)

    ttrees_decomposition = ttrees.xgb.explainer.decompose_prediction(
        ttrees_trees_df.tree_data, row_data
    )

    # aggregate ttrees output to variable level, by default it is at tree x node level
    ttrees_decomposition_agg = pd.DataFrame(
        ttrees_decomposition.groupby("contributing_var").contribution.sum()
    ).reset_index()

    decomposition_compare_df = ttrees_decomposition_agg.merge(
        eli5_decomposition[["feature_mapped", "weight"]],
        how="left",
        left_on="contributing_var",
        right_on="feature_mapped",
        indicator=True,
    )

    # check merge is 1:1 i.e. both have same variables
    if (
        decomposition_compare_df["_merge"] == "both"
    ).sum() < decomposition_compare_df.shape[0]:

        pytest.fail(
            f"different features in eli5 and ttrees (merge not 1:1)\n\n{decomposition_compare_df}"
        )

    # check equality between prediction decomposition values
    assert_series_equal(
        left=decomposition_compare_df["weight"],
        right=decomposition_compare_df["contribution"],
        check_names=False,
        check_exact=False,
    )
コード例 #5
0
ファイル: models.py プロジェクト: thiesgehrmann/JuSiPy_Asser
 def predictb(self, mdl, data):
     for model in self._models:
         if model == 'svr':
             ypred = pd.DataFrame({'predict_dataframe': pd.Series(model.predict(data))})
         else:
             ypred = pd.DataFrame({'predicted_probability': pd.Series(model.predict_proba(data)[:, 1]),
                                   'predicted_class': pd.Series(loaded_model.predict(data))})
             feats = eli5.explain_prediction_df(model, data, feature_names=list(self._data))
         return(ypred)
コード例 #6
0
def test_explain_prediction(boston_train):
    X, y, feature_names = boston_train
    reg = LinearRegression()
    reg.fit(X, y)
    expl = explain_prediction(reg, X[0])
    df = format_as_dataframe(expl)
    check_prediction_df(df, expl)
    check_prediction_df(explain_prediction_df(reg, X[0]), expl)
    df_dict = explain_prediction_dfs(reg, X[0])
    assert set(df_dict.keys()) == {'targets'}
    check_prediction_df(df_dict['targets'], expl)
コード例 #7
0
    def analyze_fc_dp(self):
        "Feature Contribution - Decision Path"

        # source for plotting decision tree
        # https://medium.com/@rnbrown/creating-and-visualizing-decision-trees-with-python-f8e8fa394176
        # Get all trees of depth 2 in the random forest
        depths2 = [
            tree for tree in self.estimator.estimators_
            if tree.tree_.max_depth == 2
        ]
        # grab the first one
        tree = depths2[0]
        # plot the tree
        dot_data = StringIO()
        export_graphviz(tree,
                        out_file=dot_data,
                        feature_names=self.features,
                        filled=True,
                        rounded=True,
                        special_characters=True)
        graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
        png_str = graph.create_png(prog='dot')

        # treat the dot output string as an image file
        bio = BytesIO()
        bio.write(png_str)
        bio.seek(0)
        img = mpimg.imread(bio)

        # plot the image
        imgplot = plt.imshow(img, aspect='equal')

        # simple exmaple of a player with a 4.6 Forty and a Wt of 260 lbs
        example = np.array([4.6, 260, 0, 0, 0, 0, 0, 0])
        eli5.explain_prediction_df(tree, example, feature_names=self.features)

        example_prec = tree.predict(example.reshape(1, -1))
コード例 #8
0
    def get_features_contribution(self, instance_as_dataframe):
        from eli5 import explain_prediction_df

        explain_df = explain_prediction_df(self.estimator,
                                           instance_as_dataframe.values)

        # separate the bias from features contribution
        mask_bias = explain_df.feature == "<BIAS>"

        self.biais = explain_df[mask_bias].weight.values[0]

        feature_names = explain_df[~mask_bias].feature.values
        feature_contributions = explain_df[~mask_bias].weight.values

        return feature_names, feature_contributions
コード例 #9
0
ファイル: gbm_regression.py プロジェクト: zhilyaev/RecoTour
import seaborn as sns
import matplotlib.pyplot as plt
import lime
import shap

from eli5.sklearn import PermutationImportance
from eli5 import explain_weights_df, explain_prediction_df
from lime.lime_tabular import LimeTabularExplainer

# eli5
feat_imp_df = explain_weights_df(model, feature_names=all_cols)
feat_imp_df.head(10)

X_train = train.values
exp_pred_df = explain_prediction_df(estimator=model,
                                    doc=X_train[0],
                                    feature_names=all_cols)

# lime
explainer = LimeTabularExplainer(X_train,
                                 mode='regression',
                                 feature_names=all_cols,
                                 categorical_features=cat_cols,
                                 random_state=1981,
                                 discretize_continuous=True)
exp = explainer.explain_instance(X_valid[10], model.predict, num_features=20)

# Shap
X_valid_rn = X_valid[random.sample(range(X_valid.shape[0]), 10000)]
shap_explainer = shap.TreeExplainer(model)
valid_shap_vals = shap_explainer.shap_values(X_valid_rn)
コード例 #10
0
def analyse():
    try:
        # store form input in dictionary
        form_input = request.form.to_dict()
        form_input = {k: None if not v else v for k, v in form_input.items()}

        # moving DEBTINC column to last
        temp = form_input['DEBTINC']
        form_input.pop('DEBTINC', None)
        form_input['DEBTINC'] = temp

        # converting form input dictionary to dataframe & calling function for preprocessing
        dataframe_for_prediction = preProcessData(
            pd.DataFrame.from_dict([form_input]))
        # converting processed dataframe to np-array to feed it to model for prediction
        processed_input = dataframe_for_prediction.iloc[:, :].values

        temp_result = model.predict(processed_input)[0]

        if (temp_result == 0):
            category = 'GOOD'
            resType = 'success'
        else:
            category = 'BAD'
            resType = 'warning'

        # converting data to send to view in tabular format
        int_columns = ['LOAN', 'YOJ', 'DEROG', 'DELINQ', 'NINQ', 'CLNO']
        float_columns = ['MORTDUE', 'VALUE', 'CLAGE', 'DEBTINC']

        for i, v in dataframe_for_prediction.iloc[0].to_dict().items():
            if i not in ['HOMEIMP', 'JOBIND']:
                if i in int_columns:
                    form_input[i] = int(float(v))
                if i in float_columns:
                    form_input[i] = round(float(v), 3)
            else:
                if i == 'HOMEIMP':
                    if v == 1:
                        form_input['REASON'] = 'HOMEIMP'
                    else:
                        form_input['REASON'] = 'DEBTCON'
                else:
                    job_frequency_dict = {
                        'Other': 0,
                        'ProfExe': 1,
                        'Office': 2,
                        'Mgr': 3,
                        'Self': 4,
                        'Sales': 5
                    }
                    form_input['JOB'] = list(job_frequency_dict.keys())[list(
                        job_frequency_dict.values()).index(v)]

        # Predicting Feature Importance based on weightage (eli5 library)
        explained_pred = eli5.explain_prediction_df(
            estimator=model, doc=dataframe_for_prediction.iloc[0])
        explained_pred = explained_pred[
            explained_pred.feature != '<BIAS>'].reset_index(drop=True)
        # print( explained_pred )
        explained_pred = explained_pred.iloc[:6]

        plt.figure(figsize=(12, 6))
        sns.barplot(x='weight', y='feature', data=explained_pred)
        plt.title("Top Features",
                  fontsize=30,
                  loc='center',
                  pad=10,
                  fontdict={
                      'family': 'serif',
                      'color': 'darkred',
                      'weight': 'normal',
                      'size': 25
                  })
        plt.xlabel('Importance',
                   fontdict={
                       'family': 'serif',
                       'color': 'darkred',
                       'weight': 'normal',
                       'size': 16
                   })
        plt.ylabel('Features',
                   fontdict={
                       'family': 'serif',
                       'color': 'darkred',
                       'weight': 'normal',
                       'size': 16
                   })
        plt.tick_params(labelsize=12)

        # Saving Feature Importance Bar Plot to img and sending to view
        img_buffer = BytesIO()
        plt.savefig(img_buffer,
                    format='png',
                    bbox_inches='tight',
                    pad_inches=0)
        plt.close()
        img_buffer.seek(0)
        plot_url = base64.b64encode(img_buffer.getvalue()).decode('utf-8')

        features = [str(i) for i in form_input.keys()]
        values = [str(i) for i in form_input.values()]
    except:
        return jsonify(status='error',
                       error='Something went wrong!! Contact Webmaster')
    else:
        return jsonify(status='success',
                       category=category,
                       features=features,
                       values=values,
                       resType=resType,
                       imgUrl=plot_url)