def explain_pred_contrib(id, clf, X, features, cats=None, waterfall={ 'rotation_value': 60, 'threshold': None }): try: p = clf.predict_proba(X.loc[X.index == id])[:, 1] except: p = clf.predict_proba(X.loc[X.index == id].values)[:, 1] print( f'Prediction explanation for ID: {id}; Probability of event (y=1): {np.round(p[0], 3)}\nModel used: {type(clf)}' ) try: df = eli5.show_prediction(clf, X.loc[id], show_feature_values=True, feature_names=features) exp = eli5.explain_prediction_df(clf, X.loc[id], feature_names=features) except: df = eli5.show_prediction(clf, X.loc[id].values, show_feature_values=True, feature_names=features) exp = eli5.explain_prediction_df(clf, X.loc[id].values, feature_names=features) if cats is not None: c = id2class(exp, cats) for k, v in c.items(): df.data = df.data.replace(k, v) if waterfall is not None: rot = waterfall['rotation_value'] threshold = waterfall['threshold'] waterfall_chart.plot(exp.feature, exp.weight, rotation_value=rot, net_label="Final Score/Proba", other_label="Minor Features", formatting="{:,.2f}", threshold=threshold, Title='Waterfall of features contributions') return df
def get_feature_importance(model, features, n_top_features=3): """ Given a model and a features DataFrame with the different observations, calculate the top important features for each individual prediction. Parameters ----------- model: model fit Fit of the model wanted to analyze (e.g. LGBMClassifer). features: DataFrame Pandas DataFrame with the input matrix used for the model (rows are different observations and columns are features). n_top_features: int Number of top features wanted. By default, the top 3 features will be displayed. Returns ------- Translated text in the format "what on who" (e.g. 'tp_n_tenders' -> 'tenders on type of procurement'). """ features_list = [] for index, row in features.iterrows(): important_features = eli5.explain_prediction_df( model, row, feature_names=[c for c in features.columns ])['feature'][:n_top_features] features_list = features_list + [ list( map(translate_features_to_human_language, [f for f in important_features])) ] return features_list
def explain(self, data): data = self.build_input(data) X = data.iloc[0].values feature_names = [c for c in data.columns] return explain_prediction_df(self.model.model.get_booster(), X, feature_names=feature_names)
def test_prediction_decomposition_eqal_eli5(): """Test that the prediction decomposition outputs from xgb.explainer.decompose_prediction are eqaul to the outputs from eli5.""" model = build_model.build_depth_3_model() boston = load_boston() boston_data = pd.DataFrame(boston["data"], columns=boston["feature_names"]) row_data = boston_data.iloc[[0]] eli5_decomposition = explain_prediction_df(model, row_data) column_mapping = {"<BIAS>": "base"} # create mapping because eli5 output will have feature names x0, x1 etc.. for i, x in enumerate(boston["feature_names"]): column_mapping[f"x{i}"] = x eli5_decomposition["feature_mapped"] = eli5_decomposition["feature"].map( column_mapping ) ttrees_trees_df = ttrees.xgb.parser.parse_model(model) ttrees_decomposition = ttrees.xgb.explainer.decompose_prediction( ttrees_trees_df.tree_data, row_data ) # aggregate ttrees output to variable level, by default it is at tree x node level ttrees_decomposition_agg = pd.DataFrame( ttrees_decomposition.groupby("contributing_var").contribution.sum() ).reset_index() decomposition_compare_df = ttrees_decomposition_agg.merge( eli5_decomposition[["feature_mapped", "weight"]], how="left", left_on="contributing_var", right_on="feature_mapped", indicator=True, ) # check merge is 1:1 i.e. both have same variables if ( decomposition_compare_df["_merge"] == "both" ).sum() < decomposition_compare_df.shape[0]: pytest.fail( f"different features in eli5 and ttrees (merge not 1:1)\n\n{decomposition_compare_df}" ) # check equality between prediction decomposition values assert_series_equal( left=decomposition_compare_df["weight"], right=decomposition_compare_df["contribution"], check_names=False, check_exact=False, )
def predictb(self, mdl, data): for model in self._models: if model == 'svr': ypred = pd.DataFrame({'predict_dataframe': pd.Series(model.predict(data))}) else: ypred = pd.DataFrame({'predicted_probability': pd.Series(model.predict_proba(data)[:, 1]), 'predicted_class': pd.Series(loaded_model.predict(data))}) feats = eli5.explain_prediction_df(model, data, feature_names=list(self._data)) return(ypred)
def test_explain_prediction(boston_train): X, y, feature_names = boston_train reg = LinearRegression() reg.fit(X, y) expl = explain_prediction(reg, X[0]) df = format_as_dataframe(expl) check_prediction_df(df, expl) check_prediction_df(explain_prediction_df(reg, X[0]), expl) df_dict = explain_prediction_dfs(reg, X[0]) assert set(df_dict.keys()) == {'targets'} check_prediction_df(df_dict['targets'], expl)
def analyze_fc_dp(self): "Feature Contribution - Decision Path" # source for plotting decision tree # https://medium.com/@rnbrown/creating-and-visualizing-decision-trees-with-python-f8e8fa394176 # Get all trees of depth 2 in the random forest depths2 = [ tree for tree in self.estimator.estimators_ if tree.tree_.max_depth == 2 ] # grab the first one tree = depths2[0] # plot the tree dot_data = StringIO() export_graphviz(tree, out_file=dot_data, feature_names=self.features, filled=True, rounded=True, special_characters=True) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) png_str = graph.create_png(prog='dot') # treat the dot output string as an image file bio = BytesIO() bio.write(png_str) bio.seek(0) img = mpimg.imread(bio) # plot the image imgplot = plt.imshow(img, aspect='equal') # simple exmaple of a player with a 4.6 Forty and a Wt of 260 lbs example = np.array([4.6, 260, 0, 0, 0, 0, 0, 0]) eli5.explain_prediction_df(tree, example, feature_names=self.features) example_prec = tree.predict(example.reshape(1, -1))
def get_features_contribution(self, instance_as_dataframe): from eli5 import explain_prediction_df explain_df = explain_prediction_df(self.estimator, instance_as_dataframe.values) # separate the bias from features contribution mask_bias = explain_df.feature == "<BIAS>" self.biais = explain_df[mask_bias].weight.values[0] feature_names = explain_df[~mask_bias].feature.values feature_contributions = explain_df[~mask_bias].weight.values return feature_names, feature_contributions
import seaborn as sns import matplotlib.pyplot as plt import lime import shap from eli5.sklearn import PermutationImportance from eli5 import explain_weights_df, explain_prediction_df from lime.lime_tabular import LimeTabularExplainer # eli5 feat_imp_df = explain_weights_df(model, feature_names=all_cols) feat_imp_df.head(10) X_train = train.values exp_pred_df = explain_prediction_df(estimator=model, doc=X_train[0], feature_names=all_cols) # lime explainer = LimeTabularExplainer(X_train, mode='regression', feature_names=all_cols, categorical_features=cat_cols, random_state=1981, discretize_continuous=True) exp = explainer.explain_instance(X_valid[10], model.predict, num_features=20) # Shap X_valid_rn = X_valid[random.sample(range(X_valid.shape[0]), 10000)] shap_explainer = shap.TreeExplainer(model) valid_shap_vals = shap_explainer.shap_values(X_valid_rn)
def analyse(): try: # store form input in dictionary form_input = request.form.to_dict() form_input = {k: None if not v else v for k, v in form_input.items()} # moving DEBTINC column to last temp = form_input['DEBTINC'] form_input.pop('DEBTINC', None) form_input['DEBTINC'] = temp # converting form input dictionary to dataframe & calling function for preprocessing dataframe_for_prediction = preProcessData( pd.DataFrame.from_dict([form_input])) # converting processed dataframe to np-array to feed it to model for prediction processed_input = dataframe_for_prediction.iloc[:, :].values temp_result = model.predict(processed_input)[0] if (temp_result == 0): category = 'GOOD' resType = 'success' else: category = 'BAD' resType = 'warning' # converting data to send to view in tabular format int_columns = ['LOAN', 'YOJ', 'DEROG', 'DELINQ', 'NINQ', 'CLNO'] float_columns = ['MORTDUE', 'VALUE', 'CLAGE', 'DEBTINC'] for i, v in dataframe_for_prediction.iloc[0].to_dict().items(): if i not in ['HOMEIMP', 'JOBIND']: if i in int_columns: form_input[i] = int(float(v)) if i in float_columns: form_input[i] = round(float(v), 3) else: if i == 'HOMEIMP': if v == 1: form_input['REASON'] = 'HOMEIMP' else: form_input['REASON'] = 'DEBTCON' else: job_frequency_dict = { 'Other': 0, 'ProfExe': 1, 'Office': 2, 'Mgr': 3, 'Self': 4, 'Sales': 5 } form_input['JOB'] = list(job_frequency_dict.keys())[list( job_frequency_dict.values()).index(v)] # Predicting Feature Importance based on weightage (eli5 library) explained_pred = eli5.explain_prediction_df( estimator=model, doc=dataframe_for_prediction.iloc[0]) explained_pred = explained_pred[ explained_pred.feature != '<BIAS>'].reset_index(drop=True) # print( explained_pred ) explained_pred = explained_pred.iloc[:6] plt.figure(figsize=(12, 6)) sns.barplot(x='weight', y='feature', data=explained_pred) plt.title("Top Features", fontsize=30, loc='center', pad=10, fontdict={ 'family': 'serif', 'color': 'darkred', 'weight': 'normal', 'size': 25 }) plt.xlabel('Importance', fontdict={ 'family': 'serif', 'color': 'darkred', 'weight': 'normal', 'size': 16 }) plt.ylabel('Features', fontdict={ 'family': 'serif', 'color': 'darkred', 'weight': 'normal', 'size': 16 }) plt.tick_params(labelsize=12) # Saving Feature Importance Bar Plot to img and sending to view img_buffer = BytesIO() plt.savefig(img_buffer, format='png', bbox_inches='tight', pad_inches=0) plt.close() img_buffer.seek(0) plot_url = base64.b64encode(img_buffer.getvalue()).decode('utf-8') features = [str(i) for i in form_input.keys()] values = [str(i) for i in form_input.values()] except: return jsonify(status='error', error='Something went wrong!! Contact Webmaster') else: return jsonify(status='success', category=category, features=features, values=values, resType=resType, imgUrl=plot_url)