def pdp_interact_explain(X, y, feature): import category_encoders as ce from sklearn.pipeline import make_pipeline from sklearn.impute import SimpleImputer from sklearn.ensemble import RandomForestClassifier from pdpbox.pdp import pdp_interact, pdp_interact_plot # Encode, impute as needed X_encoded = ce.OrdinalEncoder().fit_transform(X) X_processed = SimpleImputer().fit_transform(X_encoded) # Pick a model and fit the data pdp_model = RandomForestClassifier(n_estimators=200, n_jobs=-1, random_state=6) pdp_model.fit(X_processed, y) # The actual plotting pdp_interact = pdp_interact(model=pdp_model, dataset=X_encoded, model_features=X_encoded.columns, features=feature) # There's a TypeError in the pdpinteract code that prevents the axes from getting labels # and I can't be bothered to go fix their mistakes. # This ignores it and lets it continue with the plotting try: pdp_interact_plot(pdp_interact, feature_names=feature, plot_type='contour') except: pass
def plot_2d_pdp(self, features_2d_plot): # creating data to plot inter = pdp.pdp_interact(model=self.model, dataset=self.x, model_features=list(self.x.columns), features=features_2d_plot) # plot it plot_params = { # plot title and subtitle 'title_fontsize': 15, 'subtitle_fontsize': 12, # color for contour line 'contour_color': 'white', 'font_family': 'Arial', # matplotlib color map for interact plot 'cmap': 'viridis', # fill alpha for interact plot 'inter_fill_alpha': 0.8, # fontsize for interact plot text 'inter_fontsize': 9, } pdp.pdp_interact_plot(pdp_interact_out=inter, feature_names=features_2d_plot, plot_type='contour', plot_params=plot_params) # saving the plot plt.tight_layout() plt.savefig(self.out + '/dep_plot_2d.jpg', dpi=300) plt.close()
def show_PDP_interact(self, features=[]): for f in features: pdp_interact = pdp.pdp_interact(self.model,self.X_train, model_features=self.feature_names, features=f) pdp.pdp_interact_plot(pdp_interact,feature_names=f) plt.xticks(rotation=90) plt.show()
def part_plot_2D(model, total_features, X_val, y_val, feature1, feature2): inter1 = pdp.pdp_interact(model=model, dataset=X_val, model_features=total_features, features=[feature1, feature2]) pdp.pdp_interact_plot(pdp_interact_out=inter1, feature_names=[feature1, feature2], plot_type='grid') plt.show()
def two_dim_pdp(f): ''' Function for plotting a two dimension PDP''' inter = pdp.pdp_interact(model=rf, dataset=X_train, model_features=X_train.columns, features=f) pdp.pdp_interact_plot(pdp_interact_out=inter, feature_names=f, plot_type='grid') plt.show()
def pdp_plot_bivariate(self, model, X_val, feature_pair): ''' pdp plot for feature pair model: fitted model X_val: validation dataset feature_pair : pair of feature (list) ''' partial_plot = pdp.pdp_interact(model, X_val, X_val.columns.tolist(), feature_pair) pdp.pdp_interact_plot(partial_plot, feature_pair, plot_type='contour') plt.show()
def interaction(model, X, features, type = 'grid'): """ plot interaction between features """ #instantiate interaction vairable interaction = pdp_interact(model = model, dataset = X, model_features = X.columns, features = features) #plot interactions pdp_interact_plot(interaction, plot_type = type, feature_names = features)
def plot_2D_partial_dependency(fitted_model, X_test : pd.DataFrame, model_features : list, features : list , plot_type = 'contour'): """ have an error with the matplolib version 3.0.0 """ # Create the data that we will plot pdp_obj = pdp.pdp_interact(model = fitted_model, dataset = X_test, model_features = model_features, features = features) # plot it pdp.pdp_interact_plot(pdp_interact_out = pdp_obj, feature_names = feature, plot_type = plot_type) plt.show()
def test_grid(self, pdp_interact_out): fig, axes = pdp_interact_plot(pdp_interact_out=pdp_interact_out, feature_names=['age', 'fare'], plot_type='grid', x_quantile=True) assert type(fig) == matplotlib.figure.Figure assert sorted(axes.keys()) == ['pdp_inter_ax', 'title_ax'] for k in axes.keys(): assert type(axes[k]) == matplotlib.axes._subplots.Subplot
def pdp_2d(clf,X,Y,features_to_plot,label,plot_type='contour'): from pdpbox import pdp # Extract the classifier object from the clf multilearn object index = Y.columns.to_list().index(label) clf = clf.classifiers_[index] clf.verbose = False #Turn verbose off after this to tidy prints inter = pdp.pdp_interact(model=clf, dataset=X, model_features=X.columns.to_list(), features=features_to_plot,percentile_ranges=[(5,95),(5,95)]) if(plot_type=='grid'): fig, ax = pdp.pdp_interact_plot(pdp_interact_out=inter, feature_names=features_to_plot, plot_type='grid', x_quantile=True,plot_pdp=True) elif(plot_type=='contour'): fig, ax = pdp.pdp_interact_plot(pdp_interact_out=inter, feature_names=features_to_plot, plot_type='contour') clf.verbose = True # reset return fig, ax
def ind_cond_exp(model_line, X_train, y_data): empty_list = [] for col in X_train.columns: print(col) empty_list.append(col) from pdpbox.pdp import pdp_interact, pdp_interact_plot X_features = empty_list features = empty_list[1:3] interaction = pdp_interact(model=model_line, dataset=X_train, model_features=X_features, features=features) #pdp_goals = pdp.pdp_isolate(model=model_line, dataset=X_train, model_features=X_features, feature='sqft_living') pdp_interact_plot(interaction, plot_type='grid', feature_names=features) import seaborn as sns pdp = interaction.pdp.pivot_table( values='preds', columns=features[0], index=features[1] )[::-1] # Slice notation to reverse index order so y axis is ascending #plt.figure(figsize=(10,8)) # sns.heatmap(pdp, annot=True, fmt='.2f', cmap='viridis') # plt.title('Partial Dependence on Interest Rate on Annual Income & Credit Score'); #import plotly.graph_objs as go surface = go.Surface(x=pdp.columns, y=pdp.index, z=pdp.values) fig = go.Figure(surface) fig.show() ee.layout = html.Div([dcc.Graph(figure=fig)]) print("done") return ee.index()
def test_grid(self, pdp_interact_out): fig, axes = pdp_interact_plot(pdp_interact_out=pdp_interact_out, feature_names=['feat_67', 'feat_24'], plot_type='grid', x_quantile=True) assert type(fig) == matplotlib.figure.Figure assert sorted(axes.keys()) == ['pdp_inter_ax', 'title_ax'] assert len(axes['pdp_inter_ax']) == 9 assert type(axes['title_ax']) == matplotlib.axes._subplots.Subplot for i in range(9): assert type( axes['pdp_inter_ax'][i]) == matplotlib.axes._subplots.Subplot
def plot_pdp_interact(model, X_train, feats): """ Function to plot dependency of target variable on the feature :param model: Trained model :param X_train: Datafram to get prediction of model from :param feats: List (size 2) of feature to plot target dependency for :param clusters: Flag to indicate is clusters are needed :param feat_name: Feature name to display on plot :return: partial dependency plot """ x = get_sample(X_train, 1000) p = pdp.pdp_interact(model, x, x.columns, feats) return pdp.pdp_interact_plot(p, feats, plot_pdp=True)
def show_partial_dependence(model, val_X, features): ''' Takes the model and dataframe for validation set (X) then plots the partial dependence plot For more on this, check https://www.kaggle.com/dansbecker/partial-plots?utm_medium=email&utm_source=mailchimp&utm_campaign=ml4insights ''' from matplotlib import pyplot as plt from pdpbox import pdp, get_dataset, info_plots # Do I need get_dataset and info_plots??? feature_names = [i for i in val_X.columns if val_X[i].dtype in [np.int64]] if (not type(features) == list): # Create the data that we will plot pdp_feature = pdp.pdp_isolate(model=model, dataset=val_X, model_features=feature_names, feature=features) # plot it pdp.pdp_plot(pdp_feature, feature) plt.show() elif (type(features) == list) & (len(features) == 2): # Similar to previous PDP plot except we use pdp_interact instead of pdp_isolate and pdp_interact_plot instead of pdp_isolate_plot inter1 = pdp.pdp_interact(model=model, dataset=val_X, model_features=feature_names, features=features) pdp.pdp_interact_plot(pdp_interact_out=inter1, feature_names=features_to_plot, plot_type='contour') plt.show() else: print( 'Error, check input and also think of a better error message.... don\'t be lazy' )
def pdp_interact_plot(self, feature, var_name=None, sample = 10000, which_classes = None, num_grid_points=[10, 10], plot_types = None, plot_params = {'cmap': ["#00cc00", "#002266"]}): ft_plot = pdp.pdp_interact( model=self.md, dataset=self.sample(sample), model_features=self.features, features=feature, num_grid_points=num_grid_points, n_jobs=4) plot_types = ['contour', 'grid'] if plot_types is None else [plot_types] for plot_type in plot_types: figs, ax = pdp.pdp_interact_plot( pdp_interact_out = ft_plot, feature_names = var_name or feature, plot_type= plot_type, plot_pdp=True, which_classes=which_classes, plot_params = plot_params) plt.show()
def test_contour_1(self, pdp_interact_out): fig, axes = pdp_interact_plot(pdp_interact_out=pdp_interact_out, feature_names=['feat_67', 'feat_24'], plot_type='contour', x_quantile=True, plot_pdp=True, which_classes=[1]) assert type(fig) == matplotlib.figure.Figure assert sorted(axes.keys()) == ['pdp_inter_ax', 'title_ax'] assert sorted(axes['pdp_inter_ax'].keys()) == [ '_pdp_inter_ax', '_pdp_x_ax', '_pdp_y_ax' ] assert type(axes['title_ax']) == matplotlib.axes._subplots.Subplot for k in ['_pdp_inter_ax', '_pdp_x_ax', '_pdp_y_ax']: assert type( axes['pdp_inter_ax'][k]) == matplotlib.axes._subplots.Subplot
def plot_2d_pdp(model, X, y=None, X_unscaled=None, model_features=None, features=None, **kwargs): ''' Plots a 1d pdp plot with the x-axis being unscaled. X_scaled: A pandas dataframe or numpy array. Contains the unscaled values of X. All other variables are the same as for plot_1d_pdp() ''' if y is not None: model.fit(X, y) pdp_plt = pdp.pdp_interact(model=model, dataset=X, model_features=model_features, features=features) fig, ax = pdp.pdp_interact_plot(pdp_plt, feature_names=features, **kwargs) if X_unscaled is not None: meanx = X_unscaled[features[0]].mean() stdx = X_unscaled[features[0]].std() #Unscale x values def unscale_xticks(x, pos): return ('%.1f' % (x * stdx + meanx)) meany = X_unscaled[features[1]].mean() stdy = X_unscaled[features[1]].std() #Unscale y values def unscale_yticks(x, pos): return ('%.1f' % (x * stdy + meany)) ax['pdp_inter_ax'].xaxis.set_major_formatter( mticker.FuncFormatter(unscale_xticks)) ax['pdp_inter_ax'].yaxis.set_major_formatter( mticker.FuncFormatter(unscale_yticks)) return fig, ax
def pdp_2d_pdp(term_type, train_data, fea_2d_1, fea_2d_2, fea_nam): fea_1_min = min(train_data[fea_2d_1].values) fea_1_max = max(train_data[fea_2d_1].values) fea_2_min = min(train_data[fea_2d_2].values) fea_2_max = max(train_data[fea_2d_2].values) inter_rf = pdp.pdp_interact( model=rfr, dataset=train_data, model_features=fea_nam, cust_grid_points=[np.linspace(fea_1_min, fea_1_max, 10), np.linspace(fea_2_min, fea_2_max, 10)], features=[fea_2d_1, fea_2d_2]) fig, axes = pdp.pdp_interact_plot( inter_rf, [fea_2d_1, fea_2d_2], x_quantile=False, plot_type='contour', plot_pdp=False) fig.savefig("./results/{}_{}-{}_2d_pdp.png".format(term_type, fea_2d_1, fea_2d_2), dpi=300) return fig
def save_pdp_plot_2d(model, X_train, features, n_jobs, figure_saver=None): model.n_jobs = n_jobs with parallel_backend("threading", n_jobs=n_jobs): pdp_interact_out = pdp.pdp_interact( model=model, dataset=X_train, model_features=X_train.columns, features=features, num_grid_points=[20, 20], ) fig, axes = pdp.pdp_interact_plot( pdp_interact_out, features, x_quantile=True, figsize=(7, 8) ) axes["pdp_inter_ax"].xaxis.set_tick_params(rotation=45) if figure_saver is not None: figure_saver.save_figure(fig, "__".join(features), sub_directory="pdp_2d")
def plot_modal(): path = os.path.join(app.config['UPLOAD_FOLDER'], session.get("filename", "not set")) index = request.args.get('model', default=0, type=int) estim = request.args.get('estimator', default=None, type=str) target_ft = session.get('target_ft', 'not set') features = session.get('features', 'not set') f1 = request.args.get('f1', default=None, type=str) f2 = request.args.get('f2', default=None, type=str) t1 = request.args.get('t1', default=None, type=str) X, y, data = process_data(path, "csv", target_ft) #remove nans data = data.dropna() chosen_class = list(np.unique(y)).index(int(float(t1))) with open("tmp_files/model_{}_{}.pickle".format(estim, str(index)), 'rb') as filehandler: pipe = pickle.load(filehandler) mod_path = "modal_" + str(f1.replace('.', '_')) + \ "_" + str(f2.replace('.', '_')) pdp_V1_V2 = pdp.pdp_interact(model=pipe, dataset=data, model_features=features, features=[f1, f2], num_grid_points=None, percentile_ranges=[None, None]) fig, axes = pdp.pdp_interact_plot(pdp_V1_V2, [f1, f2], plot_type='grid', x_quantile=True, ncols=2, plot_pdp=True, which_classes=[chosen_class], plot_params={ "subtitle": "For Class {}, Label: {}".format( chosen_class, t1) }) fig.savefig("static/images/figs/" + mod_path, bbox_inches="tight", transparent=True) plt.figure() return render_template("modal_plot.html", plot_name=mod_path)
def construct_pdp_interact(model, feature_names, dataset_x = dat_train_x, dataset_y = dat_train_y, num_grid_points = num_grid_points_int, n_jobs = n_jobs, model_features = dat_train_x.columns): inter_current = pdp.pdp_interact( model = model, dataset = dataset_x.join(dataset_y), num_grid_points = num_grid_points, n_jobs = n_jobs, ## needs to be 1 for XGBoost model! model_features = model_features, features = feature_names) fig, axes = pdp.pdp_interact_plot( inter_current, feature_names = feature_names, x_quantile = False, plot_type = 'contour', plot_pdp = False, plot_params = plot_params_pdp_int_default) axes["pdp_inter_ax"].set_xlabel(varnames_long_dict[feature_names[0]]) axes["pdp_inter_ax"].set_ylabel(varnames_long_dict[feature_names[1]]) ## [[here]] y-labels! axes["pdp_inter_ax"].set_title('Number of bike rides per hour\n(Partial Dependence Plot) for\n{0} and {1}\n'\ .format(varnames_long_dict[feature_names[0]], varnames_long_dict[feature_names[1]]), y = 1) return fig
def pdp_interact_plot(model, dataset, model_features, feature1, feature2, plot_type="grid", x_quantile=True, plot_pdp=False): """Wrapper for pdp.pdp_interact_plot. Uses pdp.pdp_interact.""" pdp_interact_out = pdp.pdp_interact( model=model, dataset=dataset, model_features=model_features, features=[feature1, feature2], ) fig, _ = pdp.pdp_interact_plot( pdp_interact_out=pdp_interact_out, feature_names=[feature1, feature2], plot_type=plot_type, x_quantile=x_quantile, plot_pdp=plot_pdp, ) return fig
model=model, dataset=X, model_features=x_cols, features=x_cols[:2], num_grid_points=[10, 10], percentile_ranges=[None, None], n_jobs=1, ) # %% pdp_interact_plot: grid fig, axes = pdp.pdp_interact_plot( pdp_interacted_tmp, feature_names=x_cols, plot_type='grid', x_quantile=True, ncols=2, plot_pdp=True, which_classes=[1, 2, 3], ) # %% pdp_interact_plot: contour try: fig, axes = pdp.pdp_interact_plot( pdp_interacted_tmp, feature_names=x_cols, plot_type='contour', x_quantile=True, # ncols=1, plot_pdp=True,
encoded = fig_to_base64(pdp_plot_feature) html_pdp = '<img class="img-fluid" src="data:image/png;base64, {}">'.format( encoded.decode('utf-8')) html_partial_plot += html_pdp # ------------------------------------------------------------- # 2D PARTIAL DEPENDENCE PLOTS # Similar to previous PDP plot except we use pdp_interact instead of pdp_isolate # and pdp_interact_plot instead of pdp_isolate_plot # features_to_plot = ['preg', 'skin'] inter1 = pdp.pdp_interact(model=loaded_model, dataset=dataframe_test, model_features=feature_names, features=features_to_plot2d) partial_plot = pdp.pdp_interact_plot( pdp_interact_out=inter1, feature_names=features_to_plot2d ) # plot_type='contour' plot_type='grid' encoded = fig_to_base64(partial_plot) html_partial_plot2d = '<img class="img-fluid" src="data:image/png;base64, {}">'.format( encoded.decode('utf-8')) # ------------------------------------------------------------- # SHAP PLOT data_for_prediction = dataframe_test.iloc[shap_row_to_show] explainer = shap.KernelExplainer(loaded_model.predict_proba, dataframe_test.values) shap_values = explainer.shap_values(data_for_prediction) shap.initjs() shap_plot = shap.force_plot(explainer.expected_value[1], shap_values[1], data_for_prediction,
#plt.title(f'Top {n} features pipeline5') plt.title(f'Top {n} features Gradient Boosting') importances4.sort_values()[-n:].plot.barh(color='grey'); !pip install pdpbox # Partial Dependence Plots with 2 features from pdpbox.pdp import pdp_interact, pdp_interact_plot features2 = ['Latitude', 'Longitude Difference to State Capital'] interaction = pdp_interact( model=gb, dataset=X_val, model_features=X_val.columns, features=features2 ) pdp_interact_plot(interaction, plot_type='grid', feature_names=features2); # A two feature partical dependence plot in 3D pdp = interaction.pdp.pivot_table( values='preds', columns=features2[0], index=features2[1] )[::-1] # Slice notation to reverse index order so y axis is ascending import plotly.graph_objs as go target = 'Value of d parameter' surface = go.Surface(x=pdp.columns, y=pdp.index, z=pdp.values)
# PDF between total of special request and is repeated guest # it shows that the numbers of unique grid points for each # total of special requestis 2 and repeated guest is 4 pdf_features = ['is_repeated_guest', 'total_of_special_requests'] booking_interaction = pdp_interact( model=Rand_forest, dataset=X_val, model_features=X_val.columns, features=pdf_features ) # this multiple classes which is total of special request and is_rpeated_quest # with numbers of grid points that 2:4 pdp_interact_plot(booking_interaction, plot_type='grid', feature_names=pdf_features); """### Shapley Values It is a good technique to show the insight of the model predictor and break down each model individually. """ # explain the individual observation # if I want to look for the first row from X_test # turn it into a datafrme first_row=X_test_encoded.iloc[[0]] first_row # what is the actual reservation status for the hotel booking # by the y_test for the first row which is checkout y_test.iloc[[0]]
# Best number of iterations: 32 # --------------------Model interpretation---------------- # Plotting feature importances gpb.plot_importance(bst) # Partial dependence plots from pdpbox import pdp # Single variable plots (takes a few seconds to compute) pdp_dist = pdp.pdp_isolate(model=bst, dataset=X_train, model_features=X_train.columns, feature='variable_2', num_grid_points=50) pdp.pdp_plot(pdp_dist, 'variable_2', plot_lines=True) # Two variable interaction plot inter_rf = pdp.pdp_interact(model=bst, dataset=X_train, model_features=X_train.columns, features=['variable_1','variable_2']) pdp.pdp_interact_plot(inter_rf, ['variable_1','variable_2'], x_quantile=True, plot_type='contour', plot_pdp=True)# ignore any error message # SHAP values and dependence plots # Note: you need shap version>=0.36.0 import shap shap_values = shap.TreeExplainer(bst).shap_values(X_test) shap.summary_plot(shap_values, X_test) shap.dependence_plot("variable_2", shap_values, X_test) # --------------------Comparison to alternative approaches---------------- results = pd.DataFrame(columns = ["RMSE","Time"], index = ["GPBoost", "Linear_ME","Boosting_Ign","Boosting_Cat","MERF"]) # 1. GPBoost gp_model = gpb.GPModel(group_data=group_train) start_time = time.time() # measure time
#Statistics of survivors based on Age and Pclass fig, axes, summary_df = info_plots.target_plot_interact( df=titanic_data, features=['Age', 'Pclass'], feature_names=['Age', 'Pclass'], target=titanic_target ) display(summary_df.head()) #Prediction of our model, impact if Age and Pclass fig, axes, summary_df = info_plots.actual_plot_interact( model=titanic_model, X=titanic_data[titanic_features], features=['Age', 'Pclass'], feature_names=['Age', 'Pclass'] ) display(summary_df.head()) #PDP for the interaction between Age and Pclass inter1 = pdp.pdp_interact( model=titanic_model, dataset=titanic_data, model_features=titanic_features, features=['Age', 'Pclass'] ) fig, axes = pdp.pdp_interact_plot( pdp_interact_out=inter1, feature_names=['age', 'Pclass'], plot_type='contour', x_quantile=True, plot_pdp=True ) #Let's study the link between Fare and Sex #Statistics of survivors based on Fare and Sex fig, axes, summary_df = info_plots.target_plot_interact( df=titanic_data, features=['Fare', 'Sex'], feature_names=['Fare', 'Sex'], target=titanic_target ) display(summary_df.head()) #Prediction of our model, impact if Fare and Gender fig, axes, summary_df = info_plots.actual_plot_interact( model=titanic_model, X=titanic_data[titanic_features], features=['Fare', 'Sex'], feature_names=['Fare', 'Sex'] ) display(summary_df.head()) #PDP for the interaction between Age and Pclass inter1 = pdp.pdp_interact(
# %% ################################## # Interaction Partial Dependency # ################################## inter1 = pdp.pdp_interact(model=rf_mod, dataset=train_X, model_features=train_X.columns, features=['Miles_traveled', 'Season_offense']) fig, ax = pdp.pdp_interact_plot( pdp_interact_out=inter1, feature_names=['Miles_traveled', 'Season_offense'], plot_type="grid", plot_params={ 'font_family': 'serif', 'title_fontsize': 15, 'fontsize': 15 }) pdp.plt.savefig('../09_figures/inter_offense.png') #%% inter2 = pdp.pdp_interact(model=rf_mod, dataset=train_X, model_features=train_X.columns, features=['Miles_traveled', 'Season_defense']) fig, ax = pdp.pdp_interact_plot(
X=df[X.columns], feature=_, feature_name=_, predict_kwds={} ) fig, axes, summary_df = info_plots.actual_plot_interact( model=baseline, X=df[X.columns], features=interactions_2way, feature_names=interactions_2way ) #PDP Plot : Grid Plot interactions = pdp.pdp_interact( model=baseline, dataset=df, model_features=X.columns, features=interactions_2way ) fig, axes = pdp.pdp_interact_plot(interactions,interactions_2way , plot_type='grid', x_quantile=True, plot_pdp=False) #SHAP shap.initjs() explainer=shap.TreeExplainer(baseline) shap_values=explainer.shap_values(x_train[_feat]) shap.summary_plot(shap_values,x_train[_feat]) def ABS_SHAP(df_shap,df): #import matplotlib as plt # Make a copy of the input data shap_v = pd.DataFrame(df_shap) feature_list = df.columns shap_v.columns = feature_list df_v = df.copy().reset_index().drop('index',axis=1)