def analyze_ice_fi(self): "Individual Conditional Expectation - Feature Interaction" # pcyebox likes the data to be in a DataFrame so let's create one with our imputed data # we first need to impute the missing data train_X_imp = self.imputer.transform(self.X) train_X_imp_df = pd.DataFrame(train_X_imp, columns=self.features) forty_ice_df = ice(data=train_X_imp_df, column='Forty', predict=self.pipe.predict) # new colormap for ICE plot cmap2 = plt.get_cmap('OrRd') # set color_by to Wt, in order to color each curve by that player's weight ice_plot(forty_ice_df, linewidth=0.5, color_by='Wt', cmap=cmap2) # ice_plot doesn't return a colorbar so we have to add one # hack to add in colorbar taken from here: # https://stackoverflow.com/questions/8342549/matplotlib-add-colorbar-to-a-sequence-of-line-plots/11558629#11558629 wt_vals = forty_ice_df.columns.get_level_values('Wt').values sm = plt.cm.ScalarMappable(cmap=cmap2, norm=plt.Normalize(vmin=wt_vals.min(), vmax=wt_vals.max())) # need to create fake array for the scalar mappable or else we get an error sm._A = [] plt.colorbar(sm, label='Wt') plt.ylabel('Pred. AV %ile') plt.xlabel('Forty')
def plot_ice_grid(dict_of_ice_dfs, data_df, features, ax_ylabel='', nrows=3, ncols=3, figsize=(12, 12), sharex=False, sharey=True, subplots_kws={}, rug_kws={'color':'k'}, **ice_plot_kws): """A function that plots ICE plots for different features in a grid.""" fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=figsize, sharex=sharex, sharey=sharey, **subplots_kws) # for each feature plot the ice curves and add a rug at the bottom of the # subplot #max value for y-axis based on water-depth-min which goes through the whole range ymax = dict_of_ice_dfs[features[0]].values.max() for f, ax in zip(features, axes.flatten()): ice_plot(dict_of_ice_dfs[f], ax=ax, **ice_plot_kws) # add the rug sns.distplot(data_df[f], ax=ax, hist=False, kde=False, rug=True, rug_kws=rug_kws) #ax.set_title('feature = ' + f) ax.set_ylabel(ax_ylabel) ax.set_ylim(0, ymax) sns.despine() # get rid of blank plots for i in range(len(features), nrows*ncols): axes.flatten()[i].axis('off') return fig
def plot_data_vs_ice(pred_function, ylabel, X, feature_name, feature_label, color_by=None, legend_key=None, alpha=0.15): ice_df = ice(X, feature_name,\ pred_function, num_grid_points=None) fig, axs = plt.subplots(2, 1, sharex=False, sharey=True,\ figsize=(15,20)) fig.subplots_adjust(hspace=0.15, wspace=0) if color_by is None or legend_key is None: scatter = axs[0].scatter(X[feature_name],\ pred_function(X),\ alpha=alpha) ice_plot(ice_df, alpha=alpha, ax=axs[1]) else: scatter = axs[0].scatter(X[feature_name],\ pred_function(X),\ c=X[color_by], alpha=alpha) legend = axs[0].legend(*scatter.legend_elements(), loc='best') for s in legend_key.keys(): legend.get_texts()[s].set_text(legend_key[s]) ice_plot(ice_df, color_by=color_by, alpha=alpha, ax=axs[1]) axs[0].set_xlabel(feature_label, fontsize=12) axs[0].set_ylabel(ylabel, fontsize=12) axs[0].set_title('Data', fontsize=16) axs[1].set_xlabel(feature_label, fontsize=12) axs[1].set_ylabel(ylabel, fontsize=12) axs[1].set_title('ICE Curves', fontsize=16) plt.show()
def plot_ice_grid(dict_of_ice_dfs, data_df, features, ax_ylabel='', nrows=3, ncols=3, figsize=(12, 12), sharex=False, sharey=True, subplots_kws={}, rug_kws={'color':'k'}, **ice_plot_kws): """A function that plots ICE plots for different features in a grid.""" fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=figsize, sharex=sharex, sharey=sharey, **subplots_kws) # for each feature plot the ice curves and add a rug at the bottom of the # subplot for f, ax in zip(features, axes.flatten()): ice_plot(dict_of_ice_dfs[f], ax=ax, **ice_plot_kws) # add the rug sns.distplot(data_df[f], ax=ax, hist=False, kde=False, rug=True, rug_kws=rug_kws) autoscale_y(ax) ax.set_title('feature = ' + f) ax.set_ylabel(ax_ylabel) sns.despine() # get rid of blank plots for i in range(len(features), nrows*ncols): axes.flatten()[i].axis('off') return fig
def analyze_ice(self): "Individual Conditional Expectation" # pcyebox likes the data to be in a DataFrame so let's create one with our imputed data # we first need to impute the missing data train_X_imp = self.imputer.transform(self.X) train_X_imp_df = pd.DataFrame(train_X_imp, columns=self.features) forty_ice_df = ice(data=train_X_imp_df, column='Forty', predict=self.pipe.predict) ice_plot(forty_ice_df, c='dimgray', linewidth=0.3) plt.ylabel('Pred. AV %ile') plt.xlabel('Forty')
def plotIce(data, pr): ''' :param data: pandas dataframe with datasets where each row represents a dataset :param resultColumnName: Name of column in data that contains actual results :param pr: Predictor of ML-System saves and plots ICE ''' pr.setReturnDistanceOfClass(True) resultColumnName = pr.resultColumn for i in pr.listOfNumericalColumns: data[i] = data[i].astype(float).round(2).astype(str) data = pr.encode(data) columnCombinations = pr.unsortedColumnCombinations(data, resultColumnName) for columnCombination in columnCombinations: if not isinstance(columnCombination, tuple): iceResult = pIce.ice(data, columnCombination, pr.predict, num_grid_points=None) ax = pIce.ice_plot(iceResult, frac_to_plot=1., plot_points=True, point_kwargs=None, x_quantile=False, plot_pdp=True, centered=False, centered_quantile=0., color_by=None, cmap=None, ax=None, pdp_kwargs=None) ax.set_ylabel("Distance to Hyperplane of true result") ax.set_xlabel(columnCombination) ax.set_title("ICE for " + columnCombination) lines = ax.lines for lineIndex in range(len(lines)): lines[lineIndex].set_label("Dataset " + str(lineIndex)) lines[len(lines) - 1].set_label("Pdp") #ax.legend(loc='upper left', bbox_to_anchor=(1, 1)) for line in ax.lines: line.set_color("k") line._linewidth = 0.5 lines[-1].linewidth = 1 lines[-1].set_color("r") xValues = pr.encodingDictionary[columnCombination] ax.set_xticks(np.arange(1, len(xValues), 1)) ax.set_xticklabels(xValues[1:]) ax.tick_params(axis='both', which='major', labelsize=6) ax.tick_params(axis='both', which='minor', labelsize=6) plt.xticks(rotation=90) saveName = "ice" + str(columnCombination) save(saveName, plt=plt)
# For the variable TENURE, create its ICE function # As mentionned earlier, we are using the predicted probability that a customer will churn as target variable. ice_tenure = ice(X_train2, 'tenure', regressor.predict, num_grid_points=72) # Here 72 grid points because tenure can take 72 values. The more grid points, the more accurate it would be. ice_tenure.head() # Each column corresponds to a datapoint # Data points plots and ICE plots. Run 2 parts together to obtain both plots. fig, (data_ax, ice_ax) = plt.subplots(ncols=2, sharex=True, sharey=True, figsize=(16, 6)) data_ax.scatter(X_train2.tenure, y2_pred2, c='k', alpha=0.5); data_ax.set_xlabel('tenure$'); data_ax.set_ylabel('$churn$'); data_ax.set_title('Data'); # This is a first version of ICE. It is too crowded and uncentered, which we will try to modify. ice_plot(ice_tenure, ax=ice_ax, plot_points=False, linewidth=0.2, plot_pdp = True); # frac_to_plot = 0.1 ; the fraction of ICE curves to plot. ice_ax.set_xlabel('$tenure$'); ice_ax.set_ylabel('$churn$'); ice_ax.set_title('ICE curves'); # Play with the following parameters of ice and ice_plot: num_grid_points, frac_to_plot, centered and plot_points. # New centered-ICE plots, still for tenure and with only a fraction of the total number of instances being considered. ICE_plot(ice_tenure, plot_points=False, linewidth=0.2, plot_pdp = True, frac_to_plot = 0.1) plt.title('Uncentered ICE for a fraction of instances') ICE_plot(ice_tenure, plot_points=False, linewidth=0.2, plot_pdp = True, frac_to_plot = 0.1, centered=True) plt.title('Centered ICE for a fraction of instances') plt.show() # PDP for 'tenure': created manually from definition of 'pycebox.ice.pdp' plt.plot(ice_tenure.index, ice_tenure.mean(axis=1))