def analyze_ice_grid(self): "Individual Conditional Expectation - Feature Interaction Grid" train_X_imp = self.imputer.transform(self.X) train_X_imp_df = pd.DataFrame(train_X_imp, columns=self.features) # create dict of ICE data for grid of ICE plots train_ice_dfs = { feat: ice(data=train_X_imp_df, column=feat, predict=self.estimator.predict) for feat in self.features } fig = plot_ice_grid(train_ice_dfs, train_X_imp_df, self.features, ax_ylabel='Pred. AV %ile', alpha=0.3, plot_pdp=True, pdp_kwargs={ 'c': 'red', 'linewidth': 3 }, linewidth=0.5, c='dimgray') fig.tight_layout() fig.suptitle('ICE plots (training data)') fig.subplots_adjust(top=0.89)
def plot_data_vs_ice(pred_function, ylabel, X, feature_name, feature_label, color_by=None, legend_key=None, alpha=0.15): ice_df = ice(X, feature_name,\ pred_function, num_grid_points=None) fig, axs = plt.subplots(2, 1, sharex=False, sharey=True,\ figsize=(15,20)) fig.subplots_adjust(hspace=0.15, wspace=0) if color_by is None or legend_key is None: scatter = axs[0].scatter(X[feature_name],\ pred_function(X),\ alpha=alpha) ice_plot(ice_df, alpha=alpha, ax=axs[1]) else: scatter = axs[0].scatter(X[feature_name],\ pred_function(X),\ c=X[color_by], alpha=alpha) legend = axs[0].legend(*scatter.legend_elements(), loc='best') for s in legend_key.keys(): legend.get_texts()[s].set_text(legend_key[s]) ice_plot(ice_df, color_by=color_by, alpha=alpha, ax=axs[1]) axs[0].set_xlabel(feature_label, fontsize=12) axs[0].set_ylabel(ylabel, fontsize=12) axs[0].set_title('Data', fontsize=16) axs[1].set_xlabel(feature_label, fontsize=12) axs[1].set_ylabel(ylabel, fontsize=12) axs[1].set_title('ICE Curves', fontsize=16) plt.show()
def analyze_ice_fi(self): "Individual Conditional Expectation - Feature Interaction" # pcyebox likes the data to be in a DataFrame so let's create one with our imputed data # we first need to impute the missing data train_X_imp = self.imputer.transform(self.X) train_X_imp_df = pd.DataFrame(train_X_imp, columns=self.features) forty_ice_df = ice(data=train_X_imp_df, column='Forty', predict=self.pipe.predict) # new colormap for ICE plot cmap2 = plt.get_cmap('OrRd') # set color_by to Wt, in order to color each curve by that player's weight ice_plot(forty_ice_df, linewidth=0.5, color_by='Wt', cmap=cmap2) # ice_plot doesn't return a colorbar so we have to add one # hack to add in colorbar taken from here: # https://stackoverflow.com/questions/8342549/matplotlib-add-colorbar-to-a-sequence-of-line-plots/11558629#11558629 wt_vals = forty_ice_df.columns.get_level_values('Wt').values sm = plt.cm.ScalarMappable(cmap=cmap2, norm=plt.Normalize(vmin=wt_vals.min(), vmax=wt_vals.max())) # need to create fake array for the scalar mappable or else we get an error sm._A = [] plt.colorbar(sm, label='Wt') plt.ylabel('Pred. AV %ile') plt.xlabel('Forty')
def test_ice_two_samples_two_points(): X = np.eye(2) df = pd.DataFrame(X, columns=['x0', 'x1']) ice_df = ice.ice(df, 'x0', lambda X: X.prod(axis=1)) ice_df_expected = pd.DataFrame(np.array([[0, 0], [0, 1]]), columns=pd.Series([0., 1.], name='x1'), index=pd.Series([0., 1.], name='x0')) assert (ice_df == ice_df_expected).all().all()
def test_ice_one_sample_one_point(): X = np.array([[0, 1]]) df = pd.DataFrame(X, columns=['x0', 'x1']) ice_df = ice.ice(df, 'x1', lambda X: -1) ice_df_expected = pd.DataFrame(np.array([-1]), columns=pd.Series(0, name='x0'), index=pd.Series(1, name='x1')) assert (ice_df == ice_df_expected).all().all()
def plotIce(data, pr): ''' :param data: pandas dataframe with datasets where each row represents a dataset :param resultColumnName: Name of column in data that contains actual results :param pr: Predictor of ML-System saves and plots ICE ''' pr.setReturnDistanceOfClass(True) resultColumnName = pr.resultColumn for i in pr.listOfNumericalColumns: data[i] = data[i].astype(float).round(2).astype(str) data = pr.encode(data) columnCombinations = pr.unsortedColumnCombinations(data, resultColumnName) for columnCombination in columnCombinations: if not isinstance(columnCombination, tuple): iceResult = pIce.ice(data, columnCombination, pr.predict, num_grid_points=None) ax = pIce.ice_plot(iceResult, frac_to_plot=1., plot_points=True, point_kwargs=None, x_quantile=False, plot_pdp=True, centered=False, centered_quantile=0., color_by=None, cmap=None, ax=None, pdp_kwargs=None) ax.set_ylabel("Distance to Hyperplane of true result") ax.set_xlabel(columnCombination) ax.set_title("ICE for " + columnCombination) lines = ax.lines for lineIndex in range(len(lines)): lines[lineIndex].set_label("Dataset " + str(lineIndex)) lines[len(lines) - 1].set_label("Pdp") #ax.legend(loc='upper left', bbox_to_anchor=(1, 1)) for line in ax.lines: line.set_color("k") line._linewidth = 0.5 lines[-1].linewidth = 1 lines[-1].set_color("r") xValues = pr.encodingDictionary[columnCombination] ax.set_xticks(np.arange(1, len(xValues), 1)) ax.set_xticklabels(xValues[1:]) ax.tick_params(axis='both', which='major', labelsize=6) ax.tick_params(axis='both', which='minor', labelsize=6) plt.xticks(rotation=90) saveName = "ice" + str(columnCombination) save(saveName, plt=plt)
def test_ice_one_sample_one_point(): X = np.array([[0, 1]]) df = pd.DataFrame(X, columns=['x0', 'x1']) ice_df = ice.ice(df, 'x1', lambda X: -1) expected_columns = pd.MultiIndex.from_tuples([(1, 0)], names=['data_x1', 'x0']) ice_df_expected = pd.DataFrame(np.array([-1]), columns=expected_columns, index=pd.Series(1, name='x1')) assert (ice_df == ice_df_expected).all().all()
def test_ice_two_samples_two_points(): X = np.eye(2) df = pd.DataFrame(X, columns=['x0', 'x1']) ice_df = ice.ice(df, 'x0', lambda X: X.prod(axis=1)) expected_columns = pd.MultiIndex.from_tuples([(0, 1), (1, 0)], names=['data_x0', 'x1']) ice_df_expected = pd.DataFrame(np.array([[0, 0], [1, 0]]), columns=expected_columns, index=pd.Series([0., 1.], name='x0')) assert (ice_df == ice_df_expected).all().all()
def test_ice_num_grid_points(): X = np.eye(3) df = pd.DataFrame(X, columns=['x0', 'x1', 'x2']) ice_df = ice.ice(df, 'x2', lambda X: X.dot(np.array([[1., 2., 3.]]).T), num_grid_points=5) expected_columns = pd.MultiIndex.from_tuples([(0, 0, 1), (0, 1, 0), (1, 0, 0)], names=['data_x2', 'x0', 'x1']) ice_df_expected = pd.DataFrame(np.array([[2., 1., 0.], [3.5, 2.5, 1.5], [5., 4., 3.]]), columns=expected_columns, index=pd.Series([0., 0.5, 1.], name='x2')) assert (ice_df == ice_df_expected).all().all()
def test_ice_num_grid_points(): X = np.eye(3) df = pd.DataFrame(X, columns=['x0', 'x1', 'x2']) ice_df = ice.ice(df, 'x2', lambda X: X.dot(np.array([[1., 2., 3.]]).T), num_grid_points=5) expected_columns = pd.MultiIndex.from_tuples([(0, 0), (0, 1), (1, 0)], names=['x0', 'x1']) ice_df_expected = pd.DataFrame(np.array([[0., 2., 1.], [1.5, 3.5, 2.5], [3., 5., 4.]]), columns=expected_columns, index=pd.Series([0., 0.5, 1.], name='x2')) assert (ice_df == ice_df_expected).all().all()
def ICEPlot(data, model, features): # create dict of ICE data for grid of ICE plots train_ice_dfs = {feat: ice(data=data, column=feat, predict=model.predict) for feat in features} fig = plot_ice_grid(train_ice_dfs, data, features, ax_ylabel='Pred. Ray Num.', nrows=5, ncols=4, alpha=0.3, plot_pdp=True, pdp_kwargs={'c': 'blue', 'linewidth': 2.0}, linewidth=0.5, c='dimgray') #fig.tight_layout() fig.suptitle('ICE plot: Classification - all training data') fig.subplots_adjust(top=0.89) return train_ice_dfs
def analyze_ice(self): "Individual Conditional Expectation" # pcyebox likes the data to be in a DataFrame so let's create one with our imputed data # we first need to impute the missing data train_X_imp = self.imputer.transform(self.X) train_X_imp_df = pd.DataFrame(train_X_imp, columns=self.features) forty_ice_df = ice(data=train_X_imp_df, column='Forty', predict=self.pipe.predict) ice_plot(forty_ice_df, c='dimgray', linewidth=0.3) plt.ylabel('Pred. AV %ile') plt.xlabel('Forty')
def analyze_ice_gc(self): "Individual Conditional Expectation - Centered Feature Interaction Grid" train_X_imp = self.imputer.transform(self.X) train_X_imp_df = pd.DataFrame(train_X_imp, columns=self.features) # create dict of ICE data for grid of ICE plots train_ice_dfs = { feat: ice(data=train_X_imp_df, column=feat, predict=self.estimator.predict) for feat in self.features } fig = plot_ice_grid(train_ice_dfs, train_X_imp_df, self.features, ax_ylabel='Pred AV %ile (centered)', alpha=.2, plot_points=False, plot_pdp=True, pdp_kwargs={ "c": "red", "linewidth": 3 }, linewidth=0.5, c='dimgray', centered=True, sharey=False, nrows=4, ncols=2, figsize=(11, 16)) fig.tight_layout() fig.suptitle('Centered ICE plots (training data)') fig.subplots_adjust(top=0.9)
if plot_points: ax.scatter(point_x, point_y, zorder=10, **(point_kwargs or {})) if plot_pdp: pdp_kwargs = pdp_kwargs or {} pdp_data = pdp(ice_data) ax.plot(x, pdp_data, **pdp_kwargs) return ax # For the variable TENURE, create its ICE function # As mentionned earlier, we are using the predicted probability that a customer will churn as target variable. ice_tenure = ice(X_train2, 'tenure', regressor.predict, num_grid_points=72) # Here 72 grid points because tenure can take 72 values. The more grid points, the more accurate it would be. ice_tenure.head() # Each column corresponds to a datapoint # Data points plots and ICE plots. Run 2 parts together to obtain both plots. fig, (data_ax, ice_ax) = plt.subplots(ncols=2, sharex=True, sharey=True, figsize=(16, 6)) data_ax.scatter(X_train2.tenure, y2_pred2, c='k', alpha=0.5); data_ax.set_xlabel('tenure$'); data_ax.set_ylabel('$churn$'); data_ax.set_title('Data'); # This is a first version of ICE. It is too crowded and uncentered, which we will try to modify. ice_plot(ice_tenure, ax=ice_ax, plot_points=False, linewidth=0.2, plot_pdp = True); # frac_to_plot = 0.1 ; the fraction of ICE curves to plot. ice_ax.set_xlabel('$tenure$'); ice_ax.set_ylabel('$churn$'); ice_ax.set_title('ICE curves'); # Play with the following parameters of ice and ice_plot: num_grid_points, frac_to_plot, centered and plot_points.