Beispiel #1
0
    def analyze_ice_fi(self):
        "Individual Conditional Expectation - Feature Interaction"

        # pcyebox likes the data to be in a DataFrame so let's create one with our imputed data
        # we first need to impute the missing data

        train_X_imp = self.imputer.transform(self.X)

        train_X_imp_df = pd.DataFrame(train_X_imp, columns=self.features)

        forty_ice_df = ice(data=train_X_imp_df,
                           column='Forty',
                           predict=self.pipe.predict)

        # new colormap for ICE plot
        cmap2 = plt.get_cmap('OrRd')
        # set color_by to Wt, in order to color each curve by that player's weight
        ice_plot(forty_ice_df, linewidth=0.5, color_by='Wt', cmap=cmap2)
        # ice_plot doesn't return a colorbar so we have to add one
        # hack to add in colorbar taken from here:
        # https://stackoverflow.com/questions/8342549/matplotlib-add-colorbar-to-a-sequence-of-line-plots/11558629#11558629
        wt_vals = forty_ice_df.columns.get_level_values('Wt').values
        sm = plt.cm.ScalarMappable(cmap=cmap2,
                                   norm=plt.Normalize(vmin=wt_vals.min(),
                                                      vmax=wt_vals.max()))
        # need to create fake array for the scalar mappable or else we get an error
        sm._A = []
        plt.colorbar(sm, label='Wt')
        plt.ylabel('Pred. AV %ile')
        plt.xlabel('Forty')
Beispiel #2
0
def plot_ice_grid(dict_of_ice_dfs, data_df, features, ax_ylabel='', nrows=3, 
                  ncols=3, figsize=(12, 12), sharex=False, sharey=True, 
                  subplots_kws={}, rug_kws={'color':'k'}, **ice_plot_kws):
    """A function that plots ICE plots for different features in a grid."""
    fig, axes = plt.subplots(nrows=nrows, 
                             ncols=ncols, 
                             figsize=figsize,
                             sharex=sharex,
                             sharey=sharey,
                             **subplots_kws)
    # for each feature plot the ice curves and add a rug at the bottom of the 
    # subplot
    #max value for y-axis based on water-depth-min which goes through the whole range
    ymax = dict_of_ice_dfs[features[0]].values.max()
    
    for f, ax in zip(features, axes.flatten()):
        ice_plot(dict_of_ice_dfs[f], ax=ax, **ice_plot_kws)
        # add the rug
        sns.distplot(data_df[f], ax=ax, hist=False, kde=False, 
                     rug=True, rug_kws=rug_kws)
        #ax.set_title('feature = ' + f)
        ax.set_ylabel(ax_ylabel)
        ax.set_ylim(0, ymax)
        sns.despine()
        
    # get rid of blank plots
    for i in range(len(features), nrows*ncols):
        axes.flatten()[i].axis('off')
    return fig
def plot_data_vs_ice(pred_function, ylabel, X, feature_name, feature_label, color_by=None, legend_key=None, alpha=0.15):
    ice_df = ice(X, feature_name,\
             pred_function, num_grid_points=None)
    fig, axs = plt.subplots(2, 1, sharex=False, sharey=True,\
                            figsize=(15,20))
    fig.subplots_adjust(hspace=0.15, wspace=0)
    if color_by is None or legend_key is None:
        scatter = axs[0].scatter(X[feature_name],\
                                 pred_function(X),\
                                 alpha=alpha)
        ice_plot(ice_df, alpha=alpha, ax=axs[1])
    else:
        scatter = axs[0].scatter(X[feature_name],\
                                 pred_function(X),\
                                 c=X[color_by], alpha=alpha)
        legend = axs[0].legend(*scatter.legend_elements(), loc='best')
        for s in legend_key.keys(): 
            legend.get_texts()[s].set_text(legend_key[s])
        ice_plot(ice_df, color_by=color_by, alpha=alpha, ax=axs[1])
    axs[0].set_xlabel(feature_label, fontsize=12)
    axs[0].set_ylabel(ylabel, fontsize=12)
    axs[0].set_title('Data', fontsize=16)
    axs[1].set_xlabel(feature_label, fontsize=12)
    axs[1].set_ylabel(ylabel, fontsize=12)
    axs[1].set_title('ICE Curves', fontsize=16)
    plt.show()
Beispiel #4
0
def plot_ice_grid(dict_of_ice_dfs, data_df, features, ax_ylabel='', nrows=3, 
                  ncols=3, figsize=(12, 12), sharex=False, sharey=True, 
                  subplots_kws={}, rug_kws={'color':'k'}, **ice_plot_kws):
    """A function that plots ICE plots for different features in a grid."""
    fig, axes = plt.subplots(nrows=nrows, 
                             ncols=ncols, 
                             figsize=figsize,
                             sharex=sharex,
                             sharey=sharey,
                             **subplots_kws)
    # for each feature plot the ice curves and add a rug at the bottom of the 
    # subplot
    for f, ax in zip(features, axes.flatten()):
        ice_plot(dict_of_ice_dfs[f], ax=ax, **ice_plot_kws)
        # add the rug
        sns.distplot(data_df[f], ax=ax, hist=False, kde=False, 
                     rug=True, rug_kws=rug_kws)
        autoscale_y(ax)
        ax.set_title('feature = ' + f)
        ax.set_ylabel(ax_ylabel)
        sns.despine()
        
    # get rid of blank plots
    for i in range(len(features), nrows*ncols):
        axes.flatten()[i].axis('off')

    return fig
Beispiel #5
0
    def analyze_ice(self):
        "Individual Conditional Expectation"

        # pcyebox likes the data to be in a DataFrame so let's create one with our imputed data
        # we first need to impute the missing data

        train_X_imp = self.imputer.transform(self.X)

        train_X_imp_df = pd.DataFrame(train_X_imp, columns=self.features)

        forty_ice_df = ice(data=train_X_imp_df,
                           column='Forty',
                           predict=self.pipe.predict)

        ice_plot(forty_ice_df, c='dimgray', linewidth=0.3)
        plt.ylabel('Pred. AV %ile')
        plt.xlabel('Forty')
def plotIce(data, pr):
    '''
    :param data: pandas dataframe with datasets where each row represents a dataset
    :param resultColumnName: Name of column in data that contains actual results
    :param pr: Predictor of ML-System
    saves and plots ICE
    '''
    pr.setReturnDistanceOfClass(True)
    resultColumnName = pr.resultColumn
    for i in pr.listOfNumericalColumns:
        data[i] = data[i].astype(float).round(2).astype(str)
    data = pr.encode(data)

    columnCombinations = pr.unsortedColumnCombinations(data, resultColumnName)
    for columnCombination in columnCombinations:
        if not isinstance(columnCombination, tuple):
            iceResult = pIce.ice(data,
                                 columnCombination,
                                 pr.predict,
                                 num_grid_points=None)

            ax = pIce.ice_plot(iceResult,
                               frac_to_plot=1.,
                               plot_points=True,
                               point_kwargs=None,
                               x_quantile=False,
                               plot_pdp=True,
                               centered=False,
                               centered_quantile=0.,
                               color_by=None,
                               cmap=None,
                               ax=None,
                               pdp_kwargs=None)
            ax.set_ylabel("Distance to Hyperplane of true result")
            ax.set_xlabel(columnCombination)
            ax.set_title("ICE for " + columnCombination)
            lines = ax.lines
            for lineIndex in range(len(lines)):
                lines[lineIndex].set_label("Dataset " + str(lineIndex))
            lines[len(lines) - 1].set_label("Pdp")
            #ax.legend(loc='upper left', bbox_to_anchor=(1, 1))

            for line in ax.lines:
                line.set_color("k")
                line._linewidth = 0.5
            lines[-1].linewidth = 1
            lines[-1].set_color("r")
            xValues = pr.encodingDictionary[columnCombination]
            ax.set_xticks(np.arange(1, len(xValues), 1))
            ax.set_xticklabels(xValues[1:])
            ax.tick_params(axis='both', which='major', labelsize=6)
            ax.tick_params(axis='both', which='minor', labelsize=6)
            plt.xticks(rotation=90)
            saveName = "ice" + str(columnCombination)
            save(saveName, plt=plt)


# For the variable TENURE, create its ICE function
# As mentionned earlier, we are using the predicted probability that a customer will churn as target variable. 
ice_tenure = ice(X_train2, 'tenure', regressor.predict, num_grid_points=72) # Here 72 grid points because tenure can take 72 values. The more grid points, the more accurate it would be.  
ice_tenure.head() # Each column corresponds to a datapoint

# Data points plots and ICE plots. Run 2 parts together to obtain both plots. 
fig, (data_ax, ice_ax) = plt.subplots(ncols=2, sharex=True, sharey=True, figsize=(16, 6))
data_ax.scatter(X_train2.tenure, y2_pred2, c='k', alpha=0.5);
data_ax.set_xlabel('tenure$');
data_ax.set_ylabel('$churn$');
data_ax.set_title('Data');
# This is a first version of ICE. It is too crowded and uncentered, which we will try to modify. 
ice_plot(ice_tenure, ax=ice_ax, plot_points=False, linewidth=0.2, plot_pdp = True); # frac_to_plot = 0.1  ;  the fraction of ICE curves to plot.
ice_ax.set_xlabel('$tenure$');
ice_ax.set_ylabel('$churn$');
ice_ax.set_title('ICE curves');

# Play with the following parameters of ice and ice_plot: num_grid_points, frac_to_plot, centered and plot_points. 

# New centered-ICE plots, still for tenure and with only a fraction of the total number of instances being considered. 
ICE_plot(ice_tenure, plot_points=False, linewidth=0.2, plot_pdp = True, frac_to_plot = 0.1)
plt.title('Uncentered ICE for a fraction of instances')
ICE_plot(ice_tenure, plot_points=False, linewidth=0.2, plot_pdp = True, frac_to_plot = 0.1, centered=True)
plt.title('Centered ICE for a fraction of instances')
plt.show()

# PDP for 'tenure': created manually from definition of 'pycebox.ice.pdp'
plt.plot(ice_tenure.index, ice_tenure.mean(axis=1))