Exemple #1
0
    def analyze_ice_grid(self):
        "Individual Conditional Expectation - Feature Interaction Grid"

        train_X_imp = self.imputer.transform(self.X)

        train_X_imp_df = pd.DataFrame(train_X_imp, columns=self.features)

        # create dict of ICE data for grid of ICE plots
        train_ice_dfs = {
            feat: ice(data=train_X_imp_df,
                      column=feat,
                      predict=self.estimator.predict)
            for feat in self.features
        }

        fig = plot_ice_grid(train_ice_dfs,
                            train_X_imp_df,
                            self.features,
                            ax_ylabel='Pred. AV %ile',
                            alpha=0.3,
                            plot_pdp=True,
                            pdp_kwargs={
                                'c': 'red',
                                'linewidth': 3
                            },
                            linewidth=0.5,
                            c='dimgray')
        fig.tight_layout()
        fig.suptitle('ICE plots (training data)')
        fig.subplots_adjust(top=0.89)
def plot_data_vs_ice(pred_function, ylabel, X, feature_name, feature_label, color_by=None, legend_key=None, alpha=0.15):
    ice_df = ice(X, feature_name,\
             pred_function, num_grid_points=None)
    fig, axs = plt.subplots(2, 1, sharex=False, sharey=True,\
                            figsize=(15,20))
    fig.subplots_adjust(hspace=0.15, wspace=0)
    if color_by is None or legend_key is None:
        scatter = axs[0].scatter(X[feature_name],\
                                 pred_function(X),\
                                 alpha=alpha)
        ice_plot(ice_df, alpha=alpha, ax=axs[1])
    else:
        scatter = axs[0].scatter(X[feature_name],\
                                 pred_function(X),\
                                 c=X[color_by], alpha=alpha)
        legend = axs[0].legend(*scatter.legend_elements(), loc='best')
        for s in legend_key.keys(): 
            legend.get_texts()[s].set_text(legend_key[s])
        ice_plot(ice_df, color_by=color_by, alpha=alpha, ax=axs[1])
    axs[0].set_xlabel(feature_label, fontsize=12)
    axs[0].set_ylabel(ylabel, fontsize=12)
    axs[0].set_title('Data', fontsize=16)
    axs[1].set_xlabel(feature_label, fontsize=12)
    axs[1].set_ylabel(ylabel, fontsize=12)
    axs[1].set_title('ICE Curves', fontsize=16)
    plt.show()
Exemple #3
0
    def analyze_ice_fi(self):
        "Individual Conditional Expectation - Feature Interaction"

        # pcyebox likes the data to be in a DataFrame so let's create one with our imputed data
        # we first need to impute the missing data

        train_X_imp = self.imputer.transform(self.X)

        train_X_imp_df = pd.DataFrame(train_X_imp, columns=self.features)

        forty_ice_df = ice(data=train_X_imp_df,
                           column='Forty',
                           predict=self.pipe.predict)

        # new colormap for ICE plot
        cmap2 = plt.get_cmap('OrRd')
        # set color_by to Wt, in order to color each curve by that player's weight
        ice_plot(forty_ice_df, linewidth=0.5, color_by='Wt', cmap=cmap2)
        # ice_plot doesn't return a colorbar so we have to add one
        # hack to add in colorbar taken from here:
        # https://stackoverflow.com/questions/8342549/matplotlib-add-colorbar-to-a-sequence-of-line-plots/11558629#11558629
        wt_vals = forty_ice_df.columns.get_level_values('Wt').values
        sm = plt.cm.ScalarMappable(cmap=cmap2,
                                   norm=plt.Normalize(vmin=wt_vals.min(),
                                                      vmax=wt_vals.max()))
        # need to create fake array for the scalar mappable or else we get an error
        sm._A = []
        plt.colorbar(sm, label='Wt')
        plt.ylabel('Pred. AV %ile')
        plt.xlabel('Forty')
Exemple #4
0
def test_ice_two_samples_two_points():
    X = np.eye(2)
    df = pd.DataFrame(X, columns=['x0', 'x1'])

    ice_df = ice.ice(df, 'x0', lambda X: X.prod(axis=1))
    ice_df_expected = pd.DataFrame(np.array([[0, 0], [0, 1]]),
                                   columns=pd.Series([0., 1.], name='x1'),
                                   index=pd.Series([0., 1.], name='x0'))

    assert (ice_df == ice_df_expected).all().all()
Exemple #5
0
def test_ice_one_sample_one_point():
    X = np.array([[0, 1]])
    df = pd.DataFrame(X, columns=['x0', 'x1'])

    ice_df = ice.ice(df, 'x1', lambda X: -1)
    ice_df_expected = pd.DataFrame(np.array([-1]),
                                   columns=pd.Series(0, name='x0'),
                                   index=pd.Series(1, name='x1'))

    assert (ice_df == ice_df_expected).all().all()
Exemple #6
0
def test_ice_one_sample_one_point():
    X = np.array([[0, 1]])
    df = pd.DataFrame(X, columns=['x0', 'x1'])

    ice_df = ice.ice(df, 'x1', lambda X: -1)
    ice_df_expected = pd.DataFrame(np.array([-1]),
                                   columns=pd.Series(0, name='x0'),
                                   index=pd.Series(1, name='x1'))

    assert (ice_df == ice_df_expected).all().all()
def plotIce(data, pr):
    '''
    :param data: pandas dataframe with datasets where each row represents a dataset
    :param resultColumnName: Name of column in data that contains actual results
    :param pr: Predictor of ML-System
    saves and plots ICE
    '''
    pr.setReturnDistanceOfClass(True)
    resultColumnName = pr.resultColumn
    for i in pr.listOfNumericalColumns:
        data[i] = data[i].astype(float).round(2).astype(str)
    data = pr.encode(data)

    columnCombinations = pr.unsortedColumnCombinations(data, resultColumnName)
    for columnCombination in columnCombinations:
        if not isinstance(columnCombination, tuple):
            iceResult = pIce.ice(data,
                                 columnCombination,
                                 pr.predict,
                                 num_grid_points=None)

            ax = pIce.ice_plot(iceResult,
                               frac_to_plot=1.,
                               plot_points=True,
                               point_kwargs=None,
                               x_quantile=False,
                               plot_pdp=True,
                               centered=False,
                               centered_quantile=0.,
                               color_by=None,
                               cmap=None,
                               ax=None,
                               pdp_kwargs=None)
            ax.set_ylabel("Distance to Hyperplane of true result")
            ax.set_xlabel(columnCombination)
            ax.set_title("ICE for " + columnCombination)
            lines = ax.lines
            for lineIndex in range(len(lines)):
                lines[lineIndex].set_label("Dataset " + str(lineIndex))
            lines[len(lines) - 1].set_label("Pdp")
            #ax.legend(loc='upper left', bbox_to_anchor=(1, 1))

            for line in ax.lines:
                line.set_color("k")
                line._linewidth = 0.5
            lines[-1].linewidth = 1
            lines[-1].set_color("r")
            xValues = pr.encodingDictionary[columnCombination]
            ax.set_xticks(np.arange(1, len(xValues), 1))
            ax.set_xticklabels(xValues[1:])
            ax.tick_params(axis='both', which='major', labelsize=6)
            ax.tick_params(axis='both', which='minor', labelsize=6)
            plt.xticks(rotation=90)
            saveName = "ice" + str(columnCombination)
            save(saveName, plt=plt)
Exemple #8
0
def test_ice_one_sample_one_point():
    X = np.array([[0, 1]])
    df = pd.DataFrame(X, columns=['x0', 'x1'])

    ice_df = ice.ice(df, 'x1', lambda X: -1)
    expected_columns = pd.MultiIndex.from_tuples([(1, 0)], names=['data_x1', 'x0'])
    ice_df_expected = pd.DataFrame(np.array([-1]),
                                   columns=expected_columns,
                                   index=pd.Series(1, name='x1'))

    assert (ice_df == ice_df_expected).all().all()
Exemple #9
0
def test_ice_two_samples_two_points():
    X = np.eye(2)
    df = pd.DataFrame(X, columns=['x0', 'x1'])

    ice_df = ice.ice(df, 'x0', lambda X: X.prod(axis=1))
    ice_df_expected = pd.DataFrame(np.array([[0, 0],
                                             [0, 1]]),
                                   columns=pd.Series([0., 1.], name='x1'),
                                   index=pd.Series([0., 1.], name='x0'))

    assert (ice_df == ice_df_expected).all().all()
Exemple #10
0
def test_ice_two_samples_two_points():
    X = np.eye(2)
    df = pd.DataFrame(X, columns=['x0', 'x1'])

    ice_df = ice.ice(df, 'x0', lambda X: X.prod(axis=1))

    expected_columns = pd.MultiIndex.from_tuples([(0, 1), (1, 0)], names=['data_x0', 'x1'])
    ice_df_expected = pd.DataFrame(np.array([[0, 0],
                                             [1, 0]]),
                                   columns=expected_columns,
                                   index=pd.Series([0., 1.], name='x0'))

    assert (ice_df == ice_df_expected).all().all()
Exemple #11
0
def test_ice_num_grid_points():
    X = np.eye(3)
    df = pd.DataFrame(X, columns=['x0', 'x1', 'x2'])
    
    ice_df = ice.ice(df, 'x2', lambda X: X.dot(np.array([[1., 2., 3.]]).T),
                     num_grid_points=5)

    expected_columns = pd.MultiIndex.from_tuples([(0, 0, 1), (0, 1, 0), (1, 0, 0)], names=['data_x2', 'x0', 'x1'])
    ice_df_expected = pd.DataFrame(np.array([[2., 1., 0.],
                                             [3.5, 2.5, 1.5],
                                             [5., 4., 3.]]),
                                   columns=expected_columns,
                                   index=pd.Series([0., 0.5, 1.], name='x2'))

    assert (ice_df == ice_df_expected).all().all()
Exemple #12
0
def test_ice_num_grid_points():
    X = np.eye(3)
    df = pd.DataFrame(X, columns=['x0', 'x1', 'x2'])
    
    ice_df = ice.ice(df, 'x2', lambda X: X.dot(np.array([[1., 2., 3.]]).T),
                     num_grid_points=5)

    expected_columns = pd.MultiIndex.from_tuples([(0, 0), (0, 1), (1, 0)], names=['x0', 'x1'])
    ice_df_expected = pd.DataFrame(np.array([[0., 2., 1.],
                                             [1.5, 3.5, 2.5],
                                             [3., 5., 4.]]),
                                   columns=expected_columns,
                                   index=pd.Series([0., 0.5, 1.], name='x2'))

    assert (ice_df == ice_df_expected).all().all()
Exemple #13
0
def ICEPlot(data, model, features):
    # create dict of ICE data for grid of ICE plots
    train_ice_dfs = {feat: ice(data=data, column=feat, predict=model.predict) 
                     for feat in features}
    
    fig = plot_ice_grid(train_ice_dfs, data, features,
                        ax_ylabel='Pred. Ray Num.', 
                        nrows=5, 
                        ncols=4,
                        alpha=0.3, plot_pdp=True,
                        pdp_kwargs={'c': 'blue', 'linewidth': 2.0},
                        linewidth=0.5, c='dimgray')
    #fig.tight_layout()
    fig.suptitle('ICE plot: Classification - all training data')
    fig.subplots_adjust(top=0.89)
    
    return train_ice_dfs
Exemple #14
0
    def analyze_ice(self):
        "Individual Conditional Expectation"

        # pcyebox likes the data to be in a DataFrame so let's create one with our imputed data
        # we first need to impute the missing data

        train_X_imp = self.imputer.transform(self.X)

        train_X_imp_df = pd.DataFrame(train_X_imp, columns=self.features)

        forty_ice_df = ice(data=train_X_imp_df,
                           column='Forty',
                           predict=self.pipe.predict)

        ice_plot(forty_ice_df, c='dimgray', linewidth=0.3)
        plt.ylabel('Pred. AV %ile')
        plt.xlabel('Forty')
Exemple #15
0
    def analyze_ice_gc(self):
        "Individual Conditional Expectation - Centered Feature Interaction Grid"

        train_X_imp = self.imputer.transform(self.X)

        train_X_imp_df = pd.DataFrame(train_X_imp, columns=self.features)

        # create dict of ICE data for grid of ICE plots
        train_ice_dfs = {
            feat: ice(data=train_X_imp_df,
                      column=feat,
                      predict=self.estimator.predict)
            for feat in self.features
        }

        fig = plot_ice_grid(train_ice_dfs,
                            train_X_imp_df,
                            self.features,
                            ax_ylabel='Pred AV %ile (centered)',
                            alpha=.2,
                            plot_points=False,
                            plot_pdp=True,
                            pdp_kwargs={
                                "c": "red",
                                "linewidth": 3
                            },
                            linewidth=0.5,
                            c='dimgray',
                            centered=True,
                            sharey=False,
                            nrows=4,
                            ncols=2,
                            figsize=(11, 16))
        fig.tight_layout()
        fig.suptitle('Centered ICE plots (training data)')
        fig.subplots_adjust(top=0.9)
    if plot_points:
        ax.scatter(point_x, point_y, zorder=10, **(point_kwargs or {}))

    if plot_pdp:
        pdp_kwargs = pdp_kwargs or {}
        pdp_data = pdp(ice_data)
        ax.plot(x, pdp_data, **pdp_kwargs)

    return ax



# For the variable TENURE, create its ICE function
# As mentionned earlier, we are using the predicted probability that a customer will churn as target variable. 
ice_tenure = ice(X_train2, 'tenure', regressor.predict, num_grid_points=72) # Here 72 grid points because tenure can take 72 values. The more grid points, the more accurate it would be.  
ice_tenure.head() # Each column corresponds to a datapoint

# Data points plots and ICE plots. Run 2 parts together to obtain both plots. 
fig, (data_ax, ice_ax) = plt.subplots(ncols=2, sharex=True, sharey=True, figsize=(16, 6))
data_ax.scatter(X_train2.tenure, y2_pred2, c='k', alpha=0.5);
data_ax.set_xlabel('tenure$');
data_ax.set_ylabel('$churn$');
data_ax.set_title('Data');
# This is a first version of ICE. It is too crowded and uncentered, which we will try to modify. 
ice_plot(ice_tenure, ax=ice_ax, plot_points=False, linewidth=0.2, plot_pdp = True); # frac_to_plot = 0.1  ;  the fraction of ICE curves to plot.
ice_ax.set_xlabel('$tenure$');
ice_ax.set_ylabel('$churn$');
ice_ax.set_title('ICE curves');

# Play with the following parameters of ice and ice_plot: num_grid_points, frac_to_plot, centered and plot_points.