Example #1
0
    def dependence3d(self, forest, train, feature_set):
        print "******************this is the output of dependences of features"
        fig = plt.figure()
        pdp, (x_axis, y_axis) = partial_dependence(forest,
                                                   feature_set,
                                                   X=train)
        XX, YY = np.meshgrid(x_axis, y_axis)
        Z = pdp.T.reshape(XX.shape)
        ax = Axes3D(fig)
        surf = ax.plot_surface(XX,
                               YY,
                               Z,
                               rstride=1,
                               cstride=1,
                               cmap=plt.cm.BuPu)
        #------------------------------- ax.set_xlabel(names[target_feature[0]])
        #------------------------------- ax.set_ylabel(names[target_feature[1]])
        ax.set_zlabel('Partial dependence')
        #  pretty init view
        ax.view_init(elev=22, azim=122)
        plt.colorbar(surf)
        plt.suptitle('Partial dependence of house value on me12dian age and '
                     'average occupancy')
        plt.subplots_adjust(top=0.9)

        plt.show()
Example #2
0
def main():
    cal_housing = fetch_california_housing()

    # split 80/20 train-test
    X_train, X_test, y_train, y_test = train_test_split(cal_housing.data,
                                                        cal_housing.target,
                                                        test_size=0.2,
                                                        random_state=1)
    names = cal_housing.feature_names
    print('_' * 80)
    print("Training GBRT...")

    clf = GradientBoostingRegressor(n_estimators=100,
                                    max_depth=4,
                                    learning_rate=0.1,
                                    loss='huber',
                                    random_state=1)
    clf.fit(X_train, y_train)
    print("done.")
    print('_' * 80)
    print('Convenience plot with ``partial_dependence_plots``')
    print

    features = [0, 5, 1, 2, (5, 1)]
    fig, axs = plot_partial_dependence(clf,
                                       X_train,
                                       features,
                                       feature_names=names,
                                       n_jobs=3,
                                       grid_resolution=50)
    fig.suptitle(
        'Partial dependence of house value on nonlocation features for the California housing dataset'
    )
    plt.subplots_adjust(top=0.9)
    print('_' * 80)
    print('Custom 3d plot via ``partial_dependence``')
    print

    fig = plt.figure()
    target_feature = (1, 5)
    pdp, (x_axis, y_axis) = partial_dependence(clf,
                                               target_feature,
                                               X=X_train,
                                               grid_resolution=50)
    XX, YY = np.meshgrid(x_axis, y_axis)
    Z = pdp.T.reshape(XX.shape).T
    ax = Axes3D(fig)
    surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1, cmap=plt.cm.BuPu)
    ax.set_xlabel(names[target_feature[0]])
    ax.set_ylabel(names[target_feature[1]])
    ax.set_zlabel('Partial dependence')

    #  pretty init view
    ax.view_init(elev=22, azim=122)
    plt.colorbar(surf)
    plt.suptitle(
        'Partial dependence of house value on median age and average occupancy'
    )
    plt.subplots_adjust(top=0.9)
    plt.show()
Example #3
0
def compute_f_vals(gbm, model_inds, arr, inds):
    feat_vals, feat_val_counts = unique_rows_with_counts(arr[:, inds])
    uncentd_f_vals = partial_dependence.partial_dependence(
        gbm, model_inds[(inds, )], grid=feat_vals)[0][0]
    mean_uncentd_f_val = np.dot(feat_val_counts, uncentd_f_vals) / arr.shape[0]
    f_vals = uncentd_f_vals - mean_uncentd_f_val
    return dict(zip(map(tuple, feat_vals), f_vals))
Example #4
0
def test_partial_dependence_classifier():
    # Test partial dependence for classifier
    clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
    clf.fit(X, y)

    pdp, axes = partial_dependence(clf, [0], X=X, grid_resolution=5)

    # only 4 grid points instead of 5 because only 4 unique X[:,0] vals
    assert pdp.shape == (1, 4)
    assert axes[0].shape[0] == 4

    # now with our own grid
    X_ = np.asarray(X)
    grid = np.unique(X_[:, 0])
    pdp_2, axes = partial_dependence(clf, [0], grid=grid)

    assert axes is None
    assert_array_equal(pdp, pdp_2)
def test_partial_dependence_classifier():
    # Test partial dependence for classifier
    clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
    clf.fit(X, y)

    pdp, axes = partial_dependence(clf, [0], X=X, grid_resolution=5)

    # only 4 grid points instead of 5 because only 4 unique X[:,0] vals
    assert pdp.shape == (1, 4)
    assert axes[0].shape[0] == 4

    # now with our own grid
    X_ = np.asarray(X)
    grid = np.unique(X_[:, 0])
    pdp_2, axes = partial_dependence(clf, [0], grid=grid)

    assert axes is None
    assert_array_equal(pdp, pdp_2)
def main():
    # fetch California housing dataset
    try:
        cal_housing = fetch_california_housing()
    except HTTPError:
        print("Failed downloading california housing data.")
        return

    # split 80/20 train-test
    X_train, X_test, y_train, y_test = train_test_split(cal_housing.data,
                                                        cal_housing.target,
                                                        test_size=0.2,
                                                        random_state=1)
    names = cal_housing.feature_names

    print('_' * 80)
    print("Training GBRT...")
    clf = GradientBoostingRegressor(n_estimators=100, max_depth=4,
                                    learning_rate=0.1, loss='huber',
                                    random_state=1)
    clf.fit(X_train, y_train)
    print("done.")

    print('_' * 80)
    print('Convenience plot with ``partial_dependence_plots``')
    print

    features = [0, 5, 1, 2, (5, 1)]
    fig, axs = plot_partial_dependence(clf, X_train, features, feature_names=names,
                                       n_jobs=3, grid_resolution=50)
    fig.suptitle('Partial dependence of house value on nonlocation features\n'
                 'for the California housing dataset')
    plt.subplots_adjust(top=0.9)  # tight_layout causes overlap with suptitle

    print('_' * 80)
    print('Custom 3d plot via ``partial_dependence``')
    print
    fig = plt.figure()

    target_feature = (1, 5)
    pdp, (x_axis, y_axis) = partial_dependence(clf, target_feature,
                                               X=X_train, grid_resolution=50)
    XX, YY = np.meshgrid(x_axis, y_axis)
    Z = pdp.T.reshape(XX.shape).T
    ax = Axes3D(fig)
    surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1, cmap=plt.cm.BuPu)
    ax.set_xlabel(names[target_feature[0]])
    ax.set_ylabel(names[target_feature[1]])
    ax.set_zlabel('Partial dependence')
    #  pretty init view
    ax.view_init(elev=22, azim=122)
    plt.colorbar(surf)
    plt.suptitle('Partial dependence of house value on median age and '
                 'average occupancy')
    plt.subplots_adjust(top=0.9)

    plt.show()
def test_partial_dependence_regressor():
    # Test partial dependence for regressor
    clf = GradientBoostingRegressor(n_estimators=10, random_state=1)
    clf.fit(boston.data, boston.target)

    grid_resolution = 25
    pdp, axes = partial_dependence(
        clf, [0], X=boston.data, grid_resolution=grid_resolution)

    assert pdp.shape == (1, grid_resolution)
    assert axes[0].shape[0] == grid_resolution
Example #8
0
    def score(self, X_test, y_test):

        self.baseline = [self.baseline] * len(y_test)

        baseline_MSE = np.sqrt(mean_squared_error(y_test, self.baseline))
        print('Baseline Root Mean Squared Error:', round(baseline_MSE, 0))

        model_MSE = np.sqrt(mean_squared_error(y_test, self.y_pred))
        print('Model Root Mean Squared Error:', round(model_MSE, 0))

        print('Reduction in RMSE: %',
              ((model_MSE - baseline_MSE) / baseline_MSE))

        ### PD PLOT
        importances = self.model.feature_importances_
        sorted_imps = sorted(importances)[::-1]
        indicies = np.argsort(importances)[::-1]
        names = self.X.columns[indicies]
        N_COLS = 3

        pd_plots = [
            partial_dependence(self.model,
                               target_feature,
                               X=self.X,
                               grid_resolution=50)
            for target_feature in indicies
        ]
        pd_plots = list(
            zip(([pdp[0][0] for pdp in pd_plots]),
                ([pdp[1][0] for pdp in pd_plots])))

        fig, axes = plt.subplots(nrows=3,
                                 ncols=N_COLS,
                                 sharey=True,
                                 figsize=(12.0, 8.0))

        for i, (y_axis, x_axis) in enumerate(pd_plots[0:(3 * N_COLS)]):
            ax = axes[i // N_COLS, i % N_COLS]
            ax.plot(x_axis, y_axis, color="purple")
            ax.set_xlim([np.min(x_axis), np.max(x_axis)])
            text_x_pos = np.min(x_axis) + 0.05 * (np.max(x_axis) -
                                                  np.min(x_axis))
            ax.text(text_x_pos,
                    7.5,
                    "Feature Importance " + str(round(sorted_imps[i], 2)),
                    fontsize=12,
                    alpha=0.7)
            ax.set_xlabel(names[i])
            ax.grid()

        plt.suptitle(
            "Partial Dependence Plots (Ordered by Feature Importance)",
            fontsize=16)
        plt.tight_layout(rect=[0, 0.03, 1, 0.95])
def multi_case_partial_dependence(df, cases, ests, stdzrs,
                n_oversamps, c_true, c_pred):
    y_true_l = []
    y_hat_l = []
    y_proba_l = []
    feats_l = []

    fig, ax = plt.subplots(1,1,figsize=(6,4))
    for case, est, stdzr, n, c_t, c_p in zip(cases, ests, stdzrs,
            n_oversamps, c_true, c_pred):
        data_df = df.copy() # copy to read all columns after dropping
        print('case: {}'.format(case[0]))

        # drop other binary and probability column
        c_drop = [c for c in list(df.columns) if case[1] in c]
        data_df.drop(c_drop, axis=1, inplace=True)

        # train test split in time
        X_train, y_train, X_test, y_test = train_test_split_time(data_df,
            '2016-06-01', case[0])
        names = list(X_train.columns)
        features = [11, 12, 13, 14, (9, 18)]
        # plot
        fig, axs = plot_partial_dependence(est, X_train, features,
                                           feature_names=names,
                                           n_jobs=3, grid_resolution=50)
        fig.suptitle('Partial dependence of features\n'
                     'for {} model'.format(case[0]))
        plt.subplots_adjust(top=0.9)  # tight_layout causes overlap with suptitle

        print('Custom 3d plot via ``partial_dependence``')
        fig = plt.figure()

        target_feature = (9, 18)
        pdp, axes = partial_dependence(est, target_feature,
                                       X=X_train, grid_resolution=50)
        XX, YY = np.meshgrid(axes[0], axes[1])
        Z = pdp[0].reshape(list(map(np.size, axes))).T
        ax = Axes3D(fig)
        surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1,
                               cmap=plt.cm.BuPu, edgecolor='k')
        ax.set_xlabel(names[target_feature[0]])
        ax.set_ylabel(names[target_feature[1]])
        ax.set_zlabel('Partial dependence')
        #  pretty init view
        ax.view_init(elev=22, azim=122)
        plt.colorbar(surf)
        plt.suptitle('Partial dependence of features\n'
                     'for {} model'.format(case[0]))
        plt.subplots_adjust(top=0.9)

        plt.show()
def test_partial_dependence_multiclass():
    # Test partial dependence for multi-class classifier
    clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
    clf.fit(iris.data, iris.target)

    grid_resolution = 25
    n_classes = clf.n_classes_
    pdp, axes = partial_dependence(
        clf, [0], X=iris.data, grid_resolution=grid_resolution)

    assert pdp.shape == (n_classes, grid_resolution)
    assert len(axes) == 1
    assert axes[0].shape[0] == grid_resolution
    def partial_dependency_uncertainty(self, features, grid, percentiles):
        # Returns standard deviation of partial dependency curve

        pdps_cv = np.zeros((len(grid), len(self.models)), dtype=np.float64)
        # logging.debug('grid: {}; shape: {}'.format(grid, grid.shape))
        for i, cv_model in enumerate(self.models):
            pdps = skl_e_pd.partial_dependence(cv_model,
                                               features,
                                               grid,
                                               percentiles=percentiles)
            pdps_cv[:, i] = pdps[0]

        stds = np.std(pdps_cv, axis=1)
        return stds
def test_partial_dependence_classifier():
    # Test partial dependence for classifier
    clf = GradientBoostingClassifier(n_estimators=10, random_state=1)
    clf.fit(X, y)

    pdp, axes = partial_dependence(clf, [0], X=X, grid_resolution=5)

    # only 4 grid points instead of 5 because only 4 unique X[:,0] vals
    assert pdp.shape == (1, 4)
    assert axes[0].shape[0] == 4

    # now with our own grid
    X_ = np.asarray(X)
    grid = np.unique(X_[:, 0])
    pdp_2, axes = partial_dependence(clf, [0], grid=grid)

    assert axes is None
    assert_array_equal(pdp, pdp_2)

    # with trivial (no-op) sample weights
    clf.fit(X, y, sample_weight=np.ones(len(y)))

    pdp_w, axes_w = partial_dependence(clf, [0], X=X, grid_resolution=5)

    assert pdp_w.shape == (1, 4)
    assert axes_w[0].shape[0] == 4
    assert_allclose(pdp_w, pdp)

    # with non-trivial sample weights
    clf.fit(X, y, sample_weight=sample_weight)

    pdp_w2, axes_w2 = partial_dependence(clf, [0], X=X, grid_resolution=5)

    assert pdp_w2.shape == (1, 4)
    assert axes_w2[0].shape[0] == 4
    assert np.all(np.abs(pdp_w2 - pdp_w) / np.abs(pdp_w) > 0.1)
Example #13
0
def calc_avg_pdp(clf):
	from sklearn.ensemble.partial_dependence import partial_dependence
	import pandas

	pdps =  np.empty(clf.fit_clfs.shape + (clf.feature_importances.shape[2], 2, 100))

	for i in range(0, clf.mask_num):
		for j in range(0, clf.mask_num):

			if i == j:
				pdps[i, j] = None
			else:
				for feature in range(0, clf.feature_importances.shape[2]):
					pdp, a = partial_dependence(clf.fit_clfs[i, j], [feature], X=clf.c_data[i, j][0]) 
					pdps[i, j, feature] = [pdp[0], a[0]]

	clf.pdps = np.ma.masked_array(pdps, mask= pandas.isnull(pdps))
Example #14
0
def calc_avg_pdp(clf):
    from sklearn.ensemble.partial_dependence import partial_dependence
    import pandas

    pdps = np.empty(clf.fit_clfs.shape +
                    (clf.feature_importances.shape[2], 2, 100))

    for i in range(0, clf.mask_num):
        for j in range(0, clf.mask_num):

            if i == j:
                pdps[i, j] = None
            else:
                for feature in range(0, clf.feature_importances.shape[2]):
                    pdp, a = partial_dependence(clf.fit_clfs[i, j], [feature],
                                                X=clf.c_data[i, j][0])
                    pdps[i, j, feature] = [pdp[0], a[0]]

    clf.pdps = np.ma.masked_array(pdps, mask=pandas.isnull(pdps))
    def plot_partial_dependence_with_unc(self,
                                         gbm,
                                         feature_idx,
                                         percentiles=(0.05, 0.95),
                                         absolute_yscale=False,
                                         absolute_yticks=True,
                                         **fig_params):
        outcome_mean = np.mean(self.train_y)
        fig = plt.figure(**fig_params)
        ax = fig.add_subplot(1, 1, 1)

        pdps, axes = skl_e_pd.partial_dependence(gbm, [feature_idx],
                                                 X=self.train_X,
                                                 percentiles=percentiles)

        if absolute_yscale or absolute_yticks:
            pdps = pdps + outcome_mean

            # plt.xticks(locs, labels)
        stds = self.partial_dependency_uncertainty([feature_idx],
                                                   grid=axes[0],
                                                   percentiles=percentiles)

        pdp_uncertainty_plot = ax.fill_between(axes[0],
                                               pdps[0] - stds,
                                               pdps[0] + stds,
                                               alpha=0.2,
                                               color=self.pdp_color)
        pdp_plot, = ax.plot(axes[0], pdps[0], lw=5, color=self.pdp_color)
        if absolute_yscale:
            c_ylim = ax.get_ylim()
            ax.set_ylim(0, c_ylim[1])
        # if offset_mean and not offset_mean_labels:
        #     # fig.canvas.draw()
        #     ax.set_yticklabels(ax.get_yticks())
        #     labels_both = ax.get_yticklabels(which='both')

        #     for l in labels_both:
        #         l.set_text('{:.2f}'.format(outcome_mean + float(l.get_text())))

        #     ax.set_yticklabels(labels_both)

        return fig, ax, pdp_plot, pdp_uncertainty_plot
Example #16
0
 def dependence3d(self, forest, train, feature_set):
     print "******************this is the output of dependences of features"
     fig = plt.figure()
     pdp, (x_axis, y_axis) = partial_dependence(forest, feature_set,
                                        X=train)
     XX, YY = np.meshgrid(x_axis, y_axis)
     Z = pdp.T.reshape(XX.shape)
     ax = Axes3D(fig)
     surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1, cmap=plt.cm.BuPu)
     #------------------------------- ax.set_xlabel(names[target_feature[0]])
     #------------------------------- ax.set_ylabel(names[target_feature[1]])
     ax.set_zlabel('Partial dependence')
     #  pretty init view
     ax.view_init(elev=22, azim=122)
     plt.colorbar(surf)
     plt.suptitle('Partial dependence of house value on median age and '
                 'average occupancy')
     plt.subplots_adjust(top=0.9)
     
     plt.show()
def test_partial_dependence_sample_weight():
    # Test near perfect correlation between partial dependence and diagonal
    # when sample weights emphasize y = x predictions
    N = 1000
    rng = np.random.RandomState(123456)
    mask = rng.randint(2, size=N, dtype=bool)

    x = rng.rand(N)
    # set y = x on mask and y = -x outside
    y = x.copy()
    y[~mask] = -y[~mask]
    X = np.c_[mask, x]
    # sample weights to emphasize data points where y = x
    sample_weight = np.ones(N)
    sample_weight[mask] = 1000.

    clf = GradientBoostingRegressor(n_estimators=10, random_state=1)
    clf.fit(X, y, sample_weight=sample_weight)

    grid = np.arange(0, 1, 0.01)
    pdp = partial_dependence(clf, [1], grid=grid)

    assert np.corrcoef(np.ravel(pdp[0]), grid)[0, 1] > 0.99
Example #18
0
    def plot_3d(self,
                feature_3d,
                top=0.9,
                grid_resolution=50,
                rstride=1,
                cstride=1,
                cmap="jet",
                edgecolor='K',
                elev=22,
                azim=122):
        fig = plt.figure()
        pdp, axes = partial_dependence(gbrt=self.model,
                                       target_variables=feature_3d,
                                       X=self.feature_df,
                                       grid_resolution=grid_resolution)
        XX, YY = np.meshgrid(axes[0], axes[1])
        Z = pdp[0].reshape(list(map(np.size, axes))).T
        ax = Axes3D(fig)
        surf = ax.plot_surface(XX,
                               YY,
                               Z,
                               rstride=rstride,
                               cstride=cstride,
                               cmap=cmap,
                               edgecolor=edgecolor)
        ax.set_xlabel(self.feature_list[feature_3d[0]])
        ax.set_ylabel(self.feature_list[feature_3d[1]])
        ax.set_zlabel('Partial dependence')

        ax.view_init(elev=elev, azim=azim)
        plt.colorbar(surf)
        plt.suptitle('Partial dependence of house value on median\n'
                     'age and average occupancy')
        plt.subplots_adjust(top=top)

        plt.savefig(os.path.join(self.output_path, self.fig_file))
        plt.close()
Example #19
0
    priors = joblib.load(project_dir + '/data/raw/%s_priors' % prior_group)
    stacked_clf = joblib.load(target_dir + prior_group + '_' + method +
                              '_final_stacked_clf_' + str(k) + '.npy')
    X_train = joblib.load(target_dir + prior_group + '_' + method +
                          '_final_predictions_' + str(k) + '.npy')

    for x in [9]:
        for y in [25]:
            target_feature = (x, y)
            fig = plt.figure()
            names = [priors[target_feature[0]], priors[target_feature[1]]]
            print(
                'Convenience plot with ``partial_dependence_plots`` for %s and %s'
                % (names[0], names[1]))
            pdp, axes = partial_dependence(stacked_clf,
                                           target_feature,
                                           X=X_train,
                                           grid_resolution=50)
            XX, YY = np.meshgrid(axes[0], axes[1])
            Z = pdp[0].reshape(list(map(np.size, axes))).T
            ax = Axes3D(fig)
            surf = ax.plot_surface(XX,
                                   YY,
                                   Z,
                                   rstride=1,
                                   cstride=1,
                                   cmap=plt.cm.BuPu)
            ax.set_xlabel(names[0], fontsize=12)
            ax.set_ylabel(names[1], fontsize=12)
            ax.set_zlabel('Partial dependence', fontsize=12)
            ax.view_init(elev=12, azim=-142)
            plt.xticks([0, 0.5, 1])
print

features = [0, 5, 1, 2, (5, 1)]
fig, axs = plot_partial_dependence(clf, X_train, features, feature_names=names,
                                   n_jobs=3, grid_resolution=50)
fig.suptitle('Partial dependence of house value on nonlocation features\n'
             'for the California housing dataset')
pl.subplots_adjust(top=0.9)  # tight_layout causes overlap with suptitle

print('_' * 80)
print('Custom 3d plot via ``partial_dependence``')
print
fig = pl.figure()

target_feature = (1, 5)
pdp, (x_axis, y_axis) = partial_dependence(clf, target_feature,
                                           X=X_train, grid_resolution=50)
XX, YY = np.meshgrid(x_axis, y_axis)
Z = pdp.T.reshape(XX.shape).T
ax = Axes3D(fig)
surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1, cmap=pl.cm.BuPu)
ax.set_xlabel(names[target_feature[0]])
ax.set_ylabel(names[target_feature[1]])
ax.set_zlabel('Partial dependence')
#  pretty init view
ax.view_init(elev=22, azim=122)
pl.colorbar(surf)
pl.suptitle('Partial dependence of house value on median age and '
            'average occupancy')
pl.subplots_adjust(top=0.9)

pl.show()
Example #21
0
                                   features,
                                   feature_names=names,
                                   n_jobs=3,
                                   grid_resolution=50)
fig.suptitle('Partial dependence of house value on nonlocation features\n'
             'for the California housing dataset')
pl.subplots_adjust(top=0.9)  # tight_layout causes overlap with suptitle

print('_' * 80)
print('Custom 3d plot via ``partial_dependence``')
print
fig = pl.figure()

target_feature = (1, 5)
pdp, (x_axis, y_axis) = partial_dependence(clf,
                                           target_feature,
                                           X=X_train,
                                           grid_resolution=50)
XX, YY = np.meshgrid(x_axis, y_axis)
Z = pdp.T.reshape(XX.shape).T
ax = Axes3D(fig)
surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1, cmap=pl.cm.BuPu)
ax.set_xlabel(names[target_feature[0]])
ax.set_ylabel(names[target_feature[1]])
ax.set_zlabel('Partial dependence')
#  pretty init view
ax.view_init(elev=22, azim=122)
pl.colorbar(surf)
pl.suptitle('Partial dependence of house value on median age and '
            'average occupancy')
pl.subplots_adjust(top=0.9)
Example #22
0
def main():
    # ommit gender and children (as does the reference paper)
    X = df[["yearsmarried", "age", "religiousness", "occupation",
            "rating"]].values
    y = df['affairs'].values

    # shuffle data
    order = np.argsort(np.random.random(y.shape))
    X = X[order]
    y = y[order]

    names = ["yearsmarried", "age", "religiousness", "occupation", "rating"]

    print("features to be plotted on first graph: " + str(names))
    clf = GradientBoostingRegressor(n_estimators=100,
                                    max_depth=4,
                                    learning_rate=0.1,
                                    loss='huber',
                                    random_state=1)
    clf.fit(X, y)

    print('Convenience plot with ``partial_dependence_plots``')

    # features = [0, 5, 1, 2, (5, 1)]
    features = [0, 1, 2, 3, 4, (0, 1)]
    fig, axs = plot_partial_dependence(clf,
                                       X,
                                       features,
                                       feature_names=names,
                                       n_jobs=-1,
                                       grid_resolution=100,
                                       n_cols=3)
    fig.set_size_inches(10.5, 7.5)
    fig.suptitle('Partial dependence for amount of affairs\n'
                 'for the Affairs dataset.')
    plt.subplots_adjust(top=0.9)  # tight_layout causes overlap

    print('Custom 3d plot via ``partial_dependence``')
    fig = plt.figure()

    target_feature = (0, 1)
    pdp, axes = partial_dependence(clf,
                                   target_feature,
                                   X=X,
                                   grid_resolution=100)
    XX, YY = np.meshgrid(axes[0], axes[1])
    Z = pdp[0].reshape(list(map(np.size, axes))).T
    ax = Axes3D(fig)
    surf = ax.plot_surface(XX,
                           YY,
                           Z,
                           rstride=1,
                           cstride=1,
                           cmap=plt.cm.BuPu,
                           edgecolor='k')
    ax.set_xlabel(names[target_feature[0]])
    ax.set_ylabel(names[target_feature[1]])
    ax.set_zlabel('Partial dependence')
    #  pretty init view
    ax.view_init(elev=22, azim=122)
    plt.colorbar(surf)
    plt.suptitle('Partial dependence of amount of affairs\n'
                 'for the amount of years married and the age.')
    plt.subplots_adjust(top=0.9)

    plt.show()
####################################################################
# Inspect feature 0, 3, 5, 6, and the interaction between 5 and 0, and 5
# and 3
target_features = [0, 3, 5, 6, (5, 0), (5, 3)]
fig, axs = plot_partial_dependence(clf, X_train, target_features,
                                    feature_names=boston.feature_names,
                                    grid_resolution=30)
plt.tight_layout()

####################################################################
# Lower-level partial_dependence function
# ----------------------------------------

target_feature = (5, 0)
from sklearn.ensemble.partial_dependence import partial_dependence
partial_deps, grid = partial_dependence(clf, target_feature,
                                        X=X_train, grid_resolution=50)
import numpy as np
# The 2D coordinate grid (for plotting)
XX, YY = np.meshgrid(grid[0], grid[1])

# Reshape the partial deps on the grid
Z = partial_deps[0].reshape(list(map(np.size, grid))).T

####################################################################
# 3D plotting
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure()
ax = Axes3D(fig)
surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1,
                        cmap=plt.cm.BuPu, edgecolor='k')
clf = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.1,
                                 max_depth=3, random_state=0).fit(X_train[X_train_relevant.columns], y_train)

#%%
names = X_train_relevant.columns
features = [0,1 , (1, 2)]
fig, axs = plot_partial_dependence(clf,    
                                   X_train[X_train_relevant.columns], 
                                   features, 
                                   feature_names=names) 


#%%

target_feature = (0, 1)
pdp, axes = partial_dependence(clf, target_feature,
                               X=X_train[X_train_relevant.columns], grid_resolution=50)
XX, YY = np.meshgrid(axes[0], axes[1])
Z = pdp[0].reshape(list(map(np.size, axes))).T
ax = Axes3D(fig)
surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1,
                       cmap=plt.cm.BuPu, edgecolor='k')
ax.set_xlabel(names[target_feature[0]])
ax.set_ylabel(names[target_feature[1]])
ax.set_zlabel('Partial dependence')
#  pretty init view
ax.view_init(elev=22, azim=122)
plt.colorbar(surf)
plt.suptitle('Partial dependence of house value on median\n'
             'age and average occupancy')
plt.subplots_adjust(top=0.9)
Example #25
0
    def ml_GradientBoostingClassifier2(self):
        # This example shows how to obtain partial dependence plots from a GradientBoostingRegressor trained on the California housing dataset.
        cal_housing = fetch_california_housing()

        # split 80/20 train-tests
        X_train, X_test, y_train, y_test = train_test_split(cal_housing.data,
                                                            cal_housing.target,
                                                            test_size=0.2,
                                                            random_state=1)
        names = cal_housing.feature_names

        print("Training GBRT...")
        clf = GradientBoostingRegressor(n_estimators=100,
                                        max_depth=4,
                                        learning_rate=0.1,
                                        loss='huber',
                                        random_state=1)
        clf.fit(X_train, y_train)
        print(" done.")

        print('Convenience plot with ``partial_dependence_plots``')

        features = [0, 5, 1, 2, (5, 1)]
        fig, axs = plot_partial_dependence(clf,
                                           X_train,
                                           features,
                                           feature_names=names,
                                           n_jobs=3,
                                           grid_resolution=50)
        fig.suptitle(
            'Partial dependence of house value on nonlocation features\n'
            'for the California housing dataset')
        plt.subplots_adjust(
            top=0.9)  # tight_layout causes overlap with suptitle

        print('Custom 3d plot via ``partial_dependence``')
        fig = plt.figure()

        target_feature = (1, 5)
        pdp, axes = partial_dependence(clf,
                                       target_feature,
                                       X=X_train,
                                       grid_resolution=50)
        XX, YY = np.meshgrid(axes[0], axes[1])
        Z = pdp[0].reshape(list(map(np.size, axes))).T
        ax = Axes3D(fig)
        surf = ax.plot_surface(XX,
                               YY,
                               Z,
                               rstride=1,
                               cstride=1,
                               cmap=plt.cm.BuPu,
                               edgecolor='k')
        ax.set_xlabel(names[target_feature[0]])
        ax.set_ylabel(names[target_feature[1]])
        ax.set_zlabel('Partial dependence')
        #  pretty init view
        ax.view_init(elev=22, azim=122)
        plt.colorbar(surf)
        plt.suptitle('Partial dependence of house value on median\n'
                     'age and average occupancy')
        plt.subplots_adjust(top=0.9)

        plt.show()
Example #26
0
    The number of columns in the grid plot (default: 3).
percentiles : (low, high), default=(0.05, 0.95)
    The lower and upper percentile used to create the extreme values
    for the PDP axes.
grid_resolution : int, default=100
    The number of equally spaced points on the axes.
"""
my_plots = plot_partial_dependence(gbrt=my_model, 
                                   features=[0,1,2],
                                   X=imputed_X,
                                   feature_names=cols_to_use, 
                                   grid_resolution=100,
                                   n_cols=2)

partial_dependence?
"""
plot yourself with seaborn...

partial_dependence(gbrt, target_variables, grid=None, 
                   X=None, percentiles=(0.05, 0.95), 
                   grid_resolution=100)
"""
p_dep = partial_dependence(gbrt=my_model,
                           target_variables=[0,1,2],
                           X=imputed_X,
                           percentiles=(0.05,0.95))

p_dep[0][0] #y or PDP
p_dep[1][0] #x for Distance
p_dep[1][1] #x for Land
p_dep[1][2] #x for BuildingArea
Example #27
0
#a 25 = 26-1 ami az y számossága eggyel csökkentve az 0tól kezdődő számossága miatt
label = len(ytrain.value_counts())
pdp_fig, pdp_axs = partial_dependence.plot_partial_dependence(gbc,Xtrain,feature_numbers,feature_names, label)
plt.subplots_adjust(top=1.5)  # tight_layout causes overlap with suptitle



#%%
from mpl_toolkits.mplot3d import Axes3D

print('Custom 3d plot via ``partial_dependence``')
fig = plt.figure()

target_feature = (3,7)
pdp, axes = partial_dependence.partial_dependence(gbc, target_feature,X=Xtrain, grid_resolution=100, )
XX, YY = np.meshgrid(axes[0], axes[1])
Z = pdp[0].reshape(list(map(np.size, axes))).T
ax = Axes3D(fig)
surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1,
                       cmap=plt.cm.BuPu, edgecolor='k')
ax.set_xlabel(feature_names[target_feature[0]])
ax.set_ylabel(feature_names[target_feature[1]])
ax.set_zlabel('Partial dependence')
#  pretty init view
ax.view_init(elev=22, azim=122)
plt.colorbar(surf)
plt.subplots_adjust(top=0.9)

plt.show()
Example #28
0
 def make_pdp(i, mean=0.0):
     pdp = partial_dependence(self.model, i, X=X, grid_resolution=100)
     feature = self.feature_names[i]
     feature_bins = [denorm.denormalize_feature_value(feature, x) for x in list(pdp[1][0])]
     data = list([x - mean for x in pdp[0]])
     return {"feature": feature, "featureBins": feature_bins, "data": data}
    def partial_dependency_catagorical_plot(self,
                                            gbm,
                                            feature_label,
                                            ax_limits_per_feature=None,
                                            overlay_box_plots=False,
                                            add_means=True,
                                            absolute_yscale=False,
                                            absolute_yticks=True,
                                            **fig_params):
        """Plots partial dependency for a categorical variable.

        arguments:
            absolute_yscale - absolute_scale (starts at zero) on y axis
            absolute_yticks - flag to subtract mean from y axis pabels
        Todo: split this methods into 2-3 smaller ones
        """
        legends = []
        legend_labels = []
        deltas = []
        deltas_unc = []
        labels = []
        means = []
        means_unc = []
        raw_data = []
        counts = []
        raw_stds = []
        width = 0.8
        if add_means:
            width = 0.35
        if ax_limits_per_feature is None:
            ax_limits_per_feature = {}
        outcome_mean = np.mean(self.train_y)
        for feature_index in self.categorical_features_ind[feature_label]:
            y, x = skl_e_pd.partial_dependence(gbm, [feature_index],
                                               X=self.train_X)

            stds = self.partial_dependency_uncertainty([feature_index],
                                                       grid=x[0],
                                                       percentiles=(0.0, 1.0))
            if len(x) == 0 or len(y) == 0:
                logging.debug(
                    'no results for feature_index {}'.format(feature_index))
            else:
                try:
                    delta = y[0][np.where(x[0] == 1)[0][0]]\
                            - y[0][np.where(x[0] == 0)[0][0]]
                    # if absolute_yscale or absolute_yticks:
                    #     logging.debug('original delta: {}'.format(delta))
                    #     delta += outcome_mean
                    #     logging.debug('original delta with mean: {}'.format(delta))
                    deltas.append(delta)
                    labels.append(self.feature_labels[feature_index].replace(
                        feature_label + '_', ''))
                    deltas_unc.append(np.sqrt(np.sum(stds**2)))

                    train_X_idx = self.train_X[
                        self.feature_labels[feature_index]] == 1
                    raw_data_subset = self.train_y[train_X_idx]
                    # if not absolute_yscale and not absolute_yticks:
                    #     raw_data_subset -= outcome_mean
                    raw_data_weights = self.train_weights[train_X_idx]
                    raw_data.append(raw_data_subset)
                    means.append(
                        np.average(raw_data_subset, weights=raw_data_weights))
                    means_unc.append(
                        self.mean_uncertainty(raw_data_subset,
                                              weights=raw_data_weights))
                    counts.append(int(np.sum(raw_data_weights)))
                    raw_stds.append(
                        pde.std(raw_data_subset, weights=raw_data_weights))
                except Exception as e:
                    logging.error('Cannot get partial dependence delta for \
feature index "{}" due to "{}"'.format(feature_index, e))
        idxs = np.argsort(deltas)
        plot_x = np.arange(idxs.shape[0]) + 0.5
        plot_deltas = np.array(deltas)[idxs]
        fig = plt.figure(**fig_params)
        ax = fig.add_subplot(1, 1, 1)
        if absolute_yscale or not absolute_yticks:
            bar_bottom = 0
        else:
            bar_bottom = outcome_mean
        logging.debug('bar bottom: {}, plot_deltas: {}'.format(
            bar_bottom, plot_deltas))
        pdp_bars = ax.bar(plot_x,
                          plot_deltas,
                          width=width,
                          align='center',
                          color=self.pdp_color,
                          bottom=bar_bottom)

        legends.append(pdp_bars)
        legend_labels.append('Partial dependences with uncertainties')
        ax.errorbar(plot_x,
                    plot_deltas + bar_bottom,
                    yerr=np.array(deltas_unc)[idxs],
                    color='grey',
                    fmt='o')

        if add_means:
            means_y = np.array(means)[idxs]
            if not absolute_yticks:
                means_y -= outcome_mean
            logging.debug('bar bottom: {}, means_y: {}'.format(
                bar_bottom, means_y))
            means_bars = ax.bar(plot_x + width,
                                means_y - bar_bottom,
                                width=width,
                                align='center',
                                alpha=0.5,
                                color=self.means_color,
                                bottom=bar_bottom)
            ax.errorbar(plot_x + width,
                        means_y,
                        yerr=np.array(means_unc)[idxs],
                        color='grey',
                        fmt='o')
            legends.append(means_bars)
            legend_labels.append('Raw averages with uncertainties')
            # ax.legend(['Raw averages with uncertainties'])
        ax.grid(alpha=0.3)

        if overlay_box_plots:
            self.box_plots_raw_data(raw_data, plot_x)

        # store x,y limits based on primary data
        xlim = ax.get_xlim()
        ylim = ax.get_ylim()

        counts_bars = self.overlay_counts_histogram(ax, plot_x, counts, xlim,
                                                    ylim)
        if counts_bars is not None:
            legends.append(counts_bars)
            legend_labels.append('Counts')
        plt.xticks(plot_x, np.array(labels)[idxs], rotation='vertical')
        plt.xlabel('{}'.format(feature_label))
        plt.ylabel(self.outcome_ylabel(absolute_yticks))
        plt.title('Partial dependence on {} ({} trees)'.format(
            feature_label, gbm.n_estimators))
        ylim = ax_limits_per_feature.get('ylim')
        if ylim is not None:
            ax.set_ylim(tuple(ylim))
        xlim = ax_limits_per_feature.get('xlim')
        if xlim is not None:
            ax.set_ylim(tuple(xlim))
        logging.debug(
            'creating legend with handles: "{}", labels: "{}"'.format(
                legends, legend_labels))
        try:
            ax.legend(handles=legends,
                      labels=legend_labels,
                      bbox_to_anchor=(1, 0.5))
        except Exception as e:
            logging.error(
                'Cannot add legend for feature "{}" due to "{}"'.format(
                    feature_label, e))
        # plt.legend(
        #     handles=legends[:1],
        #     loc='center left',
        #     bbox_to_anchor=(1, 0.5))

        if self.show_plots:
            plt.show()
        # plt.show()
        self.save_fig(fig, 'partial_dependence_{}.png'.format(feature_label))

        df = pd.DataFrame(
            [labels, deltas, deltas_unc, means, means_unc, counts, raw_stds],
            index=[
                feature_label, 'partial dependence delta',
                'partial dependence delta uncertainty', 'mean',
                'mean uncertainty', 'sample size', 'standard deviation'
            ]).T.sort_values(by='partial dependence delta')
        df.to_excel(
            self.destination_dir +
            'partial_dependence_with_stats_{}.xlsx'.format(feature_label))
Example #30
0
print('Convenience plot with ``partial_dependence_plots``')

features = [0, 5, 1, 2, (5, 1)]
fig, axs = plot_partial_dependence(clf, X_train, features,
                                   feature_names=names,
                                   n_jobs=1, grid_resolution=50)
fig.suptitle('Partial dependence of house value on nonlocation features\n'
             'for the California housing dataset')
plt.subplots_adjust(top=0.9)  # tight_layout causes overlap with suptitle

print('Custom 3d plot via ``partial_dependence``')
fig = plt.figure()

target_feature = (1, 5)
pdp, axes = partial_dependence(clf, target_feature,
                               X=X_train, grid_resolution=50)
XX, YY = np.meshgrid(axes[0], axes[1])
Z = pdp[0].reshape(list(map(np.size, axes))).T
ax = Axes3D(fig)
surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1,
                       cmap=plt.cm.BuPu, edgecolor='k')
ax.set_xlabel(names[target_feature[0]])
ax.set_ylabel(names[target_feature[1]])
ax.set_zlabel('Partial dependence')
#  pretty init view
ax.view_init(elev=22, azim=122)
plt.colorbar(surf)
plt.suptitle('Partial dependence of house value on median\n'
             'age and average occupancy')
plt.subplots_adjust(top=0.9)
Example #31
0
    my_model,
    features=[0, 2],  # column numbers of plots we want to show
    X=train_X,  # raw predictors data.
    feature_names=names,  # labels on graphs
    grid_resolution=10)  # number of values to plot on x axis

fig.suptitle('Partial dependence of house value on nonlocation features\n'
             'for the Melbourne housing dataset')
plt.subplots_adjust(top=0.9)  # tight_layout causes overlap with suptitle

print('Custom 3d plot via ``partial_dependence``')
fig = plt.figure()

target_feature = (0, 2)
pdp, axes = partial_dependence(my_model,
                               target_feature,
                               X=train_X,
                               grid_resolution=50)
XX, YY = np.meshgrid(axes[0], axes[1])
Z = pdp[0].reshape(list(map(np.size, axes))).T
ax = Axes3D(fig)
surf = ax.plot_surface(XX,
                       YY,
                       Z,
                       rstride=1,
                       cstride=1,
                       cmap=plt.cm.BuPu,
                       edgecolor='k')
ax.set_xlabel(names[target_feature[0]])
ax.set_ylabel(names[target_feature[1]])
ax.set_zlabel('Partial dependence')
#  pretty init view
Example #32
0
def brt_train_plot_func(X, y, feats, train_model=False):
    """
    Function to train the BRT model and plots to make inference
    """

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.15,
                                                        random_state=1)

    #y_train = y_train.as_matrix().ravel()
    #y_test = y_test.as_matrix().ravel() using only when coming from pandas dataframe

    param_grid_brt = {
        'learning_rate': np.logspace(-4, 0, 50),
        'max_depth': range(2, 8),
        'min_samples_leaf': range(3, 10)
    }
    #from sklearn.metrics import mean_squared_error, make_scorer
    #param_grid_brt = {'learning_rate': np.logspace(-4,0,3),'max_depth': [2,6],'min_samples_leaf': [3,10]}
    clf = GradientBoostingRegressor(n_estimators=500)
    #cross-validation grid to search the best parameters

    #str_in = raw_input("(T)raining or (U)sed selected (Default: U): ")

    if train_model:
        print "Training model"
        #mse_scorer = make_scorer(mean_squared_error,greater_is_better = False)
        brt_complete = GridSearchCV(clf,
                                    param_grid_brt,
                                    n_jobs=-1,
                                    verbose=True,
                                    cv=10)
        brt_complete.fit(X_train, y_train)
        brt = brt_complete.best_estimator_
    else:
        brt = GradientBoostingRegressor(n_estimators=2000,
                                        learning_rate=0.0008,
                                        max_depth=4,
                                        min_samples_leaf=5)
        brt.fit(X_train, y_train)

    #str_in = raw_input("Descomp-(T)raining or (U)sed selected (Default: U): ")
    #
    #if str_in == 'T':
    #    print "Training descomp model"
    #    brt_descomp_complete = GridSearchCV(clf_descomp, param_grid_brt,n_jobs = -1,verbose = True,cv = 10)
    #    brt_descomp_complete.fit(X_descomp_train,y_descomp_train)
    #    brt_descomp = brt_descomp_complete.best_estimator_
    #else:
    #    brt_descomp = GradientBoostingRegressor(n_estimators=2000,learning_rate=0.006,max_depth = 4,min_samples_leaf=5)
    #    brt_descomp.fit(X_descomp_train,y_descomp_train)

    plt.close('all')
    #  ####### IAM %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    #relative importance

    feature_importance = brt.feature_importances_
    # make importances relative to max importance
    feature_importance = 100.0 * (feature_importance /
                                  feature_importance.max())
    sorted_idx = np.argsort(feature_importance)
    pos = np.arange(sorted_idx.shape[0]) + .5
    #plt.sca(axs[5])
    #plt.cla()
    #feats = np.array(features)
    plt.barh(pos, feature_importance[sorted_idx], align='center')
    plt.yticks(pos, feats[sorted_idx], fontsize=20)
    plt.title("TS Group 3", fontsize=20)
    plt.xlabel('Relative Importance (%)', fontsize=20)
    plt.subplots_adjust(top=0.9, left=0.18, bottom=0.15)
    #partial dependence plot

    #mse
    from sklearn.metrics import mean_squared_error, r2_score

    y_pred = brt.predict(X_test)

    print "MSE", mean_squared_error(y_test, y_pred)
    print 'R2', r2_score(y_test, y_pred)

    #plot for IAM
    #plt.figure()
    #4 features AVNN, age, sex, ci
    #features = ['SDNN','HRVTriangIndex','SDSD','AVNN','logIndex','RMSSD','ci','sex','age']
    #target_features = [features[3],features[-1],features[-2],features[-3]]
    target_features_idx = [0, 4, 7, 3, 9, (0, 4)]
    fig_hrt, axs = plot_partial_dependence(brt,
                                           X_train,
                                           target_features_idx,
                                           feature_names=feats,
                                           n_jobs=-1,
                                           grid_resolution=80)
    fig_hrt.suptitle('TS Group 3 = f(HRV)', fontsize=20)
    plt.subplots_adjust(top=0.9, hspace=0.4, wspace=0.5)
    for a in range(5):
        axs[a].set_ylabel(
            "TS", fontsize=20)  # tight_layout causes overlap with suptitle
        axs[a].set_xlabel(feats[target_features_idx[a]], fontsize=20)
    axs[5].set_xlabel(feats[target_features_idx[5][0]], fontsize=20)
    axs[5].set_ylabel(feats[target_features_idx[5][1]], fontsize=20)
    plt.show()

    target_features_idx = [8, 7]
    fig_hrt, axs = plot_partial_dependence(brt,
                                           X_train,
                                           target_features_idx,
                                           feature_names=feats,
                                           n_jobs=-1,
                                           grid_resolution=80)
    fig_hrt.suptitle('TS Group 3 = f(HRV)', fontsize=20)
    plt.subplots_adjust(top=0.9, left=0.12)
    for a in range(2):
        axs[a].set_ylabel(
            "TS partial dependence",
            fontsize=20)  # tight_layout causes overlap with suptitle
        axs[a].set_xlabel(feats[target_features_idx[a]], fontsize=20)
        axs[a].set_ylim(-2.5, 1.5)
    plt.show()

    fig = plt.figure()

    target_feature = (7, 3)
    pdp, (x_axis, y_axis) = partial_dependence(brt,
                                               target_feature,
                                               X=X_train,
                                               grid_resolution=80)
    XX, YY = np.meshgrid(x_axis, y_axis)
    Z = pdp.T.reshape(XX.shape).T
    ax = Axes3D(fig)
    surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1, cmap=plt.cm.BuPu)
    ax.set_xlabel(feats[target_feature[0]], fontsize=18)
    ax.set_ylabel(feats[target_feature[1]], fontsize=18)
    ax.set_zlabel('$TS$', fontsize=18)
    #  pretty init view
    ax.view_init(elev=22, azim=122)
    plt.colorbar(surf)
    plt.suptitle('$TS = f(Scl,TINN)$', fontsize=18)
Example #33
0
# features = [0, 5, 1, 2, (5, 1)]
# features = ['MedInc', 'AveOccup', 'HouseAge', 'AveRooms', ('AveOccup', 'HouseAge')]
# fig, axs = plot_partial_dependence(clf, X_train, features, feature_names=names,
#                                    n_jobs=1, grid_resolution=50)
# fig.suptitle('Partial dependence of house value on nonlocation features\n'
#              'for the California housing dataset')
# pl.subplots_adjust(top=0.9)  # tight_layout causes overlap with suptitle

print('_' * 80)
print('Custom 3d plot via ``partial_dependence``')
print
fig = pl.figure()

target_feature = (1, 5)
pdp, (x_axis, y_axis) = partial_dependence(clf,
                                           target_feature,
                                           X=X_train,
                                           grid_resolution=50)
XX, YY = np.meshgrid(x_axis, y_axis)
Z = pdp.T.reshape(XX.shape).T
ax = Axes3D(fig)
surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1, cmap=pl.cm.BuPu)
ax.set_xlabel(names[target_feature[0]])
ax.set_ylabel(names[target_feature[1]])
ax.set_zlabel('Partial dependence')
#  pretty init view
ax.view_init(elev=22, azim=122)
pl.colorbar(surf)
pl.suptitle('Partial dependence of house value on median age and '
            'average occupancy')
pl.subplots_adjust(top=0.9)
Example #34
0
#def plot_feature_importances(model):
#    plt.figure(figsize=(8,6))
#    n_features = len(names)
#    plt.barh(range(n_features), model.feature_importances_, align='center')
#    plt.yticks(np.arange(n_features), names)
#    plt.xlabel("Feature importance")
#    plt.ylabel("Feature")
#    plt.ylim(-1, n_features)
#
#plot_feature_importances(gbc)
#
#plt.show()

pd.DataFrame(np.vstack(gbc.feature_importances_).T, columns=names).T.sort_values(by=0).plot(kind='barh')

pdep = partial_dependence(gbc, features, X = X_train, grid_resolution=100)

columns_new = ['Risk Probabilities',names[feat_num]]
fico = pd.DataFrame(np.vstack(pdep).T, columns = columns_new)

fico = fico[fico[names[feat_num]]>600]

fico.plot(names[feat_num],'Risk Probabilities')

# ROC Curve Plot
logit_roc_auc = roc_auc_score(Y_test, gbc.predict(X_test))
fpr, tpr, thresholds = roc_curve(Y_test, gbc.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Gradient Boosting Classifier (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
# features = [0, 5, 1, 2, (5, 1)]
# features = ['MedInc', 'AveOccup', 'HouseAge', 'AveRooms', ('AveOccup', 'HouseAge')]
# fig, axs = plot_partial_dependence(clf, X_train, features, feature_names=names,
#                                    n_jobs=1, grid_resolution=50)
# fig.suptitle('Partial dependence of house value on nonlocation features\n'
#              'for the California housing dataset')
# pl.subplots_adjust(top=0.9)  # tight_layout causes overlap with suptitle

print('_' * 80)
print('Custom 3d plot via ``partial_dependence``')
print
fig = pl.figure()

target_feature = (1, 5)
pdp, (x_axis, y_axis) = partial_dependence(clf, target_feature,
                                           X=X_train, grid_resolution=50)
XX, YY = np.meshgrid(x_axis, y_axis)
Z = pdp.T.reshape(XX.shape).T
ax = Axes3D(fig)
surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1, cmap=pl.cm.BuPu)
ax.set_xlabel(names[target_feature[0]])
ax.set_ylabel(names[target_feature[1]])
ax.set_zlabel('Partial dependence')
#  pretty init view
ax.view_init(elev=22, azim=122)
pl.colorbar(surf)
pl.suptitle('Partial dependence of house value on median age and '
            'average occupancy')
pl.subplots_adjust(top=0.9)

pl.show()
Example #36
0
    'learning_rate': 0.01,
    'loss': 'ls'
}

gbr = GradientBoostingRegressor(**params)
gbr.fit(X_train, y_train)
pd.crosstab(y_test,
            gbr.predict(X_test).round(),
            rownames=['Actual'],
            colnames=['Predicted'])

pd.DataFrame({
    'Variable': X_test.columns,
    'Importance': gbr.feature_importances_
}).sort_values('Importance', ascending=False)

fig, axs = plot_partial_dependence(
    gbr,
    X=X_test,
    features=['Parhelion Patrol', 'Rubblebelt Boar', 'Hammer Dropper'],
    feature_names=feature_list,
    n_jobs=1,
    grid_resolution=10)

allpd = {}

for i in range(len(feature_list) - 1):
    key, values = partial_dependence(gbr, target_variables=i, X=X_test)
    allpd.update(dict(zip([feature_list[i]], key.tolist())))

df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in allpd.items()]))
Example #37
0
"""
1.11.4.7.2. Partial dependence

"""

# from sklearn.datasets import make_hastie_10_2
# from sklearn.ensemble import GradientBoostingClassifier
# from sklearn.ensemble.partial_dependence import plot_partial_dependence

# X, y = make_hastie_10_2(random_state=0)
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
#     max_depth=1, random_state=0).fit(X, y)
# features = [0, 1, (0, 1)]
# fig, axs = plot_partial_dependence(clf, X, features) 


# from sklearn.datasets import load_iris
# iris = load_iris()
# mc_clf = GradientBoostingClassifier(n_estimators=10,
#     max_depth=1).fit(iris.data, iris.target)
# features = [3, 2, (3, 2)]
# fig, axs = plot_partial_dependence(mc_clf, X, features, label=0) 
from sklearn.ensemble.partial_dependence import plot_partial_dependence

from sklearn.ensemble.partial_dependence import partial_dependence

pdp, axes = partial_dependence(clf, [0], X=X)
print pdp