def dependence3d(self, forest, train, feature_set): print "******************this is the output of dependences of features" fig = plt.figure() pdp, (x_axis, y_axis) = partial_dependence(forest, feature_set, X=train) XX, YY = np.meshgrid(x_axis, y_axis) Z = pdp.T.reshape(XX.shape) ax = Axes3D(fig) surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1, cmap=plt.cm.BuPu) #------------------------------- ax.set_xlabel(names[target_feature[0]]) #------------------------------- ax.set_ylabel(names[target_feature[1]]) ax.set_zlabel('Partial dependence') # pretty init view ax.view_init(elev=22, azim=122) plt.colorbar(surf) plt.suptitle('Partial dependence of house value on me12dian age and ' 'average occupancy') plt.subplots_adjust(top=0.9) plt.show()
def main(): cal_housing = fetch_california_housing() # split 80/20 train-test X_train, X_test, y_train, y_test = train_test_split(cal_housing.data, cal_housing.target, test_size=0.2, random_state=1) names = cal_housing.feature_names print('_' * 80) print("Training GBRT...") clf = GradientBoostingRegressor(n_estimators=100, max_depth=4, learning_rate=0.1, loss='huber', random_state=1) clf.fit(X_train, y_train) print("done.") print('_' * 80) print('Convenience plot with ``partial_dependence_plots``') print features = [0, 5, 1, 2, (5, 1)] fig, axs = plot_partial_dependence(clf, X_train, features, feature_names=names, n_jobs=3, grid_resolution=50) fig.suptitle( 'Partial dependence of house value on nonlocation features for the California housing dataset' ) plt.subplots_adjust(top=0.9) print('_' * 80) print('Custom 3d plot via ``partial_dependence``') print fig = plt.figure() target_feature = (1, 5) pdp, (x_axis, y_axis) = partial_dependence(clf, target_feature, X=X_train, grid_resolution=50) XX, YY = np.meshgrid(x_axis, y_axis) Z = pdp.T.reshape(XX.shape).T ax = Axes3D(fig) surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1, cmap=plt.cm.BuPu) ax.set_xlabel(names[target_feature[0]]) ax.set_ylabel(names[target_feature[1]]) ax.set_zlabel('Partial dependence') # pretty init view ax.view_init(elev=22, azim=122) plt.colorbar(surf) plt.suptitle( 'Partial dependence of house value on median age and average occupancy' ) plt.subplots_adjust(top=0.9) plt.show()
def compute_f_vals(gbm, model_inds, arr, inds): feat_vals, feat_val_counts = unique_rows_with_counts(arr[:, inds]) uncentd_f_vals = partial_dependence.partial_dependence( gbm, model_inds[(inds, )], grid=feat_vals)[0][0] mean_uncentd_f_val = np.dot(feat_val_counts, uncentd_f_vals) / arr.shape[0] f_vals = uncentd_f_vals - mean_uncentd_f_val return dict(zip(map(tuple, feat_vals), f_vals))
def test_partial_dependence_classifier(): # Test partial dependence for classifier clf = GradientBoostingClassifier(n_estimators=10, random_state=1) clf.fit(X, y) pdp, axes = partial_dependence(clf, [0], X=X, grid_resolution=5) # only 4 grid points instead of 5 because only 4 unique X[:,0] vals assert pdp.shape == (1, 4) assert axes[0].shape[0] == 4 # now with our own grid X_ = np.asarray(X) grid = np.unique(X_[:, 0]) pdp_2, axes = partial_dependence(clf, [0], grid=grid) assert axes is None assert_array_equal(pdp, pdp_2)
def test_partial_dependence_classifier(): # Test partial dependence for classifier clf = GradientBoostingClassifier(n_estimators=10, random_state=1) clf.fit(X, y) pdp, axes = partial_dependence(clf, [0], X=X, grid_resolution=5) # only 4 grid points instead of 5 because only 4 unique X[:,0] vals assert pdp.shape == (1, 4) assert axes[0].shape[0] == 4 # now with our own grid X_ = np.asarray(X) grid = np.unique(X_[:, 0]) pdp_2, axes = partial_dependence(clf, [0], grid=grid) assert axes is None assert_array_equal(pdp, pdp_2)
def main(): # fetch California housing dataset try: cal_housing = fetch_california_housing() except HTTPError: print("Failed downloading california housing data.") return # split 80/20 train-test X_train, X_test, y_train, y_test = train_test_split(cal_housing.data, cal_housing.target, test_size=0.2, random_state=1) names = cal_housing.feature_names print('_' * 80) print("Training GBRT...") clf = GradientBoostingRegressor(n_estimators=100, max_depth=4, learning_rate=0.1, loss='huber', random_state=1) clf.fit(X_train, y_train) print("done.") print('_' * 80) print('Convenience plot with ``partial_dependence_plots``') print features = [0, 5, 1, 2, (5, 1)] fig, axs = plot_partial_dependence(clf, X_train, features, feature_names=names, n_jobs=3, grid_resolution=50) fig.suptitle('Partial dependence of house value on nonlocation features\n' 'for the California housing dataset') plt.subplots_adjust(top=0.9) # tight_layout causes overlap with suptitle print('_' * 80) print('Custom 3d plot via ``partial_dependence``') print fig = plt.figure() target_feature = (1, 5) pdp, (x_axis, y_axis) = partial_dependence(clf, target_feature, X=X_train, grid_resolution=50) XX, YY = np.meshgrid(x_axis, y_axis) Z = pdp.T.reshape(XX.shape).T ax = Axes3D(fig) surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1, cmap=plt.cm.BuPu) ax.set_xlabel(names[target_feature[0]]) ax.set_ylabel(names[target_feature[1]]) ax.set_zlabel('Partial dependence') # pretty init view ax.view_init(elev=22, azim=122) plt.colorbar(surf) plt.suptitle('Partial dependence of house value on median age and ' 'average occupancy') plt.subplots_adjust(top=0.9) plt.show()
def test_partial_dependence_regressor(): # Test partial dependence for regressor clf = GradientBoostingRegressor(n_estimators=10, random_state=1) clf.fit(boston.data, boston.target) grid_resolution = 25 pdp, axes = partial_dependence( clf, [0], X=boston.data, grid_resolution=grid_resolution) assert pdp.shape == (1, grid_resolution) assert axes[0].shape[0] == grid_resolution
def score(self, X_test, y_test): self.baseline = [self.baseline] * len(y_test) baseline_MSE = np.sqrt(mean_squared_error(y_test, self.baseline)) print('Baseline Root Mean Squared Error:', round(baseline_MSE, 0)) model_MSE = np.sqrt(mean_squared_error(y_test, self.y_pred)) print('Model Root Mean Squared Error:', round(model_MSE, 0)) print('Reduction in RMSE: %', ((model_MSE - baseline_MSE) / baseline_MSE)) ### PD PLOT importances = self.model.feature_importances_ sorted_imps = sorted(importances)[::-1] indicies = np.argsort(importances)[::-1] names = self.X.columns[indicies] N_COLS = 3 pd_plots = [ partial_dependence(self.model, target_feature, X=self.X, grid_resolution=50) for target_feature in indicies ] pd_plots = list( zip(([pdp[0][0] for pdp in pd_plots]), ([pdp[1][0] for pdp in pd_plots]))) fig, axes = plt.subplots(nrows=3, ncols=N_COLS, sharey=True, figsize=(12.0, 8.0)) for i, (y_axis, x_axis) in enumerate(pd_plots[0:(3 * N_COLS)]): ax = axes[i // N_COLS, i % N_COLS] ax.plot(x_axis, y_axis, color="purple") ax.set_xlim([np.min(x_axis), np.max(x_axis)]) text_x_pos = np.min(x_axis) + 0.05 * (np.max(x_axis) - np.min(x_axis)) ax.text(text_x_pos, 7.5, "Feature Importance " + str(round(sorted_imps[i], 2)), fontsize=12, alpha=0.7) ax.set_xlabel(names[i]) ax.grid() plt.suptitle( "Partial Dependence Plots (Ordered by Feature Importance)", fontsize=16) plt.tight_layout(rect=[0, 0.03, 1, 0.95])
def multi_case_partial_dependence(df, cases, ests, stdzrs, n_oversamps, c_true, c_pred): y_true_l = [] y_hat_l = [] y_proba_l = [] feats_l = [] fig, ax = plt.subplots(1,1,figsize=(6,4)) for case, est, stdzr, n, c_t, c_p in zip(cases, ests, stdzrs, n_oversamps, c_true, c_pred): data_df = df.copy() # copy to read all columns after dropping print('case: {}'.format(case[0])) # drop other binary and probability column c_drop = [c for c in list(df.columns) if case[1] in c] data_df.drop(c_drop, axis=1, inplace=True) # train test split in time X_train, y_train, X_test, y_test = train_test_split_time(data_df, '2016-06-01', case[0]) names = list(X_train.columns) features = [11, 12, 13, 14, (9, 18)] # plot fig, axs = plot_partial_dependence(est, X_train, features, feature_names=names, n_jobs=3, grid_resolution=50) fig.suptitle('Partial dependence of features\n' 'for {} model'.format(case[0])) plt.subplots_adjust(top=0.9) # tight_layout causes overlap with suptitle print('Custom 3d plot via ``partial_dependence``') fig = plt.figure() target_feature = (9, 18) pdp, axes = partial_dependence(est, target_feature, X=X_train, grid_resolution=50) XX, YY = np.meshgrid(axes[0], axes[1]) Z = pdp[0].reshape(list(map(np.size, axes))).T ax = Axes3D(fig) surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1, cmap=plt.cm.BuPu, edgecolor='k') ax.set_xlabel(names[target_feature[0]]) ax.set_ylabel(names[target_feature[1]]) ax.set_zlabel('Partial dependence') # pretty init view ax.view_init(elev=22, azim=122) plt.colorbar(surf) plt.suptitle('Partial dependence of features\n' 'for {} model'.format(case[0])) plt.subplots_adjust(top=0.9) plt.show()
def test_partial_dependence_multiclass(): # Test partial dependence for multi-class classifier clf = GradientBoostingClassifier(n_estimators=10, random_state=1) clf.fit(iris.data, iris.target) grid_resolution = 25 n_classes = clf.n_classes_ pdp, axes = partial_dependence( clf, [0], X=iris.data, grid_resolution=grid_resolution) assert pdp.shape == (n_classes, grid_resolution) assert len(axes) == 1 assert axes[0].shape[0] == grid_resolution
def partial_dependency_uncertainty(self, features, grid, percentiles): # Returns standard deviation of partial dependency curve pdps_cv = np.zeros((len(grid), len(self.models)), dtype=np.float64) # logging.debug('grid: {}; shape: {}'.format(grid, grid.shape)) for i, cv_model in enumerate(self.models): pdps = skl_e_pd.partial_dependence(cv_model, features, grid, percentiles=percentiles) pdps_cv[:, i] = pdps[0] stds = np.std(pdps_cv, axis=1) return stds
def test_partial_dependence_classifier(): # Test partial dependence for classifier clf = GradientBoostingClassifier(n_estimators=10, random_state=1) clf.fit(X, y) pdp, axes = partial_dependence(clf, [0], X=X, grid_resolution=5) # only 4 grid points instead of 5 because only 4 unique X[:,0] vals assert pdp.shape == (1, 4) assert axes[0].shape[0] == 4 # now with our own grid X_ = np.asarray(X) grid = np.unique(X_[:, 0]) pdp_2, axes = partial_dependence(clf, [0], grid=grid) assert axes is None assert_array_equal(pdp, pdp_2) # with trivial (no-op) sample weights clf.fit(X, y, sample_weight=np.ones(len(y))) pdp_w, axes_w = partial_dependence(clf, [0], X=X, grid_resolution=5) assert pdp_w.shape == (1, 4) assert axes_w[0].shape[0] == 4 assert_allclose(pdp_w, pdp) # with non-trivial sample weights clf.fit(X, y, sample_weight=sample_weight) pdp_w2, axes_w2 = partial_dependence(clf, [0], X=X, grid_resolution=5) assert pdp_w2.shape == (1, 4) assert axes_w2[0].shape[0] == 4 assert np.all(np.abs(pdp_w2 - pdp_w) / np.abs(pdp_w) > 0.1)
def calc_avg_pdp(clf): from sklearn.ensemble.partial_dependence import partial_dependence import pandas pdps = np.empty(clf.fit_clfs.shape + (clf.feature_importances.shape[2], 2, 100)) for i in range(0, clf.mask_num): for j in range(0, clf.mask_num): if i == j: pdps[i, j] = None else: for feature in range(0, clf.feature_importances.shape[2]): pdp, a = partial_dependence(clf.fit_clfs[i, j], [feature], X=clf.c_data[i, j][0]) pdps[i, j, feature] = [pdp[0], a[0]] clf.pdps = np.ma.masked_array(pdps, mask= pandas.isnull(pdps))
def calc_avg_pdp(clf): from sklearn.ensemble.partial_dependence import partial_dependence import pandas pdps = np.empty(clf.fit_clfs.shape + (clf.feature_importances.shape[2], 2, 100)) for i in range(0, clf.mask_num): for j in range(0, clf.mask_num): if i == j: pdps[i, j] = None else: for feature in range(0, clf.feature_importances.shape[2]): pdp, a = partial_dependence(clf.fit_clfs[i, j], [feature], X=clf.c_data[i, j][0]) pdps[i, j, feature] = [pdp[0], a[0]] clf.pdps = np.ma.masked_array(pdps, mask=pandas.isnull(pdps))
def plot_partial_dependence_with_unc(self, gbm, feature_idx, percentiles=(0.05, 0.95), absolute_yscale=False, absolute_yticks=True, **fig_params): outcome_mean = np.mean(self.train_y) fig = plt.figure(**fig_params) ax = fig.add_subplot(1, 1, 1) pdps, axes = skl_e_pd.partial_dependence(gbm, [feature_idx], X=self.train_X, percentiles=percentiles) if absolute_yscale or absolute_yticks: pdps = pdps + outcome_mean # plt.xticks(locs, labels) stds = self.partial_dependency_uncertainty([feature_idx], grid=axes[0], percentiles=percentiles) pdp_uncertainty_plot = ax.fill_between(axes[0], pdps[0] - stds, pdps[0] + stds, alpha=0.2, color=self.pdp_color) pdp_plot, = ax.plot(axes[0], pdps[0], lw=5, color=self.pdp_color) if absolute_yscale: c_ylim = ax.get_ylim() ax.set_ylim(0, c_ylim[1]) # if offset_mean and not offset_mean_labels: # # fig.canvas.draw() # ax.set_yticklabels(ax.get_yticks()) # labels_both = ax.get_yticklabels(which='both') # for l in labels_both: # l.set_text('{:.2f}'.format(outcome_mean + float(l.get_text()))) # ax.set_yticklabels(labels_both) return fig, ax, pdp_plot, pdp_uncertainty_plot
def dependence3d(self, forest, train, feature_set): print "******************this is the output of dependences of features" fig = plt.figure() pdp, (x_axis, y_axis) = partial_dependence(forest, feature_set, X=train) XX, YY = np.meshgrid(x_axis, y_axis) Z = pdp.T.reshape(XX.shape) ax = Axes3D(fig) surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1, cmap=plt.cm.BuPu) #------------------------------- ax.set_xlabel(names[target_feature[0]]) #------------------------------- ax.set_ylabel(names[target_feature[1]]) ax.set_zlabel('Partial dependence') # pretty init view ax.view_init(elev=22, azim=122) plt.colorbar(surf) plt.suptitle('Partial dependence of house value on median age and ' 'average occupancy') plt.subplots_adjust(top=0.9) plt.show()
def test_partial_dependence_sample_weight(): # Test near perfect correlation between partial dependence and diagonal # when sample weights emphasize y = x predictions N = 1000 rng = np.random.RandomState(123456) mask = rng.randint(2, size=N, dtype=bool) x = rng.rand(N) # set y = x on mask and y = -x outside y = x.copy() y[~mask] = -y[~mask] X = np.c_[mask, x] # sample weights to emphasize data points where y = x sample_weight = np.ones(N) sample_weight[mask] = 1000. clf = GradientBoostingRegressor(n_estimators=10, random_state=1) clf.fit(X, y, sample_weight=sample_weight) grid = np.arange(0, 1, 0.01) pdp = partial_dependence(clf, [1], grid=grid) assert np.corrcoef(np.ravel(pdp[0]), grid)[0, 1] > 0.99
def plot_3d(self, feature_3d, top=0.9, grid_resolution=50, rstride=1, cstride=1, cmap="jet", edgecolor='K', elev=22, azim=122): fig = plt.figure() pdp, axes = partial_dependence(gbrt=self.model, target_variables=feature_3d, X=self.feature_df, grid_resolution=grid_resolution) XX, YY = np.meshgrid(axes[0], axes[1]) Z = pdp[0].reshape(list(map(np.size, axes))).T ax = Axes3D(fig) surf = ax.plot_surface(XX, YY, Z, rstride=rstride, cstride=cstride, cmap=cmap, edgecolor=edgecolor) ax.set_xlabel(self.feature_list[feature_3d[0]]) ax.set_ylabel(self.feature_list[feature_3d[1]]) ax.set_zlabel('Partial dependence') ax.view_init(elev=elev, azim=azim) plt.colorbar(surf) plt.suptitle('Partial dependence of house value on median\n' 'age and average occupancy') plt.subplots_adjust(top=top) plt.savefig(os.path.join(self.output_path, self.fig_file)) plt.close()
priors = joblib.load(project_dir + '/data/raw/%s_priors' % prior_group) stacked_clf = joblib.load(target_dir + prior_group + '_' + method + '_final_stacked_clf_' + str(k) + '.npy') X_train = joblib.load(target_dir + prior_group + '_' + method + '_final_predictions_' + str(k) + '.npy') for x in [9]: for y in [25]: target_feature = (x, y) fig = plt.figure() names = [priors[target_feature[0]], priors[target_feature[1]]] print( 'Convenience plot with ``partial_dependence_plots`` for %s and %s' % (names[0], names[1])) pdp, axes = partial_dependence(stacked_clf, target_feature, X=X_train, grid_resolution=50) XX, YY = np.meshgrid(axes[0], axes[1]) Z = pdp[0].reshape(list(map(np.size, axes))).T ax = Axes3D(fig) surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1, cmap=plt.cm.BuPu) ax.set_xlabel(names[0], fontsize=12) ax.set_ylabel(names[1], fontsize=12) ax.set_zlabel('Partial dependence', fontsize=12) ax.view_init(elev=12, azim=-142) plt.xticks([0, 0.5, 1])
print features = [0, 5, 1, 2, (5, 1)] fig, axs = plot_partial_dependence(clf, X_train, features, feature_names=names, n_jobs=3, grid_resolution=50) fig.suptitle('Partial dependence of house value on nonlocation features\n' 'for the California housing dataset') pl.subplots_adjust(top=0.9) # tight_layout causes overlap with suptitle print('_' * 80) print('Custom 3d plot via ``partial_dependence``') print fig = pl.figure() target_feature = (1, 5) pdp, (x_axis, y_axis) = partial_dependence(clf, target_feature, X=X_train, grid_resolution=50) XX, YY = np.meshgrid(x_axis, y_axis) Z = pdp.T.reshape(XX.shape).T ax = Axes3D(fig) surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1, cmap=pl.cm.BuPu) ax.set_xlabel(names[target_feature[0]]) ax.set_ylabel(names[target_feature[1]]) ax.set_zlabel('Partial dependence') # pretty init view ax.view_init(elev=22, azim=122) pl.colorbar(surf) pl.suptitle('Partial dependence of house value on median age and ' 'average occupancy') pl.subplots_adjust(top=0.9) pl.show()
features, feature_names=names, n_jobs=3, grid_resolution=50) fig.suptitle('Partial dependence of house value on nonlocation features\n' 'for the California housing dataset') pl.subplots_adjust(top=0.9) # tight_layout causes overlap with suptitle print('_' * 80) print('Custom 3d plot via ``partial_dependence``') print fig = pl.figure() target_feature = (1, 5) pdp, (x_axis, y_axis) = partial_dependence(clf, target_feature, X=X_train, grid_resolution=50) XX, YY = np.meshgrid(x_axis, y_axis) Z = pdp.T.reshape(XX.shape).T ax = Axes3D(fig) surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1, cmap=pl.cm.BuPu) ax.set_xlabel(names[target_feature[0]]) ax.set_ylabel(names[target_feature[1]]) ax.set_zlabel('Partial dependence') # pretty init view ax.view_init(elev=22, azim=122) pl.colorbar(surf) pl.suptitle('Partial dependence of house value on median age and ' 'average occupancy') pl.subplots_adjust(top=0.9)
def main(): # ommit gender and children (as does the reference paper) X = df[["yearsmarried", "age", "religiousness", "occupation", "rating"]].values y = df['affairs'].values # shuffle data order = np.argsort(np.random.random(y.shape)) X = X[order] y = y[order] names = ["yearsmarried", "age", "religiousness", "occupation", "rating"] print("features to be plotted on first graph: " + str(names)) clf = GradientBoostingRegressor(n_estimators=100, max_depth=4, learning_rate=0.1, loss='huber', random_state=1) clf.fit(X, y) print('Convenience plot with ``partial_dependence_plots``') # features = [0, 5, 1, 2, (5, 1)] features = [0, 1, 2, 3, 4, (0, 1)] fig, axs = plot_partial_dependence(clf, X, features, feature_names=names, n_jobs=-1, grid_resolution=100, n_cols=3) fig.set_size_inches(10.5, 7.5) fig.suptitle('Partial dependence for amount of affairs\n' 'for the Affairs dataset.') plt.subplots_adjust(top=0.9) # tight_layout causes overlap print('Custom 3d plot via ``partial_dependence``') fig = plt.figure() target_feature = (0, 1) pdp, axes = partial_dependence(clf, target_feature, X=X, grid_resolution=100) XX, YY = np.meshgrid(axes[0], axes[1]) Z = pdp[0].reshape(list(map(np.size, axes))).T ax = Axes3D(fig) surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1, cmap=plt.cm.BuPu, edgecolor='k') ax.set_xlabel(names[target_feature[0]]) ax.set_ylabel(names[target_feature[1]]) ax.set_zlabel('Partial dependence') # pretty init view ax.view_init(elev=22, azim=122) plt.colorbar(surf) plt.suptitle('Partial dependence of amount of affairs\n' 'for the amount of years married and the age.') plt.subplots_adjust(top=0.9) plt.show()
#################################################################### # Inspect feature 0, 3, 5, 6, and the interaction between 5 and 0, and 5 # and 3 target_features = [0, 3, 5, 6, (5, 0), (5, 3)] fig, axs = plot_partial_dependence(clf, X_train, target_features, feature_names=boston.feature_names, grid_resolution=30) plt.tight_layout() #################################################################### # Lower-level partial_dependence function # ---------------------------------------- target_feature = (5, 0) from sklearn.ensemble.partial_dependence import partial_dependence partial_deps, grid = partial_dependence(clf, target_feature, X=X_train, grid_resolution=50) import numpy as np # The 2D coordinate grid (for plotting) XX, YY = np.meshgrid(grid[0], grid[1]) # Reshape the partial deps on the grid Z = partial_deps[0].reshape(list(map(np.size, grid))).T #################################################################### # 3D plotting from mpl_toolkits.mplot3d import Axes3D fig = plt.figure() ax = Axes3D(fig) surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1, cmap=plt.cm.BuPu, edgecolor='k')
clf = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.1, max_depth=3, random_state=0).fit(X_train[X_train_relevant.columns], y_train) #%% names = X_train_relevant.columns features = [0,1 , (1, 2)] fig, axs = plot_partial_dependence(clf, X_train[X_train_relevant.columns], features, feature_names=names) #%% target_feature = (0, 1) pdp, axes = partial_dependence(clf, target_feature, X=X_train[X_train_relevant.columns], grid_resolution=50) XX, YY = np.meshgrid(axes[0], axes[1]) Z = pdp[0].reshape(list(map(np.size, axes))).T ax = Axes3D(fig) surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1, cmap=plt.cm.BuPu, edgecolor='k') ax.set_xlabel(names[target_feature[0]]) ax.set_ylabel(names[target_feature[1]]) ax.set_zlabel('Partial dependence') # pretty init view ax.view_init(elev=22, azim=122) plt.colorbar(surf) plt.suptitle('Partial dependence of house value on median\n' 'age and average occupancy') plt.subplots_adjust(top=0.9)
def ml_GradientBoostingClassifier2(self): # This example shows how to obtain partial dependence plots from a GradientBoostingRegressor trained on the California housing dataset. cal_housing = fetch_california_housing() # split 80/20 train-tests X_train, X_test, y_train, y_test = train_test_split(cal_housing.data, cal_housing.target, test_size=0.2, random_state=1) names = cal_housing.feature_names print("Training GBRT...") clf = GradientBoostingRegressor(n_estimators=100, max_depth=4, learning_rate=0.1, loss='huber', random_state=1) clf.fit(X_train, y_train) print(" done.") print('Convenience plot with ``partial_dependence_plots``') features = [0, 5, 1, 2, (5, 1)] fig, axs = plot_partial_dependence(clf, X_train, features, feature_names=names, n_jobs=3, grid_resolution=50) fig.suptitle( 'Partial dependence of house value on nonlocation features\n' 'for the California housing dataset') plt.subplots_adjust( top=0.9) # tight_layout causes overlap with suptitle print('Custom 3d plot via ``partial_dependence``') fig = plt.figure() target_feature = (1, 5) pdp, axes = partial_dependence(clf, target_feature, X=X_train, grid_resolution=50) XX, YY = np.meshgrid(axes[0], axes[1]) Z = pdp[0].reshape(list(map(np.size, axes))).T ax = Axes3D(fig) surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1, cmap=plt.cm.BuPu, edgecolor='k') ax.set_xlabel(names[target_feature[0]]) ax.set_ylabel(names[target_feature[1]]) ax.set_zlabel('Partial dependence') # pretty init view ax.view_init(elev=22, azim=122) plt.colorbar(surf) plt.suptitle('Partial dependence of house value on median\n' 'age and average occupancy') plt.subplots_adjust(top=0.9) plt.show()
The number of columns in the grid plot (default: 3). percentiles : (low, high), default=(0.05, 0.95) The lower and upper percentile used to create the extreme values for the PDP axes. grid_resolution : int, default=100 The number of equally spaced points on the axes. """ my_plots = plot_partial_dependence(gbrt=my_model, features=[0,1,2], X=imputed_X, feature_names=cols_to_use, grid_resolution=100, n_cols=2) partial_dependence? """ plot yourself with seaborn... partial_dependence(gbrt, target_variables, grid=None, X=None, percentiles=(0.05, 0.95), grid_resolution=100) """ p_dep = partial_dependence(gbrt=my_model, target_variables=[0,1,2], X=imputed_X, percentiles=(0.05,0.95)) p_dep[0][0] #y or PDP p_dep[1][0] #x for Distance p_dep[1][1] #x for Land p_dep[1][2] #x for BuildingArea
#a 25 = 26-1 ami az y számossága eggyel csökkentve az 0tól kezdődő számossága miatt label = len(ytrain.value_counts()) pdp_fig, pdp_axs = partial_dependence.plot_partial_dependence(gbc,Xtrain,feature_numbers,feature_names, label) plt.subplots_adjust(top=1.5) # tight_layout causes overlap with suptitle #%% from mpl_toolkits.mplot3d import Axes3D print('Custom 3d plot via ``partial_dependence``') fig = plt.figure() target_feature = (3,7) pdp, axes = partial_dependence.partial_dependence(gbc, target_feature,X=Xtrain, grid_resolution=100, ) XX, YY = np.meshgrid(axes[0], axes[1]) Z = pdp[0].reshape(list(map(np.size, axes))).T ax = Axes3D(fig) surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1, cmap=plt.cm.BuPu, edgecolor='k') ax.set_xlabel(feature_names[target_feature[0]]) ax.set_ylabel(feature_names[target_feature[1]]) ax.set_zlabel('Partial dependence') # pretty init view ax.view_init(elev=22, azim=122) plt.colorbar(surf) plt.subplots_adjust(top=0.9) plt.show()
def make_pdp(i, mean=0.0): pdp = partial_dependence(self.model, i, X=X, grid_resolution=100) feature = self.feature_names[i] feature_bins = [denorm.denormalize_feature_value(feature, x) for x in list(pdp[1][0])] data = list([x - mean for x in pdp[0]]) return {"feature": feature, "featureBins": feature_bins, "data": data}
def partial_dependency_catagorical_plot(self, gbm, feature_label, ax_limits_per_feature=None, overlay_box_plots=False, add_means=True, absolute_yscale=False, absolute_yticks=True, **fig_params): """Plots partial dependency for a categorical variable. arguments: absolute_yscale - absolute_scale (starts at zero) on y axis absolute_yticks - flag to subtract mean from y axis pabels Todo: split this methods into 2-3 smaller ones """ legends = [] legend_labels = [] deltas = [] deltas_unc = [] labels = [] means = [] means_unc = [] raw_data = [] counts = [] raw_stds = [] width = 0.8 if add_means: width = 0.35 if ax_limits_per_feature is None: ax_limits_per_feature = {} outcome_mean = np.mean(self.train_y) for feature_index in self.categorical_features_ind[feature_label]: y, x = skl_e_pd.partial_dependence(gbm, [feature_index], X=self.train_X) stds = self.partial_dependency_uncertainty([feature_index], grid=x[0], percentiles=(0.0, 1.0)) if len(x) == 0 or len(y) == 0: logging.debug( 'no results for feature_index {}'.format(feature_index)) else: try: delta = y[0][np.where(x[0] == 1)[0][0]]\ - y[0][np.where(x[0] == 0)[0][0]] # if absolute_yscale or absolute_yticks: # logging.debug('original delta: {}'.format(delta)) # delta += outcome_mean # logging.debug('original delta with mean: {}'.format(delta)) deltas.append(delta) labels.append(self.feature_labels[feature_index].replace( feature_label + '_', '')) deltas_unc.append(np.sqrt(np.sum(stds**2))) train_X_idx = self.train_X[ self.feature_labels[feature_index]] == 1 raw_data_subset = self.train_y[train_X_idx] # if not absolute_yscale and not absolute_yticks: # raw_data_subset -= outcome_mean raw_data_weights = self.train_weights[train_X_idx] raw_data.append(raw_data_subset) means.append( np.average(raw_data_subset, weights=raw_data_weights)) means_unc.append( self.mean_uncertainty(raw_data_subset, weights=raw_data_weights)) counts.append(int(np.sum(raw_data_weights))) raw_stds.append( pde.std(raw_data_subset, weights=raw_data_weights)) except Exception as e: logging.error('Cannot get partial dependence delta for \ feature index "{}" due to "{}"'.format(feature_index, e)) idxs = np.argsort(deltas) plot_x = np.arange(idxs.shape[0]) + 0.5 plot_deltas = np.array(deltas)[idxs] fig = plt.figure(**fig_params) ax = fig.add_subplot(1, 1, 1) if absolute_yscale or not absolute_yticks: bar_bottom = 0 else: bar_bottom = outcome_mean logging.debug('bar bottom: {}, plot_deltas: {}'.format( bar_bottom, plot_deltas)) pdp_bars = ax.bar(plot_x, plot_deltas, width=width, align='center', color=self.pdp_color, bottom=bar_bottom) legends.append(pdp_bars) legend_labels.append('Partial dependences with uncertainties') ax.errorbar(plot_x, plot_deltas + bar_bottom, yerr=np.array(deltas_unc)[idxs], color='grey', fmt='o') if add_means: means_y = np.array(means)[idxs] if not absolute_yticks: means_y -= outcome_mean logging.debug('bar bottom: {}, means_y: {}'.format( bar_bottom, means_y)) means_bars = ax.bar(plot_x + width, means_y - bar_bottom, width=width, align='center', alpha=0.5, color=self.means_color, bottom=bar_bottom) ax.errorbar(plot_x + width, means_y, yerr=np.array(means_unc)[idxs], color='grey', fmt='o') legends.append(means_bars) legend_labels.append('Raw averages with uncertainties') # ax.legend(['Raw averages with uncertainties']) ax.grid(alpha=0.3) if overlay_box_plots: self.box_plots_raw_data(raw_data, plot_x) # store x,y limits based on primary data xlim = ax.get_xlim() ylim = ax.get_ylim() counts_bars = self.overlay_counts_histogram(ax, plot_x, counts, xlim, ylim) if counts_bars is not None: legends.append(counts_bars) legend_labels.append('Counts') plt.xticks(plot_x, np.array(labels)[idxs], rotation='vertical') plt.xlabel('{}'.format(feature_label)) plt.ylabel(self.outcome_ylabel(absolute_yticks)) plt.title('Partial dependence on {} ({} trees)'.format( feature_label, gbm.n_estimators)) ylim = ax_limits_per_feature.get('ylim') if ylim is not None: ax.set_ylim(tuple(ylim)) xlim = ax_limits_per_feature.get('xlim') if xlim is not None: ax.set_ylim(tuple(xlim)) logging.debug( 'creating legend with handles: "{}", labels: "{}"'.format( legends, legend_labels)) try: ax.legend(handles=legends, labels=legend_labels, bbox_to_anchor=(1, 0.5)) except Exception as e: logging.error( 'Cannot add legend for feature "{}" due to "{}"'.format( feature_label, e)) # plt.legend( # handles=legends[:1], # loc='center left', # bbox_to_anchor=(1, 0.5)) if self.show_plots: plt.show() # plt.show() self.save_fig(fig, 'partial_dependence_{}.png'.format(feature_label)) df = pd.DataFrame( [labels, deltas, deltas_unc, means, means_unc, counts, raw_stds], index=[ feature_label, 'partial dependence delta', 'partial dependence delta uncertainty', 'mean', 'mean uncertainty', 'sample size', 'standard deviation' ]).T.sort_values(by='partial dependence delta') df.to_excel( self.destination_dir + 'partial_dependence_with_stats_{}.xlsx'.format(feature_label))
print('Convenience plot with ``partial_dependence_plots``') features = [0, 5, 1, 2, (5, 1)] fig, axs = plot_partial_dependence(clf, X_train, features, feature_names=names, n_jobs=1, grid_resolution=50) fig.suptitle('Partial dependence of house value on nonlocation features\n' 'for the California housing dataset') plt.subplots_adjust(top=0.9) # tight_layout causes overlap with suptitle print('Custom 3d plot via ``partial_dependence``') fig = plt.figure() target_feature = (1, 5) pdp, axes = partial_dependence(clf, target_feature, X=X_train, grid_resolution=50) XX, YY = np.meshgrid(axes[0], axes[1]) Z = pdp[0].reshape(list(map(np.size, axes))).T ax = Axes3D(fig) surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1, cmap=plt.cm.BuPu, edgecolor='k') ax.set_xlabel(names[target_feature[0]]) ax.set_ylabel(names[target_feature[1]]) ax.set_zlabel('Partial dependence') # pretty init view ax.view_init(elev=22, azim=122) plt.colorbar(surf) plt.suptitle('Partial dependence of house value on median\n' 'age and average occupancy') plt.subplots_adjust(top=0.9)
my_model, features=[0, 2], # column numbers of plots we want to show X=train_X, # raw predictors data. feature_names=names, # labels on graphs grid_resolution=10) # number of values to plot on x axis fig.suptitle('Partial dependence of house value on nonlocation features\n' 'for the Melbourne housing dataset') plt.subplots_adjust(top=0.9) # tight_layout causes overlap with suptitle print('Custom 3d plot via ``partial_dependence``') fig = plt.figure() target_feature = (0, 2) pdp, axes = partial_dependence(my_model, target_feature, X=train_X, grid_resolution=50) XX, YY = np.meshgrid(axes[0], axes[1]) Z = pdp[0].reshape(list(map(np.size, axes))).T ax = Axes3D(fig) surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1, cmap=plt.cm.BuPu, edgecolor='k') ax.set_xlabel(names[target_feature[0]]) ax.set_ylabel(names[target_feature[1]]) ax.set_zlabel('Partial dependence') # pretty init view
def brt_train_plot_func(X, y, feats, train_model=False): """ Function to train the BRT model and plots to make inference """ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=1) #y_train = y_train.as_matrix().ravel() #y_test = y_test.as_matrix().ravel() using only when coming from pandas dataframe param_grid_brt = { 'learning_rate': np.logspace(-4, 0, 50), 'max_depth': range(2, 8), 'min_samples_leaf': range(3, 10) } #from sklearn.metrics import mean_squared_error, make_scorer #param_grid_brt = {'learning_rate': np.logspace(-4,0,3),'max_depth': [2,6],'min_samples_leaf': [3,10]} clf = GradientBoostingRegressor(n_estimators=500) #cross-validation grid to search the best parameters #str_in = raw_input("(T)raining or (U)sed selected (Default: U): ") if train_model: print "Training model" #mse_scorer = make_scorer(mean_squared_error,greater_is_better = False) brt_complete = GridSearchCV(clf, param_grid_brt, n_jobs=-1, verbose=True, cv=10) brt_complete.fit(X_train, y_train) brt = brt_complete.best_estimator_ else: brt = GradientBoostingRegressor(n_estimators=2000, learning_rate=0.0008, max_depth=4, min_samples_leaf=5) brt.fit(X_train, y_train) #str_in = raw_input("Descomp-(T)raining or (U)sed selected (Default: U): ") # #if str_in == 'T': # print "Training descomp model" # brt_descomp_complete = GridSearchCV(clf_descomp, param_grid_brt,n_jobs = -1,verbose = True,cv = 10) # brt_descomp_complete.fit(X_descomp_train,y_descomp_train) # brt_descomp = brt_descomp_complete.best_estimator_ #else: # brt_descomp = GradientBoostingRegressor(n_estimators=2000,learning_rate=0.006,max_depth = 4,min_samples_leaf=5) # brt_descomp.fit(X_descomp_train,y_descomp_train) plt.close('all') # ####### IAM %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% #relative importance feature_importance = brt.feature_importances_ # make importances relative to max importance feature_importance = 100.0 * (feature_importance / feature_importance.max()) sorted_idx = np.argsort(feature_importance) pos = np.arange(sorted_idx.shape[0]) + .5 #plt.sca(axs[5]) #plt.cla() #feats = np.array(features) plt.barh(pos, feature_importance[sorted_idx], align='center') plt.yticks(pos, feats[sorted_idx], fontsize=20) plt.title("TS Group 3", fontsize=20) plt.xlabel('Relative Importance (%)', fontsize=20) plt.subplots_adjust(top=0.9, left=0.18, bottom=0.15) #partial dependence plot #mse from sklearn.metrics import mean_squared_error, r2_score y_pred = brt.predict(X_test) print "MSE", mean_squared_error(y_test, y_pred) print 'R2', r2_score(y_test, y_pred) #plot for IAM #plt.figure() #4 features AVNN, age, sex, ci #features = ['SDNN','HRVTriangIndex','SDSD','AVNN','logIndex','RMSSD','ci','sex','age'] #target_features = [features[3],features[-1],features[-2],features[-3]] target_features_idx = [0, 4, 7, 3, 9, (0, 4)] fig_hrt, axs = plot_partial_dependence(brt, X_train, target_features_idx, feature_names=feats, n_jobs=-1, grid_resolution=80) fig_hrt.suptitle('TS Group 3 = f(HRV)', fontsize=20) plt.subplots_adjust(top=0.9, hspace=0.4, wspace=0.5) for a in range(5): axs[a].set_ylabel( "TS", fontsize=20) # tight_layout causes overlap with suptitle axs[a].set_xlabel(feats[target_features_idx[a]], fontsize=20) axs[5].set_xlabel(feats[target_features_idx[5][0]], fontsize=20) axs[5].set_ylabel(feats[target_features_idx[5][1]], fontsize=20) plt.show() target_features_idx = [8, 7] fig_hrt, axs = plot_partial_dependence(brt, X_train, target_features_idx, feature_names=feats, n_jobs=-1, grid_resolution=80) fig_hrt.suptitle('TS Group 3 = f(HRV)', fontsize=20) plt.subplots_adjust(top=0.9, left=0.12) for a in range(2): axs[a].set_ylabel( "TS partial dependence", fontsize=20) # tight_layout causes overlap with suptitle axs[a].set_xlabel(feats[target_features_idx[a]], fontsize=20) axs[a].set_ylim(-2.5, 1.5) plt.show() fig = plt.figure() target_feature = (7, 3) pdp, (x_axis, y_axis) = partial_dependence(brt, target_feature, X=X_train, grid_resolution=80) XX, YY = np.meshgrid(x_axis, y_axis) Z = pdp.T.reshape(XX.shape).T ax = Axes3D(fig) surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1, cmap=plt.cm.BuPu) ax.set_xlabel(feats[target_feature[0]], fontsize=18) ax.set_ylabel(feats[target_feature[1]], fontsize=18) ax.set_zlabel('$TS$', fontsize=18) # pretty init view ax.view_init(elev=22, azim=122) plt.colorbar(surf) plt.suptitle('$TS = f(Scl,TINN)$', fontsize=18)
# features = [0, 5, 1, 2, (5, 1)] # features = ['MedInc', 'AveOccup', 'HouseAge', 'AveRooms', ('AveOccup', 'HouseAge')] # fig, axs = plot_partial_dependence(clf, X_train, features, feature_names=names, # n_jobs=1, grid_resolution=50) # fig.suptitle('Partial dependence of house value on nonlocation features\n' # 'for the California housing dataset') # pl.subplots_adjust(top=0.9) # tight_layout causes overlap with suptitle print('_' * 80) print('Custom 3d plot via ``partial_dependence``') print fig = pl.figure() target_feature = (1, 5) pdp, (x_axis, y_axis) = partial_dependence(clf, target_feature, X=X_train, grid_resolution=50) XX, YY = np.meshgrid(x_axis, y_axis) Z = pdp.T.reshape(XX.shape).T ax = Axes3D(fig) surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1, cmap=pl.cm.BuPu) ax.set_xlabel(names[target_feature[0]]) ax.set_ylabel(names[target_feature[1]]) ax.set_zlabel('Partial dependence') # pretty init view ax.view_init(elev=22, azim=122) pl.colorbar(surf) pl.suptitle('Partial dependence of house value on median age and ' 'average occupancy') pl.subplots_adjust(top=0.9)
#def plot_feature_importances(model): # plt.figure(figsize=(8,6)) # n_features = len(names) # plt.barh(range(n_features), model.feature_importances_, align='center') # plt.yticks(np.arange(n_features), names) # plt.xlabel("Feature importance") # plt.ylabel("Feature") # plt.ylim(-1, n_features) # #plot_feature_importances(gbc) # #plt.show() pd.DataFrame(np.vstack(gbc.feature_importances_).T, columns=names).T.sort_values(by=0).plot(kind='barh') pdep = partial_dependence(gbc, features, X = X_train, grid_resolution=100) columns_new = ['Risk Probabilities',names[feat_num]] fico = pd.DataFrame(np.vstack(pdep).T, columns = columns_new) fico = fico[fico[names[feat_num]]>600] fico.plot(names[feat_num],'Risk Probabilities') # ROC Curve Plot logit_roc_auc = roc_auc_score(Y_test, gbc.predict(X_test)) fpr, tpr, thresholds = roc_curve(Y_test, gbc.predict_proba(X_test)[:,1]) plt.figure() plt.plot(fpr, tpr, label='Gradient Boosting Classifier (area = %0.2f)' % logit_roc_auc) plt.plot([0, 1], [0, 1],'r--') plt.xlim([0.0, 1.0])
# features = [0, 5, 1, 2, (5, 1)] # features = ['MedInc', 'AveOccup', 'HouseAge', 'AveRooms', ('AveOccup', 'HouseAge')] # fig, axs = plot_partial_dependence(clf, X_train, features, feature_names=names, # n_jobs=1, grid_resolution=50) # fig.suptitle('Partial dependence of house value on nonlocation features\n' # 'for the California housing dataset') # pl.subplots_adjust(top=0.9) # tight_layout causes overlap with suptitle print('_' * 80) print('Custom 3d plot via ``partial_dependence``') print fig = pl.figure() target_feature = (1, 5) pdp, (x_axis, y_axis) = partial_dependence(clf, target_feature, X=X_train, grid_resolution=50) XX, YY = np.meshgrid(x_axis, y_axis) Z = pdp.T.reshape(XX.shape).T ax = Axes3D(fig) surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1, cmap=pl.cm.BuPu) ax.set_xlabel(names[target_feature[0]]) ax.set_ylabel(names[target_feature[1]]) ax.set_zlabel('Partial dependence') # pretty init view ax.view_init(elev=22, azim=122) pl.colorbar(surf) pl.suptitle('Partial dependence of house value on median age and ' 'average occupancy') pl.subplots_adjust(top=0.9) pl.show()
'learning_rate': 0.01, 'loss': 'ls' } gbr = GradientBoostingRegressor(**params) gbr.fit(X_train, y_train) pd.crosstab(y_test, gbr.predict(X_test).round(), rownames=['Actual'], colnames=['Predicted']) pd.DataFrame({ 'Variable': X_test.columns, 'Importance': gbr.feature_importances_ }).sort_values('Importance', ascending=False) fig, axs = plot_partial_dependence( gbr, X=X_test, features=['Parhelion Patrol', 'Rubblebelt Boar', 'Hammer Dropper'], feature_names=feature_list, n_jobs=1, grid_resolution=10) allpd = {} for i in range(len(feature_list) - 1): key, values = partial_dependence(gbr, target_variables=i, X=X_test) allpd.update(dict(zip([feature_list[i]], key.tolist()))) df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in allpd.items()]))
""" 1.11.4.7.2. Partial dependence """ # from sklearn.datasets import make_hastie_10_2 # from sklearn.ensemble import GradientBoostingClassifier # from sklearn.ensemble.partial_dependence import plot_partial_dependence # X, y = make_hastie_10_2(random_state=0) clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, # max_depth=1, random_state=0).fit(X, y) # features = [0, 1, (0, 1)] # fig, axs = plot_partial_dependence(clf, X, features) # from sklearn.datasets import load_iris # iris = load_iris() # mc_clf = GradientBoostingClassifier(n_estimators=10, # max_depth=1).fit(iris.data, iris.target) # features = [3, 2, (3, 2)] # fig, axs = plot_partial_dependence(mc_clf, X, features, label=0) from sklearn.ensemble.partial_dependence import plot_partial_dependence from sklearn.ensemble.partial_dependence import partial_dependence pdp, axes = partial_dependence(clf, [0], X=X) print pdp