def test_partial_dependence_unknown_feature(estimator, features): X, y = make_classification(random_state=0) estimator.fit(X, y) err_msg = 'all features must be in' with pytest.raises(ValueError, match=err_msg): partial_dependence(estimator, X, [features])
def test_multiclass_multioutput(Estimator): # Make sure error is raised for multiclass-multioutput classifiers # make multiclass-multioutput dataset X, y = make_classification(n_classes=3, n_clusters_per_class=1, random_state=0) y = np.array([y, y]).T est = Estimator() est.fit(X, y) with pytest.raises( ValueError, match="Multiclass-multioutput estimators are not supported"): partial_dependence(est, X, [0])
def test_warning_recursion_non_constant_init(): # make sure that passing a non-constant init parameter to a GBDT and using # recursion method yields a warning. gbc = GradientBoostingClassifier(init=DummyClassifier(), random_state=0) gbc.fit(X, y) with pytest.warns( UserWarning, match='Using recursion method with a non-constant init predictor'): partial_dependence(gbc, X, [0], method='recursion') with pytest.warns( UserWarning, match='Using recursion method with a non-constant init predictor'): partial_dependence(gbc, X, [0], method='recursion')
def test_output_shape(Estimator, method, data, grid_resolution, features): # Check that partial_dependence has consistent output shape for different # kinds of estimators: # - classifiers with binary and multiclass settings # - regressors # - multi-task regressors est = Estimator() # n_target corresponds to the number of classes (1 for binary classif) or # the number of tasks / outputs in multi task settings. It's equal to 1 for # classical regression_data. (X, y), n_targets = data est.fit(X, y) pdp, axes = partial_dependence(est, X=X, features=features, method=method, grid_resolution=grid_resolution) expected_pdp_shape = (n_targets, *[grid_resolution for _ in range(len(features))]) expected_axes_shape = (len(features), grid_resolution) assert pdp.shape == expected_pdp_shape assert axes is not None assert np.asarray(axes).shape == expected_axes_shape
def test_partial_dependence_easy_target(est, power): # If the target y only depends on one feature in an obvious way (linear or # quadratic) then the partial dependence for that feature should reflect # it. # We here fit a linear regression_data model (with polynomial features if # needed) and compute r_squared to check that the partial dependence # correctly reflects the target. rng = np.random.RandomState(0) n_samples = 100 target_variable = 2 X = rng.normal(size=(n_samples, 5)) y = X[:, target_variable]**power est.fit(X, y) averaged_predictions, values = partial_dependence( est, features=[target_variable], X=X, grid_resolution=1000) new_X = values[0].reshape(-1, 1) new_y = averaged_predictions[0] # add polynomial features if needed new_X = PolynomialFeatures(degree=power).fit_transform(new_X) lr = LinearRegression().fit(new_X, new_y) r2 = r2_score(new_y, lr.predict(new_X)) assert r2 > .99
def test_recursion_decision_function(target_feature): # Make sure the recursion method (implicitly uses decision_function) has # the same result as using brute method with # response_method=decision_function X, y = make_classification(n_classes=2, n_clusters_per_class=1, random_state=1) assert np.mean(y) == .5 # make sure the init estimator predicts 0 anyway est = GradientBoostingClassifier(random_state=0, loss='deviance') est.fit(X, y) preds_1, _ = partial_dependence(est, X, [target_feature], response_method='decision_function', method='recursion') preds_2, _ = partial_dependence(est, X, [target_feature], response_method='decision_function', method='brute') assert_allclose(preds_1, preds_2, atol=1e-7)
def test_partial_dependence_sample_weight(): # Test near perfect correlation between partial dependence and diagonal # when sample weights emphasize y = x predictions # non-regression test for #13193 N = 1000 rng = np.random.RandomState(123456) mask = rng.randint(2, size=N, dtype=bool) x = rng.rand(N) # set y = x on mask and y = -x outside y = x.copy() y[~mask] = -y[~mask] X = np.c_[mask, x] # sample weights to emphasize data points where y = x sample_weight = np.ones(N) sample_weight[mask] = 1000. clf = GradientBoostingRegressor(n_estimators=10, random_state=1) clf.fit(X, y, sample_weight=sample_weight) pdp, values = partial_dependence(clf, X, features=[1]) assert np.corrcoef(pdp, values)[0, 1] > 0.99
def test_partial_dependence_error(estimator, params, err_msg): X, y = make_classification(random_state=0) estimator.fit(X, y) with pytest.raises(ValueError, match=err_msg): partial_dependence(estimator, X, **params)
def test_partial_dependence_unfitted_estimator(estimator): err_msg = "'estimator' parameter must be a fitted estimator" with pytest.raises(ValueError, match=err_msg): partial_dependence(estimator, X, [0])
def plot_partial_dependence_bootstrap(model, X_train, y_train, features, feature_name, n_boot=5, random_state=None): rng = check_random_state(random_state) # fit a model for each bootstrap sample all_estimators = [clone(model) for _ in range(n_boot)] for est in all_estimators: bootstrap_idx = rng.choice(np.arange(X_train.shape[0]), size=X_train.shape[0], replace=True) X_train_bootstrap = X_train.iloc[bootstrap_idx] y_train_bootstrap = y_train[bootstrap_idx] est.fit(X_train_bootstrap, y_train_bootstrap) # prepare the plotting n_fig = 3 n_rows = (int(len(features) / n_fig) + 1 if len(features) % n_fig != 0 else int(len(features) / n_fig)) n_cols = len(features) if n_rows == 1 else n_fig fig, axs = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(n_cols * 5, n_rows * 5)) for feat, ax in zip(features, np.ravel(axs)): # compute the partial dependence for each models X_train_preprocessed = model[0].fit_transform(X_train) avg_preds_bootstrap = [] for est in all_estimators: avg_preds, values = partial_dependence(est[-1], X_train_preprocessed, feat, grid_resolution=20) avg_preds_bootstrap.append(avg_preds) if len(values) == 2: # compute the mean of the average prediction when plotting contour # plots mean_avg_preds = np.mean(avg_preds_bootstrap, axis=0) Z_level = np.linspace(mean_avg_preds.min(), mean_avg_preds.max(), 8) XX, YY = np.meshgrid(values[0], values[1]) Z = mean_avg_preds[0].T CS = ax.contour(XX, YY, Z, levels=Z_level, linewidths=0.5, colors='k') ax.contourf(XX, YY, Z, levels=Z_level, vmax=Z_level[-1], vmin=Z_level[0], alpha=0.75) ax.clabel(CS, fmt='%2.2f', colors='k', fontsize=10, inline=True) ax.set_xlabel(feature_name[feat[0]]) ax.set_ylabel(feature_name[feat[1]]) else: # plot all average predictions and their mean mean_avg_preds = np.zeros_like(avg_preds_bootstrap[0]) for preds in avg_preds_bootstrap: mean_avg_preds += preds ax.plot(values[0], preds[0], '--k', linewidth=1, alpha=0.5) mean_avg_preds /= len(avg_preds_bootstrap) ax.plot(values[0], mean_avg_preds[0], 'r', alpha=0.8, label='Average') ax.set_xlabel(feature_name[feat]) ax.set_ylabel('WAGE') ax.legend() plt.tight_layout()
def plot_oneway_partial_dependence(GBR_models_split_lags, keys=None, lags=None, grid_resolution=20): #%% sns.set_style("whitegrid") sns.set_style(rc={'axes.edgecolor': 'black'}) if lags is None: lag_keys = GBR_models_split_lags.keys() lags = [int(l.split('_')[1]) for l in lag_keys][:3] if keys is None: keys = set() for l, lag in enumerate(lags): # get models at lag GBR_models_split = GBR_models_split_lags[f'lag_{lag}'] [ keys.update(list(r.X_pred.columns)) for k, r in GBR_models_split.items() ] masks = ['TrainIsTrue', 'x_fit', 'x_pred', 'y_fit', 'y_pred'] keys = [k for k in keys if k not in masks] keys = keys df_lags = [] for l, lag in enumerate(lags): # get models at lag GBR_models_split = GBR_models_split_lags[f'lag_{lag}'] df_keys = [] keys_in_lag = [] for i, key in enumerate(keys): y = [] x = [] for splitkey, regressor in GBR_models_split.items(): if key in list(regressor.X_pred.columns): X_pred = regressor.X_pred index = list(X_pred.columns).index(key) TrainIsTrue = regressor.df_norm['TrainIsTrue'] TestIsTrue = TrainIsTrue.loc[X_pred.index] == False X_test = X_pred[TestIsTrue] # X_test = regressor.X_pred.loc[:,all_keys][regressor.X_pred['x_pred']] _y, _x = partial_dependence( regressor, X=X_test, features=[index], grid_resolution=grid_resolution) y.append(_y[0]) x.append(_x[0]) keys_in_lag.append(key) if len(y) != 0: # y has shape (grid_res, splits_key_present) y_mean = np.array(y).mean(0) y_std = np.std(y, 0).ravel() x_vals = np.mean(x, 0) count_splits = np.repeat(np.array(y).shape[0], y_mean.shape) data = [ y_mean[:, None], y_std[:, None], x_vals[:, None], count_splits[:, None] ] data = np.concatenate(data, axis=1) df_key = pd.DataFrame( data, columns=['y_mean', 'y_std', 'x_vals', 'count splits']) df_keys.append(df_key) df_keys = pd.concat(df_keys, keys=np.unique(keys_in_lag)) df_lags.append(df_keys) df_lags = pd.concat(df_lags, keys=lags) # ============================================================================= # Plotting # ============================================================================= #%% col_wrap = 4 g = sns.FacetGrid(pd.DataFrame(data=keys), col=0, col_wrap=col_wrap, aspect=1.5, sharex=False) custom_lines = [] _legend = [] for l, lag in enumerate(lags): style = line_styles[l] color = colors_datasets[l] custom_lines.append( Line2D([0], [0], linestyle=style, color=color, lw=4, markersize=10)) _legend.append(f'lag {lag}') # text_lag = [] for i, key in enumerate(keys): ax = g.axes[i] df_plot = df_lags.loc[lag, key] y_mean = df_plot['y_mean'] y_std = 2 * df_plot['y_std'] x_vals = df_plot['x_vals'] ax.fill_between(x_vals, y_mean - y_std, y_mean + y_std, color=color, linestyle=style, alpha=0.2) ax.plot(x_vals, y_mean, color=color, linestyle=style) ax.set_title(key) if i == 0: ax.legend(custom_lines, _legend, handlelength=3) return df_lags, g.fig
def test_partial_dependence_X_list(estimator): # check that array-like objects are accepted X, y = make_classification(random_state=0) estimator.fit(X, y) partial_dependence(estimator, list(X), [0])
def test_interpret(self): iris = load_iris() X, y = iris.data, iris.target X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) classifier = RandomForestClassifier(random_state=42) classifier.fit(X_train, y_train) pdp, axes = partial_dependence(classifier, X, [0], response_method='predict_proba', percentiles=(0.05, 0.95), grid_resolution=100, method='brute') # ensure original pdp implemtend by scikit learn works as expected assert pytest.approx(pdp[0][0]) == 0.34126667 assert pytest.approx(pdp[0][-1]) == 0.29406667 # load predictions that have been saved from the exact model trained above and run through serving engine v1.0.2 predictions = pickle.load(open("./misc/predictions.pickle", "rb")) # simulate the responses from serving engine model = Mock() model.output_name = 'output_probability' flatten = lambda l: [item for sublist in l for item in sublist] model.estimate = Mock( side_effect=[{ 'output_probability': flatten(predictions) }]) sim_pdp, sim_axes = self.pdp.interpret(model=model, X=X, features=[0], percentiles=(0.05, 0.95), grid_resolution=100) # check that pdp values computed from predictions of serving engine matches scikit's pdp implementation assert pdp[0][0] == sim_pdp[0][0] assert (pdp == sim_pdp).all() assert (axes[0] == sim_axes[0]).all() # Try with multiple api calls model.estimate = Mock(side_effect=[{ 'output_probability': predictions[i] } for i in range(len(predictions))]) sim_pdp, sim_axes = self.pdp.interpret(model=model, X=X, features=[0], percentiles=(0.05, 0.95), grid_resolution=100, one_api_call=False) # check that pdp values computed from predictions of serving engine matches scikit's pdp implementation assert pdp[0][0] == sim_pdp[0][0] assert (pdp == sim_pdp).all() assert (axes[0] == sim_axes[0]).all()
y_predict = scaler_Y.inverse_transform(net.predict(\ scaler_X.transform(x_test))) line = Line() line.add_xaxis(range(1, len(test_Y) + 1)) line.add_yaxis('samples',test_Y.reshape(-1,),\ label_opts=opts.LabelOpts(is_show=False)) line.add_yaxis('predict',y_predict,\ label_opts=opts.LabelOpts(is_show=False)) line.set_global_opts(title_opts=opts.TitleOpts(title="line demo")) line.render('./html/line.html') # 3D surface target_feature = (1, 2) pdp, axes = partial_dependence(net,\ scaler_X.transform(x_train),\ target_feature,\ grid_resolution=30) XX, YY = np.meshgrid(axes[0], axes[1]) Z = pdp[0].T names = ['Num', 'x2', 'x4', 'temp'] fig = plt.figure() ax = Axes3D(fig) surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1, cmap=plt.cm.BuPu, edgecolor='k') ax.set_xlabel(names[target_feature[0]])
# MAGIC %md #### Read from db # COMMAND ---------- predDF_final_done = spark.read.format("jdbc").option("url", "jdbc:mysql://sx2200-gr5069.ccqalx6jsr2n.us-east-1.rds.amazonaws.com/sx2200") \ .option("driver", "com.mysql.jdbc.Driver").option("dbtable", "test_lr_preds") \ .option("user", "admin").option("password", "Xs19980312!").load() # COMMAND ---------- # MAGIC %md #### Marginal Effects # COMMAND ---------- from sklearn.inspection import plot_partial_dependence, partial_dependence from sklearn.datasets import make_friedman1 from sklearn.linear_model import LinearRegression from sklearn.ensemble import GradientBoostingRegressor %matplotlib inline # COMMAND ---------- factors = X_train[['race_count','lag1_avg']] # plot the partial dependence (marginal effect) plot_partial_dependence(logreg, X_train, factors) # get the partial dependence (marginal effect) partial_dependence(logreg, X_train_s, [0]) # COMMAND ---------- # From the plots, the marginal effect of race_count and lag1_avg are shown. The relationship between races completed and whether or not the constructor would win a season is a linear relation with positive slope, and the line is quite cliffy. The relation between average points earned in last season and the constructor championship is also positive, while the slope of this curve is smaller than the slope of race_count.
def plot_pdp(clf, X, feature, scaler=None, column_names=None, query=None, xlabel=None, show_deciles=True, show_distplot=False, y=None, pardep_kws={}, plt_kws={}, distplot_kws={}, ax=None): """ plots partial dependence plot against `feature` for samples satifying `query` Parameters ---------- clf : compatible with sklearn.inspect.parital_dependence model X : pd.DataFrame or 2D array of numbers data to calculated partial dependece both training and hold-on set (or combined) is reasonable for this purpose feature : str name of the feature to compute pdp w.r.t scaler : sklearn-compatible scaler e.g. StandardScaler scaler used to scale training data column_names : iterable of strings names of the columns in the dataset, if None (default) then X has to be dataframe with correct columns other args as `feature` or `query` relies on it query : str selection criteria, only samples passing it will be used to compute pdp has to be valid input for pd.DataFrame.query() xlabel : str or None xlabel, if None (default) `feature` will be used show_deciles : bool if small vertical lines (seaborn's rugs) corresponding to deciles of selected `feature` values should be shown, selected i.e. passing `query` show_distplot : bool if distribution of `feature` should be plotted below pdp if `y` passed, than it's grouped by y=0/1 y : array of numbers samples labels, used to split distplot, used only if show_distplot is True pardep_kws : dict passed to sklearn.inspect.parital_dependence plt_kws : dict passed to plt.plot distplot_kws : dict passed to sns.distplot, has some defaults - see code ax : matplotlib.axes._subplots.AxesSubplot object or None axes to plot on default=None, meaning creating axes inside function Returns ------- ax """ if not ax: _, ax = plt.subplots(figsize=(7, 5)) if column_names is None: column_names = X.columns X_orig = scaler.inverse_transform(X) if scaler else X df = pd.DataFrame(X_orig) df.columns = column_names if query: df = df.query(query) df_xgb = pd.DataFrame(scaler.transform(df)) if scaler else df if feature not in clf.get_booster().feature_names: df_xgb.columns = [f'f{i}' for i in range(df_xgb.shape[1])] feat_idx = list(column_names).index(feature) feat_name_xgb = f'f{feat_idx}' else: df_xgb.columns = df.columns feat_idx = list(column_names).index(feature) feat_name_xgb = feature part_dep, feat_vals = partial_dependence( clf, df_xgb[df_xgb[feat_name_xgb].notna()], features=[feat_name_xgb], **pardep_kws) part_dep, feat_vals = np.array(part_dep[0]), np.array(feat_vals[0]) if scaler: feat_vals_orig = feat_vals * np.sqrt( scaler.var_[feat_idx]) + scaler.mean_[feat_idx] else: feat_vals_orig = feat_vals ax.plot(feat_vals_orig, part_dep, lw=3, **plt_kws) ax.set_xlim(left=min(ax.get_xlim()[0], min(feat_vals_orig)), right=max(ax.get_xlim()[1], max(feat_vals_orig))) vals = df[feature] if show_deciles: xlim = ax.get_xlim() deciles = np.nanpercentile(vals, np.arange(0, 101, 10)) sns.rugplot(deciles, ax=ax) ax.set_xlim(xlim) if show_distplot: distplot_default_kws = dict(bins=np.linspace(*ax.get_xlim(), 100), distplot_y_frac=0.8) distplot_kws = { **distplot_default_kws, **distplot_kws } # passed `distplot_kws` overwrites defaults _add_distplot(ax, vals, y=y, **distplot_kws) ax.set_xlabel(xlabel if xlabel else feature) ax.set_ylabel('partial dependence') return ax
def get(request): filename = "titanic_train.csv" treeNum = 0 treeDeep = 0 if request.method == 'GET': filename = request.GET.get('name') treeNum = request.GET.get('treeNum') treeDeep = request.GET.get('treeDeep') print(filename + " " + treeNum + " " + treeDeep) # D:\PyCharm 2020.2.3\djangoProject\djangoProject\data data = pd.read_csv('D:/PyCharm 2020.2.3/djangoProject/djangoProject/data/' + filename + '1.csv') # dftrain = pd.read_csv('data/' + filename) y_train = None dp = None if filename == 'german': y_train = data.pop('Creditability') dp = pd.read_csv('D:/PyCharm 2020.2.3/djangoProject/djangoProject/data/germandis.csv') else: y_train = data.pop('survived') # 定义随机森林模型参数 rfc = RandomForestClassifier(max_depth=int(treeDeep), n_estimators=int(treeNum), random_state=60) # # 处理数据,将离散化的值转换为数字等 # data = prepareData(dftrain) global totaldata totaldata = data.copy() # 特征名称列表 featureList = totaldata.columns.values.tolist() # # 训练 rfc.fit(data, y_train) global estimator estimator = rfc importancenData = permutation_importance(rfc, data, y_train, n_repeats=100, random_state=16) global mdiFeature mdiFeature = rfc.feature_importances_.tolist() global feature feature = importancenData.importances_mean.tolist() print("排列重要性") print(feature) print("mdi重要性") print(mdiFeature) # 计算部份依赖 global pdpData pdpData = [] for index, value in enumerate(data.columns.values): pdp, axes = partial_dependence(rfc, data, index) pdpData.append({"name": value, "axes": axes[0].tolist(), "pdp": pdp[0].tolist()}) # print(plot_partial_dependence(rfc, data, ["Account Balance"], target=0)) # # tsne降维 # tsne = TSNE(n_components=2, perplexity=30, n_iter=500, metric='precomputed') tsne = TSNE(learning_rate=100.0) array = tsne.fit_transform(dp).tolist() # 进行数据降维 # embedding = MDS(n_components=2) # array = embedding.fit_transform(dp) # 预测的概率 y_pre = rfc.predict(data) predict_prob = rfc.predict_proba(data) predict0 = [] predict1 = [] for i in predict_prob: predict0.append(i[0]) predict1.append(i[1]) data['predict0'] = predict0 data['predict1'] = predict1 # # 将预测值和真实值加入到数据中 data['predict'] = y_pre data['true'] = y_train x = [] y = [] for i in array: x.append(i[0]) y.append(i[1]) data['x'] = x data['y'] = y index = [] for i in range(len(data)): index.append(i) data.insert(0, 'id', index) # data = pd.read_csv('D:/PyCharm 2020.2.3/djangoProject/djangoProject/data/result_new.csv') # return JsonResponse(data, safe=False) print("AUC Score (Train): %f" % metrics.roc_auc_score(y_train, y_pre)) print(metrics.confusion_matrix(y_train, y_pre, labels=None, sample_weight=None)) da = data.to_dict(orient='records') featureListMin = [] featureListMax = [] for i in featureList: featureListMin.append(min(data[i])) featureListMax.append(max(data[i])) return JsonResponse({'data': da, 'featureList': featureList, 'featureImportance': feature, 'mdiFeatureImportance': mdiFeature, 'featureListMin': featureListMin , 'featureListMax': featureListMax, 'auc': metrics.roc_auc_score(y_train, y_pre), 'confusionMatrix': metrics.confusion_matrix(y_train, y_pre, labels=None, sample_weight=None).tolist()}, safe=False)
) # 2 ランダムフォレストによる学習 ------------------------------------------------- # モデル構築 # --- インスタンスの生成 # --- 学習 rf = RandomForestRegressor(n_jobs=-1, random_state=42) rf.fit(X_train, y_train) # 3 PDとICEによる解釈 ------------------------------------------------------------- # PDとICEの計算 ice = partial_dependence(estimator=rf, X=X_test, features=["RM"], kind="both") ice # プロット定義 def plot_ice(): fig, ax = plt.subplots(figsize=(8, 4)) plot_partial_dependence(estimator=rf, X=X_test, features=["RM"], kind="both", ax=ax) fig.show() # プロット作成 plot_ice()
def main(): cal_housing = fetch_california_housing() X, y = cal_housing.data, cal_housing.target names = cal_housing.feature_names # Center target to avoid gradient boosting init bias: gradient boosting # with the 'recursion' method does not account for the initial estimator # (here the average target, by default) y -= y.mean() print("Training MLPRegressor...") est = MLPRegressor(activation='logistic') est.fit(X, y) print('Computing partial dependence plots...') # We don't compute the 2-way PDP (5, 1) here, because it is a lot slower # with the brute method. features = [0, 5, 1, 2] plot_partial_dependence(est, X, features, feature_names=names, n_jobs=3, grid_resolution=50) fig = plt.gcf() fig.suptitle('Partial dependence of house value on non-location features\n' 'for the California housing dataset, with MLPRegressor') plt.subplots_adjust(top=0.9) # tight_layout causes overlap with suptitle print("Training GradientBoostingRegressor...") est = GradientBoostingRegressor(n_estimators=100, max_depth=4, learning_rate=0.1, loss='huber', random_state=1) est.fit(X, y) print('Computing partial dependence plots...') features = [0, 5, 1, 2, (5, 1)] plot_partial_dependence(est, X, features, feature_names=names, n_jobs=3, grid_resolution=50) fig = plt.gcf() fig.suptitle('Partial dependence of house value on non-location features\n' 'for the California housing dataset, with Gradient Boosting') plt.subplots_adjust(top=0.9) print('Custom 3d plot via ``partial_dependence``') fig = plt.figure() target_feature = (1, 5) pdp, axes = partial_dependence(est, X, target_feature, grid_resolution=50) XX, YY = np.meshgrid(axes[0], axes[1]) Z = pdp[0].T ax = Axes3D(fig) surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1, cmap=plt.cm.BuPu, edgecolor='k') ax.set_xlabel(names[target_feature[0]]) ax.set_ylabel(names[target_feature[1]]) ax.set_zlabel('Partial dependence') # pretty init view ax.view_init(elev=22, azim=122) plt.colorbar(surf) plt.suptitle('Partial dependence of house value on median\n' 'age and average occupancy, with Gradient Boosting') plt.subplots_adjust(top=0.9) plt.show()
#skplt.metrics.plot_roc_curve(ytest, ypred) #plt.show() #Getting scores for each column as partial dependence value from sklearn.inspection import partial_dependence #b=partial_dependence(lr,features=[0],X=data,percentiles=(0,1)) #print(b) #print(b[0].max()) #a = np.array(()) listt = [ ] #Creating a list that will store the maximum dependency value for each column #listt = ((b[0].max(), 0)) #listt.append([b[0].max(), 0]) #print(data.head(1)) #Here is the head of the data that has column variable names for i in range(len(data.columns)): b = partial_dependence(clf, features=[i], X=data, percentiles=(0, 1)) listt.append([b[0].max(), data.columns.values[i]]) #print(listt) #Write the listt to a file #conc = np.vstack(listt) my_df = pd.DataFrame(listt, columns=['PDValues', 'ColumnName']) #print(my_df.head()) my_df = my_df.sort_values(by='PDValues') #print(my_df.head()) my_df.to_csv('PDValuesAdaBoost.csv', index=False) #a.sort(axis=0) #np.savetxt('columgSig.csv', listt, delimiter=',') print("Columns Significance Saved to PDValuesAdaBoost.csv") #Now we go for Precision Recall Curve metrics precision, recall, thresholds = metrics.precision_recall_curve(ytest, ypred)
def main(): cal_housing = fetch_california_housing() X, y = cal_housing.data, cal_housing.target names = cal_housing.feature_names # Center target to avoid gradient boosting init bias: gradient boosting # with the 'recursion' method does not account for the initial estimator # (here the average target, by default) y -= y.mean() print("Training MLPRegressor...") est = MLPRegressor(activation='logistic') est.fit(X, y) print('Computing partial dependence plots...') # We don't compute the 2-way PDP (5, 1) here, because it is a lot slower # with the brute method. features = [0, 5, 1, 2] plot_partial_dependence(est, X, features, feature_names=names, n_jobs=3, grid_resolution=50) fig = plt.gcf() fig.suptitle('Partial dependence of house value on non-location features\n' 'for the California housing dataset, with MLPRegressor') plt.subplots_adjust(top=0.9) # tight_layout causes overlap with suptitle print("Training GradientBoostingRegressor...") est = GradientBoostingRegressor(n_estimators=100, max_depth=4, learning_rate=0.1, loss='huber', random_state=1) est.fit(X, y) print('Computing partial dependence plots...') features = [0, 5, 1, 2, (5, 1)] plot_partial_dependence(est, X, features, feature_names=names, n_jobs=3, grid_resolution=50) fig = plt.gcf() fig.suptitle('Partial dependence of house value on non-location features\n' 'for the California housing dataset, with Gradient Boosting') plt.subplots_adjust(top=0.9) print('Custom 3d plot via ``partial_dependence``') fig = plt.figure() target_feature = (1, 5) pdp, axes = partial_dependence(est, X, target_feature, grid_resolution=50) XX, YY = np.meshgrid(axes[0], axes[1]) Z = pdp[0].T ax = Axes3D(fig) surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1, cmap=plt.cm.BuPu, edgecolor='k') ax.set_xlabel(names[target_feature[0]]) ax.set_ylabel(names[target_feature[1]]) ax.set_zlabel('Partial dependence') # pretty init view ax.view_init(elev=22, azim=122) plt.colorbar(surf) plt.suptitle('Partial dependence of house value on median\n' 'age and average occupancy, with Gradient Boosting') plt.subplots_adjust(top=0.9) plt.show()
# age. # # 3D interaction plots # -------------------- # # Let's make the same partial dependence plot for the 2 features interaction, # this time in 3 dimensions. import numpy as np from mpl_toolkits.mplot3d import Axes3D fig = plt.figure() features = ('AveOccup', 'HouseAge') pdp = partial_dependence(est, X_train, features=features, kind='average', grid_resolution=20) XX, YY = np.meshgrid(pdp["values"][0], pdp["values"][1]) Z = pdp.average[0].T ax = Axes3D(fig) surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1, cmap=plt.cm.BuPu, edgecolor='k') ax.set_xlabel(features[0]) ax.set_ylabel(features[1]) ax.set_zlabel('Partial dependence')
] plot_partial_dependence(est, X_train, features, n_jobs=3, grid_resolution=20) print("done in {:.3f}s".format(time() - tic)) fig = plt.gcf() fig.suptitle('Partial dependence of house value on non-location features\n' 'for the California housing dataset, with Gradient Boosting') fig.subplots_adjust(wspace=0.4, hspace=0.3) ############################################################################## # 3D interaction plots (2D PDP) fig = plt.figure() features = ('AveOccup', 'HouseAge') pdp, axes = partial_dependence(est, X_train, features=features, grid_resolution=20) XX, YY = np.meshgrid(axes[0], axes[1]) Z = pdp[0].T ax = Axes3D(fig) surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1, cmap=plt.cm.BuPu, edgecolor='k') ax.set_xlabel(features[0]) ax.set_ylabel(features[1]) ax.set_zlabel('Partial dependence') # pretty init view
def main(): cal_housing = fetch_california_housing() X, y = cal_housing.data, cal_housing.target names = cal_housing.feature_names # Center target to avoid gradient boosting init bias: gradient boosting # with the 'recursion' method does not account for the initial estimator # (here the average target, by default) y -= y.mean() print("Training SNN_Regressor...") est = SNN_Regressor(8, 1, 10, 10, hiddenAct=Activation.Tanh(), error=Error.Mse(), update=Update.RmsProp(0.001, rateDecay=0.9)) t = [ (3, lambda e: e.cool()), # cool (6, lambda e: Trainer.prune(e, X, y)), # prune # ( 18, lambda e: e.cool() ), # cool (9, lambda e: Trainer.grow(e, max(1, 1 + int(np.log(e.hiddenSize_ + 1)))) ), # grow # ( 11, lambda e: e.cool() ), # cool ] growLoss = Trainer.train(est, X, y, batch=1, maxIter=100, triggers=t) est.maxIter_ = 1000 plt.semilogy(growLoss, label='Grow') plt.legend() # plt.show() # pdb.set_trace() print("SNN weights:", est.weight_) print("SNN dweight:", est.dWeight_) print("SNN nHidden:", est.hiddenSize_) print('Computing partial dependence plots...') # We don't compute the 2-way PDP (5, 1) here, because it is a lot slower # with the brute method. features = [0, 5, 1, 2] plot_partial_dependence(est, X, features, feature_names=names, n_jobs=3, grid_resolution=50) fig = plt.gcf() fig.suptitle('Partial dependence of house value on non-location features\n' 'for the California housing dataset, with SNN_Regressor...') plt.subplots_adjust(top=0.9) # tight_layout causes overlap with suptitle print("Training MLPRegressor...") est = MLPRegressor(activation='logistic') est.fit(X, y) print('MLP Loss: ', np.average(Error.Mse().f(y, est.predict(X)))) print('Computing partial dependence plots...') # We don't compute the 2-way PDP (5, 1) here, because it is a lot slower # with the brute method. features = [0, 5, 1, 2] plot_partial_dependence(est, X, features, feature_names=names, n_jobs=3, grid_resolution=50) fig = plt.gcf() fig.suptitle('Partial dependence of house value on non-location features\n' 'for the California housing dataset, with MLPRegressor') plt.subplots_adjust(top=0.9) # tight_layout causes overlap with suptitle print("Training GradientBoostingRegressor...") est = GradientBoostingRegressor(n_estimators=100, max_depth=4, learning_rate=0.1, loss='huber', random_state=1) est.fit(X, y) print('Computing partial dependence plots...') features = [0, 5, 1, 2, (5, 1)] plot_partial_dependence(est, X, features, feature_names=names, n_jobs=3, grid_resolution=50) fig = plt.gcf() fig.suptitle('Partial dependence of house value on non-location features\n' 'for the California housing dataset, with Gradient Boosting') plt.subplots_adjust(top=0.9) print('Custom 3d plot via ``partial_dependence``') fig = plt.figure() target_feature = (1, 5) pdp, axes = partial_dependence(est, X, target_feature, grid_resolution=50) XX, YY = np.meshgrid(axes[0], axes[1]) Z = pdp[0].T ax = Axes3D(fig) surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1, cmap=plt.cm.BuPu, edgecolor='k') ax.set_xlabel(names[target_feature[0]]) ax.set_ylabel(names[target_feature[1]]) ax.set_zlabel('Partial dependence') # pretty init view ax.view_init(elev=22, azim=122) plt.colorbar(surf) plt.suptitle('Partial dependence of house value on median\n' 'age and average occupancy, with Gradient Boosting') plt.subplots_adjust(top=0.9) plt.show()
def test_partial_dependence_error(estimator, params, err_msg): X, y = make_classification(random_state=0) estimator.fit(X, y) with pytest.raises(ValueError, match=err_msg): partial_dependence(estimator, X, **params)
# # 3D interaction plots # -------------------- # # Let's make the same partial dependence plot for the 2 features interaction, # this time in 3 dimensions. import numpy as np from mpl_toolkits.mplot3d import Axes3D from sklearn.inspection import partial_dependence fig = plt.figure() features = ("AveOccup", "HouseAge") pdp = partial_dependence(est, X_train, features=features, kind="average", grid_resolution=10) XX, YY = np.meshgrid(pdp["values"][0], pdp["values"][1]) Z = pdp.average[0].T ax = Axes3D(fig) fig.add_axes(ax) surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1, cmap=plt.cm.BuPu, edgecolor="k") ax.set_xlabel(features[0])
def test_partial_dependence_unfitted_estimator(estimator): err_msg = "'estimator' parameter must be a fitted estimator" with pytest.raises(ValueError, match=err_msg): partial_dependence(estimator, X, [0])
def test_partial_dependence_X_list(estimator): # check that array-like objects are accepted X, y = make_classification(random_state=0) estimator.fit(X, y) partial_dependence(estimator, list(X), [0])
ens.score(X_val, y_val) #Once you are confident about your final model, measure its performance on the test set to estimate the generalization error #Model interpretability #Feature importance import eli5 from eli5.sklearn import PermutationImportance perm = PermutationImportance(model, random_state=101).fit(X_val, y_val) eli5.show_weights(perm, feature_names=X_val.columns.tolist()) #Partial dependence plot #New integration in sklearn, might not work with older versions from sklearn.inspection import partial_dependence, plot_partial_dependence partial_dependence(model, X_train, features=['feature', ('feat1', 'feat2')]) plot_partial_dependence(model, X_train, features=['feature', ('feat1', 'feat2')]) #With external module for legacy editions from pdpbox import pdp, get_dataset, info_plots #Create the data that we will plot pdp_goals = pdp.pdp_isolate(model=model, dataset=X_val, model_features=X_val.columns, feature='Goals Scored') #plot it pdp.pdp_plot(pdp_goals, 'Goals Scored') plt.show()
def plot_pdp(model, x, feature, target=False, return_pd=False, y_pct=True, figsize=(10, 9), norm_hist=True, dec=.5): """ Plot partial dependence plot suing sklearn and add a bar blot with the distribuition of the observations Parameters: model (model): A decimal integer X (dataframe): Another decimal integer feature (str): Another decimal integer Returns: plot """ # Get partial dependence pardep = partial_dependence(model, x, [feature]) # Get min & max values xmin = pardep[1][0].min() xmax = pardep[1][0].max() ymin = pardep[0][0].min() ymax = pardep[0][0].max() # Create figure fig, ax1 = plt.subplots(figsize=figsize) ax1.grid(alpha=.5, linewidth=1) # Plot partial dependence color = 'tab:blue' ax1.plot(pardep[1][0], pardep[0][0], color=color) ax1.tick_params(axis='y', labelcolor=color) ax1.set_xlabel(feature, fontsize=14) tar_ylabel = ': {}'.format(target) if target else '' ax1.set_ylabel('Partial Dependence{}'.format(tar_ylabel), color=color, fontsize=14) tar_title = target if target else 'Target Variable' ax1.set_title('Relationship Between {} and {}'.format(feature, tar_title), fontsize=16) if y_pct and ymin >= 0 and ymax <= 1: # Display yticks on ax1 as percentages fig.canvas.draw() labels = [item.get_text() for item in ax1.get_yticklabels()] labels = [ int(np.float(label.replace('−', '-')) * 100) for label in labels ] labels = ['{}%'.format(label) for label in labels] ax1.set_yticklabels(labels) # Plot line for decision boundary ax1.hlines(dec, xmin=xmin, xmax=xmax, color='black', linewidth=2, linestyle='--', label='Decision Boundary') ax1.legend() ax2 = ax1.twinx() color = 'tab:red' ax2.hist(x[feature], bins=80, range=(xmin, xmax), alpha=.25, color=color, density=norm_hist) ax2.tick_params(axis='y', labelcolor=color) ax2.set_ylabel('Distribution', color=color, fontsize=14) if y_pct and norm_hist: # Display yticks on ax2 as percentages fig.canvas.draw() labels = [item.get_text() for item in ax2.get_yticklabels()] labels = [ int(np.float(label.replace('−', '-')) * 100) for label in labels ] labels = ['{}%'.format(label) for label in labels] ax2.set_yticklabels(labels) plt.show() if return_pd: return pardep
# two features: for an average occupancy greater than two, the house price is # nearly independent of the house age, whereas for values less than two there # is a strong dependence on age. ############################################################################## # 3D interaction plots # -------------------- # # Let's make the same partial dependence plot for the 2 features interaction, # this time in 3 dimensions. fig = plt.figure() target_feature = (1, 5) pdp, axes = partial_dependence(est, X_train, target_feature, grid_resolution=20) XX, YY = np.meshgrid(axes[0], axes[1]) Z = pdp[0].T ax = Axes3D(fig) surf = ax.plot_surface(XX, YY, Z, rstride=1, cstride=1, cmap=plt.cm.BuPu, edgecolor='k') ax.set_xlabel(names[target_feature[0]]) ax.set_ylabel(names[target_feature[1]]) ax.set_zlabel('Partial dependence') # pretty init view
def PartialDependencePlots(estimator, X, features, feature_labels, nrows=None, ncols=4, figsize=None, sharey=True, conf_int=True, show=True, save=False, plot_dir='Output/Plots', title='PDP', save_params={}, pdp_params={}, plot_params={}, plot_ci_params={}): """ INPUT: - estimator -> A sklearn tree-based fitted estimator object. (object) Look at sklearn.inspection.partial_dependence doc. for more info. - X -> Feature matrix. (array-like or dataframe) Look at sklearn.inspection.partial_dependence doc. for more info. - features -> Features for which the partial dependency should be computed. (int, string or list of ints, strings) Look at sklearn.inspection.partial_dependence doc. for more info. - feature_labels -> Labels of the model features that will be used in the plot. (string or list of strings). - nrows -> Number of rows of the figure object. (int) - ncols -> Number of columns of the figure object. (int) - figsize -> Size of the figure object. (tuple of int: (width, height)) - sharey -> Choose whether or not axes in the figure should share the y axis values. (bool) - conf_int -> Choose whether or not to plot the confidence interval. (bool) - show -> Choose whether or not to display the plot. (bool) - save -> Choose whether or not to save the plot. (bool) - plot_dir -> Plot saving directory. (path string) - title -> Name of the plot file without file extension. (string) - save_params -> Parameters for the saving operation. (dict) - pdp_params -> Parameters for sklearn.inspection.partial_dependence. (dict) Look at the doc. for more info. - plot_params -> Parameters for matplotlib.pylot.plot. (dict) Look at the doc. for more info. - plot_ci_params -> Parameters for matplotlib.axes.Axes.fill_between. (dict) Look at the doc. for more info. OUTPUT: - fig -> Figure object. (matplotlib figure object) - ax, axs -> Axes of the current figure. (matplotlib axes object) """ # Default values for parameter dictionaries: # save_params SP = {'format': 'jpg'} # pdp_params PDPP = {'kind': 'both', 'grid_resolution': 100} # plot_params PP = {} # plot_ci_params PCIP = {'alpha': 0.2, 'color': '#66C2D7'} # Update parameter dictionaries with user choices SP.update(save_params) PDPP.update(pdp_params) PP.update(plot_params) PCIP.update(plot_ci_params) # If features and feature_labels contains only a string, make them lists. features = MakeList(features) feature_labels = MakeList(feature_labels) n = len(features) # Define number of rows and columns and figsize of the figure object # containing the plot(s) nrows, ncols, figsize = ArrangePlots(n, nrows, ncols, figsize) # Output file directory file_dir = os.path.join(plot_dir, f"{title}.{SP['format']}") # If the plot file already exists and must not be overwritten, then display it. if show and os.path.exists(file_dir) and not save: fig, ax = plt.subplots(figsize=figsize) ax.imshow(plt.imread(file_dir), aspect='equal') plt.axis('off') return fig, ax # If the file must be created or overwritten ... else: fig, axs = plt.subplots(nrows, ncols, sharey=sharey, figsize=figsize) # Go through each feature for f, l, ax in zip(features, feature_labels, np.array(axs).flat): # Compute partial dependence values PDP = partial_dependence(estimator=estimator, X=X, features=f, **PDPP) ax.plot(PDP['values'][0], PDP['average'][0], **PP) if 'individual' in PDP and conf_int: # Compute the standard error on each mean partial dependence PDP['sd'] = PDP['individual'][0].std(axis=0).reshape(1, -1) # Define upper and lower bounds for the confidence interval upper = PDP['average'][0] + PDP['sd'][0] lower = PDP['average'][0] - PDP['sd'][0] ax.fill_between(PDP['values'][0], upper, lower, **PCIP) ax.set_xlabel(l) if ax.is_first_col(): ax.set_ylabel('Target') # Remove eventual excessive axes if n < nrows * ncols: for i in range(1, nrows * ncols - n + 1): fig.delaxes(axs.flat[-i]) fig.tight_layout() # Save the plot if needed if save: plt.savefig(file_dir, **SP) # Prevent display of the plot if needed if not show: plt.close() return fig, axs