def xgbmodelfit(alg, datamatrix, cvfolds): xgb_param = alg.get_xgb_params() cvresult = xgb.cv(xgb_param, datamatrix, num_boost_round=alg.get_params()['n_estimators'], folds=cvfolds, metrics='rmse', early_stopping_rounds=50) alg.set_params(n_estimators=cvresult.shape[0]) alg.fit(X, y) rmse = root_mean_squared_error(alg, X, y) n = cvresult.shape[0] print("optimal n_estimator is %d" % n) print("With optimal n_estimator, mean CV test RMSE is %.4f" % cvresult['test-rmse-mean'][n - 1]) print("With optimal n_estimator, mean CV train RMSE is %.4f" % cvresult['train-rmse-mean'][n - 1]) print("RMSE of xgb entire data is %.4f" % (rmse)) plot_cv_traintestscores(cvresult['train-rmse-mean'], cvresult['test-rmse-mean'], [i for i in range(n)], 'n_estimators') plot_cv_traintestscores(cvresult['train-rmse-mean'][40:50], cvresult['test-rmse-mean'][40:50], [i for i in range(n)][40:50], 'n_estimators') plot_cv_traintestscores(cvresult['train-rmse-mean'][50:], cvresult['test-rmse-mean'][50:], [i for i in range(n)][50:], 'n_estimators') plot_cv_testscores(cvresult['test-rmse-mean'][50:], [i for i in range(n)][50:], 'n_estimators') feat_imp = plot_FI_tree(alg, cols, 20) feat_imp[0:20] return alg
def simpleRF(X,y,test_frac): rf_model = ensemble.RandomForestRegressor() X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_frac) rf_model.fit(X_train,y_train) print("R2 on training data %.2f" %rf_model.score(X_train,y_train)) print("RMSE on training data %2.3f" %np.sqrt(mean_squared_error(y_train,rf_model.predict(X_train)))) print("R2 on test data %.2f" %rf_model.score(X_test,y_test)) print("RMSE on test data %2.3f" %np.sqrt(mean_squared_error(y_test,rf_model.predict(X_test)))) FI = plot_FI_tree(rf_model, cols, topn = 20) FI[0:20] return rf_model
plt.figure().set_size_inches(8, 6) for ridx, depth in enumerate(max_depths): if ridx >= 3: plt.plot(n_estimators, test_rmse[ridx, :], label="max_depth: " + str(depth)) plt.xlabel('# of trees') plt.ylabel('Mean RMSE on test set (5-fold CV)') plt.legend(loc='upper left') return tuneRF #RF0 = simpleRF(X,y) RF1 = tune_Nestimator(X, y) FI = plot_FI_tree(RF1, cols, topn=20) FI[0:20] RF2 = tune_maxdepth(X, y) FI = plot_FI_tree(RF2, cols, topn=20) FI[0:20] tuneRF3 = tune_Nestimators_maxdepth(X, y) FI = plot_FI_tree(tuneRF3.best_estimator_, cols, topn=20) FI[0:20] RF3 = tune_min_samples_leaf(X, y) RF4 = tune_min_samples_split(X, y) RF5 = tune_max_features(X, y) # make predictions with best model, most complex model, simplest model