def test5(): random_state = 420 n_estimators = 380 # test4调完最佳参数是380 data = load_boston() X = data.data y = data.target Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=random_state) cv = KFold(n_splits=5, shuffle=True, random_state=random_state) # 交叉验证模式 axis = np.linspace(0.75, 1, 25) rs = [] # 方差 vars = [] # 偏差 ges = [] for i in axis: reg = XGBR(n_estimators=n_estimators, random_state=random_state, subsample=i) cvs = cross_val_score(reg, Xtrain, ytrain, cv=cv) rs.append(cvs.mean()) vars.append(cvs.var()) ges.append(1 - cvs.mean()**2 + cvs.var()) max_rs = axis[rs.index(max(rs))] min_vars = axis[vars.index(min(vars))] min_ges = axis[ges.index(min(ges))] print(axis) print(rs) print(axis[rs.index(max(rs))], max(rs), vars[rs.index(max(rs))]) print(axis[vars.index(min(vars))], rs[vars.index(min(vars))], min(vars)) print(axis[ges.index(min(ges))], rs[ges.index(min(ges))], vars[ges.index(min(ges))]) plt.figure(figsize=(20, 5)) plt.plot(axis, np.array(rs) + np.array(vars), c='red', linestyle='-.') plt.plot(axis, rs, c='black', label='XGB') plt.plot(axis, np.array(rs) - np.array(vars), c='red', linestyle='-.') plt.legend() plt.show() time0 = time() print( XGBR(n_estimators=n_estimators, random_state=random_state, subsample=max_rs).fit(Xtrain, ytrain).score(Xtest, ytest)) print(time() - time0) time0 = time() print( XGBR(n_estimators=n_estimators, random_state=random_state, subsample=min_vars).fit(Xtrain, ytrain).score(Xtest, ytest)) print(time() - time0) time0 = time() print( XGBR(n_estimators=n_estimators, random_state=random_state, subsample=min_ges).fit(Xtrain, ytrain).score(Xtest, ytest)) print(time() - time0)
def get_errors(Xt, Xy, Yt, Yy, counts, Xtype=True): errors = [] for cnt in counts: if (Xtype): xgb_reg = XGBR(n_estimators=cnt).fit(X_train, y_train) else: xgb_reg = XGBR(max_depth=cnt).fit(X_train, y_train) mse = MSE(y_test, xgb_reg.predict(X_test)) errors.append(mse) return errors
def xgbcv(num_round, subsample, eta, max_depth): val = cross_val_score( XGBR(num_round=int(num_round), subsample=float(subsample), eta=min(eta, 0.999), max_depth = int(max_depth), random_state=2 ), X, y,score, cv=5 ).mean() return val
def test3(): data = load_boston() X = data.data y = data.target Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=0) estimator = XGBR(n_estimators=100, random_state=0) cv = KFold(n_splits=5, shuffle=True, random_state=0) # 交叉验证模式 plot_learning_curve(estimator, 'XGB', Xtrain, ytrain, ax=None, cv=cv) plt.show()
def xgboost_demo(): data = load_boston() x = data.data y = data.target Xtrain, Xtest, Ytrain, Ytest = TTS(x, y, test_size=0.3, random_state=430) reg = XGBR(n_estimators=100).fit(Xtrain, Ytrain) # 创建多少棵树 reg.predict(Xtest) print(reg.score(Xtest, Ytest)) print(MSE(Ytest, reg.predict(Xtest))) # 查看均方误差 print(reg.feature_importances_) # 每个特征的贡献 print(CVS(reg, Xtrain, Ytrain, cv=5).mean()) # 交叉验证均值
def test1(): data = load_boston() X = data.data y = data.target print(X.shape) print(data.data, data.target) Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=0) reg = XGBR(n_estimators=100, random_state=0).fit(Xtrain, ytrain) print(reg.score(Xtest, ytest)) # R^2 print(y.mean()) print(MSE(ytest, reg.predict(Xtest))) print(reg.feature_importances_) print(data.feature_names[np.argsort(-reg.feature_importances_)])
def draw_curve(): data = load_boston() x = data.data y = data.target Xtrain, Xtest, Ytrain, Ytest = TTS(x, y, test_size=0.3, random_state=430) axisx = range(10, 1010, 50) rs = [] # 1 减去 偏差 var = [] # 纪录方差 ge = [] # 计算泛化误差的可控部分 for i in axisx: reg = XGBR(n_estimators=i) # 创建多少棵树 # 默认值越大,越好。所以scoring='neg_mean_squared_error' rs.append(CVS(reg, Xtrain, Ytrain, cv=5, scoring='neg_mean_squared_error').mean()) print(axisx[rs.index(max(rs))], max(rs)) plt.figure(figsize=(20, 5)) plt.plot(axisx, rs, c='red', label='XGB') plt.legend() plt.show()
def draw_curve_2(): data = load_boston() x = data.data y = data.target Xtrain, Xtest, Ytrain, Ytest = TTS(x, y, test_size=0.3, random_state=430) axisx = range(10, 1010, 50) rs = [] var = [] ge = [] for i in axisx: reg = XGBR(n_estimators=i, random_state=420) # 创建多少棵树 cvresult = CVS(reg, Xtrain, Ytrain, cv=5) # 分5折验证 rs.append(cvresult.mean()) # 1 减去 偏差 var.append(cvresult.var()) # 纪录方差 ge.append((1 - cvresult.mean()) ** 2 + cvresult.var()) # 计算泛化误差的可控部分 # print(axisx[rs.index(max(rs))], max(rs), var[rs.index(max(rs))]) # 泛化误差可控部分最小的时候,打印r平方和泛化误差 print(axisx[ge.index(min(ge))], rs[ge.index(min(ge))], var[ge.index(min(ge))], min(ge))
def test7(): random_state = 420 n_estimators = 380 # test4调完最佳参数是380 subsample = 0.9 learning_rate = 0.1 data = load_boston() X = data.data y = data.target Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=random_state) for booster in ['gbtree', 'gblinear', 'dart']: reg = XGBR(n_estimators=n_estimators, random_state=random_state, subsample=subsample, learning_rate=learning_rate, booster=booster) reg.fit(Xtrain, ytrain) print(booster, reg.score(Xtest, ytest))
def test2(): data = load_boston() X = data.data y = data.target xgbr = XGBR(n_estimators=100, random_state=0) Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=0) xgbr_score = cross_val_score(xgbr, Xtrain, ytrain, cv=5).mean() print(xgbr_score) xgbr_score = cross_val_score(xgbr, Xtrain, ytrain, cv=5, scoring='neg_mean_squared_error').mean() print(xgbr_score) print(sorted(sklearn.metrics.SCORERS.keys())) rfr = RFR(n_estimators=100, random_state=0) rfr_score = cross_val_score(rfr, Xtrain, ytrain, cv=5).mean() print(rfr_score) rfr_score = cross_val_score(rfr, Xtrain, ytrain, cv=5, scoring='neg_mean_squared_error').mean() print(rfr_score) lr = LR() lr_score = cross_val_score(lr, Xtrain, ytrain, cv=5).mean() print(lr_score) lr_score = cross_val_score(lr, Xtrain, ytrain, cv=5, scoring='neg_mean_squared_error').mean() print(lr_score)
:binary:logistic 对数损失,二分类时使用 :reg:linear 均方误差(回归时使用) :binary:hinge 支持向量机的损失函数,二分类用 :multi.softmax 多分类使用 回归用效果好 reg_alpha L1的系数[0,] reg_lambda L2 的系数[0,] ''' sk_xgb_model = XGBR( n_estimators=20, random_state=420, booster='gblinear', objective='reg:linear', reg_lambda=0.3, gamma=0.4 #阻止 树继续生长从而导致过拟合,树的结构之差大于gamma则生长 [0,] # ,max_depth= ) for i in range(5, 10): train_size, train_score, test_score = learning_curve( sk_xgb_model, X, Y, cv=5) #返回训练尺寸,训练分数,测试分数 plt.scatter(train_size, [i.mean() for i in train_score], marker='s') plt.plot(train_size, [i.mean() for i in train_score], label='true') plt.scatter(train_size, [i.mean() for i in test_score], marker='s') plt.plot(train_size, [i.mean() for i in test_score], label='pre') plt.title('cv = {}'.format(i)) plt.legend()
data # In[3]: x = data.data y = data.target print(x.shape) print(y.shape) # In[4]: xtrain, xtest, ytrain, ytest = TTS(x, y, test_size=0.3, random_state=420) # In[5]: reg = XGBR(n_estimators=100).fit(xtrain, ytrain) reg.predict(xtest) # In[6]: # 测试集的结果分数,默认是返回R平方指标 reg.score(xtest, ytest) # In[7]: # 均方误差 MSE(ytest, reg.predict(xtest)) # In[8]: y.mean()
import matplotlib.pyplot as plt from time import time import datetime data = load_boston() # 波士顿数据集非常简单,但它所涉及到的问题却很多 X = data.data y = data.target Xtrain, Xtest, Ytrain, Ytest = TTS(X, y, test_size=0.3, random_state=420) var = [] ge = [] for i in axisx: reg = XGBR(n_estimators=i, random_state=0) cvresult = CVS(reg, Xtrain, Ytrain, cv=5) rs.append(cvresult.mean()) var.append(cvresult.var()) ge.append((1 - cvresult.mean())**2 + cvresult.var()) print(axisx[rs.index(max(rs))], max(rs), var[rs.index(max(rs))]) print(axisx[var.index(min(var))], rs[var.index(min(var))], min(var)) print(axisx[ge.index(min(ge))], rs[ge.index(min(ge))], var[ge.index(min(ge))], min(ge)) plt.figure(figsize=(20, 5)) plt.plot(axisx, rs, c='red', label='XGB') plt.legend() plt.show() rs = np.array(rs)
from xgboost import XGBRegressor as XGBR import xgboost as xgb from sklearn.model_selection import cross_val_score as CVS, train_test_split as TTS from sklearn.metrics import mean_squared_error as MSE from sklearn.metrics import r2_score from sklearn.datasets import load_boston import pickle data = load_boston() X = data.data y = data.target Xtrain, Xtest, Ytrain, Ytest = TTS(X, y, test_size=0.3, random_state=420) reg = XGBR(n_estimators=100).fit(Xtrain, Ytrain) reg.predict(Xtest) # 传统接口predict score = reg.score(Xtest, Ytest) # 你能想出这里应该返回什么模型评估指标么? MSE(Ytest, reg.predict(Xtest)) dtrain = xgb.DMatrix(Xtrain, Ytrain) param = { 'silent': True, 'obj': 'reg:linear', "subsample": 1, "eta": 0.05, "gamma": 20, "lambda": 3.5, "alpha": 0.2, "max_depth": 4, "colsample_bytree": 0.4, "colsample_bylevel": 0.6,
# In[]: # 测试还原 price: train_data_4_min_price = 2.3978952727983707 train_data_4_max_price = 10.676669748432332 predict_result['predict_minmax'] = predict_result['predict'] * ( train_data_4_max_price - train_data_4_min_price) + train_data_4_min_price predict_result['predict_minmax_log'] = np.exp(predict_result['predict_minmax']) predict_result['predict_final'] = np.round( predict_result['predict_minmax_log']) # In[]: # 1.2、Sklearn库: bst_skl = XGBR(n_estimators=250, random_state=420, silent=True, objective="reg:squarederror", learning_rate=0.13, gamma=20) bst_skl.fit(X_data, Y_data) # In[]: print(r2_score(Y_data, bst_skl.predict(X_data))) # 0.8946348682692177 print(MSE(Y_data, bst_skl.predict(X_data))) # 0.00200635456280437 print(MAE(Y_data, bst_skl.predict(X_data))) # 0.031795600421283834 # In[]: predict_result_skl = bst_skl.predict(X_test) predict_result_skl = pd.DataFrame(predict_result, columns=['predict']) # In[]: # My Stacking: # 模型融合中使用到的各个单模型 clfs = {
def test6(): random_state = 420 n_estimators = 380 # test4调完最佳参数是380 subsample = 0.9 data = load_boston() X = data.data y = data.target Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=random_state) cv = KFold(n_splits=5, shuffle=True, random_state=random_state) # 交叉验证模式 axis = np.arange(0.05, 1, 0.05) rs = [] # 方差 vars = [] # 偏差 ges = [] for i in axis: reg = XGBR(n_estimators=n_estimators, random_state=random_state, subsample=subsample, learning_rate=i) cvs = regassess(reg, Xtrain, ytrain, cv=cv, scoring=['r2', 'neg_mean_squared_error'], show=True)[0] print(cvs) rs.append(cvs.mean()) vars.append(cvs.var()) ges.append(1 - cvs.mean()**2 + cvs.var()) max_rs = axis[rs.index(max(rs))] min_vars = axis[vars.index(min(vars))] min_ges = axis[ges.index(min(ges))] print(axis) print(rs) print(axis[rs.index(max(rs))], max(rs), vars[rs.index(max(rs))]) print(axis[vars.index(min(vars))], rs[vars.index(min(vars))], min(vars)) print(axis[ges.index(min(ges))], rs[ges.index(min(ges))], vars[ges.index(min(ges))]) plt.figure(figsize=(20, 5)) plt.plot(axis, np.array(rs) + np.array(vars), c='red', linestyle='-.') plt.plot(axis, rs, c='black', label='XGB') plt.plot(axis, np.array(rs) - np.array(vars), c='red', linestyle='-.') plt.legend() plt.show() time0 = time() print( XGBR(n_estimators=n_estimators, random_state=random_state, subsample=max_rs).fit(Xtrain, ytrain).score(Xtest, ytest)) print(time() - time0) time0 = time() print( XGBR(n_estimators=n_estimators, random_state=random_state, subsample=min_vars).fit(Xtrain, ytrain).score(Xtest, ytest)) print(time() - time0) time0 = time() print( XGBR(n_estimators=n_estimators, random_state=random_state, subsample=min_ges).fit(Xtrain, ytrain).score(Xtest, ytest)) print(time() - time0)
from time import time import datetime from time import time import datetime from sklearn.metrics import r2_score from sklearn.model_selection import GridSearchCV from sklearn.metrics import r2_score, mean_squared_error as MSE from sklearn.datasets import load_breast_cancer data = load_boston() X = data.data y = data.target Xtrain, Xtest, Ytrain, Ytest = TTS(X, y, test_size=0.3, random_state=420) reg = XGBR(n_estimators=100).fit(Xtrain, Ytrain) reg.predict(Xtest) # 传统接口predict score = reg.score(Xtest, Ytest) # 你能想出这里应该返回什么模型评估指标么? MSE(Ytest, reg.predict(Xtest)) # # cv = KFold(n_splits=5, shuffle=True, random_state=42) # param = {"reg_alpha": np.arange(0, 5, 0.05), "reg_lambda": np.arange(0, 2, 0.05)} # gscv = GridSearchCV(reg, param_grid=param, scoring="neg_mean_squared_error", cv=cv) # time0 = time() # gscv.fit(Xtrain, Ytrain) # print(datetime.datetime.fromtimestamp(time() - time0).strftime("%M:%S:%f")) # gscv.best_params_ # gscv.best_score_ # preds = gscv.predict(Xtest) # # r2_score(Ytest, preds)
import xgboost as xgb from xgboost import XGBRegressor as XGBR from xgboost import XGBClassifier as XGBC from sklearn.datasets import load_boston from sklearn.model_selection import train_test_split, KFold, cross_val_score from sklearn.ensemble import RandomForestClassifier as RFC from sklearn.linear_model import LinearRegression as LR from sklearn.metrics import mean_squared_error as mse, SCORERS data = load_boston() X = data.data Y = data.target X_trian, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3) sk_xgb_model = XGBR(n_estimators=100, random_state=0).fit(X_trian, Y_train) pre1 = sk_xgb_model.predict(X_test) score1 = sk_xgb_model.score(X_test, Y_test) mse = mse(y_true=Y_test, y_pred=pre1) important = sk_xgb_model.feature_importances_ print('pre: ', pre1) print('score1: ', score1) print('mse: ', mse) print('important: ', important) print('mean: ', Y.mean()) print(SCORERS.keys()) #所有可用的评估指标
X_embedded_2 = sfmf.transform(train_X) print(train_X.columns[X_embedded_2_index]) # 特征选择后 特征的 原列名称索引 # 在这里我只想取出来有限的特征。0.005这个阈值对于有780个特征的数据来说,是非常高的阈值,因为平均每个特征 # 只能够分到大约0.001 = 1/780 的feature_importances_ #模型的维度明显被降低了 ''' # In[]: # XGBoost: # 一、选 n_estimators: # Sklearn库 # 1.1、样本量学习曲线: 检测过拟合情况 (一折交叉验证,机器顶不住) cv = ShuffleSplit(n_splits=1, test_size=.2, random_state=0) ft.plot_learning_curve(XGBR(n_estimators=100,random_state=420,silent=True,objective="reg:squarederror") ,"XGB",train_X,train_y,ax=None,cv=cv) plt.show() # In[] # 1.2、方差与泛化误差 学习曲线: (一折交叉验证,机器顶不住) cv = ShuffleSplit(n_splits=1, test_size=.2, random_state=0) axisx = range(100,300,50) ft.learning_curve_r2_customize(axisx, train_X, train_y, cv) ''' R2最大值时对应的n_estimators参数取值:250.000000; R2最大值:0.947486; R^2最大值对应的R^2方差值:0.000000 R2方差最小值时对应的n_estimators参数取值:100.000000; R2方差最小值对应的R2值:0.940488; R2方差最小值:0.000000 泛化误差可控部分最小值时对应的n_estimators参数取值:250.000000; 泛化误差可控部分最小值时对应的R2值:0.947486; 泛化误差可控部分最小值时对应的R2方差值:0.000000; 泛化误差可控部分最小值:0.002758 ''' # 选 n_estimators参数取值:250
i=20 t=arg[:i] X=X[:,np.array(t)] X=preprocessing.scale(X) y=datatotal.iloc[:,26] X_train,X_test,Y_train,Y_test=TTS(X,y,test_size=0.2,random_state=seed) x=np.arange(-1,2,step=0.001) y=x ###Xgboost fig = plt.figure(figsize=(8,16)) ax = fig.subplots(3,2) reg=XGBR(silent=True,n_estimators=200,max_depth=3,learning_rate=0.26,reg_lambda=0.09).fit(X_train,Y_train) xgb_pre=reg.predict(X_test) xgb_pre_tr=reg.predict(X_train) xgb_avg=CVS(reg,X_train,Y_train,scoring="neg_mean_absolute_error",cv=5).mean() xgb_mse=metrics.mean_squared_error(xgb_pre,Y_test) xgb_r2=metrics.explained_variance_score(xgb_pre,Y_test) xgb_mae=metrics.mean_absolute_error(xgb_pre,Y_test) print("xgb_r2",xgb_r2) #print("xgb_mse",xgb_mse) print("xgb_mae",xgb_mae) #plt.subplot(121) #plt.figure(figsize=(10,8)) #ax1.text(x=1.36,y=0,s="R^2=0.987",fontdict=font) #ax[0,0].text(x=1.36,y=-0.3,s="MSE=0.0043",fontdict=font) ax[0,0].text(x=1.36,y=-0.3,s="RMSE=0.075",fontdict=font) ax[0,0].text(x=1.36,y=-0.6,s="MAE=0.052",fontdict=font)
# @Projcet : # @Software: PyCharm """ 在常规的validation curve中只通过偏差值来进行判断,实际中往往要记入方差的影响 """ from xgboost import XGBRegressor as XGBR from sklearn.model_selection import cross_val_score as CVS import numpy as np import matplotlib.pyplot as plt axisx = range(100, 300, 10) rs = [] var = [] ge = [] for i in axisx: reg = XGBR(n_estimators=i, random_state=420) cvresult = CVS(reg, Xtrain, Ytrain, cv=cv) rs.append(cvresult.mean()) var.append(cvresult.var()) ge.append((1 - cvresult.mean())**2 + cvresult.var()) print(axisx[rs.index(max(rs))], max(rs), var[rs.index(max(rs))]) print(axisx[var.index(min(var))], rs[var.index(min(var))], min(var)) print(axisx[ge.index(min(ge))], rs[ge.index(min(ge))], var[ge.index(min(ge))], min(ge)) rs = np.array(rs) var = np.array(var) * 0.01 plt.figure(figsize=(20, 5)) plt.plot(axisx, rs, c="black", label="XGB") #添加方差线 plt.plot(axisx, rs + var, c="red", linestyle='-.')
# plt.figure(figsize=(20,5)) # plt.plot(axisx,rs,c="red",label="XGB") # plt.legend() # plt.show() def regassess(reg,xtrain,ytrain,cv,scoring=["r2"],show=True): score=[] for i in range(len(scoring)): s=CVS(reg,xtrain,ytrain,cv=5,scoring=scoring[i]).mean() if show: print("score",i,s) score.append(s) return score from time import time for i in [0,0.2,0.5,1]: reg=XGBR(n_estimators=180,random_state=420,learning_rate=i) print("learning_rate",i) t1=time() regassess(reg,xtrain,ytrain,4,scoring=["r2","neg_mean_squared_error"]) # print("cost",time()-t1)
import pandas as pd import numpy as np import matplotlib.pyplot as plt from time import time import datetime import FeatureTools as ft # In[]: data = load_boston() X = data.data y = data.target Xtrain, Xtest, Ytrain, Ytest = TTS(X, y, test_size=0.3, random_state=420) # In[]: reg = XGBR(n_estimators=100).fit(Xtrain, Ytrain) # 训练 y_predict = reg.predict(Xtest) print(reg.score(Xtest, Ytest)) # R^2评估指标 0.9197580267581366 print(np.mean(y)) # 22.532806324110677 print(MSE(Ytest, y_predict)) # 7.466827353555599 print(MSE(Ytest, y_predict) / np.mean(y)) # 均方误差 大概占 y标签均值的 1/3 结果不算好 # In[]: # 树模型的优势之一:能够查看模型的重要性分数,可以使用嵌入法(SelectFromModel)进行特征选择 temparr = reg.feature_importances_ # In[]: # 交叉验证: reg = XGBR(n_estimators=100) # 不严谨: 全数据集(如果数据量少,就用全数据集方式)
from sklearn.model_selection import KFold, cross_val_score as CVS, train_test_split as TTS from sklearn.metrics import mean_squared_error as MSE import pandas as pd import numpy as np import matplotlib.pyplot as plt from time import time import datetime data = load_boston() x = data.data y = data.target xtrain, xtest, ytrain, ytest = TTS(x, y, test_size=0.3, random_state=420) reg = XGBR(n_estimators=100).fit(xtrain, ytrain) reg.predict(xtest) reg.score(xtest, ytest) err = MSE(ytest, reg.predict(xtest)) ipt = reg.feature_importances_ # print("err",err) # print("ipt",ipt) reg = XGBR(n_estimators=100) an = CVS(reg, xtrain, ytrain, cv=5).mean() print("an", an) an2 = CVS(reg, xtrain, ytrain, cv=5, scoring="neg_mean_squared_error").mean()
from xgboost import XGBRegressor as XGBR from adult_uci_info import Adult import numpy as np from scorer import Scorer xgb = XGBR(max_depth=4, learning_rate=0.15, n_estimators=150, silent=True, objective='reg:gamma') uci = Adult() train_x, train_y, test_x, test_y = uci() xgb.fit(train_x, train_y) y_pre = xgb.predict(test_x) scorer = Scorer(y_pre, test_y) mape, rmse = scorer() print("rmse %f" % rmse, "mape %f" % mape)
from sklearn.model_selection import KFold, cross_val_score as CVS, train_test_split from sklearn.metrics import mean_squared_error as MSE import numpy as np import pandas as pd import matplotlib.pyplot as plt import datetime from time import time from function import plot_learning_curve boston = load_boston() X, y = boston.data, boston.target x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # 构建梯度提升树模型 xgbr = XGBR(n_estimators=100) xgbr.fit(x_train, y_train) # 预测结果 predict = xgbr.predict(x_test) # 计算均方误差 print(MSE(y_test, xgbr.predict(x_test))) # 绘制学习曲线 cv = KFold(n_splits=5, shuffle=True, random_state=32) plot_learning_curve(XGBR(n_estimators=100, random_state=30), 'XGBR', X, y, ax=None, cv=cv) plt.show() # 通过观察图可以发现在数据量很少的情况下,模型处于过拟合状态,在数据流不断提高时,模型的泛华能力不断提高。
from xgboost import XGBRegressor as XGBR from sklearn.ensemble import RandomForestRegressor as RFR from sklearn.datasets import load_boston from sklearn.metrics import mean_squared_error as MSE from sklearn.model_selection import train_test_split, cross_val_score, KFold import numpy as np data = load_boston() X = data.data Y = data.target X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.75, random_state=42) # xgb xgb_reg = XGBR(n_estimators=100).fit(X_train, Y_train) print('训练模型的: %.9f' % xgb_reg.score(X_train, Y_train)) print('预测模型MSE: %.9f' % MSE(Y_test, xgb_reg.predict(X_test))) print('交叉验证得分 % .9f' % cross_val_score(xgb_reg, X_train, Y_train, cv=5).mean())
import FeatureTools as ft # In[]: data = load_boston() X = data.data y = data.target Xtrain, Xtest, Ytrain, Ytest = TTS(X, y, test_size=0.3, random_state=420) # In[]: # B、弱评估器超参数: # 4、booster(选择弱评估器) for booster in ["gbtree", "gblinear", "dart"]: reg = XGBR(n_estimators=260, learning_rate=0.25, random_state=420, booster=booster, silent=True).fit(Xtrain, Ytrain) print(booster) print(reg.score(Xtest, Ytest)) # gblinear线性弱评估器表现最差: 说明 波斯顿数据集 不是线性数据集(特征X 与 因变量Y 不是线性联系) # In[]: # 5、objective(损失函数) # Sklearn的XGB: objective:默认reg:linear reg = XGBR(n_estimators=270, subsample=0.75, learning_rate=0.13, random_state=420).fit(Xtrain, Ytrain) print(reg.score(Xtest, Ytest))