Beispiel #1
0
def test5():
    random_state = 420
    n_estimators = 380  # test4调完最佳参数是380
    data = load_boston()
    X = data.data
    y = data.target
    Xtrain, Xtest, ytrain, ytest = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=random_state)
    cv = KFold(n_splits=5, shuffle=True, random_state=random_state)  # 交叉验证模式
    axis = np.linspace(0.75, 1, 25)
    rs = []  # 方差
    vars = []  # 偏差
    ges = []
    for i in axis:
        reg = XGBR(n_estimators=n_estimators,
                   random_state=random_state,
                   subsample=i)
        cvs = cross_val_score(reg, Xtrain, ytrain, cv=cv)
        rs.append(cvs.mean())
        vars.append(cvs.var())
        ges.append(1 - cvs.mean()**2 + cvs.var())
    max_rs = axis[rs.index(max(rs))]
    min_vars = axis[vars.index(min(vars))]
    min_ges = axis[ges.index(min(ges))]
    print(axis)
    print(rs)
    print(axis[rs.index(max(rs))], max(rs), vars[rs.index(max(rs))])
    print(axis[vars.index(min(vars))], rs[vars.index(min(vars))], min(vars))
    print(axis[ges.index(min(ges))], rs[ges.index(min(ges))],
          vars[ges.index(min(ges))])
    plt.figure(figsize=(20, 5))
    plt.plot(axis, np.array(rs) + np.array(vars), c='red', linestyle='-.')
    plt.plot(axis, rs, c='black', label='XGB')
    plt.plot(axis, np.array(rs) - np.array(vars), c='red', linestyle='-.')
    plt.legend()
    plt.show()
    time0 = time()
    print(
        XGBR(n_estimators=n_estimators,
             random_state=random_state,
             subsample=max_rs).fit(Xtrain, ytrain).score(Xtest, ytest))
    print(time() - time0)

    time0 = time()
    print(
        XGBR(n_estimators=n_estimators,
             random_state=random_state,
             subsample=min_vars).fit(Xtrain, ytrain).score(Xtest, ytest))
    print(time() - time0)

    time0 = time()
    print(
        XGBR(n_estimators=n_estimators,
             random_state=random_state,
             subsample=min_ges).fit(Xtrain, ytrain).score(Xtest, ytest))
    print(time() - time0)
Beispiel #2
0
def get_errors(Xt, Xy, Yt, Yy, counts, Xtype=True):
    errors = []
    for cnt in counts:
        if (Xtype):
            xgb_reg = XGBR(n_estimators=cnt).fit(X_train, y_train)
        else:
            xgb_reg = XGBR(max_depth=cnt).fit(X_train, y_train)
        mse = MSE(y_test, xgb_reg.predict(X_test))
        errors.append(mse)
    return errors
Beispiel #3
0
 def xgbcv(num_round, subsample, eta, max_depth):
     val = cross_val_score(
         XGBR(num_round=int(num_round),
             subsample=float(subsample),
             eta=min(eta, 0.999),
             max_depth = int(max_depth),
             random_state=2
         ),
         X, y,score, cv=5
     ).mean()
     return val
Beispiel #4
0
def test3():
    data = load_boston()
    X = data.data
    y = data.target
    Xtrain, Xtest, ytrain, ytest = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=0)
    estimator = XGBR(n_estimators=100, random_state=0)
    cv = KFold(n_splits=5, shuffle=True, random_state=0)  # 交叉验证模式
    plot_learning_curve(estimator, 'XGB', Xtrain, ytrain, ax=None, cv=cv)
    plt.show()
Beispiel #5
0
def xgboost_demo():
    data = load_boston()
    x = data.data
    y = data.target
    Xtrain, Xtest, Ytrain, Ytest = TTS(x, y, test_size=0.3, random_state=430)
    reg = XGBR(n_estimators=100).fit(Xtrain, Ytrain)    # 创建多少棵树
    reg.predict(Xtest)

    print(reg.score(Xtest, Ytest))
    print(MSE(Ytest, reg.predict(Xtest)))    # 查看均方误差

    print(reg.feature_importances_)    # 每个特征的贡献
    print(CVS(reg, Xtrain, Ytrain, cv=5).mean())    # 交叉验证均值
Beispiel #6
0
def test1():
    data = load_boston()
    X = data.data
    y = data.target
    print(X.shape)
    print(data.data, data.target)
    Xtrain, Xtest, ytrain, ytest = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=0)
    reg = XGBR(n_estimators=100, random_state=0).fit(Xtrain, ytrain)
    print(reg.score(Xtest, ytest))  # R^2
    print(y.mean())
    print(MSE(ytest, reg.predict(Xtest)))
    print(reg.feature_importances_)
    print(data.feature_names[np.argsort(-reg.feature_importances_)])
Beispiel #7
0
def draw_curve():
    data = load_boston()
    x = data.data
    y = data.target
    Xtrain, Xtest, Ytrain, Ytest = TTS(x, y, test_size=0.3, random_state=430)
    axisx = range(10, 1010, 50)
    rs = []    # 1 减去 偏差
    var = []    # 纪录方差
    ge = []    # 计算泛化误差的可控部分
    for i in axisx:
        reg = XGBR(n_estimators=i)  # 创建多少棵树
        # 默认值越大,越好。所以scoring='neg_mean_squared_error'
        rs.append(CVS(reg, Xtrain, Ytrain, cv=5, scoring='neg_mean_squared_error').mean())

    print(axisx[rs.index(max(rs))], max(rs))
    plt.figure(figsize=(20, 5))
    plt.plot(axisx, rs, c='red', label='XGB')
    plt.legend()
    plt.show()
Beispiel #8
0
def draw_curve_2():
    data = load_boston()
    x = data.data
    y = data.target
    Xtrain, Xtest, Ytrain, Ytest = TTS(x, y, test_size=0.3, random_state=430)
    axisx = range(10, 1010, 50)
    rs = []
    var = []
    ge = []
    for i in axisx:
        reg = XGBR(n_estimators=i, random_state=420)  # 创建多少棵树
        cvresult = CVS(reg, Xtrain, Ytrain, cv=5)    # 分5折验证
        rs.append(cvresult.mean())  # 1 减去 偏差
        var.append(cvresult.var())  # 纪录方差
        ge.append((1 - cvresult.mean()) ** 2 + cvresult.var())  # 计算泛化误差的可控部分

    # print(axisx[rs.index(max(rs))], max(rs), var[rs.index(max(rs))])
    # 泛化误差可控部分最小的时候,打印r平方和泛化误差
    print(axisx[ge.index(min(ge))], rs[ge.index(min(ge))], var[ge.index(min(ge))], min(ge))
Beispiel #9
0
def test7():
    random_state = 420
    n_estimators = 380  # test4调完最佳参数是380
    subsample = 0.9
    learning_rate = 0.1

    data = load_boston()
    X = data.data
    y = data.target
    Xtrain, Xtest, ytrain, ytest = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=random_state)

    for booster in ['gbtree', 'gblinear', 'dart']:
        reg = XGBR(n_estimators=n_estimators,
                   random_state=random_state,
                   subsample=subsample,
                   learning_rate=learning_rate,
                   booster=booster)
        reg.fit(Xtrain, ytrain)
        print(booster, reg.score(Xtest, ytest))
Beispiel #10
0
def test2():
    data = load_boston()
    X = data.data
    y = data.target
    xgbr = XGBR(n_estimators=100, random_state=0)
    Xtrain, Xtest, ytrain, ytest = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=0)
    xgbr_score = cross_val_score(xgbr, Xtrain, ytrain, cv=5).mean()
    print(xgbr_score)
    xgbr_score = cross_val_score(xgbr,
                                 Xtrain,
                                 ytrain,
                                 cv=5,
                                 scoring='neg_mean_squared_error').mean()
    print(xgbr_score)
    print(sorted(sklearn.metrics.SCORERS.keys()))

    rfr = RFR(n_estimators=100, random_state=0)
    rfr_score = cross_val_score(rfr, Xtrain, ytrain, cv=5).mean()
    print(rfr_score)
    rfr_score = cross_val_score(rfr,
                                Xtrain,
                                ytrain,
                                cv=5,
                                scoring='neg_mean_squared_error').mean()
    print(rfr_score)

    lr = LR()
    lr_score = cross_val_score(lr, Xtrain, ytrain, cv=5).mean()
    print(lr_score)
    lr_score = cross_val_score(lr,
                               Xtrain,
                               ytrain,
                               cv=5,
                               scoring='neg_mean_squared_error').mean()
    print(lr_score)
Beispiel #11
0
                        :binary:logistic 对数损失,二分类时使用
                        :reg:linear 均方误差(回归时使用)
                        :binary:hinge 支持向量机的损失函数,二分类用
                        :multi.softmax 多分类使用
            
回归用效果好
    reg_alpha   L1的系数[0,]
    reg_lambda  L2 的系数[0,]
            
'''

sk_xgb_model = XGBR(
    n_estimators=20,
    random_state=420,
    booster='gblinear',
    objective='reg:linear',
    reg_lambda=0.3,
    gamma=0.4  #阻止 树继续生长从而导致过拟合,树的结构之差大于gamma则生长 [0,]
    # ,max_depth=
)
for i in range(5, 10):
    train_size, train_score, test_score = learning_curve(
        sk_xgb_model, X, Y, cv=5)  #返回训练尺寸,训练分数,测试分数

    plt.scatter(train_size, [i.mean() for i in train_score], marker='s')
    plt.plot(train_size, [i.mean() for i in train_score], label='true')

    plt.scatter(train_size, [i.mean() for i in test_score], marker='s')
    plt.plot(train_size, [i.mean() for i in test_score], label='pre')
    plt.title('cv = {}'.format(i))
    plt.legend()
data

# In[3]:

x = data.data
y = data.target
print(x.shape)
print(y.shape)

# In[4]:

xtrain, xtest, ytrain, ytest = TTS(x, y, test_size=0.3, random_state=420)

# In[5]:

reg = XGBR(n_estimators=100).fit(xtrain, ytrain)
reg.predict(xtest)

# In[6]:

# 测试集的结果分数,默认是返回R平方指标
reg.score(xtest, ytest)

# In[7]:

# 均方误差
MSE(ytest, reg.predict(xtest))

# In[8]:

y.mean()
Beispiel #13
0
import matplotlib.pyplot as plt
from time import time
import datetime

data = load_boston()
# 波士顿数据集非常简单,但它所涉及到的问题却很多

X = data.data
y = data.target

Xtrain, Xtest, Ytrain, Ytest = TTS(X, y, test_size=0.3, random_state=420)

var = []
ge = []
for i in axisx:
    reg = XGBR(n_estimators=i, random_state=0)
    cvresult = CVS(reg, Xtrain, Ytrain, cv=5)
    rs.append(cvresult.mean())
    var.append(cvresult.var())
    ge.append((1 - cvresult.mean())**2 + cvresult.var())
print(axisx[rs.index(max(rs))], max(rs), var[rs.index(max(rs))])
print(axisx[var.index(min(var))], rs[var.index(min(var))], min(var))
print(axisx[ge.index(min(ge))], rs[ge.index(min(ge))], var[ge.index(min(ge))],
      min(ge))

plt.figure(figsize=(20, 5))
plt.plot(axisx, rs, c='red', label='XGB')
plt.legend()
plt.show()

rs = np.array(rs)
Beispiel #14
0
from xgboost import XGBRegressor as XGBR
import xgboost as xgb
from sklearn.model_selection import cross_val_score as CVS, train_test_split as TTS
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import r2_score
from sklearn.datasets import load_boston
import pickle

data = load_boston()

X = data.data
y = data.target

Xtrain, Xtest, Ytrain, Ytest = TTS(X, y, test_size=0.3, random_state=420)
reg = XGBR(n_estimators=100).fit(Xtrain, Ytrain)
reg.predict(Xtest)  # 传统接口predict
score = reg.score(Xtest, Ytest)  # 你能想出这里应该返回什么模型评估指标么?
MSE(Ytest, reg.predict(Xtest))

dtrain = xgb.DMatrix(Xtrain, Ytrain)
param = {
    'silent': True,
    'obj': 'reg:linear',
    "subsample": 1,
    "eta": 0.05,
    "gamma": 20,
    "lambda": 3.5,
    "alpha": 0.2,
    "max_depth": 4,
    "colsample_bytree": 0.4,
    "colsample_bylevel": 0.6,
Beispiel #15
0
# In[]:
# 测试还原 price:
train_data_4_min_price = 2.3978952727983707
train_data_4_max_price = 10.676669748432332

predict_result['predict_minmax'] = predict_result['predict'] * (
    train_data_4_max_price - train_data_4_min_price) + train_data_4_min_price
predict_result['predict_minmax_log'] = np.exp(predict_result['predict_minmax'])
predict_result['predict_final'] = np.round(
    predict_result['predict_minmax_log'])

# In[]:
# 1.2、Sklearn库:
bst_skl = XGBR(n_estimators=250,
               random_state=420,
               silent=True,
               objective="reg:squarederror",
               learning_rate=0.13,
               gamma=20)
bst_skl.fit(X_data, Y_data)
# In[]:
print(r2_score(Y_data, bst_skl.predict(X_data)))  # 0.8946348682692177
print(MSE(Y_data, bst_skl.predict(X_data)))  # 0.00200635456280437
print(MAE(Y_data, bst_skl.predict(X_data)))  # 0.031795600421283834
# In[]:
predict_result_skl = bst_skl.predict(X_test)
predict_result_skl = pd.DataFrame(predict_result, columns=['predict'])

# In[]:
# My Stacking:
# 模型融合中使用到的各个单模型
clfs = {
Beispiel #16
0
def test6():
    random_state = 420
    n_estimators = 380  # test4调完最佳参数是380
    subsample = 0.9
    data = load_boston()
    X = data.data
    y = data.target
    Xtrain, Xtest, ytrain, ytest = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=random_state)
    cv = KFold(n_splits=5, shuffle=True, random_state=random_state)  # 交叉验证模式
    axis = np.arange(0.05, 1, 0.05)
    rs = []  # 方差
    vars = []  # 偏差
    ges = []
    for i in axis:
        reg = XGBR(n_estimators=n_estimators,
                   random_state=random_state,
                   subsample=subsample,
                   learning_rate=i)
        cvs = regassess(reg,
                        Xtrain,
                        ytrain,
                        cv=cv,
                        scoring=['r2', 'neg_mean_squared_error'],
                        show=True)[0]
        print(cvs)
        rs.append(cvs.mean())
        vars.append(cvs.var())
        ges.append(1 - cvs.mean()**2 + cvs.var())
    max_rs = axis[rs.index(max(rs))]
    min_vars = axis[vars.index(min(vars))]
    min_ges = axis[ges.index(min(ges))]
    print(axis)
    print(rs)
    print(axis[rs.index(max(rs))], max(rs), vars[rs.index(max(rs))])
    print(axis[vars.index(min(vars))], rs[vars.index(min(vars))], min(vars))
    print(axis[ges.index(min(ges))], rs[ges.index(min(ges))],
          vars[ges.index(min(ges))])
    plt.figure(figsize=(20, 5))
    plt.plot(axis, np.array(rs) + np.array(vars), c='red', linestyle='-.')
    plt.plot(axis, rs, c='black', label='XGB')
    plt.plot(axis, np.array(rs) - np.array(vars), c='red', linestyle='-.')
    plt.legend()
    plt.show()
    time0 = time()
    print(
        XGBR(n_estimators=n_estimators,
             random_state=random_state,
             subsample=max_rs).fit(Xtrain, ytrain).score(Xtest, ytest))
    print(time() - time0)

    time0 = time()
    print(
        XGBR(n_estimators=n_estimators,
             random_state=random_state,
             subsample=min_vars).fit(Xtrain, ytrain).score(Xtest, ytest))
    print(time() - time0)

    time0 = time()
    print(
        XGBR(n_estimators=n_estimators,
             random_state=random_state,
             subsample=min_ges).fit(Xtrain, ytrain).score(Xtest, ytest))
    print(time() - time0)
Beispiel #17
0
from time import time
import datetime
from time import time
import datetime
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error as MSE
from sklearn.datasets import load_breast_cancer

data = load_boston()

X = data.data
y = data.target

Xtrain, Xtest, Ytrain, Ytest = TTS(X, y, test_size=0.3, random_state=420)
reg = XGBR(n_estimators=100).fit(Xtrain, Ytrain)
reg.predict(Xtest)  # 传统接口predict
score = reg.score(Xtest, Ytest)  # 你能想出这里应该返回什么模型评估指标么?
MSE(Ytest, reg.predict(Xtest))
#
# cv = KFold(n_splits=5, shuffle=True, random_state=42)
# param = {"reg_alpha": np.arange(0, 5, 0.05), "reg_lambda": np.arange(0, 2, 0.05)}
# gscv = GridSearchCV(reg, param_grid=param, scoring="neg_mean_squared_error", cv=cv)
# time0 = time()
# gscv.fit(Xtrain, Ytrain)
# print(datetime.datetime.fromtimestamp(time() - time0).strftime("%M:%S:%f"))
# gscv.best_params_
# gscv.best_score_
# preds = gscv.predict(Xtest)
#
# r2_score(Ytest, preds)
Beispiel #18
0
import xgboost as xgb
from xgboost import XGBRegressor as XGBR
from xgboost import XGBClassifier as XGBC
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.linear_model import LinearRegression as LR
from sklearn.metrics import mean_squared_error as mse, SCORERS

data = load_boston()

X = data.data
Y = data.target

X_trian, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)

sk_xgb_model = XGBR(n_estimators=100, random_state=0).fit(X_trian, Y_train)

pre1 = sk_xgb_model.predict(X_test)
score1 = sk_xgb_model.score(X_test, Y_test)
mse = mse(y_true=Y_test, y_pred=pre1)
important = sk_xgb_model.feature_importances_

print('pre:  ', pre1)
print('score1:  ', score1)
print('mse:  ', mse)
print('important:  ', important)
print('mean:   ', Y.mean())

print(SCORERS.keys())  #所有可用的评估指标
X_embedded_2 = sfmf.transform(train_X)
print(train_X.columns[X_embedded_2_index]) # 特征选择后 特征的 原列名称索引
# 在这里我只想取出来有限的特征。0.005这个阈值对于有780个特征的数据来说,是非常高的阈值,因为平均每个特征
# 只能够分到大约0.001 = 1/780 的feature_importances_
#模型的维度明显被降低了
'''


# In[]:
# XGBoost:
# 一、选 n_estimators:
# Sklearn库
# 1.1、样本量学习曲线: 检测过拟合情况 (一折交叉验证,机器顶不住)
cv = ShuffleSplit(n_splits=1, test_size=.2, random_state=0)

ft.plot_learning_curve(XGBR(n_estimators=100,random_state=420,silent=True,objective="reg:squarederror")
                    ,"XGB",train_X,train_y,ax=None,cv=cv)
plt.show()

# In[]
# 1.2、方差与泛化误差 学习曲线:  (一折交叉验证,机器顶不住)
cv = ShuffleSplit(n_splits=1, test_size=.2, random_state=0)

axisx = range(100,300,50)
ft.learning_curve_r2_customize(axisx, train_X, train_y, cv)
'''
R2最大值时对应的n_estimators参数取值:250.000000; R2最大值:0.947486; R^2最大值对应的R^2方差值:0.000000
R2方差最小值时对应的n_estimators参数取值:100.000000; R2方差最小值对应的R2值:0.940488; R2方差最小值:0.000000
泛化误差可控部分最小值时对应的n_estimators参数取值:250.000000; 泛化误差可控部分最小值时对应的R2值:0.947486; 泛化误差可控部分最小值时对应的R2方差值:0.000000; 泛化误差可控部分最小值:0.002758
'''
# 选 n_estimators参数取值:250
Beispiel #20
0
i=20
t=arg[:i]
X=X[:,np.array(t)]
X=preprocessing.scale(X)
y=datatotal.iloc[:,26]
X_train,X_test,Y_train,Y_test=TTS(X,y,test_size=0.2,random_state=seed)

x=np.arange(-1,2,step=0.001)
y=x


###Xgboost
fig = plt.figure(figsize=(8,16))
ax = fig.subplots(3,2)
reg=XGBR(silent=True,n_estimators=200,max_depth=3,learning_rate=0.26,reg_lambda=0.09).fit(X_train,Y_train)
xgb_pre=reg.predict(X_test)
xgb_pre_tr=reg.predict(X_train)
xgb_avg=CVS(reg,X_train,Y_train,scoring="neg_mean_absolute_error",cv=5).mean()
xgb_mse=metrics.mean_squared_error(xgb_pre,Y_test)
xgb_r2=metrics.explained_variance_score(xgb_pre,Y_test)
xgb_mae=metrics.mean_absolute_error(xgb_pre,Y_test)
print("xgb_r2",xgb_r2)
#print("xgb_mse",xgb_mse)
print("xgb_mae",xgb_mae)
#plt.subplot(121)
#plt.figure(figsize=(10,8))
#ax1.text(x=1.36,y=0,s="R^2=0.987",fontdict=font)
#ax[0,0].text(x=1.36,y=-0.3,s="MSE=0.0043",fontdict=font)
ax[0,0].text(x=1.36,y=-0.3,s="RMSE=0.075",fontdict=font)
ax[0,0].text(x=1.36,y=-0.6,s="MAE=0.052",fontdict=font)
Beispiel #21
0
# @Projcet :
# @Software: PyCharm
"""
在常规的validation curve中只通过偏差值来进行判断,实际中往往要记入方差的影响
"""
from xgboost import XGBRegressor as XGBR
from sklearn.model_selection import cross_val_score as CVS
import numpy as np
import matplotlib.pyplot as plt

axisx = range(100, 300, 10)
rs = []
var = []
ge = []
for i in axisx:
    reg = XGBR(n_estimators=i, random_state=420)
    cvresult = CVS(reg, Xtrain, Ytrain, cv=cv)
    rs.append(cvresult.mean())
    var.append(cvresult.var())
    ge.append((1 - cvresult.mean())**2 + cvresult.var())

print(axisx[rs.index(max(rs))], max(rs), var[rs.index(max(rs))])
print(axisx[var.index(min(var))], rs[var.index(min(var))], min(var))
print(axisx[ge.index(min(ge))], rs[ge.index(min(ge))], var[ge.index(min(ge))],
      min(ge))
rs = np.array(rs)
var = np.array(var) * 0.01
plt.figure(figsize=(20, 5))
plt.plot(axisx, rs, c="black", label="XGB")
#添加方差线
plt.plot(axisx, rs + var, c="red", linestyle='-.')
Beispiel #22
0
# plt.figure(figsize=(20,5))
# plt.plot(axisx,rs,c="red",label="XGB")
# plt.legend()
# plt.show()

def regassess(reg,xtrain,ytrain,cv,scoring=["r2"],show=True):
    score=[]
    for i in range(len(scoring)):
        s=CVS(reg,xtrain,ytrain,cv=5,scoring=scoring[i]).mean()
        if show:
            print("score",i,s)
        score.append(s)
    return score
from time import time
for i in [0,0.2,0.5,1]:
    reg=XGBR(n_estimators=180,random_state=420,learning_rate=i)
    print("learning_rate",i)
    t1=time()
    regassess(reg,xtrain,ytrain,4,scoring=["r2","neg_mean_squared_error"])
    # print("cost",time()-t1)










Beispiel #23
0
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from time import time
import datetime
import FeatureTools as ft

# In[]:
data = load_boston()
X = data.data
y = data.target

Xtrain, Xtest, Ytrain, Ytest = TTS(X, y, test_size=0.3, random_state=420)

# In[]:
reg = XGBR(n_estimators=100).fit(Xtrain, Ytrain)  # 训练
y_predict = reg.predict(Xtest)
print(reg.score(Xtest, Ytest))  # R^2评估指标  0.9197580267581366
print(np.mean(y))  # 22.532806324110677
print(MSE(Ytest, y_predict))  # 7.466827353555599
print(MSE(Ytest, y_predict) / np.mean(y))  # 均方误差 大概占 y标签均值的 1/3 结果不算好

# In[]:
# 树模型的优势之一:能够查看模型的重要性分数,可以使用嵌入法(SelectFromModel)进行特征选择
temparr = reg.feature_importances_

# In[]:
# 交叉验证:
reg = XGBR(n_estimators=100)

# 不严谨: 全数据集(如果数据量少,就用全数据集方式)
Beispiel #24
0
from sklearn.model_selection import KFold, cross_val_score as CVS, train_test_split as TTS
from sklearn.metrics import mean_squared_error as MSE
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from time import time
import datetime

data = load_boston()

x = data.data
y = data.target

xtrain, xtest, ytrain, ytest = TTS(x, y, test_size=0.3, random_state=420)

reg = XGBR(n_estimators=100).fit(xtrain, ytrain)

reg.predict(xtest)
reg.score(xtest, ytest)

err = MSE(ytest, reg.predict(xtest))
ipt = reg.feature_importances_

# print("err",err)
# print("ipt",ipt)

reg = XGBR(n_estimators=100)
an = CVS(reg, xtrain, ytrain, cv=5).mean()
print("an", an)

an2 = CVS(reg, xtrain, ytrain, cv=5, scoring="neg_mean_squared_error").mean()
Beispiel #25
0
from xgboost import XGBRegressor as XGBR
from adult_uci_info import Adult
import numpy as np
from scorer import Scorer
xgb = XGBR(max_depth=4,
           learning_rate=0.15,
           n_estimators=150,
           silent=True,
           objective='reg:gamma')
uci = Adult()
train_x, train_y, test_x, test_y = uci()
xgb.fit(train_x, train_y)
y_pre = xgb.predict(test_x)
scorer = Scorer(y_pre, test_y)
mape, rmse = scorer()
print("rmse %f" % rmse, "mape %f" % mape)
Beispiel #26
0
from sklearn.model_selection import KFold, cross_val_score as CVS, train_test_split
from sklearn.metrics import mean_squared_error as MSE
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
from time import time
from function import plot_learning_curve

boston = load_boston()
X, y = boston.data, boston.target

x_train, x_test,  y_train, y_test =  train_test_split(X, y, test_size=0.3)

# 构建梯度提升树模型
xgbr = XGBR(n_estimators=100)
xgbr.fit(x_train, y_train)
# 预测结果
predict = xgbr.predict(x_test)


# 计算均方误差
print(MSE(y_test, xgbr.predict(x_test)))

# 绘制学习曲线
cv = KFold(n_splits=5, shuffle=True, random_state=32)
plot_learning_curve(XGBR(n_estimators=100, random_state=30), 'XGBR', X, y, ax=None, cv=cv)
plt.show()

# 通过观察图可以发现在数据量很少的情况下,模型处于过拟合状态,在数据流不断提高时,模型的泛华能力不断提高。
Beispiel #27
0
from xgboost import XGBRegressor as XGBR
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import train_test_split, cross_val_score, KFold
import numpy as np

data = load_boston()
X = data.data
Y = data.target

X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.75,
                                                    random_state=42)
# xgb
xgb_reg = XGBR(n_estimators=100).fit(X_train, Y_train)
print('训练模型的: %.9f' % xgb_reg.score(X_train, Y_train))
print('预测模型MSE: %.9f' % MSE(Y_test, xgb_reg.predict(X_test)))
print('交叉验证得分 % .9f' % cross_val_score(xgb_reg, X_train, Y_train, cv=5).mean())
Beispiel #28
0
import FeatureTools as ft

# In[]:
data = load_boston()
X = data.data
y = data.target

Xtrain, Xtest, Ytrain, Ytest = TTS(X, y, test_size=0.3, random_state=420)

# In[]:
# B、弱评估器超参数:
# 4、booster(选择弱评估器)
for booster in ["gbtree", "gblinear", "dart"]:
    reg = XGBR(n_estimators=260,
               learning_rate=0.25,
               random_state=420,
               booster=booster,
               silent=True).fit(Xtrain, Ytrain)
    print(booster)
    print(reg.score(Xtest, Ytest))

# gblinear线性弱评估器表现最差: 说明 波斯顿数据集 不是线性数据集(特征X 与 因变量Y 不是线性联系)

# In[]:
# 5、objective(损失函数)
# Sklearn的XGB: objective:默认reg:linear
reg = XGBR(n_estimators=270,
           subsample=0.75,
           learning_rate=0.13,
           random_state=420).fit(Xtrain, Ytrain)
print(reg.score(Xtest, Ytest))