def main(): #X, types, y = ToFormNumpy("D:\\tanlanmalar\\giper_my.txt") #X, types, y = ToFormNumpy("D:\\tanlanmalar\\IT_BORI_42_6.txt") #X, types, y = ToFormNumpy("D:\\tanlanmalar\\gasterlogy1394.txt") #X, types, y = ToFormNumpy("D:\\tanlanmalar\\spame.txt") X, types, y = ToFormNumpy("D:\\tanlanmalar\\Asian Religion.txt") #X, types, y = ToFormNumpy("D:\\tanlanmalar\\arcene_train.txt") minmax_scale(X, copy=False) #Normalizing_Estmation(X, y, types=types) k = 10 k_fold = KFold(n_splits=k, shuffle=True, random_state=None) # Neighbors nnc = NearestNeighborClassifier() knc = TemplateClassifier() begin = time.time() max_mean1 = CVS(nnc, X, y, cv=k_fold, n_jobs=4, scoring='accuracy').mean() end = time.time() print("Time: ", (end - begin) * 1000) print(max_mean1) begin = time.time() max_mean2 = CVS(knc, X, y, cv=k_fold, n_jobs=4, scoring='accuracy').mean() end = time.time() print("Time: ", (end - begin) * 1000) print(max_mean1, max_mean2)
def regassess(reg, xtrain, ytrain, cv, scoring=['r2'], show=True): score = [] for i in range(len(scoring)): if show: print("{}:{:.2f}".format( scoring[i], CVS(reg, xtrain, ytrain, cv=cv, scoring=scoring[i]).mean())) score.append( CVS(reg, xtrain, ytrain, cv=cv, scoring=scoring[i]).mean()) return score
def regassess(reg, Xtrain, Ytrain, cv, scoring=["r2"], show=True): score = [] for i in range(len(scoring)): if show: print("{}:{:.2f}".format( scoring[i] #模型评估指标的名字 , CVS(reg, Xtrain, Ytrain, cv=cv, scoring=scoring[i]).mean())) score.append( CVS(reg, Xtrain, Ytrain, cv=cv, scoring=scoring[i]).mean()) return score
def main(): #X, types, y = ToFormNumpy("D:\\tanlanmalar\\IT_BORI_42_6.txt") #X, types, y = ToFormNumpy("D:\\tanlanmalar\\giper_my.txt") #X, types, y = ToFormNumpy("D:\\tanlanmalar\\spame.txt") #X, types, y = ToFormNumpy("D:\\tanlanmalar\\gasterlogy1394.txt") #X, types, y = ToFormNumpy("D:\\tanlanmalar\\MATBIO_MY.txt") #X, types, y = ToFormNumpy("D:\\german.txt") #71.2 #X, types, y = ToFormNumpy("D:\\german1.txt") #91.7 #X, types, y = ToFormNumpy("D:\\german2.txt") #91.7 #X, types, y = ToFormNumpy("D:\\german3.txt") #94.4 #X, types, y = ToFormNumpy("D:\\german4.txt") #95.4 #X, types, y = ToFormNumpy("D:\\german5.txt") #97.7 X, types, y = ToFormNumpy("D:\\german6.txt") #98.1 #X, types, y = ToFormNumpy("D:\\german7.txt") #97.3 #X, types, y = ToFormNumpy("D:\\german8.txt") #94.5 #X, types, y = ToFormNumpy("D:\\tanlanmalar\\german.txt") #y[y == 2] = 1 _, ln = np.unique(y, return_counts=True) #minmax_scale(X, copy=False) #Normalizing_Estmation(X, y) # Cross Validation k = 10 k_fold = KFold(n_splits=k, shuffle=True, random_state=None) #Nerual network mlp = MLPClassifier(hidden_layer_sizes=(100, 200)) # Knn n_neighbors = 2 * min(ln) - 3 # mertic Euclidean #knc = KNeighborsClassifier(n_neighbors=n_neighbors) knc = KNeighborsClassifier(n_neighbors=1) #SVM svc = SVC() #print("MLP") max_mean1 = CVS(mlp, X, y, cv=k_fold, n_jobs=4, scoring='accuracy').mean() #print("KNN") max_mean2 = CVS(knc, X, y, cv=k_fold, n_jobs=4, scoring='accuracy').mean() #print("SVM") max_mean3 = CVS(svc, X, y, cv=k_fold, n_jobs=4, scoring='accuracy').mean() print(max_mean1, max_mean2, max_mean3)
def main(): #X, types, y = ToFormNumpy("D:\\tanlanmalar\\gasterlogy1394.txt") #X, types, y = ToFormNumpy("D:\\tanlanmalar\\spame.txt") #X, types, y = ToFormNumpy("D:\\tanlanmalar\\MATBIO_MY.txt") X, types, y = ToFormNumpy(r"D:\Nuu\AI\Selections\Amazon_initial_50_30_10000\data.txt") metric = 1 minmax_scale(X, copy=False) #w = Lagranj_nd(X, y) w = Lagranj(X) value = w.max() value = w.max() cond = w == value while len(cond[cond == True]) < 661: value = np.max(w[w < value]) cond = w >= value X_Test = X[:, w >= value] k = 10 k_fold = KFold(n_splits=k, shuffle=True, random_state=None) svm = SVC(kernel="linear") #svm.fit(X_Test, y) nn = MLPClassifier() nn.fit(X_Test, y) max_mean = CVS(nn, X_Test, y, cv=k_fold, n_jobs=4, scoring='accuracy').mean() print(max_mean)
def regassess(reg,xtrain,ytrain,cv,scoring=["r2"],show=True): score=[] for i in range(len(scoring)): s=CVS(reg,xtrain,ytrain,cv=5,scoring=scoring[i]).mean() if show: print("score",i,s) score.append(s) return score
def xgboost_demo(): data = load_boston() x = data.data y = data.target Xtrain, Xtest, Ytrain, Ytest = TTS(x, y, test_size=0.3, random_state=430) reg = XGBR(n_estimators=100).fit(Xtrain, Ytrain) # 创建多少棵树 reg.predict(Xtest) print(reg.score(Xtest, Ytest)) print(MSE(Ytest, reg.predict(Xtest))) # 查看均方误差 print(reg.feature_importances_) # 每个特征的贡献 print(CVS(reg, Xtrain, Ytrain, cv=5).mean()) # 交叉验证均值
def draw_curve_2(): data = load_boston() x = data.data y = data.target Xtrain, Xtest, Ytrain, Ytest = TTS(x, y, test_size=0.3, random_state=430) axisx = range(10, 1010, 50) rs = [] var = [] ge = [] for i in axisx: reg = XGBR(n_estimators=i, random_state=420) # 创建多少棵树 cvresult = CVS(reg, Xtrain, Ytrain, cv=5) # 分5折验证 rs.append(cvresult.mean()) # 1 减去 偏差 var.append(cvresult.var()) # 纪录方差 ge.append((1 - cvresult.mean()) ** 2 + cvresult.var()) # 计算泛化误差的可控部分 # print(axisx[rs.index(max(rs))], max(rs), var[rs.index(max(rs))]) # 泛化误差可控部分最小的时候,打印r平方和泛化误差 print(axisx[ge.index(min(ge))], rs[ge.index(min(ge))], var[ge.index(min(ge))], min(ge))
def draw_curve(): data = load_boston() x = data.data y = data.target Xtrain, Xtest, Ytrain, Ytest = TTS(x, y, test_size=0.3, random_state=430) axisx = range(10, 1010, 50) rs = [] # 1 减去 偏差 var = [] # 纪录方差 ge = [] # 计算泛化误差的可控部分 for i in axisx: reg = XGBR(n_estimators=i) # 创建多少棵树 # 默认值越大,越好。所以scoring='neg_mean_squared_error' rs.append(CVS(reg, Xtrain, Ytrain, cv=5, scoring='neg_mean_squared_error').mean()) print(axisx[rs.index(max(rs))], max(rs)) plt.figure(figsize=(20, 5)) plt.plot(axisx, rs, c='red', label='XGB') plt.legend() plt.show()
def main(): X, types, y = ToFormNumpy("D:\\tanlanmalar\\spame.txt") minmax_scale(X, copy=False) #X, types, y = ToFormNumpy(r"D:\Nuu\Data mining\Articles\PCA operator\Computing\Lagranj\Spame\data\own\(4595, 57).txt") k = 5 k_fold = KFold(n_splits=k, shuffle=True, random_state=42) mlp = MLPClassifier(hidden_layer_sizes=(50, 200), activation='relu', max_iter=1000, alpha=1e-5, solver='adam', verbose=False, tol=1e-4, random_state=1, learning_rate_init=.1) max_mean = sum(CVS(mlp, X, y, cv=k_fold, n_jobs=4, scoring='accuracy')) / k print('Score = ', max_mean)
def CalculatePrediction(self): try: X = self.X.copy() y = self.y.copy() # Normalize if self.chbIsNormalize.isChecked(): m1 = X.max(axis=0) m2 = X.min(axis=0) X[:, m1 != m2] = (X[:, m1 != m2] - m2[m1 != m2]) / (m1[m1 != m2] - m2[m1 != m2]) """ # clear noisy objects if self.chbIsNoisy.isChecked(): noisy = find_noisy(X, y) X = X[noisy == False] y = y[noisy == False] """ # Cross Validation k = self.spNumberOfKFold.value() k_fold = KFold(n_splits=k, shuffle=True, random_state=None) nn = NearestNeighborClassifier_(noisy=self.chbIsNoisy.isChecked()) result = CVS(nn, X, y, cv=k_fold, n_jobs=4, scoring='accuracy').mean() self.lbTextOfResult.setText( "Sirpanuvchi nazorat bahosi: {:.4%}".format(result)) except Exception as exc: QMessageBox.about(self, "Cross validationda xatolik xatolik bor: ", str(exc))
def main(): path = r"D:\Nuu\AI\Selections\leukemia\leukemia_small.csv" X, types, y = ReadFromCSVWithHeaderClass(path) minmax_scale(X, copy=False) #minmax_scale(X, copy=False) """w = Lagranj_nd(X) value = w.max() cond = w == value while len(cond[cond == True]) < 661: value = np.max(w[w < value]) cond = w >= value print(len(cond[cond == True])) X_Test = X[:, w >= value] types_Test = types[w >= value] metric = 1 noisy = find_noisy(X_Test, y, types=types_Test, metric=metric) cond = np.logical_not(noisy) X_Test = X_Test[cond] y_Test = y[cond] print(X.shape) """ noisy = find_noisy(X, y, types=types) cond = np.logical_not(noisy) X = X[cond] y = y[cond] k = 10 k_fold = KFold(n_splits=k, shuffle=True, random_state=None) """ # Neighbors nnc = NearestNeighborClassifier() nnc_ = NearestNeighborClassifier_() knc = KNeighborsClassifier(n_neighbors=30) begin = time.time() max_mean1 = 0 #max_mean1 = CVS(nnc, X_Test, y_Test, cv=k_fold, n_jobs=4, scoring='accuracy').mean() end = time.time() print("Time: ", (end - begin) * 1000) max_mean2 = 0 max_mean2 = CVS(nnc_, X_Test, y_Test, cv=k_fold, n_jobs=4, scoring='accuracy').mean() begin = time.time() max_mean3 = 0 max_mean3 = CVS(knc, X_Test, y_Test, cv=k_fold, n_jobs=4, scoring='accuracy').mean() end = time.time() print("Time: ", (end - begin) * 1000) print(max_mean1, max_mean2, max_mean3) """ nnc = NearestNeighborClassifier_() nnc.fit(X, y) svm = SVC(kernel="linear") #svm.fit(X, y) nn = MLPClassifier(hidden_layer_sizes=(100, 200)) #nn.fit(X, y) max_mean = CVS(nnc, X, y, cv=k_fold, n_jobs=4, scoring='accuracy').mean() print(max_mean)
y_predict = reg.predict(Xtest) print(reg.score(Xtest, Ytest)) # R^2评估指标 0.9197580267581366 print(np.mean(y)) # 22.532806324110677 print(MSE(Ytest, y_predict)) # 7.466827353555599 print(MSE(Ytest, y_predict) / np.mean(y)) # 均方误差 大概占 y标签均值的 1/3 结果不算好 # In[]: # 树模型的优势之一:能够查看模型的重要性分数,可以使用嵌入法(SelectFromModel)进行特征选择 temparr = reg.feature_importances_ # In[]: # 交叉验证: reg = XGBR(n_estimators=100) # 不严谨: 全数据集(如果数据量少,就用全数据集方式) CVS(reg, X, y, cv=5).mean() # R^2: cross_val_score默认 和 模型默认的评估指标相同 # In[]: # 严谨: 分训练和测试 CVS(reg, Xtrain, Ytrain, cv=5).mean( ) # 默认评估指标R^2; 但可以显示指定评估指标: scoring='neg_mean_squared_error' 负均方误差 # In[]: import sklearn sorted(sklearn.metrics.SCORERS.keys()) # In[]: # 使用随机森林和线性回归进行一个对比 rfr = RFR(n_estimators=100) CVS(rfr, Xtrain, Ytrain, cv=5).mean() # 0.7975497480638329 # In[]:
X=X[:,np.array(t)] X=preprocessing.scale(X) y=datatotal.iloc[:,26] X_train,X_test,Y_train,Y_test=TTS(X,y,test_size=0.2,random_state=seed) x=np.arange(-1,2,step=0.001) y=x ###Xgboost fig = plt.figure(figsize=(8,16)) ax = fig.subplots(3,2) reg=XGBR(silent=True,n_estimators=200,max_depth=3,learning_rate=0.26,reg_lambda=0.09).fit(X_train,Y_train) xgb_pre=reg.predict(X_test) xgb_pre_tr=reg.predict(X_train) xgb_avg=CVS(reg,X_train,Y_train,scoring="neg_mean_absolute_error",cv=5).mean() xgb_mse=metrics.mean_squared_error(xgb_pre,Y_test) xgb_r2=metrics.explained_variance_score(xgb_pre,Y_test) xgb_mae=metrics.mean_absolute_error(xgb_pre,Y_test) print("xgb_r2",xgb_r2) #print("xgb_mse",xgb_mse) print("xgb_mae",xgb_mae) #plt.subplot(121) #plt.figure(figsize=(10,8)) #ax1.text(x=1.36,y=0,s="R^2=0.987",fontdict=font) #ax[0,0].text(x=1.36,y=-0.3,s="MSE=0.0043",fontdict=font) ax[0,0].text(x=1.36,y=-0.3,s="RMSE=0.075",fontdict=font) ax[0,0].text(x=1.36,y=-0.6,s="MAE=0.052",fontdict=font) ax[0,0].set_title("Xgboost") ax[0,0].plot(x,y,linewidth=3.81) ax[0,0].scatter(Y_train,xgb_pre_tr,color = 'blue', s = 15)
print("Classification Report:\n", CR(Y_test, pred, zero_division=0)) # ### Cross Validation # In[12]: from sklearn.model_selection import StratifiedKFold as SKF from sklearn.model_selection import cross_val_score as CVS model = SVC(kernel='rbf', C=13, gamma=0.325) folds = 5 start = T() cross_val = SKF(n_splits=folds, shuffle=True, random_state=4) scores = CVS(model, X, Y, scoring='accuracy', cv=cross_val) end = T() accuracy = scores.mean() * 100 print(f"SVC has mean accuracy of {accuracy:.3f}%\n" + f"Cross Validation took {(end-start)*1000:.3f}ms") # ### Calculate F1-Score of the model # In[13]: from sklearn.metrics import f1_score as F1 f1score = F1(Y_test, pred, average='weighted') print(f"SVC has F1-Score = {f1score * 100:.3f}%")
def main(): #X, types, y = ToFormNumpy("D:\\tanlanmalar\\IT_BORI_42_6.txt") #X, types, y = ToFormNumpy("D:\\tanlanmalar\\giper_my.txt") #X, types, y = ToFormNumpy("D:\\tanlanmalar\\spame.txt") X, types, y = ToFormNumpy("D:\\tanlanmalar\\gasterlogy1394.txt") #X, types, y = ToFormNumpy("D:\\tanlanmalar\\MATBIO_MY.txt") y[y == 2] = 1 _, ln = np.unique(y, return_counts=True) #print(ln) #minmax_scale(X, copy=False) Normalizing_Estmation(X, y) indx = clearNoisy(X, y) X = X[indx] y = y[indx] #print(X.shape) #print(y.shape) #return None selection_Name = r'\Gasterology2' preproccesing_name = r'own' path = r"D:\Nuu\Data mining\Articles\Cross Validation\Computing" + selection_Name + \ r"\res " + preproccesing_name + ".txt" file = open(path, 'w') # Cross Validation k = 10 k_fold = KFold(n_splits=k, shuffle=True, random_state=None) #Nerual network mlp = MLPClassifier(hidden_layer_sizes=(100, 200), activation='logistic') # Knn n_neighbors = 2 * min(ln) - 3 # mertic Euclidean knc = KNeighborsClassifier(n_neighbors=n_neighbors, p=2) #SVM svc = SVC(kernel="linear", degree=5) # RDF rdf = RandomForestClassifier(max_depth=1000) #print("MLP") max_mean1 = CVS(mlp, X, y, cv=k_fold, n_jobs=4, scoring='accuracy').mean() #print("KNN") max_mean2 = CVS(knc, X, y, cv=k_fold, n_jobs=4, scoring='accuracy').mean() #print("SVM") max_mean3 = CVS(svc, X, y, cv=k_fold, n_jobs=4, scoring='accuracy').mean() print(X.shape[1], max_mean1, max_mean2, max_mean3) # 25 w = Lagranj(X, y, types) while X.shape[1] > 2: # Cross Validation k = 5 k_fold = KFold(n_splits=k, shuffle=True, random_state=42) # Nerual network mlp = MLPClassifier(hidden_layer_sizes=(50, 200), activation='relu', max_iter=1000, alpha=1e-5, solver='adam', verbose=False, tol=1e-8, random_state=1, learning_rate_init=.1) # Knn n_neighbors = 2 * min(ln) - 3 # mertic Euclidean knc = KNeighborsClassifier(n_neighbors=n_neighbors, p=2) # SVM svc = SVC(gamma='scale') max_mean1 = sum(CVS(mlp, X, y, cv=k_fold, n_jobs=4, scoring='accuracy')) / k max_mean2 = sum(CVS(knc, X, y, cv=k_fold, n_jobs=4, scoring='accuracy')) / k max_mean3 = sum(CVS(svc, X, y, cv=k_fold, n_jobs=4, scoring='accuracy')) / k print(X.shape[1], max_mean1, max_mean2, max_mean3) file.write( str(X.shape[1]) + "\t" + str(max_mean1) + "\t" + str(max_mean2) + "\t" + str(max_mean3) + "\n") cond = w != w.min() X = X[:, cond] w = w[cond] file.close()
# 均方误差结果大约占y均值的三分之一,效果一般 # In[9]: # 树模型可以查看模型的重要性分数,可以使用嵌入法(select from model)进行特征选择 reg.feature_importances_ # # 使用交叉验证来进行对比 # In[10]: reg = XGBR(n_estimators=100) # 交叉验证中导入没有经过训练的模型 # In[11]: print(CVS(reg, xtrain, ytrain, cv=5)) # 1: mean 是对5次交叉验证求均值 # 2: 由于reg(XGB)默认是是R平方指标,所以交叉验证中也是返回R平方指标结果 CVS(reg, xtrain, ytrain, cv=5).mean() # In[12]: CVS(reg, xtrain, ytrain, cv=5, scoring='neg_mean_squared_error').mean() # In[13]: # 来查看下sklearn中的所有模型来评估指标 import sklearn sorted(sklearn.metrics.SCORERS.keys()) # In[14]:
xtrain, xtest, ytrain, ytest = TTS(x, y, test_size=0.3, random_state=420) reg = XGBR(n_estimators=100).fit(xtrain, ytrain) reg.predict(xtest) reg.score(xtest, ytest) err = MSE(ytest, reg.predict(xtest)) ipt = reg.feature_importances_ # print("err",err) # print("ipt",ipt) reg = XGBR(n_estimators=100) an = CVS(reg, xtrain, ytrain, cv=5).mean() print("an", an) an2 = CVS(reg, xtrain, ytrain, cv=5, scoring="neg_mean_squared_error").mean() print("an2", an2) rfr = RFR(n_estimators=100) a = CVS(rfr, xtrain, ytrain, cv=5).mean() neg_mean_square = CVS(rfr, xtrain, ytrain, scoring="neg_mean_squared_error").mean() print("a,", a, neg_mean_square) lr = LinearRegression() b = CVS(lr, xtrain, ytrain, cv=5).mean() bb = CVS(lr, xtrain, ytrain, cv=5, scoring="neg_mean_squared_error").mean() print("b", b, bb)
model_lgb.fit(x_train, y_train) pickle.dump(model_lgb, open('model/model_lgb.model', "wb")) # print(CVS(model_lgb, x_train, y_train, cv=5, scoring='neg_mean_squared_error').mean()) #-0.010733531287803806 model_cat = CatBoostClassifier(iterations=220, learning_rate=0.02, depth=4, loss_function='Logloss', eval_metric='Logloss') print( CVS(model_cat, x_train, y_train, cv=5, scoring='neg_mean_squared_error').mean()) # -0.011547344110854504 model_cat.fit(x_train, y_train) pickle.dump(model_cat, open('model/model_cat.model', "wb")) print("==================================================") print("training done!") model_xg = pickle.load(open('model/model_xg.model', 'rb')) model_lgb = pickle.load(open('model/model_lgb.model', 'rb')) model_cat = pickle.load(open('model/model_cat.model', 'rb')) print("==================================================") print("ensemble predicting ...")
from time import time import datetime data = load_boston() # 波士顿数据集非常简单,但它所涉及到的问题却很多 X = data.data y = data.target Xtrain, Xtest, Ytrain, Ytest = TTS(X, y, test_size=0.3, random_state=420) var = [] ge = [] for i in axisx: reg = XGBR(n_estimators=i, random_state=0) cvresult = CVS(reg, Xtrain, Ytrain, cv=5) rs.append(cvresult.mean()) var.append(cvresult.var()) ge.append((1 - cvresult.mean())**2 + cvresult.var()) print(axisx[rs.index(max(rs))], max(rs), var[rs.index(max(rs))]) print(axisx[var.index(min(var))], rs[var.index(min(var))], min(var)) print(axisx[ge.index(min(ge))], rs[ge.index(min(ge))], var[ge.index(min(ge))], min(ge)) plt.figure(figsize=(20, 5)) plt.plot(axisx, rs, c='red', label='XGB') plt.legend() plt.show() rs = np.array(rs) var = np.array(var) * 0.01
# 绘制学习曲线 cv = KFold(n_splits=5, shuffle=True, random_state=32) plot_learning_curve(XGBR(n_estimators=100, random_state=30), 'XGBR', X, y, ax=None, cv=cv) plt.show() # 通过观察图可以发现在数据量很少的情况下,模型处于过拟合状态,在数据流不断提高时,模型的泛华能力不断提高。 # 开始模型调参,先来绘制n_estimators的学习曲线观察何时最优 axisx = range(100, 300, 10) rs = [] var = [] ge = [] for i in axisx: xgbr = XGBR(n_estimators=i, random_state=30) cvresult = CVS(xgbr, X, y, cv=cv) # 记录1-偏差 rs.append(cvresult.mean()) # 记录方差 var.append(cvresult.var()) # 记录泛化误差 ge.append((1-cvresult.mean())**2 + cvresult.var()) # 打印R2最高的参数取值,并打印此时的方差 print(axisx[rs.index(max(rs))], max(rs), var[rs.index(max(rs))]) # 打印方差最低时对应的参数取值,并打印此时的R2 print(axisx[var.index(min(var))], min(var), rs[var.index(min(var))]) # 打印泛化误差最低时的参数取值 print(axisx[np.argmin(ge)], rs[ge.index(min(ge))], var[ge.index(min(ge))]) plt.plot(axisx, rs, color="r", label="XGBR") plt.legend() plt.show()
ax2.plot(X[:, 0], y, 'o', c='k') ax2.legend(loc="best") ax2.set_xlabel("Input feature") ax2.set_title("Result after discretization") plt.tight_layout() plt.show() pred, score, var = [], [], [] binsrange = [2, 5, 10, 15, 20, 30] for i in binsrange: enc = KBinsDiscretizer(n_bins=i, encode="onehot") X_binned = enc.fit_transform(X) line_binned = enc.transform(line) LinearR_ = LinearRegression() cvresult = CVS(LinearR_, X_binned, y, cv=5) score.append(cvresult.mean()) var.append(cvresult.var()) pred.append(LinearR_.fit(X_binned, y).score(line_binned, np.sin(line))) plt.figure(figsize=(6, 5)) plt.plot(binsrange, pred, c="orange", label="test") plt.plot(binsrange, score, c="k", label="full data") plt.plot(binsrange, score + np.array(var) * 0.5, c="red", linestyle="--", label="var") plt.plot(binsrange, score - np.array(var) * 0.5, c="red", linestyle="--") plt.legend() plt.show()
# 绘制训练样本容量的学习曲线 train_size, train_scores, test_scores = learning_curve(xgbr, X=X, y=y, cv=cv, random_state=12) sns.lineplot(train_size, train_scores.mean(axis=1), marker='o', color='red') sns.lineplot(train_size, test_scores.mean(axis=1), marker='o', color='green') # 绘制基分类器个数的学习曲线 x_axis = range(10, 1010, 50) cv = KFold(n_splits=5, shuffle=True, random_state=45) v_bias = []; v_vars = []; v_error = []; for i in x_axis: xgbr = XGBRegressor(n_estimators=i); score = CVS(xgbr, X, y, cv=cv); v_bias.append(score.mean()) v_vars.append(score.var()) v_error.append((1 - score.mean()) ** 2 + score.var()) print(max(v_bias), x_axis[v_bias.index(max(v_bias))], v_error[v_bias.index(max(v_bias))]) print(min(v_vars), x_axis[v_vars.index(min(v_vars))], v_error[v_vars.index(min(v_vars))]) print(min(v_error), x_axis[v_error.index(min(v_error))], v_bias[v_error.index(min(v_error))]) # 绘制偏差、方差学习曲线 sns.lineplot(x_axis, v_bias, marker='o', color='red') sns.lineplot(x_axis, np.array(v_bias) - np.array(v_vars), marker='+', color='green', ) sns.lineplot(x_axis, np.array(v_bias) + np.array(v_vars), marker='+', color='green', )