Exemple #1
0
def main():

    #X, types, y = ToFormNumpy("D:\\tanlanmalar\\giper_my.txt")
    #X, types, y = ToFormNumpy("D:\\tanlanmalar\\IT_BORI_42_6.txt")
    #X, types, y = ToFormNumpy("D:\\tanlanmalar\\gasterlogy1394.txt")
    #X, types, y = ToFormNumpy("D:\\tanlanmalar\\spame.txt")
    X, types, y = ToFormNumpy("D:\\tanlanmalar\\Asian Religion.txt")
    #X, types, y = ToFormNumpy("D:\\tanlanmalar\\arcene_train.txt")

    minmax_scale(X, copy=False)
    #Normalizing_Estmation(X, y, types=types)

    k = 10
    k_fold = KFold(n_splits=k, shuffle=True, random_state=None)

    # Neighbors
    nnc = NearestNeighborClassifier()

    knc = TemplateClassifier()

    begin = time.time()
    max_mean1 = CVS(nnc, X, y, cv=k_fold, n_jobs=4, scoring='accuracy').mean()
    end = time.time()
    print("Time: ", (end - begin) * 1000)

    print(max_mean1)

    begin = time.time()
    max_mean2 = CVS(knc, X, y, cv=k_fold, n_jobs=4, scoring='accuracy').mean()
    end = time.time()
    print("Time: ", (end - begin) * 1000)

    print(max_mean1, max_mean2)
def regassess(reg, xtrain, ytrain, cv, scoring=['r2'], show=True):
    score = []
    for i in range(len(scoring)):
        if show:
            print("{}:{:.2f}".format(
                scoring[i],
                CVS(reg, xtrain, ytrain, cv=cv, scoring=scoring[i]).mean()))
        score.append(
            CVS(reg, xtrain, ytrain, cv=cv, scoring=scoring[i]).mean())
    return score
Exemple #3
0
def regassess(reg, Xtrain, Ytrain, cv, scoring=["r2"], show=True):
    score = []
    for i in range(len(scoring)):
        if show:
            print("{}:{:.2f}".format(
                scoring[i]  #模型评估指标的名字
                ,
                CVS(reg, Xtrain, Ytrain, cv=cv, scoring=scoring[i]).mean()))
        score.append(
            CVS(reg, Xtrain, Ytrain, cv=cv, scoring=scoring[i]).mean())
    return score
Exemple #4
0
def main():

    #X, types, y = ToFormNumpy("D:\\tanlanmalar\\IT_BORI_42_6.txt")
    #X, types, y = ToFormNumpy("D:\\tanlanmalar\\giper_my.txt")
    #X, types, y = ToFormNumpy("D:\\tanlanmalar\\spame.txt")
    #X, types, y = ToFormNumpy("D:\\tanlanmalar\\gasterlogy1394.txt")
    #X, types, y = ToFormNumpy("D:\\tanlanmalar\\MATBIO_MY.txt")
    #X, types, y = ToFormNumpy("D:\\german.txt") #71.2
    #X, types, y = ToFormNumpy("D:\\german1.txt") #91.7
    #X, types, y = ToFormNumpy("D:\\german2.txt") #91.7
    #X, types, y = ToFormNumpy("D:\\german3.txt") #94.4
    #X, types, y = ToFormNumpy("D:\\german4.txt") #95.4
    #X, types, y = ToFormNumpy("D:\\german5.txt") #97.7
    X, types, y = ToFormNumpy("D:\\german6.txt")  #98.1
    #X, types, y = ToFormNumpy("D:\\german7.txt") #97.3
    #X, types, y = ToFormNumpy("D:\\german8.txt") #94.5
    #X, types, y = ToFormNumpy("D:\\tanlanmalar\\german.txt")

    #y[y == 2] = 1

    _, ln = np.unique(y, return_counts=True)

    #minmax_scale(X, copy=False)
    #Normalizing_Estmation(X, y)

    # Cross Validation
    k = 10
    k_fold = KFold(n_splits=k, shuffle=True, random_state=None)

    #Nerual network
    mlp = MLPClassifier(hidden_layer_sizes=(100, 200))

    # Knn
    n_neighbors = 2 * min(ln) - 3
    # mertic Euclidean
    #knc = KNeighborsClassifier(n_neighbors=n_neighbors)
    knc = KNeighborsClassifier(n_neighbors=1)

    #SVM
    svc = SVC()

    #print("MLP")
    max_mean1 = CVS(mlp, X, y, cv=k_fold, n_jobs=4, scoring='accuracy').mean()
    #print("KNN")
    max_mean2 = CVS(knc, X, y, cv=k_fold, n_jobs=4, scoring='accuracy').mean()
    #print("SVM")
    max_mean3 = CVS(svc, X, y, cv=k_fold, n_jobs=4, scoring='accuracy').mean()

    print(max_mean1, max_mean2, max_mean3)
Exemple #5
0
def main():
    #X, types, y = ToFormNumpy("D:\\tanlanmalar\\gasterlogy1394.txt")
    #X, types, y = ToFormNumpy("D:\\tanlanmalar\\spame.txt")
    #X, types, y = ToFormNumpy("D:\\tanlanmalar\\MATBIO_MY.txt")
    X, types, y = ToFormNumpy(r"D:\Nuu\AI\Selections\Amazon_initial_50_30_10000\data.txt")

    metric = 1

    minmax_scale(X, copy=False)

    #w = Lagranj_nd(X, y)
    w = Lagranj(X)
    value = w.max()

    value = w.max()
    cond = w == value
    while len(cond[cond == True]) < 661:
        value = np.max(w[w < value])
        cond = w >= value

    X_Test = X[:, w >= value]

    k = 10
    k_fold = KFold(n_splits=k, shuffle=True, random_state=None)

    svm = SVC(kernel="linear")

    #svm.fit(X_Test, y)

    nn = MLPClassifier()
    nn.fit(X_Test, y)

    max_mean = CVS(nn, X_Test, y, cv=k_fold, n_jobs=4, scoring='accuracy').mean()
    print(max_mean)
Exemple #6
0
def regassess(reg,xtrain,ytrain,cv,scoring=["r2"],show=True):
    score=[]
    for i in range(len(scoring)):
        s=CVS(reg,xtrain,ytrain,cv=5,scoring=scoring[i]).mean()
        if show:
            print("score",i,s)
        score.append(s)
    return score
Exemple #7
0
def xgboost_demo():
    data = load_boston()
    x = data.data
    y = data.target
    Xtrain, Xtest, Ytrain, Ytest = TTS(x, y, test_size=0.3, random_state=430)
    reg = XGBR(n_estimators=100).fit(Xtrain, Ytrain)    # 创建多少棵树
    reg.predict(Xtest)

    print(reg.score(Xtest, Ytest))
    print(MSE(Ytest, reg.predict(Xtest)))    # 查看均方误差

    print(reg.feature_importances_)    # 每个特征的贡献
    print(CVS(reg, Xtrain, Ytrain, cv=5).mean())    # 交叉验证均值
Exemple #8
0
def draw_curve_2():
    data = load_boston()
    x = data.data
    y = data.target
    Xtrain, Xtest, Ytrain, Ytest = TTS(x, y, test_size=0.3, random_state=430)
    axisx = range(10, 1010, 50)
    rs = []
    var = []
    ge = []
    for i in axisx:
        reg = XGBR(n_estimators=i, random_state=420)  # 创建多少棵树
        cvresult = CVS(reg, Xtrain, Ytrain, cv=5)    # 分5折验证
        rs.append(cvresult.mean())  # 1 减去 偏差
        var.append(cvresult.var())  # 纪录方差
        ge.append((1 - cvresult.mean()) ** 2 + cvresult.var())  # 计算泛化误差的可控部分

    # print(axisx[rs.index(max(rs))], max(rs), var[rs.index(max(rs))])
    # 泛化误差可控部分最小的时候,打印r平方和泛化误差
    print(axisx[ge.index(min(ge))], rs[ge.index(min(ge))], var[ge.index(min(ge))], min(ge))
Exemple #9
0
def draw_curve():
    data = load_boston()
    x = data.data
    y = data.target
    Xtrain, Xtest, Ytrain, Ytest = TTS(x, y, test_size=0.3, random_state=430)
    axisx = range(10, 1010, 50)
    rs = []    # 1 减去 偏差
    var = []    # 纪录方差
    ge = []    # 计算泛化误差的可控部分
    for i in axisx:
        reg = XGBR(n_estimators=i)  # 创建多少棵树
        # 默认值越大,越好。所以scoring='neg_mean_squared_error'
        rs.append(CVS(reg, Xtrain, Ytrain, cv=5, scoring='neg_mean_squared_error').mean())

    print(axisx[rs.index(max(rs))], max(rs))
    plt.figure(figsize=(20, 5))
    plt.plot(axisx, rs, c='red', label='XGB')
    plt.legend()
    plt.show()
Exemple #10
0
def main():
    X, types, y = ToFormNumpy("D:\\tanlanmalar\\spame.txt")
    minmax_scale(X, copy=False)

    #X, types, y = ToFormNumpy(r"D:\Nuu\Data mining\Articles\PCA operator\Computing\Lagranj\Spame\data\own\(4595, 57).txt")

    k = 5

    k_fold = KFold(n_splits=k, shuffle=True, random_state=42)

    mlp = MLPClassifier(hidden_layer_sizes=(50, 200),
                        activation='relu',
                        max_iter=1000,
                        alpha=1e-5,
                        solver='adam',
                        verbose=False,
                        tol=1e-4,
                        random_state=1,
                        learning_rate_init=.1)

    max_mean = sum(CVS(mlp, X, y, cv=k_fold, n_jobs=4, scoring='accuracy')) / k

    print('Score = ', max_mean)
Exemple #11
0
    def CalculatePrediction(self):
        try:
            X = self.X.copy()
            y = self.y.copy()

            # Normalize
            if self.chbIsNormalize.isChecked():
                m1 = X.max(axis=0)
                m2 = X.min(axis=0)
                X[:, m1 != m2] = (X[:, m1 != m2] -
                                  m2[m1 != m2]) / (m1[m1 != m2] - m2[m1 != m2])
            """
            # clear noisy objects
            if self.chbIsNoisy.isChecked():
                noisy = find_noisy(X, y)

                X = X[noisy == False]
                y = y[noisy == False]

            """

            # Cross Validation
            k = self.spNumberOfKFold.value()
            k_fold = KFold(n_splits=k, shuffle=True, random_state=None)

            nn = NearestNeighborClassifier_(noisy=self.chbIsNoisy.isChecked())

            result = CVS(nn, X, y, cv=k_fold, n_jobs=4,
                         scoring='accuracy').mean()

            self.lbTextOfResult.setText(
                "Sirpanuvchi nazorat bahosi: {:.4%}".format(result))

        except Exception as exc:
            QMessageBox.about(self,
                              "Cross validationda xatolik  xatolik bor: ",
                              str(exc))
Exemple #12
0
def main():

    path = r"D:\Nuu\AI\Selections\leukemia\leukemia_small.csv"

    X, types, y = ReadFromCSVWithHeaderClass(path)

    minmax_scale(X, copy=False)
    #minmax_scale(X, copy=False)
    """w = Lagranj_nd(X)

    value = w.max()
    cond = w == value
    while len(cond[cond == True]) < 661:
        value = np.max(w[w < value])
        cond = w >= value


    print(len(cond[cond == True]))

    X_Test = X[:, w >= value]
    types_Test = types[w >= value]

    metric = 1

    noisy = find_noisy(X_Test, y, types=types_Test, metric=metric)

    cond = np.logical_not(noisy)

    X_Test = X_Test[cond]
    y_Test = y[cond]

    print(X.shape)

"""

    noisy = find_noisy(X, y, types=types)

    cond = np.logical_not(noisy)
    X = X[cond]
    y = y[cond]

    k = 10
    k_fold = KFold(n_splits=k, shuffle=True, random_state=None)
    """
    # Neighbors
    nnc = NearestNeighborClassifier()

    nnc_ = NearestNeighborClassifier_()

    knc = KNeighborsClassifier(n_neighbors=30)


    begin = time.time()
    max_mean1 = 0
    #max_mean1 = CVS(nnc, X_Test, y_Test, cv=k_fold, n_jobs=4, scoring='accuracy').mean()
    end = time.time()
    print("Time: ", (end - begin) * 1000)

    max_mean2 = 0
    max_mean2 = CVS(nnc_, X_Test, y_Test, cv=k_fold, n_jobs=4, scoring='accuracy').mean()

    begin = time.time()
    max_mean3 = 0
    max_mean3 = CVS(knc, X_Test, y_Test, cv=k_fold, n_jobs=4, scoring='accuracy').mean()
    end = time.time()
    print("Time: ", (end - begin) * 1000)

    print(max_mean1, max_mean2, max_mean3)
"""

    nnc = NearestNeighborClassifier_()
    nnc.fit(X, y)

    svm = SVC(kernel="linear")
    #svm.fit(X, y)

    nn = MLPClassifier(hidden_layer_sizes=(100, 200))
    #nn.fit(X, y)

    max_mean = CVS(nnc, X, y, cv=k_fold, n_jobs=4, scoring='accuracy').mean()

    print(max_mean)
Exemple #13
0
y_predict = reg.predict(Xtest)
print(reg.score(Xtest, Ytest))  # R^2评估指标  0.9197580267581366
print(np.mean(y))  # 22.532806324110677
print(MSE(Ytest, y_predict))  # 7.466827353555599
print(MSE(Ytest, y_predict) / np.mean(y))  # 均方误差 大概占 y标签均值的 1/3 结果不算好

# In[]:
# 树模型的优势之一:能够查看模型的重要性分数,可以使用嵌入法(SelectFromModel)进行特征选择
temparr = reg.feature_importances_

# In[]:
# 交叉验证:
reg = XGBR(n_estimators=100)

# 不严谨: 全数据集(如果数据量少,就用全数据集方式)
CVS(reg, X, y, cv=5).mean()  # R^2: cross_val_score默认 和 模型默认的评估指标相同
# In[]:
# 严谨: 分训练和测试
CVS(reg, Xtrain, Ytrain, cv=5).mean(
)  # 默认评估指标R^2; 但可以显示指定评估指标: scoring='neg_mean_squared_error' 负均方误差

# In[]:
import sklearn
sorted(sklearn.metrics.SCORERS.keys())

# In[]:
# 使用随机森林和线性回归进行一个对比
rfr = RFR(n_estimators=100)
CVS(rfr, Xtrain, Ytrain, cv=5).mean()  # 0.7975497480638329

# In[]:
X=X[:,np.array(t)]
X=preprocessing.scale(X)
y=datatotal.iloc[:,26]
X_train,X_test,Y_train,Y_test=TTS(X,y,test_size=0.2,random_state=seed)

x=np.arange(-1,2,step=0.001)
y=x


###Xgboost
fig = plt.figure(figsize=(8,16))
ax = fig.subplots(3,2)
reg=XGBR(silent=True,n_estimators=200,max_depth=3,learning_rate=0.26,reg_lambda=0.09).fit(X_train,Y_train)
xgb_pre=reg.predict(X_test)
xgb_pre_tr=reg.predict(X_train)
xgb_avg=CVS(reg,X_train,Y_train,scoring="neg_mean_absolute_error",cv=5).mean()
xgb_mse=metrics.mean_squared_error(xgb_pre,Y_test)
xgb_r2=metrics.explained_variance_score(xgb_pre,Y_test)
xgb_mae=metrics.mean_absolute_error(xgb_pre,Y_test)
print("xgb_r2",xgb_r2)
#print("xgb_mse",xgb_mse)
print("xgb_mae",xgb_mae)
#plt.subplot(121)
#plt.figure(figsize=(10,8))
#ax1.text(x=1.36,y=0,s="R^2=0.987",fontdict=font)
#ax[0,0].text(x=1.36,y=-0.3,s="MSE=0.0043",fontdict=font)
ax[0,0].text(x=1.36,y=-0.3,s="RMSE=0.075",fontdict=font)
ax[0,0].text(x=1.36,y=-0.6,s="MAE=0.052",fontdict=font)
ax[0,0].set_title("Xgboost")
ax[0,0].plot(x,y,linewidth=3.81)
ax[0,0].scatter(Y_train,xgb_pre_tr,color = 'blue', s = 15)
print("Classification Report:\n", CR(Y_test, pred, zero_division=0))

# ### Cross Validation

# In[12]:

from sklearn.model_selection import StratifiedKFold as SKF
from sklearn.model_selection import cross_val_score as CVS

model = SVC(kernel='rbf', C=13, gamma=0.325)
folds = 5

start = T()
cross_val = SKF(n_splits=folds, shuffle=True, random_state=4)
scores = CVS(model, X, Y, scoring='accuracy', cv=cross_val)
end = T()

accuracy = scores.mean() * 100
print(f"SVC has mean accuracy of {accuracy:.3f}%\n" +
      f"Cross Validation took {(end-start)*1000:.3f}ms")

# ### Calculate F1-Score of the model

# In[13]:

from sklearn.metrics import f1_score as F1

f1score = F1(Y_test, pred, average='weighted')
print(f"SVC has F1-Score = {f1score * 100:.3f}%")
Exemple #16
0
def main():

    #X, types, y = ToFormNumpy("D:\\tanlanmalar\\IT_BORI_42_6.txt")
    #X, types, y = ToFormNumpy("D:\\tanlanmalar\\giper_my.txt")
    #X, types, y = ToFormNumpy("D:\\tanlanmalar\\spame.txt")
    X, types, y = ToFormNumpy("D:\\tanlanmalar\\gasterlogy1394.txt")
    #X, types, y = ToFormNumpy("D:\\tanlanmalar\\MATBIO_MY.txt")

    y[y == 2] = 1

    _, ln = np.unique(y, return_counts=True)

    #print(ln)

    #minmax_scale(X, copy=False)
    Normalizing_Estmation(X, y)

    indx = clearNoisy(X, y)

    X = X[indx]
    y = y[indx]

    #print(X.shape)
    #print(y.shape)

    #return None

    selection_Name = r'\Gasterology2'
    preproccesing_name = r'own'

    path = r"D:\Nuu\Data mining\Articles\Cross Validation\Computing" + selection_Name + \
           r"\res " + preproccesing_name + ".txt"

    file = open(path, 'w')

    # Cross Validation
    k = 10
    k_fold = KFold(n_splits=k, shuffle=True, random_state=None)

    #Nerual network
    mlp = MLPClassifier(hidden_layer_sizes=(100, 200), activation='logistic')

    # Knn
    n_neighbors = 2 * min(ln) - 3
    # mertic Euclidean
    knc = KNeighborsClassifier(n_neighbors=n_neighbors, p=2)

    #SVM
    svc = SVC(kernel="linear", degree=5)

    # RDF
    rdf = RandomForestClassifier(max_depth=1000)

    #print("MLP")
    max_mean1 = CVS(mlp, X, y, cv=k_fold, n_jobs=4, scoring='accuracy').mean()
    #print("KNN")
    max_mean2 = CVS(knc, X, y, cv=k_fold, n_jobs=4, scoring='accuracy').mean()
    #print("SVM")
    max_mean3 = CVS(svc, X, y, cv=k_fold, n_jobs=4, scoring='accuracy').mean()

    print(X.shape[1], max_mean1, max_mean2, max_mean3)
    # 25
    w = Lagranj(X, y, types)

    while X.shape[1] > 2:
        # Cross Validation
        k = 5
        k_fold = KFold(n_splits=k, shuffle=True, random_state=42)

        # Nerual network
        mlp = MLPClassifier(hidden_layer_sizes=(50, 200),
                            activation='relu',
                            max_iter=1000,
                            alpha=1e-5,
                            solver='adam',
                            verbose=False,
                            tol=1e-8,
                            random_state=1,
                            learning_rate_init=.1)

        # Knn
        n_neighbors = 2 * min(ln) - 3
        # mertic Euclidean
        knc = KNeighborsClassifier(n_neighbors=n_neighbors, p=2)

        # SVM
        svc = SVC(gamma='scale')

        max_mean1 = sum(CVS(mlp, X, y, cv=k_fold, n_jobs=4,
                            scoring='accuracy')) / k
        max_mean2 = sum(CVS(knc, X, y, cv=k_fold, n_jobs=4,
                            scoring='accuracy')) / k
        max_mean3 = sum(CVS(svc, X, y, cv=k_fold, n_jobs=4,
                            scoring='accuracy')) / k

        print(X.shape[1], max_mean1, max_mean2, max_mean3)
        file.write(
            str(X.shape[1]) + "\t" + str(max_mean1) + "\t" + str(max_mean2) +
            "\t" + str(max_mean3) + "\n")

        cond = w != w.min()
        X = X[:, cond]
        w = w[cond]

    file.close()
# 均方误差结果大约占y均值的三分之一,效果一般

# In[9]:

# 树模型可以查看模型的重要性分数,可以使用嵌入法(select from model)进行特征选择
reg.feature_importances_

# # 使用交叉验证来进行对比

# In[10]:

reg = XGBR(n_estimators=100)  # 交叉验证中导入没有经过训练的模型

# In[11]:

print(CVS(reg, xtrain, ytrain, cv=5))
# 1: mean 是对5次交叉验证求均值
# 2: 由于reg(XGB)默认是是R平方指标,所以交叉验证中也是返回R平方指标结果
CVS(reg, xtrain, ytrain, cv=5).mean()

# In[12]:

CVS(reg, xtrain, ytrain, cv=5, scoring='neg_mean_squared_error').mean()

# In[13]:

# 来查看下sklearn中的所有模型来评估指标
import sklearn
sorted(sklearn.metrics.SCORERS.keys())

# In[14]:
Exemple #18
0
xtrain, xtest, ytrain, ytest = TTS(x, y, test_size=0.3, random_state=420)

reg = XGBR(n_estimators=100).fit(xtrain, ytrain)

reg.predict(xtest)
reg.score(xtest, ytest)

err = MSE(ytest, reg.predict(xtest))
ipt = reg.feature_importances_

# print("err",err)
# print("ipt",ipt)

reg = XGBR(n_estimators=100)
an = CVS(reg, xtrain, ytrain, cv=5).mean()
print("an", an)

an2 = CVS(reg, xtrain, ytrain, cv=5, scoring="neg_mean_squared_error").mean()
print("an2", an2)

rfr = RFR(n_estimators=100)
a = CVS(rfr, xtrain, ytrain, cv=5).mean()
neg_mean_square = CVS(rfr, xtrain, ytrain,
                      scoring="neg_mean_squared_error").mean()
print("a,", a, neg_mean_square)

lr = LinearRegression()
b = CVS(lr, xtrain, ytrain, cv=5).mean()
bb = CVS(lr, xtrain, ytrain, cv=5, scoring="neg_mean_squared_error").mean()
print("b", b, bb)
Exemple #19
0
    model_lgb.fit(x_train, y_train)

    pickle.dump(model_lgb, open('model/model_lgb.model', "wb"))

    # print(CVS(model_lgb, x_train, y_train, cv=5, scoring='neg_mean_squared_error').mean()) #-0.010733531287803806

    model_cat = CatBoostClassifier(iterations=220,
                                   learning_rate=0.02,
                                   depth=4,
                                   loss_function='Logloss',
                                   eval_metric='Logloss')

    print(
        CVS(model_cat,
            x_train,
            y_train,
            cv=5,
            scoring='neg_mean_squared_error').mean())  # -0.011547344110854504
    model_cat.fit(x_train, y_train)
    pickle.dump(model_cat, open('model/model_cat.model', "wb"))

    print("==================================================")
    print("training done!")

    model_xg = pickle.load(open('model/model_xg.model', 'rb'))
    model_lgb = pickle.load(open('model/model_lgb.model', 'rb'))
    model_cat = pickle.load(open('model/model_cat.model', 'rb'))

    print("==================================================")
    print("ensemble predicting ...")
from time import time
import datetime

data = load_boston()
# 波士顿数据集非常简单,但它所涉及到的问题却很多

X = data.data
y = data.target

Xtrain, Xtest, Ytrain, Ytest = TTS(X, y, test_size=0.3, random_state=420)

var = []
ge = []
for i in axisx:
    reg = XGBR(n_estimators=i, random_state=0)
    cvresult = CVS(reg, Xtrain, Ytrain, cv=5)
    rs.append(cvresult.mean())
    var.append(cvresult.var())
    ge.append((1 - cvresult.mean())**2 + cvresult.var())
print(axisx[rs.index(max(rs))], max(rs), var[rs.index(max(rs))])
print(axisx[var.index(min(var))], rs[var.index(min(var))], min(var))
print(axisx[ge.index(min(ge))], rs[ge.index(min(ge))], var[ge.index(min(ge))],
      min(ge))

plt.figure(figsize=(20, 5))
plt.plot(axisx, rs, c='red', label='XGB')
plt.legend()
plt.show()

rs = np.array(rs)
var = np.array(var) * 0.01
Exemple #21
0
# 绘制学习曲线
cv = KFold(n_splits=5, shuffle=True, random_state=32)
plot_learning_curve(XGBR(n_estimators=100, random_state=30), 'XGBR', X, y, ax=None, cv=cv)
plt.show()

# 通过观察图可以发现在数据量很少的情况下,模型处于过拟合状态,在数据流不断提高时,模型的泛华能力不断提高。


# 开始模型调参,先来绘制n_estimators的学习曲线观察何时最优
axisx = range(100, 300, 10)
rs = []
var = []
ge = []
for i in axisx:
  xgbr = XGBR(n_estimators=i, random_state=30)
  cvresult = CVS(xgbr, X, y, cv=cv)
  # 记录1-偏差
  rs.append(cvresult.mean())
  # 记录方差
  var.append(cvresult.var())
  # 记录泛化误差
  ge.append((1-cvresult.mean())**2 + cvresult.var())
# 打印R2最高的参数取值,并打印此时的方差
print(axisx[rs.index(max(rs))], max(rs), var[rs.index(max(rs))])
# 打印方差最低时对应的参数取值,并打印此时的R2
print(axisx[var.index(min(var))], min(var), rs[var.index(min(var))])
# 打印泛化误差最低时的参数取值
print(axisx[np.argmin(ge)], rs[ge.index(min(ge))], var[ge.index(min(ge))])
plt.plot(axisx, rs, color="r", label="XGBR")
plt.legend()
plt.show()
ax2.plot(X[:, 0], y, 'o', c='k')
ax2.legend(loc="best")
ax2.set_xlabel("Input feature")
ax2.set_title("Result after discretization")
plt.tight_layout()
plt.show()

pred, score, var = [], [], []
binsrange = [2, 5, 10, 15, 20, 30]
for i in binsrange:
    enc = KBinsDiscretizer(n_bins=i, encode="onehot")
    X_binned = enc.fit_transform(X)
    line_binned = enc.transform(line)
    LinearR_ = LinearRegression()
    cvresult = CVS(LinearR_, X_binned, y, cv=5)
    score.append(cvresult.mean())
    var.append(cvresult.var())
    pred.append(LinearR_.fit(X_binned, y).score(line_binned, np.sin(line)))
plt.figure(figsize=(6, 5))
plt.plot(binsrange, pred, c="orange", label="test")
plt.plot(binsrange, score, c="k", label="full data")
plt.plot(binsrange,
         score + np.array(var) * 0.5,
         c="red",
         linestyle="--",
         label="var")
plt.plot(binsrange, score - np.array(var) * 0.5, c="red", linestyle="--")
plt.legend()
plt.show()
Exemple #23
0
# 绘制训练样本容量的学习曲线
train_size, train_scores, test_scores = learning_curve(xgbr, X=X, y=y, cv=cv, random_state=12)
sns.lineplot(train_size, train_scores.mean(axis=1), marker='o', color='red')
sns.lineplot(train_size, test_scores.mean(axis=1), marker='o', color='green')


# 绘制基分类器个数的学习曲线
x_axis = range(10, 1010, 50)
cv = KFold(n_splits=5, shuffle=True, random_state=45)
v_bias = [];
v_vars = [];
v_error = [];
for i in x_axis:
    xgbr = XGBRegressor(n_estimators=i);
    score = CVS(xgbr, X, y, cv=cv);
    v_bias.append(score.mean())
    v_vars.append(score.var())
    v_error.append((1 - score.mean()) ** 2 + score.var())

print(max(v_bias), x_axis[v_bias.index(max(v_bias))], v_error[v_bias.index(max(v_bias))])

print(min(v_vars), x_axis[v_vars.index(min(v_vars))], v_error[v_vars.index(min(v_vars))])

print(min(v_error), x_axis[v_error.index(min(v_error))], v_bias[v_error.index(min(v_error))])

# 绘制偏差、方差学习曲线
sns.lineplot(x_axis, v_bias, marker='o', color='red')
sns.lineplot(x_axis, np.array(v_bias) - np.array(v_vars), marker='+', color='green', )
sns.lineplot(x_axis, np.array(v_bias) + np.array(v_vars), marker='+', color='green', )