Beispiel #1
0
y = datasets.target
print("init x.shape:", x.shape)

# 1.1 데이터 전처리 (train_test_split)
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    random_state=44,
                                                    shuffle=True,
                                                    test_size=0.2)

# 2 모델 (XGBRFRegressor)
model = XGBRFRegressor(max_depth=4)
model.fit(x_train, y_train)

# 4. 평가
acc = model.score(x_test, y_test)
print("acc:", acc)
print(model.feature_importances_)


# 피쳐 임포턴스 자르는 함수
def earseLowFI_index(fi_arr, low_value, input_arr):
    input_arr = input_arr.T
    temp = []
    for i in range(fi_arr.shape[0]):
        if fi_arr[i] >= low_value:
            temp.append(input_arr[i, :])
    temp = np.array(temp)
    temp = temp.T
    return temp
from xgboost import XGBClassifier, XGBRFRegressor
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import load_diabetes

x, y = load_diabetes(return_X_y=True)
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    random_state=42,
                                                    shuffle=True,
                                                    train_size=0.8)

model1 = XGBRFRegressor()
model1.fit(x_train, y_train)

default_score = model1.score(x_test, y_test)

model = XGBRFRegressor()
model.fit(x_train, y_train)
print(model.feature_importances_)

index7 = np.sort(model.feature_importances_)[::-1][int(
    0.7 * len(model.feature_importances_))]

delete_list = []
for i in model.feature_importances_:
    if i < index7:
        print(i, "제거 ")
        delete_list.append(model.feature_importances_.tolist().index(i))

# print(delete_list)
Beispiel #3
0
                                                    y,
                                                    train_size=0.8,
                                                    shuffle=True,
                                                    random_state=66)

# 이 정도만 조작해 주면 됨
n_estimators = 1000  # The number of trees in the forest.
learning_rate = 1  # 학습률
colsample_bytree = None  # 트리의 샘플 / 실전0.6 ~ 0.9 사이 씀 / 실전 1씀
colsample_bylevel = 0.9  # [기본설정값: 1]: subsample, colsample_bytree 두 초모수 설정을 통해서 이미 의사결정나무 모형 개발에 사용될 변수갯수와 관측점 갯수를 사용했는데 추가로 colsample_bylevel을 지정하는 것이 특별한 의미를 갖는지 의문이 듦.
max_depth = 29  # [기본설정값: 6]: 과적합 방지를 위해서 사용되는데 역시 CV를 사용해서 적절한 값이 제시되어야 하고 보통 3-10 사이 값이 적용된다.
n_jobs = -1

# CV 써라
# XGB 속도가 굉장히 빠름, 전처리 결측치 제거 안해줘도 됨

model = XGBRFRegressor(max_depth=max_depth,
                       learning_rate=learning_rate,
                       n_estimators=n_estimators,
                       colsample_bylevel=colsample_bylevel,
                       colsample_bytree=colsample_bytree)

model.fit(x_train, y_train)

score = model.score(x_test, y_test)  # score는 evaluate
print('점수 :', score)

# print(model.feature_importances_)
plot_importance(model)
# plt.show()
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd

x, y = load_boston(return_X_y=True)

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    shuffle=True,
                                                    train_size=0.8,
                                                    random_state=66)

model = XGBRFRegressor(n_jobs=-1)
model.fit(x_train, y_train)
score = model.score(x_test, y_test)
print('R2', score)

thresholds = np.sort(model.feature_importances_)  #피처를 소팅
print(thresholds)

for thresh in thresholds:
    selection = SelectFromModel(model, threshold=thresh,
                                prefit=True)  # 피처의 개수를 하나씩 제거

    select_x_train = selection.transform(x_train)  # 피쳐의 개수를 줄인 트레인을 반환

    selection_model = XGBRFRegressor(n_jobs=-1)  # 모델 생성
    selection_model.fit(select_x_train, y_train)  #모델의 핏

    select_x_test = selection.transform(x_test)  # 피쳐의 개수를 줄인 테스트 반환
Beispiel #5
0
# 开始模型调参,先来绘制n_estimators的学习曲线观察何时最优
axisx = range(100, 300, 10)
rs = []
var = []
ge = []
for i in axisx:
  xgbr = XGBR(n_estimators=i, random_state=30)
  cvresult = CVS(xgbr, X, y, cv=cv)
  # 记录1-偏差
  rs.append(cvresult.mean())
  # 记录方差
  var.append(cvresult.var())
  # 记录泛化误差
  ge.append((1-cvresult.mean())**2 + cvresult.var())
# 打印R2最高的参数取值,并打印此时的方差
print(axisx[rs.index(max(rs))], max(rs), var[rs.index(max(rs))])
# 打印方差最低时对应的参数取值,并打印此时的R2
print(axisx[var.index(min(var))], min(var), rs[var.index(min(var))])
# 打印泛化误差最低时的参数取值
print(axisx[np.argmin(ge)], rs[ge.index(min(ge))], var[ge.index(min(ge))])
plt.plot(axisx, rs, color="r", label="XGBR")
plt.legend()
plt.show()

# 通过观察发现当n_estimators取150时泛华误差最小

# 验证
xgbr = XGBR(n_estimators=100, random_state=420).fit(x_train, y_train)
print(xgbr.score(x_test, y_test))
xgbr = XGBR(n_estimators=150, random_state=420).fit(x_train, y_train)
print(xgbr.score(x_test, y_test))
Beispiel #6
0
# print(y2_pred)
# print(y2_pred.shape)

# y3_pred = model3.predict(test)
# print(y3_pred)
# print(y3_pred.shape)

# y4_pred = model4.predict(test)
# print(y4_pred)
# print(y4_pred.shape)



# acc1 = model1.score(x_test, y1_test)
# acc2 = model2.score(x_test, y2_test)
acc3 = model3.score(x_test, y3_test)
# acc4 = model4.score(x_test, y4_test)

warnings.filterwarnings('ignore')
# print(acc1)
# print(acc2)
print(acc3)
# print(acc4)

# print("최적의 매개 변수 :  ", model.best_params_)
warnings.filterwarnings('ignore')
# thresholds = np.sort(model.feature_importances_)

# print(thresholds)

# for thresh in thresholds: #중요하지 않은 컬럼들을 하나씩 지워나간다.
Beispiel #7
0
print('R^2-Coefficient of Determination value',xgb.score(X_test, y_test))
print('MAE:', metrics.mean_absolute_error(y_test, predictions)) 
print('MSE:', metrics.mean_squared_error(y_test, predictions)) 
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions))) 

fig, ax = plt.subplots()
ax.scatter(y_test, predictions)
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
ax.set_title('R2: ' + str(r2_score(y_test, predictions)))
plt.show()

from sklearn import metrics 
xgbrfr = XGBRFRegressor(random_state=133).fit(X_train, y_train)
scores.append(xgbrfr.score(X_test, y_test))
rmse.append(np.sqrt(mean_squared_error(y_test, xgbrfr.predict(X_test))))
predictions = xgbrfr.predict(X_test)
mse.append(metrics.mean_squared_error(y_test, predictions))
mae.append(metrics.mean_absolute_error(y_test, predictions))
print('R^2-Coefficient of Determination value',xgbrfr.score(X_test, y_test))
print('MAE:', metrics.mean_absolute_error(y_test, predictions)) 
print('MSE:', metrics.mean_squared_error(y_test, predictions)) 
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions))) 

fig, ax = plt.subplots()
ax.scatter(y_test, predictions)
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
ax.set_title('R2: ' + str(r2_score(y_test, predictions)))