y = datasets.target print("init x.shape:", x.shape) # 1.1 데이터 전처리 (train_test_split) x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=44, shuffle=True, test_size=0.2) # 2 모델 (XGBRFRegressor) model = XGBRFRegressor(max_depth=4) model.fit(x_train, y_train) # 4. 평가 acc = model.score(x_test, y_test) print("acc:", acc) print(model.feature_importances_) # 피쳐 임포턴스 자르는 함수 def earseLowFI_index(fi_arr, low_value, input_arr): input_arr = input_arr.T temp = [] for i in range(fi_arr.shape[0]): if fi_arr[i] >= low_value: temp.append(input_arr[i, :]) temp = np.array(temp) temp = temp.T return temp
from xgboost import XGBClassifier, XGBRFRegressor import matplotlib.pyplot as plt import numpy as np from sklearn.datasets import load_diabetes x, y = load_diabetes(return_X_y=True) x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42, shuffle=True, train_size=0.8) model1 = XGBRFRegressor() model1.fit(x_train, y_train) default_score = model1.score(x_test, y_test) model = XGBRFRegressor() model.fit(x_train, y_train) print(model.feature_importances_) index7 = np.sort(model.feature_importances_)[::-1][int( 0.7 * len(model.feature_importances_))] delete_list = [] for i in model.feature_importances_: if i < index7: print(i, "제거 ") delete_list.append(model.feature_importances_.tolist().index(i)) # print(delete_list)
y, train_size=0.8, shuffle=True, random_state=66) # 이 정도만 조작해 주면 됨 n_estimators = 1000 # The number of trees in the forest. learning_rate = 1 # 학습률 colsample_bytree = None # 트리의 샘플 / 실전0.6 ~ 0.9 사이 씀 / 실전 1씀 colsample_bylevel = 0.9 # [기본설정값: 1]: subsample, colsample_bytree 두 초모수 설정을 통해서 이미 의사결정나무 모형 개발에 사용될 변수갯수와 관측점 갯수를 사용했는데 추가로 colsample_bylevel을 지정하는 것이 특별한 의미를 갖는지 의문이 듦. max_depth = 29 # [기본설정값: 6]: 과적합 방지를 위해서 사용되는데 역시 CV를 사용해서 적절한 값이 제시되어야 하고 보통 3-10 사이 값이 적용된다. n_jobs = -1 # CV 써라 # XGB 속도가 굉장히 빠름, 전처리 결측치 제거 안해줘도 됨 model = XGBRFRegressor(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, colsample_bylevel=colsample_bylevel, colsample_bytree=colsample_bytree) model.fit(x_train, y_train) score = model.score(x_test, y_test) # score는 evaluate print('점수 :', score) # print(model.feature_importances_) plot_importance(model) # plt.show()
from sklearn.feature_selection import SelectFromModel from sklearn.preprocessing import MinMaxScaler import numpy as np import pandas as pd x, y = load_boston(return_X_y=True) x_train, x_test, y_train, y_test = train_test_split(x, y, shuffle=True, train_size=0.8, random_state=66) model = XGBRFRegressor(n_jobs=-1) model.fit(x_train, y_train) score = model.score(x_test, y_test) print('R2', score) thresholds = np.sort(model.feature_importances_) #피처를 소팅 print(thresholds) for thresh in thresholds: selection = SelectFromModel(model, threshold=thresh, prefit=True) # 피처의 개수를 하나씩 제거 select_x_train = selection.transform(x_train) # 피쳐의 개수를 줄인 트레인을 반환 selection_model = XGBRFRegressor(n_jobs=-1) # 모델 생성 selection_model.fit(select_x_train, y_train) #모델의 핏 select_x_test = selection.transform(x_test) # 피쳐의 개수를 줄인 테스트 반환
# 开始模型调参,先来绘制n_estimators的学习曲线观察何时最优 axisx = range(100, 300, 10) rs = [] var = [] ge = [] for i in axisx: xgbr = XGBR(n_estimators=i, random_state=30) cvresult = CVS(xgbr, X, y, cv=cv) # 记录1-偏差 rs.append(cvresult.mean()) # 记录方差 var.append(cvresult.var()) # 记录泛化误差 ge.append((1-cvresult.mean())**2 + cvresult.var()) # 打印R2最高的参数取值,并打印此时的方差 print(axisx[rs.index(max(rs))], max(rs), var[rs.index(max(rs))]) # 打印方差最低时对应的参数取值,并打印此时的R2 print(axisx[var.index(min(var))], min(var), rs[var.index(min(var))]) # 打印泛化误差最低时的参数取值 print(axisx[np.argmin(ge)], rs[ge.index(min(ge))], var[ge.index(min(ge))]) plt.plot(axisx, rs, color="r", label="XGBR") plt.legend() plt.show() # 通过观察发现当n_estimators取150时泛华误差最小 # 验证 xgbr = XGBR(n_estimators=100, random_state=420).fit(x_train, y_train) print(xgbr.score(x_test, y_test)) xgbr = XGBR(n_estimators=150, random_state=420).fit(x_train, y_train) print(xgbr.score(x_test, y_test))
# print(y2_pred) # print(y2_pred.shape) # y3_pred = model3.predict(test) # print(y3_pred) # print(y3_pred.shape) # y4_pred = model4.predict(test) # print(y4_pred) # print(y4_pred.shape) # acc1 = model1.score(x_test, y1_test) # acc2 = model2.score(x_test, y2_test) acc3 = model3.score(x_test, y3_test) # acc4 = model4.score(x_test, y4_test) warnings.filterwarnings('ignore') # print(acc1) # print(acc2) print(acc3) # print(acc4) # print("최적의 매개 변수 : ", model.best_params_) warnings.filterwarnings('ignore') # thresholds = np.sort(model.feature_importances_) # print(thresholds) # for thresh in thresholds: #중요하지 않은 컬럼들을 하나씩 지워나간다.
print('R^2-Coefficient of Determination value',xgb.score(X_test, y_test)) print('MAE:', metrics.mean_absolute_error(y_test, predictions)) print('MSE:', metrics.mean_squared_error(y_test, predictions)) print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions))) fig, ax = plt.subplots() ax.scatter(y_test, predictions) ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4) ax.set_xlabel('Measured') ax.set_ylabel('Predicted') ax.set_title('R2: ' + str(r2_score(y_test, predictions))) plt.show() from sklearn import metrics xgbrfr = XGBRFRegressor(random_state=133).fit(X_train, y_train) scores.append(xgbrfr.score(X_test, y_test)) rmse.append(np.sqrt(mean_squared_error(y_test, xgbrfr.predict(X_test)))) predictions = xgbrfr.predict(X_test) mse.append(metrics.mean_squared_error(y_test, predictions)) mae.append(metrics.mean_absolute_error(y_test, predictions)) print('R^2-Coefficient of Determination value',xgbrfr.score(X_test, y_test)) print('MAE:', metrics.mean_absolute_error(y_test, predictions)) print('MSE:', metrics.mean_squared_error(y_test, predictions)) print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions))) fig, ax = plt.subplots() ax.scatter(y_test, predictions) ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4) ax.set_xlabel('Measured') ax.set_ylabel('Predicted') ax.set_title('R2: ' + str(r2_score(y_test, predictions)))