def build_model(X_train, y_train, X_valid, y_valid): best_params = { 'base_score': 2, 'colsample_bylevel': 0.75, 'colsample_bynode': 0.57, 'colsample_bytree': 0.95, 'gamma': 0.25, 'learning_rate': 1.7, 'max_depth': 18, 'min_child_weight': 0.025, 'n_estimators': 353, 'n_jobs': -1, 'num_class': 3, 'num_parallel_tree': 105, 'objective': 'multi:softmax', 'random_state': 42, 'subsample': 0.8, 'verbosity': 0, 'reg_alpha': 0.05, 'reg_lambda': 1, 'rate_drop': 0.5 } best_xgb = XGBRFClassifier(**best_params) best_xgb.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], eval_metric=['merror'], early_stopping_rounds=50, callbacks=[print_evaluation(period=5), early_stop(stopping_rounds=15)], verbose=False,) return best_xgb
def test_xg_XGBRFClassifier(): print("Testing xgboost, XGBRFClassifier...") # Note, only works with binary outcomes! mod = XGBRFClassifier() X, y = iris_data ybin = np.where(y <= 1, 0, 1) mod.fit(X, ybin) docs = {'name': "XGBRFClassifier test"} fv = X[0, :] upload(mod, fv, docs)
def fast_gbtree_classifier( X, y, *, learning_rate: float = 1.0, n_estimators: int = 100, subsample: float = 0.8, max_depth: Optional[int] = None, reg_alpha: Optional[float] = None, # L1 reg_lambda: Optional[float] = 1e-05, # L2 gamma: Optional[float] = None, missing: Optional[Any] = np.nan, objective: Objectives = 'binary:logistic', grow_policy: Literal['depthwise', 'lossguide'] = 'depthwise', tree_method: Literal['auto', 'exact', 'approx', 'hist', 'gpu_hist'] = 'auto', importance_type: Literal['gain', 'weight', 'cover', 'total_gain', 'total_cover'] = 'gain', random_state: int = 1, n_jobs: Optional[int] = None, framework: Literal['auto', 'xgboost', 'sklearn'] = 'auto', **kwargs, ) -> GradientBoostingClassifier: """Shared interface for XGBoost and sklearn Gradient Boosting Tree Classifier""" kw = dict(locals()) kwargs = kw.pop('kwargs') X = kw.pop('X') y = kw.pop('y') kw.update(kwargs) framework = kw.pop('framework') ### XGBOOST is_xgboost = False if framework == 'sklearn': XGB = GradientBoostingClassifier else: try: from xgboost import XGBRFClassifier as XGB is_xgboost = True except ImportError as e: warn('Run `pip install xgboost` to get significant ' 'faster GradientBoostingTree') XGB = GradientBoostingClassifier ### fine-tune the keywords for sklearn if not is_xgboost: org = dict(kw) spec = inspect.getfullargspec(XGB.__init__) kw = dict() for k in spec.args + spec.kwonlyargs: if k in org: kw[k] = org[k] ### training tree = XGB(**kw) tree.fit(X, y) return tree
def xgrfboost_classification(train, target, n_estimators=100, max_depth=8, random_state=17, learning_rate=0.1, colsample_bytree=0.9, colsample_bynode=0.9, colsample_bylevel=0.9, importance_type='split', reg_alpha=2, reg_lambda=2): '''XGRFBoost Classification Params :- train - Training Set to train target - Target Set to predict n_estimators - no. of trees to predict (default set to 100) max_depth - Maximum depth that a tree can grow (default set to 8) random_state - A arbitary number to get same results when run on different machine with same params (default set to 17) learning_rate - size of step to to attain towards local minima colsample_bytree, colsample_bynode, colsample_bylevel - part of total features to use bytree, bynode, bylevel importance_type - metric to split samples (default set to split) reg_alpha, reg_lambda - L1 regularisation and L2 regularisation respectively''' from xgboost import XGBRFClassifier model = XGBRFClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=random_state, learning_rate=learning_rate, colsample_bytree=colsample_bytree, colsample_bynode=colsample_bynode, colsample_bylevel=colsample_bylevel, importance_type=importance_type, reg_alpha=reg_alpha, reg_lambda=reg_lambda) model.fit(train, target) print("Training Completed .....") return model
# 이 정도만 조작해 주면 됨 n_estimators = 1000 # The number of trees in the forest. learning_rate = 1 # 학습률 colsample_bytree = None # 트리의 샘플 / 실전0.6 ~ 0.9 사이 씀 / 실전 1씀 colsample_bylevel = 0.9 # [기본설정값: 1]: subsample, colsample_bytree 두 초모수 설정을 통해서 이미 의사결정나무 모형 개발에 사용될 변수갯수와 관측점 갯수를 사용했는데 추가로 colsample_bylevel을 지정하는 것이 특별한 의미를 갖는지 의문이 듦. max_depth = 29 # [기본설정값: 6]: 과적합 방지를 위해서 사용되는데 역시 CV를 사용해서 적절한 값이 제시되어야 하고 보통 3-10 사이 값이 적용된다. n_jobs = -1 # CV 써라 # XGB 속도가 굉장히 빠름, 전처리 결측치 제거 안해줘도 됨 model = XGBRFClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, colsample_bylevel=colsample_bylevel, colsample_bytree=colsample_bytree) model.fit(x_train, y_train) score = model.score(x_test, y_test) # score는 evaluate print('점수 :', score) # print(model.feature_importances_) plot_importance(model) # plt.show() # XGBRFClassifier 점수 : 0.9666666666666667 # XGBClassifier 점수 : 0.8666666666666667
from sklearn.datasets import load_breast_cancer from sklearn.metrics import accuracy_score, r2_score from sklearn.model_selection import train_test_split from xgboost import XGBRFRegressor, XGBRFClassifier # x, y = load_breast_cancer(return_X_y=True) x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8) model = XGBRFClassifier(n_estimators=1000, learning_rate=0.1) model.fit(x_train, y_train, verbose=True, eval_metric="error", eval_set=[(x_train, y_train), (x_test, y_test)]) #rmse,mae,logloss,error,auc results = model.evals_result() print("eval:", results) y_pred = model.predict(x_test) acc = accuracy_score(y_test, y_pred) print("acc:", acc) # import pickle # pickle.dump(model, open("./model/sample/xgb_save/cancer.pickle.dat", "wb")) import joblib
## train_test_split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=True, random_state=66) ## 모델링 model = XGBRFClassifier( n_estimators=300, # verbose의 갯수, epochs와 동일 learning_rate=0.1) model.fit(x_train, y_train, verbose=True, eval_metric=['error', 'auc'], eval_set=[(x_train, y_train), (x_test, y_test)]) # early_stopping_rounds = 100) # eval_metic의 종류 : rmse, mae, logloss, error(error가 0.2면 accuracy는 0.8), auc(정확도, 정밀도; accuracy의 친구다) results = model.evals_result() print("eval's result : ", results) y_pred = model.predict(x_test) acc = accuracy_score(y_test, y_pred) # print("r2 Score : %.2f%%" %(r2 * 100)) print("acc : ", acc) thresholds = np.sort(model.feature_importances_)
from sklearn.preprocessing import StandardScaler sc = StandardScaler() print(f"\nBefore Standard Scaler, x.head() :- \n{ x.head() }") x = sc.fit_transform(x) print(f"\nAfter Standard Scaler, x :- \n{ x }") from sklearn.model_selection import train_test_split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2) from xgboost import XGBRFClassifier xgboost = XGBRFClassifier() xgboost.fit(x_train, y_train) y_pred = xgboost.predict(x_test) print( f"xgboost.score( x_test, y_test ) = { xgboost.score( x_test, y_test ) * 100 }%" ) import matplotlib.pyplot as plt plt.plot(x_test, y_test, label='Actual', marker='*', color='blue', linestyle='') plt.plot(x_test,
print(thresholds) for thresh in thresholds: #중요하지 않은 컬럼들을 하나씩 지워나간다. selection = SelectFromModel(model, threshold=thresh, prefit=True) selection_x_train = selection.transform(x_train) selection_x_test = selection.transform(x_test) print(selection_x_train.shape) selection_model = XGBRFClassifier(objective="multi:softprob", n_jobs=-1) selection_model.fit(selection_x_train, y_train, eval_metric=['merror', 'mlogloss'], eval_set=[(selection_x_train, y_train), (selection_x_test, y_test)]) y_pred = selection_model.predict(selection_x_test) acc = accuracy_score(y_test, y_pred) #print("R2:",r2) for i in thresholds: pickle.dump( model, open( "./model/sample/xgb_save/iris.pickle{}.dat".format( selection_x_train.shape[1]), "wb")) print("Thresh=%.3f, n=%d, acc: %.2f%%" % (thresh, selection_x_train.shape[1], acc * 100.0))
if test: y = None X = d.values else: y = np.ravel(d[['Survived']].values) X = d.drop(columns=['Survived']).values X = preprocessing.scale(X) return (X, y) (Xtrain, ytrain) = for_model_input(trainset) knn_imputer = KNNImputer() Xtrain = knn_imputer.fit_transform(Xtrain) boosted_model = XGBRFClassifier() boosted_model.fit(Xtrain, ytrain) boosted_scores = cross_val_score(boosted_model, Xtrain, ytrain, cv=5) print("Gradient-Boosting Model CV scores:\n", boosted_scores, np.mean(boosted_scores)) (Xtest, _) = for_model_input(testset, test=True) Xtest = knn_imputer.fit_transform(Xtest) predictions_boosted = boosted_model.predict(Xtest) # + 1) / 2 predictions_boosted = predictions_boosted.astype('int64') pred_boosted_df = pandas.DataFrame(predictions_boosted, columns=['Survived']) fin_ans_boosted = pandas.DataFrame( testset['PassengerId']).join(pred_boosted_df) with open('predictions_xgboost_rf.csv', 'w') as f: f.write((fin_ans_boosted.to_csv(index=False)))
from xgboost import XGBRFClassifier from sklearn.model_selection import train_test_split df = pd.read_csv("heart_failure_clinical_records_dataset.csv") t = np.array(list(df['creatinine_phosphokinase'])).reshape(-1, 1) pt = PowerTransformer(method="yeo-johnson") creatinine_phosphokinase = pt.fit_transform(t) df['creatinine_phosphokinase'] = creatinine_phosphokinase t = np.array(list(df['serum_creatinine'])).reshape(-1, 1) pt = PowerTransformer(method="yeo-johnson") serum_creatinine = pt.fit_transform(t) df['serum_creatinine'] = serum_creatinine df.drop(columns=['sex', 'diabetes'], inplace=True) X = df.iloc[:, 0:10].values Y = df['DEATH_EVENT'].values x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=6) xrclf = XGBRFClassifier() xrclf.fit(x_train, y_train) pickle.dump(xrclf, open('xrclf.pkl', 'wb')) clf = pickle.load(open('xrclf.pkl', 'rb')) print(clf.score(x_test, y_test))