Ejemplo n.º 1
0
def build_model(X_train, y_train, X_valid, y_valid):
    best_params = {
        'base_score': 2,
        'colsample_bylevel': 0.75,
        'colsample_bynode': 0.57,
        'colsample_bytree': 0.95,
        'gamma': 0.25,
        'learning_rate': 1.7,
        'max_depth': 18,
        'min_child_weight': 0.025,
        'n_estimators': 353,
        'n_jobs': -1,
        'num_class': 3,
        'num_parallel_tree': 105,
        'objective': 'multi:softmax',
        'random_state': 42,
        'subsample': 0.8,
        'verbosity': 0,
        'reg_alpha': 0.05,
        'reg_lambda': 1,
        'rate_drop': 0.5
    }
    best_xgb = XGBRFClassifier(**best_params)

    best_xgb.fit(X_train, y_train,
                 eval_set=[(X_train, y_train),
                           (X_valid, y_valid)],
                 eval_metric=['merror'],
                 early_stopping_rounds=50,
                 callbacks=[print_evaluation(period=5),
                            early_stop(stopping_rounds=15)],
                 verbose=False,)
    return best_xgb
Ejemplo n.º 2
0
def test_xg_XGBRFClassifier():
    print("Testing xgboost, XGBRFClassifier...")
    # Note, only works with binary outcomes!
    mod = XGBRFClassifier()
    X, y = iris_data
    ybin = np.where(y <= 1, 0, 1)
    mod.fit(X, ybin)
    docs = {'name': "XGBRFClassifier test"}
    fv = X[0, :]
    upload(mod, fv, docs)
Ejemplo n.º 3
0
def fast_gbtree_classifier(
    X,
    y,
    *,
    learning_rate: float = 1.0,
    n_estimators: int = 100,
    subsample: float = 0.8,
    max_depth: Optional[int] = None,
    reg_alpha: Optional[float] = None,  # L1
    reg_lambda: Optional[float] = 1e-05,  # L2
    gamma: Optional[float] = None,
    missing: Optional[Any] = np.nan,
    objective: Objectives = 'binary:logistic',
    grow_policy: Literal['depthwise', 'lossguide'] = 'depthwise',
    tree_method: Literal['auto', 'exact', 'approx', 'hist',
                         'gpu_hist'] = 'auto',
    importance_type: Literal['gain', 'weight', 'cover', 'total_gain',
                             'total_cover'] = 'gain',
    random_state: int = 1,
    n_jobs: Optional[int] = None,
    framework: Literal['auto', 'xgboost', 'sklearn'] = 'auto',
    **kwargs,
) -> GradientBoostingClassifier:
    """Shared interface for XGBoost and sklearn Gradient Boosting Tree Classifier"""
    kw = dict(locals())
    kwargs = kw.pop('kwargs')
    X = kw.pop('X')
    y = kw.pop('y')
    kw.update(kwargs)
    framework = kw.pop('framework')
    ### XGBOOST
    is_xgboost = False
    if framework == 'sklearn':
        XGB = GradientBoostingClassifier
    else:
        try:
            from xgboost import XGBRFClassifier as XGB
            is_xgboost = True
        except ImportError as e:
            warn('Run `pip install xgboost` to get significant '
                 'faster GradientBoostingTree')
            XGB = GradientBoostingClassifier
    ### fine-tune the keywords for sklearn
    if not is_xgboost:
        org = dict(kw)
        spec = inspect.getfullargspec(XGB.__init__)
        kw = dict()
        for k in spec.args + spec.kwonlyargs:
            if k in org:
                kw[k] = org[k]
    ### training
    tree = XGB(**kw)
    tree.fit(X, y)
    return tree
Ejemplo n.º 4
0
    def xgrfboost_classification(train,
                                 target,
                                 n_estimators=100,
                                 max_depth=8,
                                 random_state=17,
                                 learning_rate=0.1,
                                 colsample_bytree=0.9,
                                 colsample_bynode=0.9,
                                 colsample_bylevel=0.9,
                                 importance_type='split',
                                 reg_alpha=2,
                                 reg_lambda=2):
        '''XGRFBoost Classification
           Params :-
           train - Training Set to train
           target - Target Set to predict
           n_estimators - no. of trees to predict (default set to 100)
           max_depth - Maximum depth that a tree can grow (default set to 8)
           random_state - A arbitary number to get same results when run on different machine with same params (default set to 17)
           learning_rate - size of step to to attain towards local minima
           colsample_bytree, colsample_bynode, colsample_bylevel - part of total features to use bytree, bynode, bylevel
           importance_type - metric to split samples (default set to split)
           reg_alpha, reg_lambda - L1 regularisation and L2 regularisation respectively'''

        from xgboost import XGBRFClassifier
        model = XGBRFClassifier(n_estimators=n_estimators,
                                max_depth=max_depth,
                                random_state=random_state,
                                learning_rate=learning_rate,
                                colsample_bytree=colsample_bytree,
                                colsample_bynode=colsample_bynode,
                                colsample_bylevel=colsample_bylevel,
                                importance_type=importance_type,
                                reg_alpha=reg_alpha,
                                reg_lambda=reg_lambda)
        model.fit(train, target)
        print("Training Completed .....")

        return model
Ejemplo n.º 5
0
# 이 정도만 조작해 주면 됨
n_estimators = 1000  # The number of trees in the forest.
learning_rate = 1  # 학습률
colsample_bytree = None  # 트리의 샘플 / 실전0.6 ~ 0.9 사이 씀 / 실전 1씀
colsample_bylevel = 0.9  # [기본설정값: 1]: subsample, colsample_bytree 두 초모수 설정을 통해서 이미 의사결정나무 모형 개발에 사용될 변수갯수와 관측점 갯수를 사용했는데 추가로 colsample_bylevel을 지정하는 것이 특별한 의미를 갖는지 의문이 듦.
max_depth = 29  # [기본설정값: 6]: 과적합 방지를 위해서 사용되는데 역시 CV를 사용해서 적절한 값이 제시되어야 하고 보통 3-10 사이 값이 적용된다.
n_jobs = -1

# CV 써라
# XGB 속도가 굉장히 빠름, 전처리 결측치 제거 안해줘도 됨

model = XGBRFClassifier(max_depth=max_depth,
                        learning_rate=learning_rate,
                        n_estimators=n_estimators,
                        colsample_bylevel=colsample_bylevel,
                        colsample_bytree=colsample_bytree)

model.fit(x_train, y_train)

score = model.score(x_test, y_test)  # score는 evaluate
print('점수 :', score)

# print(model.feature_importances_)
plot_importance(model)
# plt.show()

# XGBRFClassifier 점수 : 0.9666666666666667

# XGBClassifier 점수 : 0.8666666666666667
Ejemplo n.º 6
0
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import accuracy_score, r2_score
from sklearn.model_selection import train_test_split
from xgboost import XGBRFRegressor, XGBRFClassifier

#
x, y = load_breast_cancer(return_X_y=True)

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8)

model = XGBRFClassifier(n_estimators=1000, learning_rate=0.1)

model.fit(x_train,
          y_train,
          verbose=True,
          eval_metric="error",
          eval_set=[(x_train, y_train), (x_test, y_test)])

#rmse,mae,logloss,error,auc

results = model.evals_result()
print("eval:", results)

y_pred = model.predict(x_test)
acc = accuracy_score(y_test, y_pred)
print("acc:", acc)

# import pickle
# pickle.dump(model, open("./model/sample/xgb_save/cancer.pickle.dat", "wb"))

import joblib
Ejemplo n.º 7
0
## train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    random_state=66)

## 모델링
model = XGBRFClassifier(
    n_estimators=300,  # verbose의 갯수, epochs와 동일
    learning_rate=0.1)

model.fit(x_train,
          y_train,
          verbose=True,
          eval_metric=['error', 'auc'],
          eval_set=[(x_train, y_train), (x_test, y_test)])
#   early_stopping_rounds = 100)
# eval_metic의 종류 : rmse, mae, logloss, error(error가 0.2면 accuracy는 0.8), auc(정확도, 정밀도; accuracy의 친구다)

results = model.evals_result()
print("eval's result : ", results)

y_pred = model.predict(x_test)

acc = accuracy_score(y_test, y_pred)
# print("r2 Score : %.2f%%" %(r2 * 100))
print("acc : ", acc)

thresholds = np.sort(model.feature_importances_)
Ejemplo n.º 8
0
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

print(f"\nBefore Standard Scaler, x.head() :- \n{ x.head() }")
x = sc.fit_transform(x)
print(f"\nAfter Standard Scaler, x :- \n{ x }")

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

from xgboost import XGBRFClassifier
xgboost = XGBRFClassifier()

xgboost.fit(x_train, y_train)
y_pred = xgboost.predict(x_test)

print(
    f"xgboost.score( x_test, y_test ) = { xgboost.score( x_test, y_test ) * 100 }%"
)

import matplotlib.pyplot as plt

plt.plot(x_test,
         y_test,
         label='Actual',
         marker='*',
         color='blue',
         linestyle='')
plt.plot(x_test,
Ejemplo n.º 9
0
print(thresholds)

for thresh in thresholds:  #중요하지 않은 컬럼들을 하나씩 지워나간다.
    selection = SelectFromModel(model, threshold=thresh, prefit=True)

    selection_x_train = selection.transform(x_train)
    selection_x_test = selection.transform(x_test)

    print(selection_x_train.shape)

    selection_model = XGBRFClassifier(objective="multi:softprob", n_jobs=-1)

    selection_model.fit(selection_x_train,
                        y_train,
                        eval_metric=['merror', 'mlogloss'],
                        eval_set=[(selection_x_train, y_train),
                                  (selection_x_test, y_test)])

    y_pred = selection_model.predict(selection_x_test)

    acc = accuracy_score(y_test, y_pred)
    #print("R2:",r2)
    for i in thresholds:
        pickle.dump(
            model,
            open(
                "./model/sample/xgb_save/iris.pickle{}.dat".format(
                    selection_x_train.shape[1]), "wb"))
    print("Thresh=%.3f, n=%d, acc: %.2f%%" %
          (thresh, selection_x_train.shape[1], acc * 100.0))
Ejemplo n.º 10
0
    if test:
        y = None
        X = d.values
    else:
        y = np.ravel(d[['Survived']].values)
        X = d.drop(columns=['Survived']).values
    X = preprocessing.scale(X)
    return (X, y)


(Xtrain, ytrain) = for_model_input(trainset)
knn_imputer = KNNImputer()
Xtrain = knn_imputer.fit_transform(Xtrain)

boosted_model = XGBRFClassifier()
boosted_model.fit(Xtrain, ytrain)
boosted_scores = cross_val_score(boosted_model, Xtrain, ytrain, cv=5)

print("Gradient-Boosting Model CV scores:\n", boosted_scores,
      np.mean(boosted_scores))

(Xtest, _) = for_model_input(testset, test=True)
Xtest = knn_imputer.fit_transform(Xtest)
predictions_boosted = boosted_model.predict(Xtest)  # + 1) / 2
predictions_boosted = predictions_boosted.astype('int64')
pred_boosted_df = pandas.DataFrame(predictions_boosted, columns=['Survived'])
fin_ans_boosted = pandas.DataFrame(
    testset['PassengerId']).join(pred_boosted_df)
with open('predictions_xgboost_rf.csv', 'w') as f:
    f.write((fin_ans_boosted.to_csv(index=False)))
Ejemplo n.º 11
0
from xgboost import XGBRFClassifier
from sklearn.model_selection import train_test_split

df = pd.read_csv("heart_failure_clinical_records_dataset.csv")

t = np.array(list(df['creatinine_phosphokinase'])).reshape(-1, 1)
pt = PowerTransformer(method="yeo-johnson")
creatinine_phosphokinase = pt.fit_transform(t)
df['creatinine_phosphokinase'] = creatinine_phosphokinase

t = np.array(list(df['serum_creatinine'])).reshape(-1, 1)
pt = PowerTransformer(method="yeo-johnson")
serum_creatinine = pt.fit_transform(t)
df['serum_creatinine'] = serum_creatinine

df.drop(columns=['sex', 'diabetes'], inplace=True)
X = df.iloc[:, 0:10].values
Y = df['DEATH_EVENT'].values

x_train, x_test, y_train, y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.1,
                                                    random_state=6)

xrclf = XGBRFClassifier()
xrclf.fit(x_train, y_train)

pickle.dump(xrclf, open('xrclf.pkl', 'wb'))

clf = pickle.load(open('xrclf.pkl', 'rb'))
print(clf.score(x_test, y_test))