def LGBM(num_l, n_est, min_child):
    st.title("Модель прогнозирования - LightGBM")
    if st.checkbox("Показать код"):
        st.code(
            """lgbm = LGBMClassifier( num_leaves=num_l, n_estimators=n_est, min_child_samples=min_child)
    lgbm.fit(X_train,y_train)
    lgbm_pred=lgbm.predict(X_test)
    acc_lgbm=round(lgbm.score(X_train,y_train),10)
    st.text(acc_lgbm)""")

    if st.checkbox("Использовать несбалансированные данные"):
        lgbm = LGBMClassifier(num_leaves=num_l,
                              n_estimators=n_est,
                              min_child_samples=min_child)
        lgbm.fit(X_train, y_train)
        lgbm_pred = lgbm.predict(X_test)
        acc_lgbm = round(lgbm.score(X_train, y_train), 10)
        st.text(acc_lgbm)
    else:
        lgbm = LGBMClassifier(num_leaves=num_l,
                              n_estimators=n_est,
                              min_child_samples=min_child)
        lgbm.fit(X_train_res, y_train_res)
        lgbm_pred = lgbm.predict(X_test0)
        acc_lgbm = round(lgbm.score(X_train0, y_train0), 10)
        st.text(acc_lgbm)
Beispiel #2
0
def lightGbmModel(X_train,Y_train):
    # use LightGBM
    #!conda install -c conda-forge lightgbm
    from lightgbm import LGBMClassifier
    lightgbm=LGBMClassifier()
    lightgbm.fit(X_train,y_train)
    print('\nLight GBM Training Score:',lightgbm.score(X_train,Y_train))   
    return lightgbm,lightgbm.score(X_train,Y_train)
Beispiel #3
0
def evaluate_lgbm(trainX, trainy, testX, testy, params):
    sc = StandardScaler()
    trainX = sc.fit_transform(trainX)
    testX = sc.transform(testX)
    model = LGBMClassifier(**params)
    model.fit(trainX, trainy)
    test_acc = model.score(testX, testy)
    pred = model.predict_proba(testX)
    return model, test_acc, pred
def get_lgbm_score(X_train,y_train,X_test,y_test):
    lgbm_default = LGBMClassifier()
    lgbm_cross = LGBMClassifier()
    np.random.seed(200)
    cross_score = np.mean(cross_val_score(lgbm_cross, X_train, y_train, cv=5))

    lgbm_default.fit(X_train, y_train)
    score_lgbm = lgbm_default.score(X_test, y_test)

    neptune.log_metric('lgbm', score_lgbm)
    neptune.log_metric('lgbm_cross_score', cross_score)
    return score_lgbm
Beispiel #5
0
def Test():
    train = pd.read_csv('./csvfile/cardio.csv')

    train["plus"] = train["smoke"] * train["alco"]

    train["age_year"] = round(train["age_year"], 0).astype(np.int64)
    train["BMI"] = round(
        train["weight"] / (train["height"] * train["height"] / 10000),
        2).astype(np.float64)
    train = train.dropna(0)

    train = train[(train.BMI <= 50) & (train.BMI >= 10)]
    y = train["cardio"]
    print(train.shape)
    print(y.shape)
    train = train.drop(["id", "age_days", "cardio"], 1)
    print(train.shape)

    print(train["BMI"].max())
    print(train["BMI"].min())

    train.loc[(train["ap_hi"] >= 140) & (train["ap_hi"] < 200),
              'ap_hi'] = 3  #high
    train.loc[(train["ap_hi"] < 90) & (train["ap_hi"] >= 60),
              'ap_hi'] = 1  #low
    train.loc[(train["ap_hi"] < 140) & (train["ap_hi"] >= 90),
              'ap_hi'] = 2  #normal

    train = train.drop(["weight", "height", "ap_lo"], 1)
    rf = LGBMClassifier(n_estimators=200,
                        num_leaves=25,
                        colsample_bytree=0.6,
                        subsample=0.6)

    xf_train, xf_test, yf_train, yf_test = train_test_split(train,
                                                            y,
                                                            test_size=0.2,
                                                            random_state=1)

    rf.fit(xf_train, yf_train)

    odd = round(rf.score(xf_test, yf_test) * 100, 1)
    print(odd)

    print(train.shape)
    importances_df = pd.DataFrame(rf.feature_importances_).rename(
        {0: "importances"}, axis=1)
    importances_df["columns"] = xf_train.columns
    importances_df = importances_df.sort_values("importances", ascending=False)
    importances_df["importances"] = (
        importances_df["importances"] /
        importances_df["importances"].values.sum()) * 100
    print(importances_df.head(10))
Beispiel #6
0
def lgbm_classifier(x_trn: pd.DataFrame, y_trn: np.ndarray,
                    x_val: pd.DataFrame, y_val: np.ndarray) -> tuple:
    x_trn, x_val = x_trn.copy(), x_val.copy()
    y_trn, y_val = y_trn.copy(), y_val.copy()

    model = LGBMClassifier(boosting_type='gbdt',
                           objective='binary',
                           metric='binary_logloss',
                           n_estimators=400,
                           learning_rate=0.05,
                           min_child_samples=16,
                           is_unbalance=True,
                           num_iterations=700,
                           n_jobs=-1,
                           random_state=7)
    _ = model.fit(x_trn, y_trn)

    training_score = model.score(x_trn, y_trn)
    validation_score = model.score(x_val, y_val)

    clf_report = classification_report(y_val, model.predict(x_val))
    ck_score = cohen_kappa_score(y_val, model.predict(x_val))

    return model, training_score, validation_score, clf_report, ck_score
def do_generate_metrics_lgbm_optimazed_model(X_train, y_train, X_test, y_test,
                                             grid):
    file_operations.write_logs(FILENAME, "LGBM metrics calculation\n")
    model = LGBMClassifier(random_state=0)
    model.set_params(**grid.best_params_)
    model.fit(X_train, y_train)
    metrics = calculate_metrics(model, X_test, y_test)
    file_operations.write_logs(
        FILENAME, "Generated model params and results\n params:" +
        str(model.get_params()) + "\nscore " +
        str(model.score(X_test, y_test)))
    file_operations.write_logs(
        FILENAME, "Search grid best params and results\n params:" +
        str(grid.best_params_) + "\nscore " + str(grid.best_score_))

    return model, metrics
def filter_LBGM_importance(dataframe, target, threshold=1):
    from lightgbm import LGBMClassifier
    from sklearn import preprocessing
    categorical_feats = dataframe.select_dtypes('object').columns.tolist()

    for col in categorical_feats:
        lb = preprocessing.LabelEncoder()
        lb.fit(list(dataframe[col].values.astype('str')))
        dataframe[col] = lb.transform(list(
            dataframe[col].values.astype('str')))

    valid_size = int(dataframe.shape[0] / 4)
    valid = dataframe.sample(valid_size)
    train = dataframe.drop(valid.index, axis=0)
    train_x = train.drop([target], axis=1)
    train_y = train[target]
    valid_x = valid.drop([target], axis=1)
    valid_y = valid[target]

    clf = LGBMClassifier(
        nthread=4,
        n_estimators=10000,
        learning_rate=0.02,
        num_leaves=34,
        colsample_bytree=0.95,
        subsample=0.9,
        max_depth=8,
        reg_alpha=0.04,
        reg_lambda=0.07,
        min_split_gain=0.025,
        min_child_weight=40,
        #        importance_type='split',
        silent=-1,
        verbose=-1,
    )

    clf.fit(train_x,
            train_y,
            eval_set=[(train_x, train_y), (valid_x, valid_y)],
            eval_metric='auc',
            verbose=100,
            early_stopping_rounds=200)
    #    oof_preds = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1]

    feats = train_x.columns.tolist()
    importance_df = pd.DataFrame()
    importance_df["feature"] = feats
    importance_df["importance"] = clf.feature_importances_
    importance_df.sort_values('importance', inplace=True, ascending=False)
    less_important_features = importance_df.loc[
        importance_df['importance'] < threshold, 'feature']
    dataframe.drop(less_important_features, axis=1, inplace=True)
    score = clf.score(train_x, train_y)
    trace('filter_LBGM_importance')
    trace(importance_df)
    trace('category features')
    trace(categorical_feats)
    trace('score')
    trace(score)
    trace('drop features')
    trace(less_important_features)
    return dataframe
                          num_leaves=15,
                          colsample_bytree=.8,
                          subsample=.8,
                          max_depth=7,
                          reg_alpha=.1,
                          reg_lambda=.1,
                          min_split_gain=.01)

clf_lgbm.fit(X_train,
             Y_train,
             eval_set=[(X_train, Y_train)],
             eval_metric='auc',
             verbose=0,
             early_stopping_rounds=30)

acc_clf_lgbm = round(clf_lgbm.score(X_train, Y_train) * 100, 2)
acc_clf_lgbm

# In[ ]:

Y_pred = random_forest.predict(X_test)

# In[ ]:

my_submission = pd.DataFrame({
    "PassengerId": test_df["PassengerId"],
    "Survived": Y_pred
})
my_submission.to_csv('new_submission.csv', index=False)

# In[ ]:
Beispiel #10
0
lgbm = LGBMClassifier(n_estimators=100, learning_rate=0.1, n_jobs=-1)

lgbm.fit(x_train,
         y_train,
         verbose=True,
         eval_metric=["logloss", "rmse"],
         eval_set=[(x_train, y_train), (x_test, y_test)],
         early_stopping_rounds=20)

#rmse,mae,logloss,error,auc

y_pre = lgbm.predict(x_test)

r2 = r2_score(y_test, y_pre)
score = lgbm.score(x_test, y_test)
print(__file__)
print("r2")
print(r2)
print("score")
print(score)

#6)selectFromModel

thresholds = np.sort(lgbm.feature_importances_)

idx_max = -1
max = r2

for idx, thresh in enumerate(thresholds):
    #데이터 전처리
Beispiel #11
0
# # Train the light GBM
# model = LGBMClassifier()
# cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# n_scores = cross_val_score(model, images, labels, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
# print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

# fit the model on the whole dataset
model = LGBMClassifier(objective="binary", class_weight="balanced")

start_time = time.time()

model = model.fit(images, labels)
print("Train Light GBM --- %s seconds ---" % (time.time() - start_time))

start_time = time.time()
basic_score = model.score(images_validation, labels_validation)
print("Validation Light GBM --- %s seconds ---" % (time.time() - start_time))

print("Light GBM scikit learn basic score: %0.4f" % basic_score)

# Validating the model and evaluation
start_time = time.time()
scores = cross_validate(model, images_validation, labels_validation, cv=5, scoring=('f1','roc_auc_ovo'), return_train_score=True)
print("Cross Validation Light GBM --- %s seconds ---" % (time.time() - start_time))

cross_score = model.score(images_validation, labels_validation)


print("Light GBM scikit learn cross-val score: %0.4f" % cross_score)
print(scores)
Beispiel #12
0
X_train_scaled = scaler.transform(X_train_scaled)
X_test_scaled = scaler.transform(X_test_scaled)
test_x_scaled = scaler.transform(test_x_scaled)

X_train_scaled = scaler.transform(X_train_scaled)
X_test_scaled = scaler.transform(X_test_scaled)
test_x_scaled = scaler.transform(test_x_scaled)

# 시각화
import matplotlib.pyplot as plt
plt.hist(X_train_scaled)
plt.title('StandardScaler')
plt.show()

# 정확도 측정
acc = LGBM.score(X_test, y_test)
print('acc: ', acc)  # 0.8454961374034351

# 예측
y_pred = LGBM.predict_proba(test_x)
print(y_pred)

# 특성 중요도 그리기
import numpy as np
import matplotlib.pyplot as plt


def plot_feature_importances_orb(model):
    n_features = train_x.shape[1]
    plt.barh(np.arange(n_features), LGBM.feature_importances_, align='center')
    plt.yticks(np.arange(n_features), feat_labels)
Beispiel #13
0
        learning_rate=0.1,
        n_estimators=500
        #,max_bin=15
        ,
        colsample_bytree=0.8,
        subsample=0.8,
        min_child_weight=6)

    print "Fitting lgbm model for unsw"
    model.fit(train,
              train_labels,
              early_stopping_rounds=3,
              eval_set=(train, train_labels),
              verbose=False)
    #pred = model.predict(test_dataset)
    print(model.score(train, train_labels))
    print(model.score(test, test_labels))

    train_dataset, train_labels, test_dataset, test_labels = get_nsl_data()
    train_labels = train_labels[:, 0]
    test_labels = test_labels[:, 0]
    train = np.column_stack(train_dataset.values())
    print train.shape
    test = np.column_stack(test_dataset.values())
    print test.shape

    print "Fitting lgbm model for nsl"
    model.fit(train,
              train_labels,
              early_stopping_rounds=3,
              eval_set=(train, train_labels),
Beispiel #14
0
xg.fit(X_important_train, y_train,
       eval_set=[(X_important_train, y_train),(X_important_val, y_val)],
       early_stopping_rounds=10, verbose=True)

print("XGB Train score: %s" % xg.score(X_important_train,y_train))
print("XGB Val score:   %s" % xg.score(X_important_val,y_val))
print("XGB Test score:  %s" % xg.score(X_important_test,y_test))

"""### LGBM """

lgbm = LGBMClassifier( ).fit( X_important_train, y_train,
       eval_set=[(X_important_train, y_train),(X_important_val, y_val)],
       early_stopping_rounds=10, verbose=True)

print()
print("LGBM Train score: %s" % lgbm.score(X_important_train,y_train))
print("LGBM Val score:   %s" % lgbm.score(X_important_val,y_val))
print("LGBM Test score:  %s" % lgbm.score(X_important_test,y_test))

y_pred = lgbm.predict(X_important_test)

# draw classification report and confusion matrix  for LGBM MODEL (BASE MODEL)
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(y_test, y_pred))
cf = confusion_matrix(y_test, y_pred)
sns.heatmap(cf, annot=True)

import pickle
pickle.dump(lgbm, open("lbm.pkl", 'wb'))
pickle.dump(xg, open("xg.pkl", 'wb'))
Beispiel #15
0
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=2018)

# 随机森林
rf = RandomForestClassifier(n_estimators=100,
                            oob_score=True,
                            random_state=2018)
rf.fit(x_train, y_train)
rf_acc = rf.score(x_test, y_test)
print("RandomForestClassifier Acc: ", rf_acc)

# GBDT
gb = GradientBoostingClassifier(random_state=2018)
gb.fit(x_train, y_train)
gb_acc = gb.score(x_test, y_test)
print("GradientBoostingClassifier Acc: ", gb_acc)

# XGBoost
xgb = XGBClassifier(random_state=2018)
xgb.fit(x_train, y_train)
xgb_acc = xgb.score(x_test, y_test)
print("XGBClassifier Acc: ", xgb_acc)

# LightGBM
lg = LGBMClassifier(random_state=2018)
lg.fit(x_train, y_train)
lg_acc = lg.score(x_test, y_test)
print("LGBMClassifier Acc: ", lg_acc)
Beispiel #16
0
y_pred['PassengerId'] = df_test['PassengerId']
y_pred_rf = y_pred
y_pred.to_csv('titanic_pred_rfc.csv', index=False)

# # Light GBM

# In[ ]:

from lightgbm import LGBMClassifier

lgb = LGBMClassifier(learning_rate=0.01, max_depth=2,
                     num_leaves=3).fit(X_train, y_train)

# In[ ]:

lgb.score(X_train, y_train)

# In[ ]:

lgb.score(X_test, y_test)

# In[ ]:

y_pred = pd.DataFrame(lgb.predict(df_test))

y_pred['Survived'] = y_pred[0]
y_pred.drop(0, axis=1, inplace=True)
y_pred['PassengerId'] = df_test['PassengerId']
y_pred_lgb = y_pred
y_pred.to_csv('titanic_pred_lgb.csv', index=False)
Beispiel #17
0
def train(MODEL="GNB"):

    # load voter data and merge with Census data
    df = pd.read_csv(DIR + "/data/nc_voter_geocoded_census_block_trigrams.csv")

    df = prep_data(df)

    tes = {}
    #tes = joblib.load(DIR + "/data/models/transformers_binary.joblib")

    models = {}

    # Loop through each race class, create model for each
    for race in ["W", "B", "A", "I", "HL"]:

        X = df.copy()

        # If hispanic, use ethnic_code instead of race code
        if race == "HL":
            X["ethnic_code"] = np.where(X["ethnic_code"] == race, True, False)
            y = X["ethnic_code"]

        # other wise race code
        else:
            X["race_code"] = np.where(X["race_code"] == race, True, False)
            y = X["race_code"]

        # target encode names, save target encoder
        for col in ["first_name", "last_name", "middle_name"]:

            #te = tes[race][col]
            te = TargetEncoder()
            te.fit(X[col], y)

            X[col] = te.transform(X[col])

        # remove target variables and fill in any nas with 0
        #sample_weights = X["sample_weights"]
        #X = X.drop(["race_code", "ethnic_code", "zip", "sample_weights"], axis=1)
        X = X.fillna(0)

        sm = SMOTE(n_jobs=-1)
        X, y = sm.fit_resample(X, y)
        sample_weights = X["sample_weights"]
        X = X.drop(["zip", "sample_weights"], axis=1)

        # train model
        if MODEL == "LGBM":
            from lightgbm import LGBMClassifier
            model = LGBMClassifier(n_jobs=-1)
        elif MODEL == "GNB":
            from sklearn.naive_bayes import GaussianNB
            model = GaussianNB()
        elif MODEL == "XGB":
            from xgboost import XGBClassifier
            model = XGBClassifier(n_jobs=-1)
        elif MODEL == "SGD":
            model = SGDClassifier(alpha=0.0,
                                  eta0=0.1,
                                  fit_intercept=True,
                                  l1_ratio=1.0,
                                  learning_rate="constant",
                                  loss="modified_huber",
                                  penalty="elasticnet",
                                  power_t=0.0)
        elif MODEL == "RF":
            from sklearn.ensemble import RandomForestClassifier
            model = RandomForestClassifier(n_jobs=-1, max_depth=10)

        model.fit(X[MODEL_COLS], y, sample_weight=sample_weights)

        # save model
        models[race] = model

        # score model
        print(race, model.score(X[MODEL_COLS], y))

    # Save the models and encoders
    handle = MODEL.lower()

    #joblib.dump(tes, DIR + "/data/models/transformers_binary.joblib", compress=True)
    joblib.dump(models,
                DIR + "/data/models/models_binary_%s.joblib" % handle,
                compress=True)
    #joblib.dump(scalers, DIR + "/data/models/scalers_binary.joblib", compress=True)

    print("Trained model saved to ./data/models/")
Beispiel #18
0
def supervised_shared(unsw_dict, nsl_dict, H1, U, num_epochs, batch_size,
                      beta):
    load = False
    if load:
        with open(r"SharedAutoEncoder/datasets.p", "rb") as i:
            EX_unsw, EX_unsw_test, EX_nsl, EX_nsl_test = pickle.load(i)
    else:
        logger.info('Using Shared AE with Linear Classifier')
        X_unsw = unsw_dict['X']
        X_unsw_test = unsw_dict['X_test']
        y_unsw = unsw_dict['y']
        y_unsw_test = unsw_dict['y_test']

        X_nsl = nsl_dict['X']
        X_nsl_test = nsl_dict['X_test']
        y_nsl = nsl_dict['y']
        y_nsl_test = nsl_dict['y_test']

        unsw_dim = X_unsw.shape[1]
        nsl_dim = X_nsl.shape[1]

        model_unsw, model_nsl, encoder_unsw, encoder_nsl = multimodal_autoencoder(
            unsw_dim, nsl_dim, H1, U)
        for x in range(num_epochs):
            print("Epoch:", x)
            model_unsw.fit(X_unsw, X_unsw, epochs=2, batch_size=batch_size)
            model_nsl.fit(X_nsl, X_nsl, epochs=2, batch_size=batch_size)

        # Get the shared representation of both datasets
        EX_unsw = encoder_unsw.predict(X_unsw)
        EX_unsw_test = encoder_unsw.predict(X_unsw_test)

        EX_nsl = encoder_nsl.predict(X_nsl)
        EX_nsl_test = encoder_nsl.predict(X_nsl_test)

        with open(r"SharedAutoEncoder/datasets.p", "wb") as o:
            pickle.dump((EX_unsw, EX_unsw_test, EX_nsl, EX_nsl_test), o)

        # Get accu5(unsw) and accu5(nsl)
        #EX_concat = np.concatenate((EX_unsw, EX_nsl), axis=0)
        #y_concat = np.concatenate((y_unsw, y_nsl), axis=0)

    #model = build_attention_model(EX_unsw.shape[1], 2)

    model = LGBMClassifier(
        n_jobs=8,
        max_depth=11,
        num_leaves=302,
        learning_rate=0.1,
        n_estimators=500
        # ,max_bin=15
        #, colsample_bytree=0.8
        #, subsample=0.8
        #, min_child_weight=6
    )

    #model.fit(EX_concat, y_concat[:,0])
    logger.info("Training lgbm model on NSL unified representation")
    model.fit(EX_nsl,
              y_nsl[:, 0],
              early_stopping_rounds=3,
              eval_set=(EX_nsl_test, y_nsl_test[:, 0]),
              verbose=False)
    logger.info("Shared model NSL train acc:\t%.6f" %
                model.score(EX_nsl, y_nsl[:, 0]))
    logger.info("Shared model NSL test acc:\t%.6f" %
                model.score(EX_nsl_test, y_nsl_test[:, 0]))

    logger.info("Training lgbm model on UNSW unified representation")
    model.fit(EX_unsw,
              y_unsw[:, 0],
              early_stopping_rounds=3,
              eval_set=(EX_unsw_test, y_unsw_test[:, 0]),
              verbose=False)
    #model.fit(EX_nsl, y_nsl[:, 0], early_stopping_rounds=3, eval_set=(EX_nsl_test, y_nsl_test[:, 0]), verbose=False)

    logger.info("Shared model UNSW train acc:\t%.6f" %
                model.score(EX_unsw, y_unsw[:, 0]))
    logger.info("Shared model UNSW test acc:\t%.6f" %
                model.score(EX_unsw_test, y_unsw_test[:, 0]))
    logger.info("Shared model NSL train acc:\t%.6f" %
                model.score(EX_nsl, y_nsl[:, 0]))
    logger.info("Shared model NSL test acc:\t%.6f" %
                model.score(EX_nsl_test, y_nsl_test[:, 0]))
Beispiel #19
0
df = pd.read_csv('./KSJR_Car_Hacking_D_training-1(DS_CV)_0.csv')
df_x = df[[
    'Data0', 'Data1', 'Data2', 'Data3', 'Data4', 'Data5', 'Data6', 'Data7'
]]
df_y = df['Class']

train_x, test_x, train_y, test_y = train_test_split(df_x,
                                                    df_y,
                                                    test_size=0.3,
                                                    random_state=10)
print(train_x.shape, test_x.shape)

base_model = LGBMClassifier(random_state=0, metric='binary_logloss')
base_model.fit(train_x, train_y)
base_acc = base_model.score(test_x, test_y)

param_grid = {
    'n_estimators': [10, 100],
    'boosting_type': ['gbdt', 'rf', 'dart', 'goss'],
    'objective': ['binary'],
    'num_leaves': [6, 8, 12, 16],
    'learning_rate': [0.1, 0.001, 0.003]
}

grid_search = GridSearchCV(LGBMClassifier(random_state=0,
                                          metric='binary_error'),
                           param_grid,
                           cv=kfold,
                           verbose=2)
grid_search.fit(train_x, train_y)
Beispiel #20
0
y_pred = model2.predict(best_x_test)
acc = accuracy_score(y_test, y_pred)
print('acc :', acc)

end1 = time.time()

import joblib
joblib.dump(best_model, './model/xgb_Save/sfm3-' + str(best_score) + '.dat')
model2 = joblib.load('./model/xgb_Save/sfm3-' + str(best_score) + '.dat')

#### LGBM 셀렉트

start2 = time.time()
model_LGBM = LGBMClassifier()
model_LGBM.fit(x_train, y_train)
score = model_LGBM.score(x_test, y_test)
print("acc : ", score)

thresholds = np.sort(model_LGBM.feature_importances_)

print(thresholds)
print(x_train.shape)
print("========================")

best_x_train = x_train
best_x_train = x_test
best_score = score
best_model = model_LGBM

for thresh in thresholds:
    selection = SelectFromModel(model_LGBM, threshold=thresh, prefit=True)
Beispiel #21
0
                               random_state=None)
scores = np.array([])
# Make k-fold CV
for train_index, test_index in rskf.split(data, target):

    # Initialize model
    clf = LGBMClassifier(learning_rate=best_learning_rate,
                         min_data_in_leaf=best_min_data_in_leaf,
                         num_leaves=best_num_leaves)
    # Split data
    X_train, X_test = data[train_index], data[test_index]
    y_train, y_test = target[train_index], target[test_index]

    # Fit and score the model
    clf.fit(X_train, y_train)
    train_score = clf.score(X_test, y_test)
    scores = np.append(scores, train_score)

# Print final score
with open('ris/OUT-score_alglorithms.txt', mode='a') as f:
    print('Average score:',
          scores.mean(),
          '+-',
          scores.std() / np.sqrt(n_splits),
          file=f)

#####################
# Data augmentation #
#####################

params = {
Beispiel #22
0
from sklearn.metrics import precision_score

# Importing the dataset
X = np.load('./project/mini/data/X.npy')
y = pd.read_csv('./project/mini/data/y_label.csv', header=0).iloc[:, 0]

X = X.reshape(X.shape[0], X.shape[1] * X.shape[2])

# Splitting the dataset into the Training set and Test set
x_train, x_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=42,
                                                    stratify=y)

# Feature Scaling
x_train /= -80
x_test /= -80

model = LGBMClassifier(objective='multiclass')

model.fit(x_train, y_train, categorical_feature=[0, 12])

print('feature_importances :', model.feature_importances_)

y_pred = model.predict(x_test)
print('최종 정답률 :', model.score(x_test, y_test))

# 최종 정답률 : 0.5326016785022595
Beispiel #23
0
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import pickle
dataset = load_breast_cancer()
x = dataset.data
y = dataset.target

x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    random_state=66)
model = LGBMClassifier()
model.fit(x_train, y_train)
score = model.score(x_test, y_test)
# print(score)

thresholds = np.sort(model.feature_importances_)

# print(thresholds)
models = []  # 빈 모델 배열 생성
res = np.array([])  #빈 결과값 배열 생성
for thres in thresholds:
    selection = SelectFromModel(model, threshold=thres,
                                prefit=True)  #중요하지 않는 컬럼부터 하나씩 빼면서 트레이닝한다
    #median
    selection_x_train = selection.transform(x_train)
    model2 = LGBMClassifier(n_estimators=1000)
    selection_x_test = selection.transform(x_test)
    model2.fit(selection_x_train,
Beispiel #24
0
def main():
    # Add arguments to script
    parser = argparse.ArgumentParser()

    # See lightgbm library for python for a list of parameters: https://lightgbm.readthedocs.io/en/latest/Parameters.html
    parser.add_argument('--n_estimators',
                        type=int,
                        default=100,
                        help="number of boosting iterations")
    parser.add_argument('--learning_rate',
                        type=float,
                        default=0.1,
                        help="shrinkage rate")
    parser.add_argument('--max_depth',
                        type=int,
                        default=-1,
                        help="max depth for tree model")
    parser.add_argument(
        '--subsample',
        type=float,
        default=1.0,
        help=
        "randomly select part of data without resampling. useful to speed up training and prevent over-fitting"
    )

    args = parser.parse_args()

    run = Run.get_context()

    run.log("n_estimators:", np.int(args.n_estimators))
    run.log(
        "learning_rate:", np.float(args.learning_rate)
    )  # see here for more ideas = https://bit.ly/3c2zJOm & https://bit.ly/3o6OAth
    run.log("max_depth:", np.int(args.max_depth))
    run.log("subsample:", np.float(args.subsample))

    # training set
    train_split_data = run.input_datasets["output_split_train"]
    # train_split_data = train_split_data.parse_parquet_files()
    train_split_df = train_split_data.to_pandas_dataframe()
    print(train_split_df.head(10))

    x_train = train_split_df.loc[:, train_split_df.columns != 'Exited']
    y_train = train_split_df.loc[:, train_split_df.columns == 'Exited']

    #evaluation set
    test_split_data = run.input_datasets["output_split_test"]
    test_split_df = test_split_data.to_pandas_dataframe()

    x_test = test_split_df.loc[:, test_split_df.columns != 'Exited']
    y_test = test_split_df.loc[:, test_split_df.columns == 'Exited']

    print(x_train.head(10))
    print(x_test.head(10))

    # declaring our model with parameters - default and those declared in our hyperparameter space
    model = LGBMClassifier(n_estimators=args.n_estimators,
                           learning_rate=args.learning_rate,
                           max_depth=args.max_depth,
                           subsample=args.subsample).fit(x_train, y_train)

    # save model
    os.makedirs('./outputs/model', exist_ok=True)

    # files saved in the "outputs" folder are automatically uploaded into run history
    joblib.dump(model, './outputs/model/saved_model.joblib')

    accuracy = model.score(x_test, y_test)
    print(model)
    print(x_test.head(10))

    run.log("Accuracy", np.float(
        accuracy))  #source: https://bit.ly/3mTxEWR && https://bit.ly/3hgonXx

    y_pred = model.predict(x_test)
    auc_weighted = roc_auc_score(y_test, y_pred, average='weighted')
    run.log("AUC_weighted", np.float(auc_weighted)
            )  #source: https://bit.ly/3mTxEWR && https://bit.ly/3hgonXx

    # creating a confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
lg1 = LGBMClassifier()

print(x1)

print(
    "\n.................................................................................\n"
)

x1_train, x1_test, y1_train, y1_test = train_test_split(x1,
                                                        y1,
                                                        test_size=0.2,
                                                        random_state=42,
                                                        stratify=y1)
lg1.fit(x1_train, y1_train)
print("YOUR R2 MACHINE LEARNED WITH THIS ACCURACY : ",
      lg1.score(x1_test, y1_test))

print(
    "\n.................................................................................\n"
)

y1_pred = lg1.predict(x1_test)

print(classification_report(y1_test, y1_pred))

precision, recall, fscore, support = score(y1_test, y1_pred)

print('precision: ', np.mean(precision))
print('recall: ', np.mean(recall))
print('fscore: ', np.mean(fscore))
Beispiel #26
0
from lightgbm import LGBMClassifier, plot_importance
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, f_classif, chi2
train = pd.read_csv('train.csv', index_col=0)
test = pd.read_csv('test.csv', index_col=0)
sample_submission = pd.read_csv('sample_submission.csv', index_col=0)
x = train.drop(columns='class', axis=1)  # class 열을 삭제한 새로운 객체
y = train['class']  # 결과 레이블(class)
TEST = test
train_x, test_x, train_y, test_y = train_test_split(x,
                                                    y,
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    stratify=y,
                                                    random_state=42)
# 데이터에서 20%를 test 데이터로 분리
evals = [(test_x, test_y)]
lgbm = LGBMClassifier(n_estimators=1000,
                      learning_rate=0.03,
                      max_depth=12,
                      num_leaves=4000,
                      random_state=42,
                      boosting_type="goss")
lgbm.fit(train_x, train_y, early_stopping_rounds=20, eval_set=evals)
print("acc: {}".format(lgbm.score(train_x, train_y)))  # 훈련 데이터에 대한 정확도
print("acc: {}".format(lgbm.score(test_x, test_y)))  # 테스트 데이터에 대한 정확도
y_pred = np.argmax(lgbm.predict_proba(TEST), axis=1)  # 각 클래스에 대한 예측확률
submission = pd.DataFrame(data=y_pred,
                          columns=sample_submission.columns,
                          index=sample_submission.index)
submission.to_csv('submission5.csv', index=True)
# full_index = np.array([95,94,82,59,0])
# data_index = np.array([44,179,112,59,82,58,84])
# data_index = np.array([0])

data_index = np.array([0, 59, 94, 95, 84, 161, 44, 179, 82, 112, 58])
# classes = ['WWW', 'MAIL', 'FTP-CONTROL', 'FTP-PASV', 'ATTACK', 'P2P', 'DATABASE', 'FTP-DATA', 'MULTIMEDIA', 'SERVICES',
#            'INTERACTIVE', 'GAMES']
classes = ['WWW', 'MAIL', 'FTP-CONTROL', 'FTP-PASV', 'ATTACK', 'P2P', 'DATABASE', 'FTP-DATA',
           'MULTIMEDIA', 'SERVICES', 'INTERACTIVE']

# file used to train, who generates x_train,x_test,y_train,y_test
# I also resampled the file `entry12`
file = os.path.join(data_dir, filename)
test_file = os.path.join(data_dir, test_filename)

if __name__ == '__main__':
    acc = []
    x_train, _, y_train, _ = get_data(file)
    _, x_test, _, y_test = get_data(test_file)
    np_dir = os.path.join(data_dir, 'estimators_100_150_5.txt')
    for i in range(50, 150, 5):
        clf = LGBMClassifier(n_estimators=i)
        clf.fit(x_train, y_train)
        print(clf.get_params())
        accuracy = clf.score(x_test, y_test)
        acc.append(accuracy)
    acc = np.array(acc)
    print(acc)
    np.savetxt(np_dir, acc)
import matplotlib.lines as lines
lines.lineStyles
Beispiel #28
0
Y_predGB = modelGB.predict(X_valid)

print("Training Accuracy: ", modelGB.score(X_train, Y_train))
print('Testing Accuarcy: ', modelGB.score(X_valid, Y_valid))

print("AUROC Score of Gradient Boosting = ", roc_auc_score(Y_valid, Y_predGB))

from lightgbm import LGBMClassifier

modelLGBM = LGBMClassifier()
modelLGBM.fit(X_train, Y_train)

Y_predLGBM = modelLGBM.predict(X_valid)

print("Training Accuracy: ", modelLGBM.score(X_train, Y_train))
print('Testing Accuarcy: ', modelLGBM.score(X_valid, Y_valid))

print("AUROC Score of LGBM = ", roc_auc_score(Y_valid, Y_predLGBM))

test_Y_RF = modelRF.predict(test_X)
test_Y_XG = modelXG.predict(test_X)
test_Y_AB = modelAB.predict(test_X)
test_Y_LGBM = modelLGBM.predict(test_X)
test_Y_GB = modelGB.predict(test_X)
test_Y_pred = []

for i in range(len(test_Y_RF)):
  k = 0.35 * test_Y_LGBM[i] + 0.25 * test_Y_RF[i] + 0.175 * test_Y_GB[i] + 0.125 * test_Y_XG[i] + 0.1 * test_Y_AB[i] # weighted averaging
  test_Y_pred.append(k)
Beispiel #29
0
                       learning_rate=0.1)  # 나무의 갯수(n_estimators)는 epoch

model.fit(x_train,
          y_train,
          verbose=True,
          eval_metric=["multi_logloss", "multi_error"],
          eval_set=[(x_train, y_train), (x_test, y_test)],
          early_stopping_rounds=100)

# results = model.evals_result()
# print("eval's results : ", results)

# y_pred = model.predict(x_test)
# r2 = r2_score(y_pred, y_test)
# print("r2 Score : %.2f%%:" %(r2*100.0))
score = model.score(x_test, y_test)
print("acc : ", score)
#########################################################################################################
# feature engineering
thresholds = np.sort(model.feature_importances_)
print(thresholds)

for thresh in thresholds:
    selection = SelectFromModel(model, threshold=thresh, prefit=True)

    select_x_train = selection.transform(x_train)

    selection_model = LGBMClassifier()
    selection_model.fit(select_x_train, y_train)

    select_x_test = selection.transform(x_test)
Beispiel #30
0
x_train_data = train_set[:,1:]
y_train_data = train_set[:,:1].reshape(-1,)

x_test_data = test_set[:,1:]
y_test_data = test_set[:,:1].reshape(-1,)

#print("[x_train_data]",x_train_data.shape)
#print("[y_train_data]", y_train_data.shape)
#print("[x_test_data]", x_test_data.shape)
#print("[y_test_data]", y_test_data.shape)

lgb = LGBMClassifier(n_estimators=1500, learning_rate=0.1, max_depth=15, application='binary', num_leaves=30, metrics='binary_logloss')
classifier = lgb.fit(x_train_data, y_train_data)

print(lgb.score(x_train_data, y_train_data))
#print(lgb.score(x_test_data, y_test_data))

y_pred = lgb.predict(x_test_data)
#y_pred = classifier.predict_proba(x_test_data)
#print(y_pred)
# for yy in y_pred:
#   print(yy)
print(confusion_matrix(y_test_data,y_pred))
print(classification_report(y_test_data,y_pred))

fig, ax = plt.subplots(figsize=(10,20))
plot_importance(lgb, ax, max_num_features=32)
plt.show()
# model save
#joblib.dump(lgb, open('lgb.model', 'wb'))