def LGBM(num_l, n_est, min_child): st.title("Модель прогнозирования - LightGBM") if st.checkbox("Показать код"): st.code( """lgbm = LGBMClassifier( num_leaves=num_l, n_estimators=n_est, min_child_samples=min_child) lgbm.fit(X_train,y_train) lgbm_pred=lgbm.predict(X_test) acc_lgbm=round(lgbm.score(X_train,y_train),10) st.text(acc_lgbm)""") if st.checkbox("Использовать несбалансированные данные"): lgbm = LGBMClassifier(num_leaves=num_l, n_estimators=n_est, min_child_samples=min_child) lgbm.fit(X_train, y_train) lgbm_pred = lgbm.predict(X_test) acc_lgbm = round(lgbm.score(X_train, y_train), 10) st.text(acc_lgbm) else: lgbm = LGBMClassifier(num_leaves=num_l, n_estimators=n_est, min_child_samples=min_child) lgbm.fit(X_train_res, y_train_res) lgbm_pred = lgbm.predict(X_test0) acc_lgbm = round(lgbm.score(X_train0, y_train0), 10) st.text(acc_lgbm)
def lightGbmModel(X_train,Y_train): # use LightGBM #!conda install -c conda-forge lightgbm from lightgbm import LGBMClassifier lightgbm=LGBMClassifier() lightgbm.fit(X_train,y_train) print('\nLight GBM Training Score:',lightgbm.score(X_train,Y_train)) return lightgbm,lightgbm.score(X_train,Y_train)
def evaluate_lgbm(trainX, trainy, testX, testy, params): sc = StandardScaler() trainX = sc.fit_transform(trainX) testX = sc.transform(testX) model = LGBMClassifier(**params) model.fit(trainX, trainy) test_acc = model.score(testX, testy) pred = model.predict_proba(testX) return model, test_acc, pred
def get_lgbm_score(X_train,y_train,X_test,y_test): lgbm_default = LGBMClassifier() lgbm_cross = LGBMClassifier() np.random.seed(200) cross_score = np.mean(cross_val_score(lgbm_cross, X_train, y_train, cv=5)) lgbm_default.fit(X_train, y_train) score_lgbm = lgbm_default.score(X_test, y_test) neptune.log_metric('lgbm', score_lgbm) neptune.log_metric('lgbm_cross_score', cross_score) return score_lgbm
def Test(): train = pd.read_csv('./csvfile/cardio.csv') train["plus"] = train["smoke"] * train["alco"] train["age_year"] = round(train["age_year"], 0).astype(np.int64) train["BMI"] = round( train["weight"] / (train["height"] * train["height"] / 10000), 2).astype(np.float64) train = train.dropna(0) train = train[(train.BMI <= 50) & (train.BMI >= 10)] y = train["cardio"] print(train.shape) print(y.shape) train = train.drop(["id", "age_days", "cardio"], 1) print(train.shape) print(train["BMI"].max()) print(train["BMI"].min()) train.loc[(train["ap_hi"] >= 140) & (train["ap_hi"] < 200), 'ap_hi'] = 3 #high train.loc[(train["ap_hi"] < 90) & (train["ap_hi"] >= 60), 'ap_hi'] = 1 #low train.loc[(train["ap_hi"] < 140) & (train["ap_hi"] >= 90), 'ap_hi'] = 2 #normal train = train.drop(["weight", "height", "ap_lo"], 1) rf = LGBMClassifier(n_estimators=200, num_leaves=25, colsample_bytree=0.6, subsample=0.6) xf_train, xf_test, yf_train, yf_test = train_test_split(train, y, test_size=0.2, random_state=1) rf.fit(xf_train, yf_train) odd = round(rf.score(xf_test, yf_test) * 100, 1) print(odd) print(train.shape) importances_df = pd.DataFrame(rf.feature_importances_).rename( {0: "importances"}, axis=1) importances_df["columns"] = xf_train.columns importances_df = importances_df.sort_values("importances", ascending=False) importances_df["importances"] = ( importances_df["importances"] / importances_df["importances"].values.sum()) * 100 print(importances_df.head(10))
def lgbm_classifier(x_trn: pd.DataFrame, y_trn: np.ndarray, x_val: pd.DataFrame, y_val: np.ndarray) -> tuple: x_trn, x_val = x_trn.copy(), x_val.copy() y_trn, y_val = y_trn.copy(), y_val.copy() model = LGBMClassifier(boosting_type='gbdt', objective='binary', metric='binary_logloss', n_estimators=400, learning_rate=0.05, min_child_samples=16, is_unbalance=True, num_iterations=700, n_jobs=-1, random_state=7) _ = model.fit(x_trn, y_trn) training_score = model.score(x_trn, y_trn) validation_score = model.score(x_val, y_val) clf_report = classification_report(y_val, model.predict(x_val)) ck_score = cohen_kappa_score(y_val, model.predict(x_val)) return model, training_score, validation_score, clf_report, ck_score
def do_generate_metrics_lgbm_optimazed_model(X_train, y_train, X_test, y_test, grid): file_operations.write_logs(FILENAME, "LGBM metrics calculation\n") model = LGBMClassifier(random_state=0) model.set_params(**grid.best_params_) model.fit(X_train, y_train) metrics = calculate_metrics(model, X_test, y_test) file_operations.write_logs( FILENAME, "Generated model params and results\n params:" + str(model.get_params()) + "\nscore " + str(model.score(X_test, y_test))) file_operations.write_logs( FILENAME, "Search grid best params and results\n params:" + str(grid.best_params_) + "\nscore " + str(grid.best_score_)) return model, metrics
def filter_LBGM_importance(dataframe, target, threshold=1): from lightgbm import LGBMClassifier from sklearn import preprocessing categorical_feats = dataframe.select_dtypes('object').columns.tolist() for col in categorical_feats: lb = preprocessing.LabelEncoder() lb.fit(list(dataframe[col].values.astype('str'))) dataframe[col] = lb.transform(list( dataframe[col].values.astype('str'))) valid_size = int(dataframe.shape[0] / 4) valid = dataframe.sample(valid_size) train = dataframe.drop(valid.index, axis=0) train_x = train.drop([target], axis=1) train_y = train[target] valid_x = valid.drop([target], axis=1) valid_y = valid[target] clf = LGBMClassifier( nthread=4, n_estimators=10000, learning_rate=0.02, num_leaves=34, colsample_bytree=0.95, subsample=0.9, max_depth=8, reg_alpha=0.04, reg_lambda=0.07, min_split_gain=0.025, min_child_weight=40, # importance_type='split', silent=-1, verbose=-1, ) clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric='auc', verbose=100, early_stopping_rounds=200) # oof_preds = clf.predict_proba(valid_x, num_iteration=clf.best_iteration_)[:, 1] feats = train_x.columns.tolist() importance_df = pd.DataFrame() importance_df["feature"] = feats importance_df["importance"] = clf.feature_importances_ importance_df.sort_values('importance', inplace=True, ascending=False) less_important_features = importance_df.loc[ importance_df['importance'] < threshold, 'feature'] dataframe.drop(less_important_features, axis=1, inplace=True) score = clf.score(train_x, train_y) trace('filter_LBGM_importance') trace(importance_df) trace('category features') trace(categorical_feats) trace('score') trace(score) trace('drop features') trace(less_important_features) return dataframe
num_leaves=15, colsample_bytree=.8, subsample=.8, max_depth=7, reg_alpha=.1, reg_lambda=.1, min_split_gain=.01) clf_lgbm.fit(X_train, Y_train, eval_set=[(X_train, Y_train)], eval_metric='auc', verbose=0, early_stopping_rounds=30) acc_clf_lgbm = round(clf_lgbm.score(X_train, Y_train) * 100, 2) acc_clf_lgbm # In[ ]: Y_pred = random_forest.predict(X_test) # In[ ]: my_submission = pd.DataFrame({ "PassengerId": test_df["PassengerId"], "Survived": Y_pred }) my_submission.to_csv('new_submission.csv', index=False) # In[ ]:
lgbm = LGBMClassifier(n_estimators=100, learning_rate=0.1, n_jobs=-1) lgbm.fit(x_train, y_train, verbose=True, eval_metric=["logloss", "rmse"], eval_set=[(x_train, y_train), (x_test, y_test)], early_stopping_rounds=20) #rmse,mae,logloss,error,auc y_pre = lgbm.predict(x_test) r2 = r2_score(y_test, y_pre) score = lgbm.score(x_test, y_test) print(__file__) print("r2") print(r2) print("score") print(score) #6)selectFromModel thresholds = np.sort(lgbm.feature_importances_) idx_max = -1 max = r2 for idx, thresh in enumerate(thresholds): #데이터 전처리
# # Train the light GBM # model = LGBMClassifier() # cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) # n_scores = cross_val_score(model, images, labels, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise') # print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores))) # fit the model on the whole dataset model = LGBMClassifier(objective="binary", class_weight="balanced") start_time = time.time() model = model.fit(images, labels) print("Train Light GBM --- %s seconds ---" % (time.time() - start_time)) start_time = time.time() basic_score = model.score(images_validation, labels_validation) print("Validation Light GBM --- %s seconds ---" % (time.time() - start_time)) print("Light GBM scikit learn basic score: %0.4f" % basic_score) # Validating the model and evaluation start_time = time.time() scores = cross_validate(model, images_validation, labels_validation, cv=5, scoring=('f1','roc_auc_ovo'), return_train_score=True) print("Cross Validation Light GBM --- %s seconds ---" % (time.time() - start_time)) cross_score = model.score(images_validation, labels_validation) print("Light GBM scikit learn cross-val score: %0.4f" % cross_score) print(scores)
X_train_scaled = scaler.transform(X_train_scaled) X_test_scaled = scaler.transform(X_test_scaled) test_x_scaled = scaler.transform(test_x_scaled) X_train_scaled = scaler.transform(X_train_scaled) X_test_scaled = scaler.transform(X_test_scaled) test_x_scaled = scaler.transform(test_x_scaled) # 시각화 import matplotlib.pyplot as plt plt.hist(X_train_scaled) plt.title('StandardScaler') plt.show() # 정확도 측정 acc = LGBM.score(X_test, y_test) print('acc: ', acc) # 0.8454961374034351 # 예측 y_pred = LGBM.predict_proba(test_x) print(y_pred) # 특성 중요도 그리기 import numpy as np import matplotlib.pyplot as plt def plot_feature_importances_orb(model): n_features = train_x.shape[1] plt.barh(np.arange(n_features), LGBM.feature_importances_, align='center') plt.yticks(np.arange(n_features), feat_labels)
learning_rate=0.1, n_estimators=500 #,max_bin=15 , colsample_bytree=0.8, subsample=0.8, min_child_weight=6) print "Fitting lgbm model for unsw" model.fit(train, train_labels, early_stopping_rounds=3, eval_set=(train, train_labels), verbose=False) #pred = model.predict(test_dataset) print(model.score(train, train_labels)) print(model.score(test, test_labels)) train_dataset, train_labels, test_dataset, test_labels = get_nsl_data() train_labels = train_labels[:, 0] test_labels = test_labels[:, 0] train = np.column_stack(train_dataset.values()) print train.shape test = np.column_stack(test_dataset.values()) print test.shape print "Fitting lgbm model for nsl" model.fit(train, train_labels, early_stopping_rounds=3, eval_set=(train, train_labels),
xg.fit(X_important_train, y_train, eval_set=[(X_important_train, y_train),(X_important_val, y_val)], early_stopping_rounds=10, verbose=True) print("XGB Train score: %s" % xg.score(X_important_train,y_train)) print("XGB Val score: %s" % xg.score(X_important_val,y_val)) print("XGB Test score: %s" % xg.score(X_important_test,y_test)) """### LGBM """ lgbm = LGBMClassifier( ).fit( X_important_train, y_train, eval_set=[(X_important_train, y_train),(X_important_val, y_val)], early_stopping_rounds=10, verbose=True) print() print("LGBM Train score: %s" % lgbm.score(X_important_train,y_train)) print("LGBM Val score: %s" % lgbm.score(X_important_val,y_val)) print("LGBM Test score: %s" % lgbm.score(X_important_test,y_test)) y_pred = lgbm.predict(X_important_test) # draw classification report and confusion matrix for LGBM MODEL (BASE MODEL) from sklearn.metrics import classification_report, confusion_matrix print(classification_report(y_test, y_pred)) cf = confusion_matrix(y_test, y_pred) sns.heatmap(cf, annot=True) import pickle pickle.dump(lgbm, open("lbm.pkl", 'wb')) pickle.dump(xg, open("xg.pkl", 'wb'))
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=2018) # 随机森林 rf = RandomForestClassifier(n_estimators=100, oob_score=True, random_state=2018) rf.fit(x_train, y_train) rf_acc = rf.score(x_test, y_test) print("RandomForestClassifier Acc: ", rf_acc) # GBDT gb = GradientBoostingClassifier(random_state=2018) gb.fit(x_train, y_train) gb_acc = gb.score(x_test, y_test) print("GradientBoostingClassifier Acc: ", gb_acc) # XGBoost xgb = XGBClassifier(random_state=2018) xgb.fit(x_train, y_train) xgb_acc = xgb.score(x_test, y_test) print("XGBClassifier Acc: ", xgb_acc) # LightGBM lg = LGBMClassifier(random_state=2018) lg.fit(x_train, y_train) lg_acc = lg.score(x_test, y_test) print("LGBMClassifier Acc: ", lg_acc)
y_pred['PassengerId'] = df_test['PassengerId'] y_pred_rf = y_pred y_pred.to_csv('titanic_pred_rfc.csv', index=False) # # Light GBM # In[ ]: from lightgbm import LGBMClassifier lgb = LGBMClassifier(learning_rate=0.01, max_depth=2, num_leaves=3).fit(X_train, y_train) # In[ ]: lgb.score(X_train, y_train) # In[ ]: lgb.score(X_test, y_test) # In[ ]: y_pred = pd.DataFrame(lgb.predict(df_test)) y_pred['Survived'] = y_pred[0] y_pred.drop(0, axis=1, inplace=True) y_pred['PassengerId'] = df_test['PassengerId'] y_pred_lgb = y_pred y_pred.to_csv('titanic_pred_lgb.csv', index=False)
def train(MODEL="GNB"): # load voter data and merge with Census data df = pd.read_csv(DIR + "/data/nc_voter_geocoded_census_block_trigrams.csv") df = prep_data(df) tes = {} #tes = joblib.load(DIR + "/data/models/transformers_binary.joblib") models = {} # Loop through each race class, create model for each for race in ["W", "B", "A", "I", "HL"]: X = df.copy() # If hispanic, use ethnic_code instead of race code if race == "HL": X["ethnic_code"] = np.where(X["ethnic_code"] == race, True, False) y = X["ethnic_code"] # other wise race code else: X["race_code"] = np.where(X["race_code"] == race, True, False) y = X["race_code"] # target encode names, save target encoder for col in ["first_name", "last_name", "middle_name"]: #te = tes[race][col] te = TargetEncoder() te.fit(X[col], y) X[col] = te.transform(X[col]) # remove target variables and fill in any nas with 0 #sample_weights = X["sample_weights"] #X = X.drop(["race_code", "ethnic_code", "zip", "sample_weights"], axis=1) X = X.fillna(0) sm = SMOTE(n_jobs=-1) X, y = sm.fit_resample(X, y) sample_weights = X["sample_weights"] X = X.drop(["zip", "sample_weights"], axis=1) # train model if MODEL == "LGBM": from lightgbm import LGBMClassifier model = LGBMClassifier(n_jobs=-1) elif MODEL == "GNB": from sklearn.naive_bayes import GaussianNB model = GaussianNB() elif MODEL == "XGB": from xgboost import XGBClassifier model = XGBClassifier(n_jobs=-1) elif MODEL == "SGD": model = SGDClassifier(alpha=0.0, eta0=0.1, fit_intercept=True, l1_ratio=1.0, learning_rate="constant", loss="modified_huber", penalty="elasticnet", power_t=0.0) elif MODEL == "RF": from sklearn.ensemble import RandomForestClassifier model = RandomForestClassifier(n_jobs=-1, max_depth=10) model.fit(X[MODEL_COLS], y, sample_weight=sample_weights) # save model models[race] = model # score model print(race, model.score(X[MODEL_COLS], y)) # Save the models and encoders handle = MODEL.lower() #joblib.dump(tes, DIR + "/data/models/transformers_binary.joblib", compress=True) joblib.dump(models, DIR + "/data/models/models_binary_%s.joblib" % handle, compress=True) #joblib.dump(scalers, DIR + "/data/models/scalers_binary.joblib", compress=True) print("Trained model saved to ./data/models/")
def supervised_shared(unsw_dict, nsl_dict, H1, U, num_epochs, batch_size, beta): load = False if load: with open(r"SharedAutoEncoder/datasets.p", "rb") as i: EX_unsw, EX_unsw_test, EX_nsl, EX_nsl_test = pickle.load(i) else: logger.info('Using Shared AE with Linear Classifier') X_unsw = unsw_dict['X'] X_unsw_test = unsw_dict['X_test'] y_unsw = unsw_dict['y'] y_unsw_test = unsw_dict['y_test'] X_nsl = nsl_dict['X'] X_nsl_test = nsl_dict['X_test'] y_nsl = nsl_dict['y'] y_nsl_test = nsl_dict['y_test'] unsw_dim = X_unsw.shape[1] nsl_dim = X_nsl.shape[1] model_unsw, model_nsl, encoder_unsw, encoder_nsl = multimodal_autoencoder( unsw_dim, nsl_dim, H1, U) for x in range(num_epochs): print("Epoch:", x) model_unsw.fit(X_unsw, X_unsw, epochs=2, batch_size=batch_size) model_nsl.fit(X_nsl, X_nsl, epochs=2, batch_size=batch_size) # Get the shared representation of both datasets EX_unsw = encoder_unsw.predict(X_unsw) EX_unsw_test = encoder_unsw.predict(X_unsw_test) EX_nsl = encoder_nsl.predict(X_nsl) EX_nsl_test = encoder_nsl.predict(X_nsl_test) with open(r"SharedAutoEncoder/datasets.p", "wb") as o: pickle.dump((EX_unsw, EX_unsw_test, EX_nsl, EX_nsl_test), o) # Get accu5(unsw) and accu5(nsl) #EX_concat = np.concatenate((EX_unsw, EX_nsl), axis=0) #y_concat = np.concatenate((y_unsw, y_nsl), axis=0) #model = build_attention_model(EX_unsw.shape[1], 2) model = LGBMClassifier( n_jobs=8, max_depth=11, num_leaves=302, learning_rate=0.1, n_estimators=500 # ,max_bin=15 #, colsample_bytree=0.8 #, subsample=0.8 #, min_child_weight=6 ) #model.fit(EX_concat, y_concat[:,0]) logger.info("Training lgbm model on NSL unified representation") model.fit(EX_nsl, y_nsl[:, 0], early_stopping_rounds=3, eval_set=(EX_nsl_test, y_nsl_test[:, 0]), verbose=False) logger.info("Shared model NSL train acc:\t%.6f" % model.score(EX_nsl, y_nsl[:, 0])) logger.info("Shared model NSL test acc:\t%.6f" % model.score(EX_nsl_test, y_nsl_test[:, 0])) logger.info("Training lgbm model on UNSW unified representation") model.fit(EX_unsw, y_unsw[:, 0], early_stopping_rounds=3, eval_set=(EX_unsw_test, y_unsw_test[:, 0]), verbose=False) #model.fit(EX_nsl, y_nsl[:, 0], early_stopping_rounds=3, eval_set=(EX_nsl_test, y_nsl_test[:, 0]), verbose=False) logger.info("Shared model UNSW train acc:\t%.6f" % model.score(EX_unsw, y_unsw[:, 0])) logger.info("Shared model UNSW test acc:\t%.6f" % model.score(EX_unsw_test, y_unsw_test[:, 0])) logger.info("Shared model NSL train acc:\t%.6f" % model.score(EX_nsl, y_nsl[:, 0])) logger.info("Shared model NSL test acc:\t%.6f" % model.score(EX_nsl_test, y_nsl_test[:, 0]))
df = pd.read_csv('./KSJR_Car_Hacking_D_training-1(DS_CV)_0.csv') df_x = df[[ 'Data0', 'Data1', 'Data2', 'Data3', 'Data4', 'Data5', 'Data6', 'Data7' ]] df_y = df['Class'] train_x, test_x, train_y, test_y = train_test_split(df_x, df_y, test_size=0.3, random_state=10) print(train_x.shape, test_x.shape) base_model = LGBMClassifier(random_state=0, metric='binary_logloss') base_model.fit(train_x, train_y) base_acc = base_model.score(test_x, test_y) param_grid = { 'n_estimators': [10, 100], 'boosting_type': ['gbdt', 'rf', 'dart', 'goss'], 'objective': ['binary'], 'num_leaves': [6, 8, 12, 16], 'learning_rate': [0.1, 0.001, 0.003] } grid_search = GridSearchCV(LGBMClassifier(random_state=0, metric='binary_error'), param_grid, cv=kfold, verbose=2) grid_search.fit(train_x, train_y)
y_pred = model2.predict(best_x_test) acc = accuracy_score(y_test, y_pred) print('acc :', acc) end1 = time.time() import joblib joblib.dump(best_model, './model/xgb_Save/sfm3-' + str(best_score) + '.dat') model2 = joblib.load('./model/xgb_Save/sfm3-' + str(best_score) + '.dat') #### LGBM 셀렉트 start2 = time.time() model_LGBM = LGBMClassifier() model_LGBM.fit(x_train, y_train) score = model_LGBM.score(x_test, y_test) print("acc : ", score) thresholds = np.sort(model_LGBM.feature_importances_) print(thresholds) print(x_train.shape) print("========================") best_x_train = x_train best_x_train = x_test best_score = score best_model = model_LGBM for thresh in thresholds: selection = SelectFromModel(model_LGBM, threshold=thresh, prefit=True)
random_state=None) scores = np.array([]) # Make k-fold CV for train_index, test_index in rskf.split(data, target): # Initialize model clf = LGBMClassifier(learning_rate=best_learning_rate, min_data_in_leaf=best_min_data_in_leaf, num_leaves=best_num_leaves) # Split data X_train, X_test = data[train_index], data[test_index] y_train, y_test = target[train_index], target[test_index] # Fit and score the model clf.fit(X_train, y_train) train_score = clf.score(X_test, y_test) scores = np.append(scores, train_score) # Print final score with open('ris/OUT-score_alglorithms.txt', mode='a') as f: print('Average score:', scores.mean(), '+-', scores.std() / np.sqrt(n_splits), file=f) ##################### # Data augmentation # ##################### params = {
from sklearn.metrics import precision_score # Importing the dataset X = np.load('./project/mini/data/X.npy') y = pd.read_csv('./project/mini/data/y_label.csv', header=0).iloc[:, 0] X = X.reshape(X.shape[0], X.shape[1] * X.shape[2]) # Splitting the dataset into the Training set and Test set x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y) # Feature Scaling x_train /= -80 x_test /= -80 model = LGBMClassifier(objective='multiclass') model.fit(x_train, y_train, categorical_feature=[0, 12]) print('feature_importances :', model.feature_importances_) y_pred = model.predict(x_test) print('최종 정답률 :', model.score(x_test, y_test)) # 최종 정답률 : 0.5326016785022595
from sklearn.model_selection import train_test_split, RandomizedSearchCV from sklearn.metrics import r2_score import matplotlib.pyplot as plt import pickle dataset = load_breast_cancer() x = dataset.data y = dataset.target x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=True, random_state=66) model = LGBMClassifier() model.fit(x_train, y_train) score = model.score(x_test, y_test) # print(score) thresholds = np.sort(model.feature_importances_) # print(thresholds) models = [] # 빈 모델 배열 생성 res = np.array([]) #빈 결과값 배열 생성 for thres in thresholds: selection = SelectFromModel(model, threshold=thres, prefit=True) #중요하지 않는 컬럼부터 하나씩 빼면서 트레이닝한다 #median selection_x_train = selection.transform(x_train) model2 = LGBMClassifier(n_estimators=1000) selection_x_test = selection.transform(x_test) model2.fit(selection_x_train,
def main(): # Add arguments to script parser = argparse.ArgumentParser() # See lightgbm library for python for a list of parameters: https://lightgbm.readthedocs.io/en/latest/Parameters.html parser.add_argument('--n_estimators', type=int, default=100, help="number of boosting iterations") parser.add_argument('--learning_rate', type=float, default=0.1, help="shrinkage rate") parser.add_argument('--max_depth', type=int, default=-1, help="max depth for tree model") parser.add_argument( '--subsample', type=float, default=1.0, help= "randomly select part of data without resampling. useful to speed up training and prevent over-fitting" ) args = parser.parse_args() run = Run.get_context() run.log("n_estimators:", np.int(args.n_estimators)) run.log( "learning_rate:", np.float(args.learning_rate) ) # see here for more ideas = https://bit.ly/3c2zJOm & https://bit.ly/3o6OAth run.log("max_depth:", np.int(args.max_depth)) run.log("subsample:", np.float(args.subsample)) # training set train_split_data = run.input_datasets["output_split_train"] # train_split_data = train_split_data.parse_parquet_files() train_split_df = train_split_data.to_pandas_dataframe() print(train_split_df.head(10)) x_train = train_split_df.loc[:, train_split_df.columns != 'Exited'] y_train = train_split_df.loc[:, train_split_df.columns == 'Exited'] #evaluation set test_split_data = run.input_datasets["output_split_test"] test_split_df = test_split_data.to_pandas_dataframe() x_test = test_split_df.loc[:, test_split_df.columns != 'Exited'] y_test = test_split_df.loc[:, test_split_df.columns == 'Exited'] print(x_train.head(10)) print(x_test.head(10)) # declaring our model with parameters - default and those declared in our hyperparameter space model = LGBMClassifier(n_estimators=args.n_estimators, learning_rate=args.learning_rate, max_depth=args.max_depth, subsample=args.subsample).fit(x_train, y_train) # save model os.makedirs('./outputs/model', exist_ok=True) # files saved in the "outputs" folder are automatically uploaded into run history joblib.dump(model, './outputs/model/saved_model.joblib') accuracy = model.score(x_test, y_test) print(model) print(x_test.head(10)) run.log("Accuracy", np.float( accuracy)) #source: https://bit.ly/3mTxEWR && https://bit.ly/3hgonXx y_pred = model.predict(x_test) auc_weighted = roc_auc_score(y_test, y_pred, average='weighted') run.log("AUC_weighted", np.float(auc_weighted) ) #source: https://bit.ly/3mTxEWR && https://bit.ly/3hgonXx # creating a confusion matrix cm = confusion_matrix(y_test, y_pred) print(cm)
lg1 = LGBMClassifier() print(x1) print( "\n.................................................................................\n" ) x1_train, x1_test, y1_train, y1_test = train_test_split(x1, y1, test_size=0.2, random_state=42, stratify=y1) lg1.fit(x1_train, y1_train) print("YOUR R2 MACHINE LEARNED WITH THIS ACCURACY : ", lg1.score(x1_test, y1_test)) print( "\n.................................................................................\n" ) y1_pred = lg1.predict(x1_test) print(classification_report(y1_test, y1_pred)) precision, recall, fscore, support = score(y1_test, y1_pred) print('precision: ', np.mean(precision)) print('recall: ', np.mean(recall)) print('fscore: ', np.mean(fscore))
from lightgbm import LGBMClassifier, plot_importance from sklearn.model_selection import train_test_split from sklearn.feature_selection import SelectKBest, f_classif, chi2 train = pd.read_csv('train.csv', index_col=0) test = pd.read_csv('test.csv', index_col=0) sample_submission = pd.read_csv('sample_submission.csv', index_col=0) x = train.drop(columns='class', axis=1) # class 열을 삭제한 새로운 객체 y = train['class'] # 결과 레이블(class) TEST = test train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.2, shuffle=True, stratify=y, random_state=42) # 데이터에서 20%를 test 데이터로 분리 evals = [(test_x, test_y)] lgbm = LGBMClassifier(n_estimators=1000, learning_rate=0.03, max_depth=12, num_leaves=4000, random_state=42, boosting_type="goss") lgbm.fit(train_x, train_y, early_stopping_rounds=20, eval_set=evals) print("acc: {}".format(lgbm.score(train_x, train_y))) # 훈련 데이터에 대한 정확도 print("acc: {}".format(lgbm.score(test_x, test_y))) # 테스트 데이터에 대한 정확도 y_pred = np.argmax(lgbm.predict_proba(TEST), axis=1) # 각 클래스에 대한 예측확률 submission = pd.DataFrame(data=y_pred, columns=sample_submission.columns, index=sample_submission.index) submission.to_csv('submission5.csv', index=True)
# full_index = np.array([95,94,82,59,0]) # data_index = np.array([44,179,112,59,82,58,84]) # data_index = np.array([0]) data_index = np.array([0, 59, 94, 95, 84, 161, 44, 179, 82, 112, 58]) # classes = ['WWW', 'MAIL', 'FTP-CONTROL', 'FTP-PASV', 'ATTACK', 'P2P', 'DATABASE', 'FTP-DATA', 'MULTIMEDIA', 'SERVICES', # 'INTERACTIVE', 'GAMES'] classes = ['WWW', 'MAIL', 'FTP-CONTROL', 'FTP-PASV', 'ATTACK', 'P2P', 'DATABASE', 'FTP-DATA', 'MULTIMEDIA', 'SERVICES', 'INTERACTIVE'] # file used to train, who generates x_train,x_test,y_train,y_test # I also resampled the file `entry12` file = os.path.join(data_dir, filename) test_file = os.path.join(data_dir, test_filename) if __name__ == '__main__': acc = [] x_train, _, y_train, _ = get_data(file) _, x_test, _, y_test = get_data(test_file) np_dir = os.path.join(data_dir, 'estimators_100_150_5.txt') for i in range(50, 150, 5): clf = LGBMClassifier(n_estimators=i) clf.fit(x_train, y_train) print(clf.get_params()) accuracy = clf.score(x_test, y_test) acc.append(accuracy) acc = np.array(acc) print(acc) np.savetxt(np_dir, acc) import matplotlib.lines as lines lines.lineStyles
Y_predGB = modelGB.predict(X_valid) print("Training Accuracy: ", modelGB.score(X_train, Y_train)) print('Testing Accuarcy: ', modelGB.score(X_valid, Y_valid)) print("AUROC Score of Gradient Boosting = ", roc_auc_score(Y_valid, Y_predGB)) from lightgbm import LGBMClassifier modelLGBM = LGBMClassifier() modelLGBM.fit(X_train, Y_train) Y_predLGBM = modelLGBM.predict(X_valid) print("Training Accuracy: ", modelLGBM.score(X_train, Y_train)) print('Testing Accuarcy: ', modelLGBM.score(X_valid, Y_valid)) print("AUROC Score of LGBM = ", roc_auc_score(Y_valid, Y_predLGBM)) test_Y_RF = modelRF.predict(test_X) test_Y_XG = modelXG.predict(test_X) test_Y_AB = modelAB.predict(test_X) test_Y_LGBM = modelLGBM.predict(test_X) test_Y_GB = modelGB.predict(test_X) test_Y_pred = [] for i in range(len(test_Y_RF)): k = 0.35 * test_Y_LGBM[i] + 0.25 * test_Y_RF[i] + 0.175 * test_Y_GB[i] + 0.125 * test_Y_XG[i] + 0.1 * test_Y_AB[i] # weighted averaging test_Y_pred.append(k)
learning_rate=0.1) # 나무의 갯수(n_estimators)는 epoch model.fit(x_train, y_train, verbose=True, eval_metric=["multi_logloss", "multi_error"], eval_set=[(x_train, y_train), (x_test, y_test)], early_stopping_rounds=100) # results = model.evals_result() # print("eval's results : ", results) # y_pred = model.predict(x_test) # r2 = r2_score(y_pred, y_test) # print("r2 Score : %.2f%%:" %(r2*100.0)) score = model.score(x_test, y_test) print("acc : ", score) ######################################################################################################### # feature engineering thresholds = np.sort(model.feature_importances_) print(thresholds) for thresh in thresholds: selection = SelectFromModel(model, threshold=thresh, prefit=True) select_x_train = selection.transform(x_train) selection_model = LGBMClassifier() selection_model.fit(select_x_train, y_train) select_x_test = selection.transform(x_test)
x_train_data = train_set[:,1:] y_train_data = train_set[:,:1].reshape(-1,) x_test_data = test_set[:,1:] y_test_data = test_set[:,:1].reshape(-1,) #print("[x_train_data]",x_train_data.shape) #print("[y_train_data]", y_train_data.shape) #print("[x_test_data]", x_test_data.shape) #print("[y_test_data]", y_test_data.shape) lgb = LGBMClassifier(n_estimators=1500, learning_rate=0.1, max_depth=15, application='binary', num_leaves=30, metrics='binary_logloss') classifier = lgb.fit(x_train_data, y_train_data) print(lgb.score(x_train_data, y_train_data)) #print(lgb.score(x_test_data, y_test_data)) y_pred = lgb.predict(x_test_data) #y_pred = classifier.predict_proba(x_test_data) #print(y_pred) # for yy in y_pred: # print(yy) print(confusion_matrix(y_test_data,y_pred)) print(classification_report(y_test_data,y_pred)) fig, ax = plt.subplots(figsize=(10,20)) plot_importance(lgb, ax, max_num_features=32) plt.show() # model save #joblib.dump(lgb, open('lgb.model', 'wb'))