import lightgbm as lgb from sklearn.model_selection import train_test_split #loading data into dataframe df = pd.read_csv('https://query.data.world/s/67p5gkjye5vocfiqm2cuxnrkx4ijim') #printig first five rows df.head() #getting basic detail df.info() #filling missing values df['3P%'].fillna(0, inplace=True) #checking data balance df['TARGET_5Yrs'].value_counts().plot.bar() #getting target and features in different variables y_train = df['TARGET_5Yrs'] X_train = df.drop(['TARGET_5Yrs', 'Name'], axis=1) #splitting data into train and test sets X_train, X_test, y_train, y_test = train_test_split(X_train, y_train) #creating an instance clf = lgb.LGBMClassifier() #training the classifier and testing clf.fit(X_train, y_train) acc = clf.score(X_test, y_test) print(acc)
y_val = y_train[val_idx] X_test_ = X_test.copy() print('=' * 30, f'FOLD {i+1}/{cv.get_n_splits()}', '=' * 30) with timer('target encoding'): cat_cols = X_train.select_dtypes(['object']).columns.tolist() te = TargetEncoder(cols=cat_cols) X_trn.loc[:, cat_cols] = te.fit_transform(X_trn.loc[:, cat_cols], y_trn) X_val.loc[:, cat_cols] = te.transform(X_val.loc[:, cat_cols]) X_test_.loc[:, cat_cols] = te.transform(X_test.loc[:, cat_cols]) X_trn.fillna(-9999) X_val.fillna(-9999) X_test_.fillna(-9999) with timer('fit'): model = lgb.LGBMClassifier(**lgb_params) model.fit(X_trn, y_trn, eval_set=[(X_trn, y_trn), (X_val, y_val)], **fit_params) p = model.predict_proba(X_val)[:, 1] val_series.iloc[val_idx] = p cv_results.append(roc_auc_score(y_val, p)) test_df[i] = model.predict_proba(X_test_)[:, 1] feat_df[i] = model.feature_importances_ val_df = pd.DataFrame({ 'TARGET': y_train, 'p': val_series }).to_csv(OUTPUT / f'{NAME}_cv_pred.csv', index=False)
# -*- coding: utf-8 -*- # @Time : 2018/5/9 19:34 # @Author : LeonHardt # @File : predictor_lgm.py import os import numpy as np from sklearn.model_selection import train_test_split import lightgbm as lgb from score import score_roc from sklearn.externals import joblib x_train = np.loadtxt(os.getcwd()+'/data_error/x_train_error93.txt', delimiter=',') y_label = np.loadtxt(os.getcwd()+'/data_error/y_train_error93.txt', delimiter=',') x_test = np.loadtxt(os.getcwd()+'/data_error/x_test_error93.txt', delimiter=',') X_train, X_test, y_train, y_test = train_test_split(x_train, y_label, test_size=0.20, random_state=314) # print(x_train_sample.shape) gbm = lgb.LGBMClassifier(n_estimators=4000, learning_rate=0.05, objective='binary', is_unbalance=True, colsample_bytree=0.8665631328558623, min_child_samples=122, num_leaves=48, reg_alpha=2, reg_lambda=50, subsample=0.7252600946741159, scale_pos_weight=2) fit_params = {"early_stopping_rounds":30, "eval_metric" : 'auc', "eval_set" : [(X_test,y_test)], 'eval_names': ['valid'], 'verbose': 100} gbm.fit(X_train, y_train, **fit_params) prob = gbm.predict_proba(x_test) np.savetxt(os.getcwd()+"/prediction/lgb4000_error_kaggle.txt", prob, delimiter=',')
def complex_lightgbm(): import lightgbm return lightgbm.LGBMClassifier(max_depth=5, num_leaves=11, class_weight='balanced')
import multiprocessing #test = pd.read_csv('./*.csv') train = pd.read_csv('/*.csv') #feat_cols=pd.read_csv('/*.csv') #feat_cols=feat_cols.iloc[:,0].values.tolist() num_cores = multiprocessing.cpu_count() print('core: ',num_cores) #del train['target'] #---------binary------------ myscore=None myobj='binary' target=train['target'] model=lgb.LGBMClassifier(boosting_type='gbdt',random_state=4590, n_jobs=num_cores, # max_depth=-1, bagging_freq= 1, bagging_seed= 11,verbosity=0) print('BINARY') #-------------------------- #target = train['target'] #myobj='regression' #myscore='neg_mean_squared_error' #param = {'num_leaves': 31, # 'min_data_in_leaf': 30, # 'objective':'regression', # 'max_depth': 10, # 'learning_rate': 0.01, # "min_child_samples": 20, # "boosting": "gbdt", # "feature_fraction": 1, # "bagging_fraction": 0.9 ,
y_temp, stratify=y_temp, test_size=0.5, random_state=42) print('Shape of X_train:', X_train.shape) print('Shape of X_val:', X_val.shape) print('Shape of X_test:', X_test.shape) # # Selection of features and plotting feature importance # In[61]: model_sk = lgb.LGBMClassifier(boosting_type='gbdt', max_depth=7, learning_rate=0.01, n_estimators=2000, class_weight='balanced', subsample=0.9, colsample_bytree=0.8, n_jobs=-1) train_features, valid_features, train_y, valid_y = train_test_split( X_train, y_train, test_size=0.15, random_state=42) model_sk.fit(train_features, train_y, early_stopping_rounds=100, eval_set=[(valid_features, valid_y)], eval_metric='auc', verbose=200) # In[62]: feature_imp = pd.DataFrame(sorted(
'learning_rate': 0.02, 'colsample_bytree': 0.3, 'subsample': 0.7, 'subsample_freq': 2, 'num_leaves': 16, 'seed': 99 } lgb_params3 = { 'n_estimators': 110, 'max_depth': 4, 'learning_rate': 0.02, 'seed': 99 } lgb_model = lgb.LGBMClassifier(**lgb_params) lgb_model2 = lgb.LGBMClassifier(**lgb_params2) lgb_model3 = lgb.LGBMClassifier(**lgb_params3) xgmodel = xgb.XGBClassifier(max_depth=8, n_estimators=1000, min_child_weight=300, colsample_bytree=0.8, subsample=0.8, eta=0.3, seed=42) lgb_model4 = lgb.LGBMClassifier(boosting_type="gbdt", num_leaves=15, reg_alpha=0, reg_lambda=0., max_depth=-1,
def recursive_feature_elimination(train, from_backup=True): """ conduct recursive feature elimination on the given training dataset :param train: training dataset :param from_backup: load from historical result (stored as list of strings), defaults to True :return top ranked features: """ # defaults to return backup list if from_backup: return ['TransactionAmt', 'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'dist1', 'P_emaildomain', 'R_emaildomain', 'C1', 'C2', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'V3', 'V4', 'V5', 'V7', 'V12', 'V13', 'V19', 'V20', 'V29', 'V30', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V40', 'V43', 'V44', 'V45', 'V47', 'V48', 'V49', 'V51', 'V52', 'V53', 'V54', 'V56', 'V57', 'V58', 'V60', 'V61', 'V62', 'V69', 'V70', 'V72', 'V74', 'V75', 'V76', 'V78', 'V81', 'V82', 'V83', 'V87', 'V90', 'V91', 'V94', 'V95', 'V96', 'V97', 'V99', 'V100', 'V126', 'V127', 'V128', 'V130', 'V131', 'V139', 'V140', 'V143', 'V145', 'V149', 'V150', 'V152', 'V156', 'V158', 'V159', 'V160', 'V162', 'V164', 'V165', 'V166', 'V167', 'V169', 'V170', 'V171', 'V187', 'V188', 'V189', 'V200', 'V201', 'V202', 'V203', 'V204', 'V205', 'V206', 'V207', 'V208', 'V209', 'V210', 'V212', 'V213', 'V215', 'V216', 'V217', 'V218', 'V219', 'V221', 'V222', 'V223', 'V224', 'V225', 'V226', 'V228', 'V231', 'V232', 'V233', 'V234', 'V243', 'V244', 'V251', 'V254', 'V256', 'V257', 'V258', 'V261', 'V262', 'V263', 'V264', 'V265', 'V266', 'V267', 'V268', 'V270', 'V271', 'V272', 'V273', 'V274', 'V275', 'V276', 'V277', 'V278', 'V279', 'V280', 'V282', 'V283', 'V285', 'V287', 'V288', 'V289', 'V291', 'V292', 'V294', 'V303', 'V306', 'V307', 'V308', 'V310', 'V312', 'V313', 'V314', 'V315', 'V317', 'V322', 'V323', 'V324', 'V326', 'V331', 'V332', 'V333', 'V335', 'id_01', 'id_02', 'id_03', 'id_05', 'id_06', 'id_09', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_19', 'id_20', 'id_30', 'id_31', 'id_32', 'id_33', 'id_38', 'DeviceType', 'DeviceInfo', 'device_name', 'OS_id_30', 'version_id_30', 'browser_id_31', 'version_id_31', 'screen_width', 'screen_height', 'P_emaildomain_bin', 'P_emaildomain_suffix', 'R_emaildomain_bin', 'R_emaildomain_suffix', 'TransactionAmt_Log', 'TransactionAmt_decimal'] # if desire to conduct RFE again... train = reduce_mem_usage(train) X = train.sort_values('TransactionDT').drop(['isFraud', 'TransactionDT'], axis=1) y = train.sort_values('TransactionDT')['isFraud'] del train gc.collect() X.fillna(-999, inplace=True) # parameters chosen by BayesianOptimization # Credit to this notebook: https://www.kaggle.com/vincentlugat/ieee-lgb-bayesian-opt/notebook params = { 'num_leaves': 491, 'min_child_weight': 0.03454472573214212, 'feature_fraction': 0.3797454081646243, 'bagging_fraction': 0.4181193142567742, 'min_data_in_leaf': 106, 'objective': 'binary', 'max_depth': -1, 'learning_rate': 0.006883242363721497, "boosting_type": "gbdt", "bagging_seed": 11, "metric": 'auc', "verbosity": -1, 'reg_alpha': 0.3899927210061127, 'reg_lambda': 0.6485237330340494, 'random_state': 47 } import lightgbm as lgb clf = lgb.LGBMClassifier(**params) rfe = RFECV(estimator=clf, step=10, cv=KFold(n_splits=5, shuffle=False), scoring='roc_auc', verbose=2) rfe.fit(X, y) return X.columns[rfe.ranking_ == 1].tolist()
print(X_train.shape, X_test.shape, len(y_train), len(y_test)) X_train.reset_index(drop=True, inplace=True) X_test.reset_index(drop=True, inplace=True) y_train.reset_index(drop=True, inplace=True) y_test.reset_index(drop=True, inplace=True) model_lgb = lgb.LGBMClassifier( n_jobs=4, n_estimators=100000, boost_from_average='false', learning_rate=0.01, num_leaves=64, num_threads=4, max_depth=-1, tree_learner="serial", feature_fraction=0.7, bagging_freq=5, bagging_fraction=0.7, min_data_in_leaf=100, silent=-1, verbose=-1, max_bin=255, bagging_seed=11, ) kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=10) auc_scores = [] models = [] for i, (train_idx, valid_idx) in enumerate(kf.split(X_train, y_train)):
train_set = lgb.Dataset(data = train_features, label = train_labels) test_set = lgb.Dataset(data = test_features, label = test_labels) return train_set, test_set, train_features, test_features, train_labels, test_labels # In[3]: train_set, test_set, train_features, test_features, train_labels, test_labels = prepare_lgb_df(df) # In[51]: model = lgb.LGBMClassifier() default_params = model.get_params() del default_params['n_estimators'] print(default_params) cv_results = lgb.cv(default_params, train_set, num_boost_round=10000, early_stopping_rounds=100, metrics='auc', nfold=n_folds, seed=50) # In[84]:
# trainX = clf.mergeToOne(trainX,new_feature) # testX = clf.mergeToOne(testX, new_test_features) # #TODO:模型搭建 start = time.time() model = lgb.LGBMClassifier( boosting_type="gbdt", num_leaves=48, max_depth=-1, learning_rate=0.05, n_estimators=3000, subsample_for_bin=50000, objective="binary", min_split_gain=0, min_child_weight=5, min_child_samples=30, #10 subsample=0.8, subsample_freq=1, colsample_bytree=1, reg_alpha=3, reg_lambda=5, feature_fraction=0.9, bagging_fraction=0.9, #此次添加的 seed=2019, n_jobs=10, slient=True, num_boost_round=3000) n_splits = 7 random_seed = 2019 skf = StratifiedKFold(shuffle=True, random_state=random_seed, n_splits=n_splits)
if __name__ == '__main__': np.random.seed(2707) X_train, X_test, y_train = utils.load_data(data_name='log_flipped', columns=COLUMNS) clf = None metric = 'logloss' if CLASSIFIER == 'xgb' else 'binary_logloss' if CLASSIFIER == 'xgb': clf = xgb.XGBClassifier(**PARAMS) else: par = PARAMS.copy() par['num_leaves'] = 2**par['max_depth'] del par['gamma'] del par['max_depth'] clf = lgb.LGBMClassifier(**par) if MODE == 'cv': utils.perform_cv(X_train, y_train, clf, MODEL_NAME + '-' + CLASSIFIER, fit_params={'eval_metric': metric}, stratify_labels=utils.load_stratify_labels()) elif MODE == 'ensemble': utils.VJUH(X_train, X_test, y_train, clf, MODEL_NAME, 'ensemble',
test = get_feature(op_test, trans_test, sub).fillna(-1) train = train.drop(['Tag'], axis=1).fillna(-1) label = y['Tag'] test_id = test['UID'] test = test.drop(['Tag'], axis=1).fillna(-1) lgb_model = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=100, reg_alpha=3, reg_lambda=5, max_depth=-1, n_estimators=5000, objective='binary', subsample=0.9, colsample_bytree=0.77, subsample_freq=1, learning_rate=0.05, random_state=1000, n_jobs=16, min_child_weight=4, min_child_samples=5, min_split_gain=0) skf = StratifiedKFold(n_splits=5, random_state=2018, shuffle=True) best_score = [] oof_preds = np.zeros(train.shape[0]) sub_preds = np.zeros(test_id.shape[0]) for index, (train_index, test_index) in enumerate(skf.split(train, label)): lgb_model.fit(train.iloc[train_index],
data_x_tst = pd.read_csv('../data/nepal_earthquake_tst.csv') df_submission = pd.read_csv('../data/nepal_earthquake_submission_format.csv') #se quitan las columnas que no se usan data_x.drop(labels=['building_id'], axis=1,inplace = True) data_x_tst.drop(labels=['building_id'], axis=1,inplace = True) data_y.drop(labels=['building_id'], axis=1,inplace = True) y = np.ravel(data_y.values) X, X_tst, selec = preprocessing(data_x, y, data_x_tst) skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=123456) print("------ LightGBM...") lgbm = lgb.LGBMClassifier(objective='regression_l1', n_estimators=200, n_jobs=2, num_leaves = 40, scale_pos_weight = 0.1) lgbm, y_test_lgbm = validacion_cruzada(lgbm, X, y, skf) # Entreno de nuevo con el total de los datos # El resultado que muestro es en training, será mejor que en test clf = lgbm clf = clf.fit(X,y) plotImp(clf, selec, X.shape[1]) y_pred_tra = clf.predict(X) print("F1 score (tra): {:.4f}".format(f1_score(y,y_pred_tra,average='micro'))) y_pred_tst = clf.predict(X_tst) df_submission['damage_grade'] = y_pred_tst df_submission.to_csv("../Submissions/submission_" + sys.argv[0][-5:-3] + ".csv", index=False)
def stack_test(train_x, train_y, test_x, test_y): print("start stacking test") clf1 = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=50, reg_alpha=0.0, reg_lambda=1, max_depth=-1, n_estimators=150, objective='binary', min_child_weight=50, subsample=0.8, colsample_bytree=0.8, subsample_freq=1, learning_rate=0.1, random_state=2018, n_jobs=-1) clf2 = lgb.LGBMClassifier(boosting_type='dart', num_leaves=50, reg_alpha=0.0, reg_lambda=1, max_depth=-1, n_estimators=150, objective='binary', min_child_weight=50, subsample=0.8, colsample_bytree=0.8, subsample_freq=1, learning_rate=0.1, random_state=2018, n_jobs=-1) clf3 = lgb.LGBMClassifier(boosting_type='rf', num_leaves=50, reg_alpha=0.0, reg_lambda=1, max_depth=-1, n_estimators=150, objective='binary', min_child_weight=50, subsample=0.8, colsample_bytree=0.8, subsample_freq=1, learning_rate=0.1, random_state=2018, n_jobs=-1) clf4 = XGBClassifier(max_depth=5, learning_rate=0.1, n_estimators=150, objective='binary:logistic', booster='gbtree', n_jobs=-1, min_child_weight=50, subsample=0.8, colsample_bytree=0.8, reg_alpha=0, reg_lambda=1, random_state=2018) stack_clf = StackingClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=clf4, use_probas=True, average_probas=True, verbose=1) stack_clf.fit(train_x, train_y) pred_score = stack_clf.predict_proba(test_x)[:, 1] auc_score = roc_auc_score(test_y, pred_score) print("auc score is {}".format(auc_score)) return stack_clf
""" # LIGHT GBM # Instantiate classifier classifier = lgbm.LGBMClassifier( objective='binary', #metric='binary_logloss', metric = 'auc', boosting='gbdt', num_leaves=10, learning_rate=0.01, n_estimators=20000, #max_bin=50, max_bin=200, max_depth=-1, min_gain_to_split = 2, bagging_fraction=0.75, bagging_freq=5, bagging_seed=7, feature_fraction=0.5, feature_fraction_seed=7, verbose=-1, min_data_in_leaf=80, min_sum_hessian_in_leaf=11 ) # Fit the data classifier.fit(X_train, y_train,)
(executors.CSharpExecutor, C_SHARP), (executors.PowershellExecutor, POWERSHELL), (executors.RExecutor, R), (executors.PhpExecutor, PHP), (executors.DartExecutor, DART), (executors.HaskellExecutor, HASKELL), (executors.RubyExecutor, RUBY), (executors.FSharpExecutor, F_SHARP), (executors.RustExecutor, RUST), ], # These models will be tested against each language specified in the previous list. [ # LightGBM regression(lgb.LGBMRegressor(**LIGHTGBM_PARAMS)), classification(lgb.LGBMClassifier(**LIGHTGBM_PARAMS)), classification_binary(lgb.LGBMClassifier(**LIGHTGBM_PARAMS)), # LightGBM (DART) regression(lgb.LGBMRegressor(**LIGHTGBM_PARAMS_DART)), classification(lgb.LGBMClassifier(**LIGHTGBM_PARAMS_DART)), classification_binary(lgb.LGBMClassifier(**LIGHTGBM_PARAMS_DART)), # LightGBM (GOSS) regression(lgb.LGBMRegressor(**LIGHTGBM_PARAMS_GOSS)), classification(lgb.LGBMClassifier(**LIGHTGBM_PARAMS_GOSS)), classification_binary(lgb.LGBMClassifier(**LIGHTGBM_PARAMS_GOSS)), # LightGBM (RF) regression(lgb.LGBMRegressor(**LIGHTGBM_PARAMS_RF)), classification(lgb.LGBMClassifier(**LIGHTGBM_PARAMS_RF)),
LossFunction = modelscore, label = 'is_trade', columnname = ColumnName[1::2], # the pattern for selection start = temp, CrossMethod = CrossMethod, # your cross term method PotentialAdd = [] # potential feature for Simulated Annealing ) try: a.run() finally: with open(RecordFolder, 'a') as f: f.write('\n{}\n%{}%\n'.format(type,'-'*60)) if __name__ == "__main__": model = {'xgb': xgb.XGBClassifier(seed = 1, max_depth = 5, n_estimators = 2000, nthread = -1), 'lgb': lgbm.LGBMClassifier(random_state=1,num_leaves = 29, n_estimators=1000), 'lgb2': lgbm.LGBMClassifier(random_state=1,num_leaves = 29, max_depth=5, n_estimators=1000), 'lgb3': lgbm.LGBMClassifier(random_state=1, num_leaves = 6, n_estimators=1000,max_depth=3,learning_rate = 0.09, n_jobs=30), 'lgb4': lgbm.LGBMClassifier(random_state=1, num_leaves = 6, n_estimators=5000,max_depth=3,learning_rate = 0.095, n_jobs=30), 'lgb5': lgbm.LGBMClassifier(random_state=1, num_leaves = 13, n_estimators=5000,max_depth=4,learning_rate = 0.05, n_jobs=30), 'lgb6': lgbm.LGBMClassifier(random_state=1, num_leaves = 6, n_estimators=5000,max_depth=3,learning_rate = 0.05, n_jobs=8) } # algorithm group CrossMethod = {'+':add, '-':substract, '*':times, '/':divide,} RecordFolder = 'record.log' # result record file modelselect = 'lgb6' # selected algorithm
predict['predicted_score'] = model.predict_proba(predict[features]) # predict[['instance_id', 'predicted_score']].to_csv('result4_18_dnn_2.csv', index=False, sep=' ') print(logloss) if mf == 'lgb': ## gbdt算法在预测集上表现非常不好。 log_loss_list = [] kf = KFold(n_splits=5, shuffle=True, random_state=1) for train_idx, test_idx in kf.split(all_train): train = all_train.iloc[train_idx, :] test = all_train.iloc[test_idx, :] X_train = train[features] y_train = train[target] X_test = test[features] y_test = test[target] clf = lgb.LGBMClassifier(num_leaves=65, max_depth=6, n_estimators=150, n_jobs=20, learning_rate=0.06, lambda_l2=1.0) clf.fit(X_train, y_train, feature_name=features) X_test['predicted_score'] = clf.predict_proba(X_test, )[:, 1] # X_test['predicted_score'] = X_test['predicted_score'] - 0.001 log_loss_value = log_loss(y_test, X_test['predicted_score']) log_loss_list.append(log_loss_value) print('the log loss of lgb model in cv with 5 splits: ', log_loss_list) # 0.08270811820531722 # (num_leaves=50, max_depth=5, n_estimators=120, n_jobs=20): 0.08258190767889406 # (num_leaves=50, max_depth=5, n_estimators=150, n_jobs=20, learning_rate=0.1, num_iterators=1000): 0.08258016897053531 # (num_leaves=36, max_depth=5, n_estimators=150, n_jobs=20, learning_rate=0.05, lambda_l2=1.0): 0.08255235859456307 # (num_leaves=65, max_depth=6, n_estimators=150, n_jobs=20, learning_rate=0.05, lambda_l2=1.0): 0.08252046284038406 # (num_leaves=65, max_depth=6, n_estimators=150, n_jobs=20, learning_rate=0.06, lambda_l2=1.0): 0.08243291190872347 # (num_leaves=65, max_depth=6, n_estimators=150, n_jobs=20, learning_rate=0.07, lambda_l2=1.0): 0.08246869876021136 # (num_leaves=65, max_depth=6, n_estimators=150, n_jobs=20, learning_rate=0.08, lambda_l2=1.0): 0.08250570813361169
#X = data_x.values #X_tst = data_x_tst.values y = np.ravel(data_y.values) oversampler = sv.MulticlassOversampling(sv.distance_SMOTE(proportion=0.5)) X_sample, y_sample = oversampler.sample(X, y) #X, y = shuffle(X, y, random_state=76592621) #X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, random_state=76592621) print("------ LightGBM...") lgbm = lgb.LGBMClassifier(n_estimators=2700, learning_rate=0.08, max_bin=350, num_leaves=34, objective='multiclassova', random_state=76592621, n_jobs=2) print("------ Generando submission...") submission(X_sample, y_sample, X_tst, lgbm) ''' fit_alg = lgb.LGBMClassifier(class_weight={1:0.9, 2:0.8, 3:0.7},num_leaves=40, learning_rate=0.08, objective='multiclassova', random_state=76592621, n_jobs=-1) param_dist = { 'n_estimators':[510, 1000] } clf = GridSearchCV(fit_alg, param_dist, verbose=1, cv=2, scoring='f1_micro', n_jobs=-1) clf = clf.fit(X,y) best_param2 = clf.best_params_['n_estimators']
def train_model(self, model='lr', balance=False): kfold = KFold(n_splits=10, shuffle=True, random_state=2021) for epoch in range(1, 3): train_x, train_y, test = self.load_train_x_train_y_test_x(balance) train_x, train_y = self.shuffle(train_x, train_y) # train_x, train_y = self.balance_data(train_x, train_y) for i, (train_idx, valid_idx) in enumerate(kfold.split(train_x, train_y)): train_xx, train_yy = train_x[train_idx], train_y[train_idx] valid_xx, valid_yy = train_x[valid_idx], train_y[valid_idx] if model == 'lr': lr = LogisticRegression(C=10, solver='liblinear', max_iter=100, n_jobs=1) lr.fit(X=train_xx, y=train_yy.reshape(-1, )) joblib.dump(lr, os.path.join(daikuan_path, 'model', 'lr_epoch_{}_k_{}.model'.format(epoch, i))) # 验证集预测 prob = lr.predict_proba(valid_xx) i, j, k = metrics.roc_curve(valid_yy, prob[:, 1]) roc_auc = metrics.auc(i, j) print('lr valid roc_auc is', roc_auc) elif model == 'svm': svc = SVC(C=10, kernel='rbf', verbose=True, max_iter=-1, gamma='scale') svc.fit(train_xx, train_yy.reshape(-1, )) joblib.dump(svc, os.path.join(daikuan_path, 'model', 'svc_epoch_{}_k_{}.model'.format(epoch, i))) prob = svc.predict_proba(valid_xx) i, j, k = metrics.roc_curve(valid_yy, prob[:, 1]) roc_auc = metrics.auc(i, j) print('svc valid roc_auc is', roc_auc) elif model == 'lgb': train_m = lgb.Dataset(train_xx, train_yy.reshape(-1, )) valid_m = lgb.Dataset(valid_xx, valid_yy.reshape(-1, )) params = { 'boosting_type': 'gbdt', 'objective': 'binary', 'learning_rate': 0.01, 'metric': 'auc', 'num_leaves': 14, 'max_depth': 19, 'min_data_in_leaf': 37, 'min_child_weight': 1.6, 'reg_lambda': 9, 'reg_alpha': 7, 'feature_fraction': 0.69, 'bagging_fraction': 0.98, 'bagging_freq': 96, 'min_split_gain': 0.4, 'nthread': 4 } # params = { # 'boosting_type': 'gbdt', # 'objective': 'binary', # 'learning_rate': 0.01, # 'metric': 'auc', # 'num_leaves': 32, # 'max_depth': 6, # 'min_data_in_leaf': 16, # 'min_child_weight': 1.9, # # 'min_child_weight': 4.9, # 'reg_lambda': 9, # 'reg_alpha': 7, # 'feature_fraction': 0.8, # 'bagging_fraction': 0.65, # 'bagging_freq': 50, # 'min_split_gain': 0.4 # } m = lgb.train(params=params, train_set=train_m, valid_sets=valid_m, num_boost_round=20000, verbose_eval=1000, early_stopping_rounds=200) val_pre_lgb = m.predict(valid_xx) fpr, tpr, threshold = metrics.roc_curve(valid_yy, val_pre_lgb) print(fpr.shape, tpr.shape, threshold.shape) roc_auc = metrics.auc(fpr, tpr) joblib.dump(m, os.path.join(daikuan_path, 'model', 'lgb4_s_epoch_{}_k_{}.model'.format(epoch, i))) print('调参lightgbm单模型在验证集上的AUC:{}'.format(roc_auc)) elif model == 'lgbm': model_lgb = lgb.LGBMClassifier( boosting_type='gbdt', objective='binary', metric='auc', learning_rate=0.1, n_estimators=2000, num_leaves=40, max_depth=4, bagging_fraction=0.85, feature_fraction=0.57, bagging_freq=58, min_data_in_leaf=25, min_child_weight=4.9, min_split_gain=0.4, reg_lambda=4.6, reg_alpha=9.7, n_jobs=4 ) model_lgb.fit(train_xx, train_yy) from sklearn.metrics import roc_auc_score from sklearn.model_selection import cross_validate c = cross_validate(model_lgb, train_x, train_y, cv=10) print('c', c) return
mean_f1Train += fper_class_train['f1'] / n_splits # print('mean valf1:',mean_f1) # print('mean trainf1:',mean_f1Train) return mean_f1 xlf = xgb.XGBClassifier(max_depth=7, learning_rate=0.05, n_estimators=55, reg_alpha=0.005, n_jobs=8, importance_type='total_cover') # llf = lgb.LGBMClassifier(num_leaves=9, max_depth=5, learning_rate=0.05, n_estimators=80, n_jobs=8) clf = cab.CatBoostClassifier(iterations=60, learning_rate=0.05, depth=10, silent=True, thread_count=8, task_type='CPU', cat_features=cat_features) rf = RandomForestClassifier(oob_score=True, random_state=2020, n_estimators=70, max_depth=13,
#------------------------------------------------------------------------ from sklearn.neural_network import MLPClassifier from sklearn.svm import SVC ''' print("------ XGB...") clf = xgb.XGBClassifier(n_estimators = 500,objective='multi:softmax',n_jobs=8,max_depth=11,num_class=4) clfb, y_test_clf = validacion_cruzada(clf,X_filtered,y,skf) #''' print("------ LightGBM...") lgbmb = lgb.LGBMClassifier(objective='multiclass',n_estimators=1000,num_threads=8,max_depth=-1) #lgbmb, y_test_lgbm = validacion_cruzada(lgbm,X_filtered,y,skf,'salida7lgb') #''' ''' print("------ MLPNN...") X_filtered = preprocessing.normalize(X_filtered) nn = MLPClassifier() nnb, y_test_nn = validacion_cruzada(nn,X_filtered,y,skf,'salida5nn') #''' ''' print("------ SVC...") svc = SVC() svcb, y_test_svc = validacion_cruzada(svc,X_filtered,y,skf,'salida5svc') #'''
def optimize(self, metrics='f1_score', n_splits=3, cv_type=StratifiedKFold, maxevals=200, do_predict_proba=None, model_id=0, reuse_experiment=False): params = self.hyperparameter_space() extra_params = self.extra_setup() env = Environment( train_dataset=self.data, results_path='HyperparameterHunterAssets', # results_path=self.PATH, metrics=[metrics], do_predict_proba=do_predict_proba, cv_type=cv_type, cv_params=dict(n_splits=n_splits), ) # optimizer = opt.GradientBoostedRegressionTreeOptimization(iterations=maxevals) optimizer = opt.BayesianOptimization(iterations=maxevals) optimizer.set_experiment_guidelines( model_initializer=lgb.LGBMClassifier, model_init_params=params, model_extra_params=extra_params) optimizer.go() # there are a few fixes on its way and the next few lines will soon be # one. At the moment, to access to the best parameters one has to read # from disc and access them best_experiment = 'HyperparameterHunterAssets/Experiments/Descriptions/'+\ optimizer.best_experiment+'.json' with open(best_experiment) as best: best = json.loads( best.read())['hyperparameters']['model_init_params'] # The next few lines are the only ones related to mlflow if not Path('mlruns').exists(): # here set the tracking_uri. If None then http://localhost:5000 client = MlflowClient() n_experiments = 0 elif not reuse_experiment: client = MlflowClient() n_experiments = len(client.list_experiments()) experiment_name = 'experiment_' + str(n_experiments) client.create_experiment(name=experiment_name) with mlflow.start_run(experiment_id=n_experiments): model = lgb.LGBMClassifier(**best) X, y = self.data.drop('target', axis=1), self.data.target model.fit(X, y, feature_name=self.colnames, categorical_feature=self.categorical_columns) for name, value in best.items(): mlflow.log_param(name, value) mlflow.log_metric('f1_score', -optimizer.optimizer_result.fun) mlflow.sklearn.log_model(model, "model") model_fname = 'model_{}_.p'.format(model_id) best_experiment_fname = 'best_experiment_{}_.p'.format(model_id) pickle.dump(model, open('/'.join([self.PATH, model_fname]), 'wb')) pickle.dump(optimizer, open('/'.join([self.PATH, best_experiment_fname]), 'wb'))
'objective': ['binary'], 'random_state': [501], # Updated from 'seed' 'colsample_bytree': [0.65, 0.75], 'subsample': [0.7, 0.75, 0.8], 'reg_alpha': [0.1, 1.2], 'reg_lambda': [0.2, 1.4], } # Create classifier to use mdl = lgb.LGBMClassifier( boosting_type='gbdt', objective='binary', n_jobs=5, # Updated from 'nthread' silent=False, max_depth=params['max_depth'], max_bin=params['max_bin'], subsample_for_bin=params['subsample_for_bin'], subsample=params['subsample'], subsample_freq=params['subsample_freq'], min_split_gain=params['min_split_gain'], min_child_weight=params['min_child_weight'], min_child_samples=params['min_child_samples'], scale_pos_weight=params['scale_pos_weight']) # View the default model params: mdl.get_params().keys() # Create the grid grid = RandomizedSearchCV(mdl, gridParams, verbose=2, cv=4, n_jobs=-1) # Run the grid grid.fit(train_early_stop_x, train_early_stop_y)
y = df.sort_values('TransactionDT')['isFraud'] df = df.sort_values('TransactionDT').drop( ['isFraud', 'TransactionDT', 'TransactionID'], axis=1) df = df_utils.clean_inf_nan(df) imputer = SimpleImputer(missing_values=np.nan, strategy='median') imputer.fit(df) df = pd.DataFrame(imputer.transform(df), columns=df.columns.values.tolist()) # 1) Stratifield 5 CV Training Data scores = [] y_pred_score = np.empty(shape=[0, 2]) predicted_index = np.empty(shape=[ 0, ]) model = lgb.LGBMClassifier() model.set_params(**lgb_optimal) if gral_parameters.get('sampling') == 'Adasyn': ovs_model = ADASYN().set_params(**oversampling) X_train, y_train = ovs_model.fit_sample(df, y) fileModel = model.fit(X_train, y_train) save_params = {'base_model': fileModel, 'imputer': imputer} joblib.dump( save_params, os.path.join( os.path.join( os.path.dirname(
def experience_mnist(config, path, param): print("START MNIST") use_cuda = config.general.use_cuda and torch.cuda.is_available() torch.manual_seed(config.general.seed) device = torch.device("cuda" if use_cuda else "cpu") print("START TRAINING TARGET MODEL") data_train_target = custum_MNIST(True, 0, config, '../data', train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])) data_test_target = custum_MNIST(True, 0, config, '../data', train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])) criterion = nn.CrossEntropyLoss() train_loader_target = torch.utils.data.DataLoader( data_train_target, batch_size=config.learning.batch_size, shuffle=True) test_loader_target = torch.utils.data.DataLoader( data_test_target, batch_size=config.learning.batch_size, shuffle=True) dataloaders_target = { "train": train_loader_target, "val": test_loader_target } dataset_sizes_target = { "train": len(data_train_target), "val": len(data_test_target) } print("TAILLE dataset", dataset_sizes_target) model_target = Net_mnist().to(device) optimizer = optim.SGD(model_target.parameters(), lr=config.learning.learning_rate, momentum=config.learning.momentum) # Add DP noise! privacy_engine = PrivacyEngine( model_target, batch_size=config.learning.batch_size, sample_size=len(train_loader_target.dataset), alphas=[1 + x / 10.0 for x in range(1, 100)] + list(range(12, 64)), noise_multiplier=1.0, # sigma max_grad_norm=1.0, # Clip per-sample gradients to this norm ) privacy_engine.attach(optimizer) exp_lr_scheduler = lr_scheduler.StepLR( optimizer, step_size=config.learning.decrease_lr_factor, gamma=config.learning.decrease_lr_every) model_target, best_acc_target, data_test_set, label_test_set, class_test_set = train_model( model_target, criterion, optimizer, exp_lr_scheduler, dataloaders_target, dataset_sizes_target, num_epochs=config.learning.epochs) np.save(path + "/res_train_target_" + str(param) + ".npy", best_acc_target) print("START TRAINING SHADOW MODEL") all_shadow_models = [] all_dataloaders_shadow = [] data_train_set = [] label_train_set = [] class_train_set = [] for num_model_sahdow in range(config.general.number_shadow_model): criterion = nn.CrossEntropyLoss() data_train_shadow = custum_MNIST(False, num_model_sahdow, config, '../data', train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])) data_test_shadow = custum_MNIST(False, num_model_sahdow, config, '../data', train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])) train_loader_shadow = torch.utils.data.DataLoader( data_train_shadow, batch_size=config.learning.batch_size, shuffle=True) test_loader_shadow = torch.utils.data.DataLoader( data_test_shadow, batch_size=config.learning.batch_size, shuffle=True) dataloaders_shadow = { "train": train_loader_shadow, "val": test_loader_shadow } dataset_sizes_shadow = { "train": len(data_train_shadow), "val": len(data_test_shadow) } print("TAILLE dataset", dataset_sizes_shadow) model_shadow = Net_mnist().to(device) optimizer = optim.SGD(model_shadow.parameters(), lr=config.learning.learning_rate, momentum=config.learning.momentum) exp_lr_scheduler = lr_scheduler.StepLR( optimizer, step_size=config.learning.decrease_lr_factor, gamma=config.learning.decrease_lr_every) model_shadow, best_acc_sh, data_train_set_unit, label_train_set_unit, class_train_set_unit = train_model( model_shadow, criterion, optimizer, exp_lr_scheduler, dataloaders_target, dataset_sizes_target, num_epochs=config.learning.epochs) data_train_set.append(data_train_set_unit) label_train_set.append(label_train_set_unit) class_train_set.append(class_train_set_unit) np.save( path + "/res_train_shadow_" + str(num_model_sahdow) + "_" + str(param) + ".npy", best_acc_sh) all_shadow_models.append(model_shadow) all_dataloaders_shadow.append(dataloaders_shadow) print("START GETTING DATASET ATTACK MODEL") data_train_set = np.concatenate(data_train_set) label_train_set = np.concatenate(label_train_set) class_train_set = np.concatenate(class_train_set) #data_test_set, label_test_set, class_test_set = get_data_for_final_eval([model_target], [dataloaders_target], device) #data_train_set, label_train_set, class_train_set = get_data_for_final_eval(all_shadow_models, all_dataloaders_shadow, device) data_train_set, label_train_set, class_train_set = shuffle( data_train_set, label_train_set, class_train_set, random_state=config.general.seed) data_test_set, label_test_set, class_test_set = shuffle( data_test_set, label_test_set, class_test_set, random_state=config.general.seed) print("Taille dataset train", len(label_train_set)) print("Taille dataset test", len(label_test_set)) print("START FITTING ATTACK MODEL") model = lgb.LGBMClassifier(objective='binary', reg_lambda=config.learning.ml.reg_lambd, n_estimators=config.learning.ml.n_estimators) model.fit(data_train_set, label_train_set) y_pred_lgbm = model.predict(data_test_set) precision_general, recall_general, _, _ = precision_recall_fscore_support( y_pred=y_pred_lgbm, y_true=label_test_set, average="macro") accuracy_general = accuracy_score(y_true=label_test_set, y_pred=y_pred_lgbm) precision_per_class, recall_per_class, accuracy_per_class = [], [], [] for idx_class, classe in enumerate(data_train_target.classes): all_index_class = np.where(class_test_set == idx_class) precision, recall, _, _ = precision_recall_fscore_support( y_pred=y_pred_lgbm[all_index_class], y_true=label_test_set[all_index_class], average="macro") accuracy = accuracy_score(y_true=label_test_set[all_index_class], y_pred=y_pred_lgbm[all_index_class]) precision_per_class.append(precision) recall_per_class.append(recall) accuracy_per_class.append(accuracy) print("END MNIST") return (precision_general, recall_general, accuracy_general, precision_per_class, recall_per_class, accuracy_per_class)
print(' Training accuracy of sgdc :', pipe_sgdc.score(X_train, y_train)) print(' Valid Accuracy : %.8f' % pipe_sgdc.score(X_valid, y_valid)) print(' AUC value of sgdc : ',AUC_calculate(pipe_sgdc,valid,X_valid)) print('End sgdc model .') ## LGB print('Start lgb model :') import lightgbm as lgb lgb_ = lgb.LGBMClassifier( learning_rate = 0.005, boosting_type = 'gbdt', objective = 'binary', metric = 'logloss', max_depth = 7, sub_feature = 0.7, num_leaves = 10, colsample_bytree = 0.7, min_data_in_leaf =10, n_estimators = 500, early_stop = 50, verbose = -1, feature_fraction= 0.7) pipe_lgb = make_pipeline(StandardScaler(),lgb_) pipe_lgb.fit(X_train, y_train) y_pred_lgb = pipe_lgb.predict(X_valid) print(' Training accuracy of lgb: ', pipe_lgb.score(X_train, y_train)) print(' Valid Accuracy of lgb : %.10f' % pipe_lgb.score(X_valid, y_valid)) print(' AUC value of lgb : ',AUC_calculate(pipe_lgb,valid,X_valid)) print('End lgb model .')
'bagging_freq': 5, 'verbose': 0, 'min_sum_hessian_in_leaf': 100 } gbm = lgb.train(params, lgb_train, num_boost_round=20, valid_sets=lgb_eval, early_stopping_rounds=10) #10折交叉验证 #y_pred = gbm.predict(x_test, num_iteration=gbm.best_iteration) #print(y_pred) # specify your configurations as a dict #'metric': 'binary_logloss', #'num_iterations':500, clf4 = lgb.LGBMClassifier( criterion="rmse") #n_estimators:估计器(树)的个数 criterion:优化目标 model4 = clf4.fit(x_train, y_train) y_hat4 = model4.predict(x_test) sum(y_hat4 == y_test) / y_test.count() c = confusion_matrix(y_hat4, y_test) acc1 = c[0, 0] / sum(c[0, :]) acc2 = c[1, 1] / sum(c[1, :]) print('4-1:%.2f%%' % (acc1 * 100)) print('4-2:%.2f%%' % (acc2 * 100)) #print('confuse_matrix') #print(c)输出混淆矩阵 train_x = x_train train_y = y_train validation_x = x_test
def train_model(X, y, params, exp_path): fold_params = params['fold'] model_params = params['model'] fit_params = params['fit'] # set mlflow experiment try: mlflow.create_experiment(exp_path) except (mlflow.exceptions.RestException, mlflow.exceptions.MlflowException): print('The specified experiment ({}) already exists.'.format(exp_path)) mlflow.set_experiment(exp_path) skf = StratifiedKFold(**fold_params) models = [] metrics = [] y_proba = np.zeros(len(X)) y_pred = np.zeros(len(X)) feature_importances_split = np.zeros(X.shape[1]) feature_importances_gain = np.zeros(X.shape[1]) scores = defaultdict(int) with mlflow.start_run() as run: corr = pd.concat((X, y), axis=1).corr() log_plot(corr, pf.corr_matrix, 'correlation_matrix.png') log_plot(y.value_counts(), pf.label_share, 'label_share.png') for fold_no, (idx_train, idx_valid) in enumerate(skf.split(X, y)): print_devider(f'Fold: {fold_no}') X_train, X_valid = X.iloc[idx_train, :], X.iloc[idx_valid, :] y_train, y_valid = y.iloc[idx_train], y.iloc[idx_valid] # train model model = lgbm.LGBMClassifier(**model_params) model.fit(X_train, y_train, **fit_params, eval_set=[(X_valid, y_valid)], eval_names=['valid']) metrics.append({ 'name': model.metric, 'values': model.evals_result_['valid'][model.metric], 'best_iteration': model.best_iteration_ }) models.append(model) # feature importance feature_importances_split += devide_by_sum( model.booster_.feature_importance( importance_type='split')) / skf.n_splits feature_importances_gain += devide_by_sum( model.booster_.feature_importance( importance_type='gain')) / skf.n_splits # predict y_valid_proba = model.predict_proba( X_valid, num_iteration=model.best_iteration_)[:, 1] y_valid_pred = model.predict(X_valid, num_iteration=model.best_iteration_) y_proba[idx_valid] = y_valid_proba y_pred[idx_valid] = y_valid_pred # evaluate scores_valid = get_scores(y_valid, y_valid_pred) mlflow.log_metrics( { **scores_valid, 'best_iteration': model.best_iteration_, }, step=fold_no) print('\nScores') print(scores_valid) # record scores for k, v in scores_valid.items(): scores[k] += v / skf.n_splits # log training parameters mlflow.log_params({ **fold_params, **model_params, **fit_params, 'cv': skf.__class__.__name__, 'model': model.__class__.__name__ }) print_devider('Saving plots') # scores log_plot(scores, pf.scores, 'scores.png') # feature importance features = np.array(model.booster_.feature_name()) log_plot( (features, feature_importances_split, 'Feature Importance: split'), pf.feature_importance, 'feature_importance_split.png') log_plot( (features, feature_importances_gain, 'Feature Importance: gain'), pf.feature_importance, 'feature_importance_gain.png') # metric history log_plot(metrics, pf.metric, 'metric_history.png') # confusion matrix cm = confusion_matrix(y, y_pred) log_plot(cm, pf.confusion_matrix, 'confusion_matrix.png') # roc curve fpr, tpr, _ = roc_curve(y, y_proba) roc_auc = roc_auc_score(y, y_pred) log_plot((fpr, tpr, roc_auc), pf.roc_curve, 'roc_curve.png') # precision-recall curve pre, rec, _ = precision_recall_curve(y, y_proba) pr_auc = average_precision_score(y, y_pred) log_plot((pre, rec, pr_auc), pf.pr_curve, 'pr_curve.png') # pickle trained models models_path = 'models.pkl' with open(models_path, 'wb') as f: pickle.dump(models, f) mlflow.log_artifact(models_path) mlflow.log_param('model_path', os.path.join(run.info.artifact_uri, models_path)) os.remove(models_path) return run.info.experiment_id, run.info.run_uuid