test_X = test[feature_names] # training print('training...') clf = LGBMClassifier(learning_rate=0.2, n_estimators=1000, subsample=0.4, subsample_freq=1, colsample_bytree=0.4, random_state=2019, num_leaves=10, min_child_samples=20, max_depth=3) clf.fit(train_X, train_y, \ eval_set=[(train_X, train_y), (val_X, val_y)], \ early_stopping_rounds=10) joblib.dump(clf, 'treemodel/lgb_final.model') clf_xgb = XGBClassifier(learning_rate=0.2, n_estimators=1000, subsample=0.4, subsample_freq=1, colsample_bytree=0.4, random_state=2019, num_leaves=10, min_child_samples=20, max_depth=3) clf_xgb.fit(train_X, train_y, \ eval_set=[(train_X, train_y), (val_X, val_y)], \ early_stopping_rounds=10)
amt_oof = np.zeros(train_num) prob_oof = np.zeros((train_num, 33)) test_pred_prob = np.zeros((x_test.shape[0], 33)) for i, (trn_idx, val_idx) in enumerate(skf.split(x_train, y_train)): print(i, 'fold...') trn_x, trn_y = x_train[trn_idx], y_train[trn_idx] val_x, val_y = x_train[val_idx], y_train[val_idx] val_repay_amt = label_amt[val_idx] val_due_amt = x_train_due_amt.iloc[val_idx] clf.fit(trn_x, trn_y, eval_set=[(trn_x, trn_y), (val_x, val_y)], early_stopping_rounds=100, verbose=5) joblib.dump(clf, '../model/lgb.pkl') # shape = (-1, 33) val_pred_prob_everyday = clf.predict_proba( val_x, num_iteration=clf.best_iteration_) prob_oof[val_idx] = val_pred_prob_everyday val_pred_prob_today = [ val_pred_prob_everyday[i][val_y[i]] for i in range(val_pred_prob_everyday.shape[0]) ] val_pred_repay_amt = val_due_amt['due_amt'].values * val_pred_prob_today print('val rmse:', np.sqrt(mean_squared_error(val_repay_amt, val_pred_repay_amt))) print('val mae:', mean_absolute_error(val_repay_amt,
def main(): train_transaction = pd.read_csv('../data/train_transaction.csv') test_transaction = pd.read_csv('../data/test_transaction.csv') test_transaction['split'] = 2 train_transaction['split'] = 1 transaction = pd.concat([train_transaction, test_transaction]) aer = pd.read_csv('../data/ae_result.csv') transaction = pd.merge(transaction, aer[['TransactionID', 'autoscore']], on='TransactionID', how='left') categoricalDomain = [ 'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'P_emaildomain', 'ProductCD', 'R_emaildomain', 'card4', 'card6' ] continuousDomain = [] for i in transaction: if i not in categoricalDomain and i != 'TransactionID' and i != 'split': continuousDomain.append(i) transaction = transaction.fillna(-1) #step1 = ('label_encode', label_encoder_sk(cols=categoricalDomain)) step1 = ('label_encode', label_encoder_sk(cols=categoricalDomain)) pipeline = Pipeline(steps=[step1]) transaction_new = pipeline.fit_transform(transaction) feature = [ f for f in transaction_new.columns if f != 'TransactionID' and f != 'split' and f != 'isFraud' ] transaction_new.to_csv('transaction_new.csv', index=False) data = transaction_new[transaction_new['split'] == 1] valid = transaction_new[transaction_new['split'] == 2] train, test = train_test_split(data, test_size=0.3, random_state=42) train_x = train[feature] test_x = test[feature] train_y = train['isFraud'] test_y = test['isFraud'] parms = { # 'x_train':X_train, # 'y_train':y_train, 'num_leaves': (5, 40), 'colsample_bytree': (0.1, 0.5), 'drop_rate': (0.1, 1), 'learning_rate': (0.001, 0.1), 'max_bin': (10, 1000), 'max_depth': (2, 5), 'min_split_gain': (0.1, 0.9), 'min_child_samples': (2, 10000), 'n_estimators': (50, 2000), 'reg_alpha': (0.1, 1000), 'reg_lambda': (0.1, 1000), 'sigmoid': (0.1, 1), 'subsample': (0.1, 1), 'subsample_for_bin': (100, 50000), 'subsample_freq': (1, 10) } def roc_auc_score_fix(y_true, y_score): score = metrics.roc_auc_score(y_true, y_score) if score > 0.8: return 0 else: return score # 参数整理格式,其实只需要提供parms里的参数即可 intdeal = [ 'max_bin', 'max_depth', 'max_drop', 'min_child_samples', 'min_child_weight', 'n_estimators', 'num_leaves', 'scale_pos_weight', 'subsample_for_bin', 'subsample_freq' ] # int类参数 middledeal = [ 'colsample_bytree', 'drop_rate', 'learning_rate', 'min_split_gain', 'skip_drop', 'subsample', '' ] # float, 只能在0,1之间 maxdeal = ['reg_alpha', 'reg_lambda', 'sigmoid'] # float,且可以大于1 others = {'is_unbalance': True, 'random_state': 24} bayesopsObj = bayes_ops(estimator=LGBMClassifier, param_grid=parms, cv=5, intdeal=intdeal, middledeal=middledeal, maxdeal=maxdeal, score_func=make_scorer( score_func=roc_auc_score_fix, greater_is_better=True, needs_threshold=True), init_points=3, n_iter=10, acq="ucb", kappa=0.1, others=others) bayesopsObj.run(X=train_x, Y=train_y) parms = bayesopsObj.baseparms print(parms) clf = LGBMClassifier(**parms) clf.fit(train_x, train_y) train_y_pred = clf.predict_proba(train_x)[:, 1] train_ks = cal_ks_scipy(train_y_pred, train_y) y_pred = clf.predict_proba(test_x)[:, 1] test_ks = cal_ks_scipy(y_pred, test_y) print(train_ks, test_ks) tr_auc = metrics.roc_auc_score(train_y, train_y_pred) te_auc = metrics.roc_auc_score(test_y, y_pred) print(tr_auc, te_auc) valid['isFraud'] = clf.predict_proba(valid[clf._Booster.feature_name()])[:, 1] valid[['TransactionID', 'isFraud']].to_csv('submitops.csv', index=False)
param_grid={ 'n_estimators': [100, 300, 600], 'reg_lambda': [0.001, 0.01, 0.1, 1] }) best_clf.fit(train_data, train_label) print( "Select best LGB model with n_estimators = {} with best_score={}".format( best_clf.best_params_['n_estimators'], best_clf.best_score_)) #%% for a in [100, 300, 600, 1000]: for b in [0.0001, 0.001, 0.01, 0.1, 1, 10]: LBMclf = LGBMClassifier(random_state=50, n_jobs=-1, n_estimators=a, reg_lambda=b) LBMclf.fit(train_data, train_label) print( "The reuslt AUC_ROC of the lightGBM with n_estimators={} and reg_lambda={} on test data is" .format(a, b), roc_auc_score(test_label.tolist(), LBMclf.predict(test_data).tolist())) #%% # Make the model with the specified regularization parameter clf = LogisticRegression() best_clf = GridSearchCV(clf, scoring='roc_auc', cv=5, n_jobs=-1, param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100]}) best_clf.fit(train_data, train_label)
params = { 'boosting_type': 'goss', 'objective': 'binary', 'is_unbalance': True, 'categorical_feature': [0, 1, 3, 5, 6, 12, 15, 16, 17, 18, 19, 20], 'n_jobs': 4, 'learning_rate': 0.01, #'n_estimators':n_estimators_1, 'num_leaves': 75, 'max_depth': 6, 'min_child_samples': 40, 'colsample_bytree': 0.4 } lg = LGBMClassifier(silent=False, **params) lg.fit(X_train, y_train) # ## 保存模型,用于后续测试 # In[26]: import pickle pickle.dump(lg, open("HappyBank_LightGBM_.pkl", 'wb')) # ### 特征重要性 # In[27]: df = pd.DataFrame({ "columns": list(feat_names),
subsample=0.8, colsample_bytree=0.8, random_state=2019, bagging_fraction=0.9, bagging_freq=8, lambda_l1=0.5, lambda_l2=0, cat_smooth=10, is_unbalenced='True', metric=None) print('************** training **************') print(train_x.shape, val_x.shape) clf.fit(train_x, train_y, eval_set=[(val_x, val_y)], eval_metric='auc', early_stopping_rounds=50, verbose=100) train_df = pd.read_csv('../user_data/train_df.csv') train_df = train_df.merge(serial, how='left', on=['serial_number', 'model']) train_df['dt'] = pd.to_datetime(train_df['dt'], format='%Y-%m-%d %H:%M:%S') train_df['dt_first'] = pd.to_datetime(train_df['dt_first'], format='%Y-%m-%d %H:%M:%S') train_df['days'] = (train_df['dt'] - train_df['dt_first']).dt.days train_df = train_df.merge(tag, how='left', on=['serial_number', 'model']) train_df['days_1'] = (train_df['dt'] - train_df['fault_time_1']).dt.days train_df.loc[train_df.days_1 <= 0, 'tag'] = None train_df.loc[train_df.days_1 <= 0, 'days_1'] = None train_df['days_2'] = (train_df['fault_time_1'] - train_df['dt_first']).dt.days
import pandas as pd import numpy as np from lightgbm.sklearn import LGBMClassifier import xgboost as xgb from scipy.stats import rankdata import gc # 前処理したデータをロード ts = pd.read_csv('../input/test_3comb_nmf.csv').sort_values( by='test_id').reset_index(drop=True) X_cols = ts.columns.drop(['date_time', 'test_id', 'is_arrested']).tolist() n_ = 5 preds = np.zeros([len(ts), n_]) for i in range(n_): gc.collect() d_sample = pd.read_csv('../input/under_sampled_{}_lda_nmf.csv'.format( 101 + i)) # baggingのため、複数回モデルを作成 # clf = xgb.XGBClassifier(eta=0.05, min_child_weight=1, subsample=0.9, colsample_bylevel = 0.2, reg_lambda=1, reg_alpha=0.4) clf = LGBMClassifier() clf.fit(d_sample[X_cols], d_sample['is_arrested']) preds[:, i] = rankdata(clf.predict_proba(ts[X_cols])[:, 1]) / len(ts) res = pd.DataFrame() res['test_id'] = ts['test_id'] res['pred_proba'] = preds.mean(axis=1) res = res.sort_values(by='test_id') res.to_csv('../output/sub.csv')
) fea_imp_list = [] clf = LGBMClassifier(learning_rate=0.01, n_estimators=6000, num_leaves=255, subsample=0.9, colsample_bytree=0.8, random_state=2019, metric=None, n_jobs=20) print('************** training **************') clf.fit( train_x, train_y, eval_set=[(val_x, val_y)], eval_metric='auc', # categorical_feature=cate_cols, early_stopping_rounds=200, verbose=50) print('runtime:', time.time() - t) print('************** validation result **************') best_rounds = clf.best_iteration_ best_score = clf.best_score_['valid_0']['auc'] val_pred = clf.predict_proba(val_x)[:, 1] fea_imp_list.append(clf.feature_importances_) print('runtime:', time.time() - t) print( '=============================================== whole dataset training ===============================================' )
) fea_imp_list = [] clf = LGBMClassifier( learning_rate=0.07, n_estimators=300, num_leaves=512, subsample=0.8, max_depth=-1, colsample_bytree=0.8, random_state=2018, is_unbalenced='True', objective='multiclass', # metric=['binary_logloss'] ) # 'binary_error','xentropy' print('************** training **************') clf = clf.fit( train_x[smartcol], train_y, eval_set=[(val_x[smartcol], val_y)], eval_metric=['multi_logloss'], # ,'binary_error','xentropy' categorical_feature='auto', early_stopping_rounds=50, verbose=50) tag_pred = clf.predict_proba(data_45678[smartcol]) fea_imp_list.append(clf.feature_importances_) # 保存模型 with open('tag_model.pkl', 'wb') as pickle_file: pickle.dump(clf, pickle_file)
test = data[data.TARGET == -9999] X = train.drop(['EID', 'TARGET'], 1).values y = train.TARGET.values.ravel() clf = LGBMClassifier(boosting_type='gbdt', objective='binary', max_depth=-1, learning_rate=0.01, n_estimators=2000, subsample=0.6, colsample_bytree=0.6, reg_alpha=5.39, reg_lambda=10, num_leaves=2**6, min_child_weight=10, min_split_gain=0.05, scale_pos_weight=1, random_state=999, n_jobs=-1) clf.fit(X, y) def get_res(clf, path='cv.07470.csv'): res = clf.predict_proba(test.drop(['EID', 'TARGET'], 1))[:, 1] test[['EID']].assign(FORTARGET=0, PROB=res).to_csv(path, index=False) if __name__ == '__main__': get_res(clf)
def train_lightgbm(verbose=True): """Train a boosted tree with LightGBM.""" logger.info("Training with LightGBM") df = pd.read_csv(STAGE1_LABELS) """ data = [] for id in df['id'].tolist(): dt = np.load(FEATURE_FOLDER + '/%s.npy' % str(id)) dt = np.r_[np.mean(dt, axis=0), np.max(dt, axis=0), np.min(dt, axis=0), np.var(dt, axis=0)] data.append(dt) x = np.array(data)[:, FEATURE] """ x = np.array([ np.load(FEATURE_FOLDER + '/%s.npy' % str(id)) for id in df['id'].tolist() ]) """ x2 = np.array([np.load(FEATURE_FOLDER_2 + '/%s.npy' % str(id)) for id in df['id'].tolist()])[:, FEATURE] x = np.c_[x, x2] """ """ x2 = np.array([np.r_[np.mean(np.load(FEATURE_FOLDER_2 + '/%s.npy' % str(id)), axis=0)] for id in df['id'].tolist()]) """ # x = np.array([np.load(FEATURE_FOLDER + '/%s.npy' % str(id))[:30].flatten() # for id in df['id'].tolist()])[:, FEATURE] y = df['cancer'].as_matrix() """ trn_x, val_x, trn_y, val_y = cross_validation.train_test_split(x, y, random_state=42, stratify=y, test_size=0.20) """ logger.info('data size: {}'.format(x.shape)) all_params = { 'max_depth': [3, 5, 10], 'learning_rate': [0.06, 0.1, 0.2], 'n_estimators': [1500], 'min_child_weight': [0], 'subsample': [1], 'colsample_bytree': [0.5, 0.6], 'boosting_type': ['gbdt'], #'num_leaves': [2, 3], #'reg_alpha': [0.1, 0, 1], #'reg_lambda': [0.1, 0, 1], #'is_unbalance': [True, False], #'subsample_freq': [1, 3], 'seed': [2261] } min_score = 100 min_params = None cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=871) for params in ParameterGrid(all_params): list_score = [] for train, test in cv.split(x, y): trn_x = x[train] val_x = x[test] trn_y = y[train] val_y = y[test] clf = LGBMClassifier(**params) clf.fit( trn_x, trn_y, eval_set=[(val_x, val_y)], verbose=verbose, # eval_metric=log_loss, early_stopping_rounds=300) _score = log_loss(val_y, clf.predict_proba(val_x)[:, 1]) # logger.debug(' _score: %s' % _score) list_score.append(_score) score = np.mean(list_score) params['n_estimators'] = clf.best_iteration logger.info('param: %s' % (params)) logger.info('score: %s (avg %s min %s max %s)' % (score, np.mean(list_score), np.min(list_score), np.max(list_score))) if min_score > score: min_score = score min_params = params logger.info('best score: %s' % min_score) logger.info('best_param: %s' % (min_params)) """ imp = pd.DataFrame(clf.feature_importances_, columns=['imp']) with open('features.py', 'a') as f: f.write('FEATURE = [' + ','.join(map(str, imp[imp['imp'] > 0].index.values)) + ']\n') """ clf = LGBMClassifier(**min_params) clf.fit(x, y) return clf
subsample_for_bin=800, n_jobs=4) # # specify your configurations as a dict # param_grid_xgboost={'min_child_samples':np.arange(10,100,10)} # start_time=time.clock() # grid_lgb=GridSearchCV(lgb,param_grid_xgboost,cv=5,scoring='accuracy') # grid_lgb.fit(X,y) # endtime=time.clock() # print('score',grid_lgb.grid_scores_) # print('Xgboost_best_estimator_',grid_lgb.best_estimator_) # print('Xgboost_best_score_',grid_lgb.best_score_) # print('Xgboost_best_params_',grid_lgb.best_params_) # print("run_time",endtime-start_time) start_time = time.clock() score_all = 0 kf = KFold(n_splits=5, shuffle=True) for train, test in kf.split(X): print(len(train), len(test)) X_train = X[train] X_test = X[test] y_train = y[train] y_test = y[test] lgb.fit(X_train, y_train) preds = lgb.predict(X_test) score = accuracy_score(y_test, preds) print("score:", score) score_all = score_all + score print("score_all", score_all / 5) endtime = time.clock() print("run_time", endtime - start_time)
def multi_machine_learing_models(data_train, data_cv): print('正在训练模型!') data_train=pd.concat([data_train,data_cv],axis=0) y_train = data_train['label'].apply(lambda x: 0 if x == 'good' else 1) y_test = data_cv['label'].apply(lambda x: 0 if x == 'good' else 1) X_train = data_train.drop(['URL', 'label'], axis=1) X_test = data_cv.drop(['URL', 'label'], axis=1) filename_bayes = 'classifier_model\c_bayes.model' filename_LGB = 'classifier_model\c_LGB.model' filename_ada = 'classifier_model\c_ada.model' filename_rf = 'classifier_model\c_rf.model' filename_decision_tree = 'classifier_model\c_decision_tree.model' filename_lgs = 'classifier_model\c_lgs.model' vote = [] for i in range(len(y_test)): vote.append(0) bayes = BernoulliNB() bayes.fit(X_train, y_train) print('\nbayes模型的准确度:', bayes.score(X_test, y_test)) predict = bayes.predict(X_test) vote = list(map(lambda x: x[0] + x[1], zip(predict, vote))) precision = metrics.precision_score(y_test, predict) recall = metrics.recall_score(y_test, predict) print("precison:", precision) print("recall:", recall) joblib.dump(bayes, filename_bayes) gbc = LGBMClassifier(n_estimators=200, objective='binary') gbc.fit(X_train, y_train) print('LGBMClassifier模型的准确度:', gbc.score(X_test, y_test)) predict = gbc.predict(X_test) vote = list(map(lambda x: 3 * x[0] + x[1], zip(predict, vote))) precision = metrics.precision_score(y_test, predict) recall = metrics.recall_score(y_test, predict) print("precison:", precision) print("recall:", recall) joblib.dump(gbc, filename_LGB) ada = AdaBoostClassifier(n_estimators=100) # 迭代100次 ada.fit(X_train, y_train) print('ada模型的准确度:', ada.score(X_test, y_test)) predict = ada.predict(X_test) vote = list(map(lambda x: 2 * x[0] + x[1], zip(predict, vote))) precision = metrics.precision_score(y_test, predict) recall = metrics.recall_score(y_test, predict) print("precison:", precision) print("recall:", recall) joblib.dump(ada, filename_ada) rf = RandomForestClassifier(n_estimators=100, oob_score=True) rf.fit(X_train, y_train) print('\nrf模型的准确度:', rf.score(X_test, y_test)) predict = rf.predict(X_test) vote = list(map(lambda x: x[0] * 3 + x[1], zip(predict, vote))) precision = metrics.precision_score(y_test, predict) recall = metrics.recall_score(y_test, predict) print("precison:", precision) print("recall:", recall) joblib.dump(rf, filename_rf) decision_tree = tree.DecisionTreeClassifier() decision_tree.fit(X_train, y_train) print('\ndecision_tree模型的准确度:', decision_tree.score(X_test, y_test)) predict = decision_tree.predict(X_test) vote = list(map(lambda x: x[0] * 2 + x[1], zip(predict, vote))) precision = metrics.precision_score(y_test, predict) recall = metrics.recall_score(y_test, predict) print("precison:", precision) print("recall:", recall) joblib.dump(decision_tree, filename_decision_tree) lgs = LogisticRegression() lgs.fit(X_train, y_train) print('\nLogisticRegression模型的准确度:', lgs.score(X_test, y_test)) predict = lgs.predict(X_test) vote = list(map(lambda x: x[0] * 2 + x[1], zip(predict, vote))) precision = metrics.precision_score(y_test, predict) recall = metrics.recall_score(y_test, predict) print("precison:", precision) print("recall:", recall) joblib.dump(lgs, filename_lgs) print('\n投票结果:') vote_r = [] for i in range(len(vote)): if vote[i] >= 3: vote_r.append(1) else: vote_r.append(0) precision = metrics.precision_score(y_test, vote_r) recall = metrics.recall_score(y_test, vote_r) acc = metrics.accuracy_score(y_test, vote_r) print('准确度:', acc) print("precison:", precision) print("recall:", recall)
clf = LGBMClassifier( learning_rate=i_params["learning_rate"][j], n_estimators=i_params["n_estimators"][j], num_leaves=i_params["num_leaves"][j], subsample=0.8, colsample_bytree=0.8, random_state=2019, is_unbalenced='True', metric=None) print('************** training **************') print(train_x.shape) clf.fit( train_x, train_y, eval_set=[(train_x, train_y)], eval_metric='auc', early_stopping_rounds=10, verbose=10 ) # 保存模型 joblib.dump(clf, './model/model_saved/lgb_voting_{}.pkl'.format(model_index)) model_index += 1 # 预测部份 test_data_dir = "./data/disk_sample_smart_log_round2" test_file_list = os.listdir(test_data_dir) submit = pd.DataFrame([]) new_disks = [] for day in pd.date_range("2018-08-20", "2018-09-30"): print("start predicting for {}".format(day.strftime("%Y-%m-%d")))
def train_k_fold_lgbm(X, y, features, FOLDS=5, RANDOM_STATE=707, PARAM_COMBINATION=40): print(f'X shape: {X.shape}') lgbm_default = LGBMClassifier(learning_rate=0.1, n_estimators=450, max_depth=7, min_child_weight=1, subsample=0.8, class_weight='balanced', boosting='gbdt') lgbm_params = { 'num_leaves': [6, 12, 24, 64], 'max_depth': [3, 5, 7, 14], 'min_data_in_leaf': [20, 40, 80], 'min_sum_hessian_in_leaf': [1e-5, 1e-2, 1, 1e2, 1e4], 'bagging_fraction': [i / 10.0 for i in range(7, 11)], 'bagging_freq': [0, 5, 10, 20, 30], 'feature_fraction': [i / 10.0 for i in range(3, 7)], 'lambda_l1': [0, 1e-5, 1e-2], 'lambda_l2': [0, 1e-5, 1e-2] } print('lgbm params: ', lgbm_params) skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=RANDOM_STATE) rd_lgbm = RandomizedSearchCV( estimator=lgbm_default, param_distributions=lgbm_params, scoring='f1_macro', n_jobs=-1, pre_dispatch='2*n_jobs', cv=skf.split(X.loc[:, features], y), verbose=1, random_state=RANDOM_STATE, n_iter=PARAM_COMBINATION ) print(f'randomcv shape: {X.loc[:, features].shape}') rd_lgbm.fit( X=X.loc[:, features], y=y ) feature_importance = pd.DataFrame( rd_lgbm.best_estimator_.feature_importances_, index=features, columns=['importance'] ).sort_values('importance', ascending=False) feature_hyped = feature_importance[feature_importance['importance'] > 0].index lgbm_hyped = LGBMClassifier(**rd_lgbm.best_estimator_.get_params()) print('Training on whole population with best parameters and features...') final_features = list(feature_hyped) print(f'training final shape: {X.loc[:, final_features].shape}') lgbm_hyped.fit( X=X.loc[:, final_features], y=y ) feature_importance = pd.DataFrame( lgbm_hyped.feature_importances_, index=final_features, columns=['importance'] ).sort_values('importance', ascending=False) print('Finished!') return feature_importance, final_features, lgbm_hyped
'colsample_bytree': 0.9497036, 'subsample': 0.8715623, 'max_depth': 8, 'reg_alpha': 0.041545473, 'reg_lambda': 0.0735294, 'min_child_weight': 2, 'silent': -1, 'verbose': -1, 'objective': 'binary', 'seed': 3 } model_single = LGBMClassifier(**lgb_params_classif2) model_single.fit(x_train, y_train, eval_set=[(x_val, y_val)], verbose=False, early_stopping_rounds=10) preds_val = model_single.predict_proba(x_val) print('Acc single model :', acc(y_val, preds_val)) model = CrossValClassifier(LGBMClassifier(**lgb_params_classif2), n_split=10) model.fit(x_train, y_train, x_val, y_val, eval_metric=acc) model.save_models('test_lgbclassifcv.pkl') del model with open('test_lgbclassifcv.pkl', 'rb') as f: model = pickle.load(f) preds = model.predict_proba(x_val) print("Evaluation CV : ", acc(y_val, preds))
pf = pd.read_csv("../../Datasets/Cancer.csv") X = pf.drop(['Unnamed: 32', "id", "diagnosis"], axis=1) Y = np.array(pd.get_dummies(pf['diagnosis'], drop_first=True)).reshape(X.shape[0]) #veriyi bölme X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.21, random_state=42) #modeli kurma LightGBM = LGBMClassifier() LightGBM.fit(X_train, y_train) #modelden tahmin tapma pred = LightGBM.predict(X_test) #ilkel başarı değeri print(f"İlkel başarı değeri : {accuracy_score(y_test,pred)}") #hiperparametre seçelim hiperparams = { 'max_depth': np.arange(2, 10, 2), 'learning_rate': [0.0001, 0.001, 0.01, 0.1, 1], 'n_estimators': np.arange(200, 1000, 200) }
clf = LGBMClassifier( objective='multiclass', num_leaves=63, learning_rate=0.01, n_estimators=10000, subsample_freq=1, subsample=0.8, colsample_bytree=0.8, min_child_weight=5, random_state=2020, n_jobs=24, ) clf.fit( X_trn, Y_trn, eval_set=[(X_val, Y_val)], early_stopping_rounds=500, verbose=200, ) print('val_acc: {:.5f}'.format( accuracy_score(Y_val, clf.predict(X_val)))) oof[val_idx] = clf.predict_proba(X_val) sub += clf.predict_proba(X_sub) / skf.n_splits print('cv_acc : {:.5f}'.format(accuracy_score(Y_train, oof.argmax(axis=1)))) print( classification_report(Y_train, oof.argmax(axis=1), target_names=lbl.classes_))
def train_merged_Model(trainset_path, valset_path, model_save_folder, lr=0.001, isLGB=True): ''' 训练融合后的模型 :param trainset_path: :param valset_path: :param model_save_folder: :param lr: :param isLGB: :return: ''' os.makedirs(model_save_folder, exist_ok=True) cols = ['model', 'days', 'label'] cols += ['p_' + str(i) for i in range(8)] def load_set(set_path): df = pd.read_csv(set_path, usecols=cols) df['model_1'] = df['model'].apply(lambda x: int(x == 1)) df['model_2'] = df['model'].apply(lambda x: int(x == 2)) sety = df['label'] setX = df.drop(['label', 'model'], axis=1) del df return setX, sety def auc_prc(y_true, y_pred): return 'AUC_PRC', average_precision_score(y_true, y_pred), True trainX, trainy = load_set(trainset_path) valX, valy = load_set(valset_path) print('trainset info, shape: {},value_counts: {}'.format( trainX.shape, trainy.value_counts())) print('valset info, shape: {},value_counts: {}'.format( valX.shape, valy.value_counts())) ##########LGBMClassifier clf = LGBMClassifier( num_leaves=127, learning_rate=lr, n_estimators=10000, objective='binary', is_unbalance=True, subsample=0.8, colsample_bytree=0.8, ) if isLGB else RandomForestClassifier() t0 = time.time() if isLGB: clf.fit(trainX, trainy, eval_set=[(valX, valy)], eval_metric=auc_prc, early_stopping_rounds=50, verbose=100) else: clf.fit(trainX, trainy) print('fit time: {:.4f}'.format(time.time() - t0)) save_name='LGBM_Merged_'+ datetime.now().strftime('%Y%m%d_%H%M%S') if isLGB else \ 'RF_Merged'+datetime.now().strftime('%Y%m%d_%H%M%S') joblib.dump(clf, os.path.join(model_save_folder, save_name)) print('Merged model is saved to {}'.format(save_name))