def train_gbtree(X_train, y_train): # Training print('Training model...') # shuffle X and y X_train, y_train = shuffle(X_train, y_train, random_state=0) if args.gb_tool == 'xgboost': model = XGBClassifier( objective='binary:logistic', booster='gbtree', learning_rate=0.05, n_estimators=200, max_depth=3, min_child_weight=6, verbosity=1, ) model.fit(X_train, y_train) params = model.get_params() else: model = CatBoostClassifier( verbose=0, cat_features=cat_features, random_state=args.rs_model, # scale_pos_weight=(1 - pos_rate) / pos_rate ) model.fit(X_train, y_train) params = model.get_all_params() print('Parameters:', params) print('Done.') return model
def xgb_classifier(X_train, X_test, y_train, y_test, useTrainCV=True, cv_folds=5, early_stopping_rounds=50): alg = XGBClassifier(learning_rate=0.1, n_estimators=140, max_depth=5, min_child_weight=3, gamma=0.2, subsample=0.6, colsample_bytree=1.0, objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=27) if useTrainCV: print("Start Feeding Data") xgb_param = alg.get_xgb_params() xgtrain = xgb.DMatrix(X_train.values, label=y_train.values) cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds, early_stopping_rounds=early_stopping_rounds) display(cvresult) alg.set_params(n_estimators=cvresult.shape[0]) print('Start Training') alg.fit(X_train, y_train, eval_metric='auc') print("Start Predicting") predictions = alg.predict(X_test) pred_proba = alg.predict_proba(X_test)[:, 1] # Model performance print("\nModel statistic") print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions)) print("AUC score (test set): %f" % metrics.roc_auc_score(y_test, pred_proba)) print("F1 Score (test set): %f" % metrics.f1_score(y_test, predictions)) feat_imp = alg.feature_importances_ feat = X_train.columns.tolist() res_df = pd.DataFrame({'Features': feat, 'Importance': feat_imp}).sort_values(by='Importance', ascending=False) res_df.plot('Features', 'Importance', kind='bar', title='Feature Importances') plt.ylabel('Feature Importance Score') plt.show() print(res_df) print(res_df["Features"].tolist()) return cvresult, alg
def train_gbtree(X_train, y_train, pos_rate, args): # Training print('Training model...') if args.gb_tool == 'xgboost': model = XGBClassifier(objective='binary:logistic', booster='gbtree', learning_rate=0.05, n_estimators=200, max_depth=3, min_child_weight=6, verbosity=1 ) else: model = CatBoostClassifier(verbose=0, # scale_pos_weight=(1 - pos_rate) / pos_rate, learning_rate=args.lr, depth=args.depth, l2_leaf_reg=args.l2 ) model.fit(X_train, y_train) params = model.get_params() if args.gb_tool == 'xgboost' else model.get_all_params() print('Parameters:', params) print('Done.') return model
def xgb_cv(X, y): # Instantiate XGBoost n_estimators = 100 dtrain = xgb.DMatrix(X, y) # XGBoost was tuned on the raw data. bst = XGBClassifier(n_estimators=100, #70 max_depth=3, min_child_weight=5, gamma=0.5, learning_rate=0.05, subsample=0.7, colsample_bytree=0.7, reg_alpha=0.001, seed=1) # Cross-validate XGBoost params = bst.get_xgb_params() # Extract parameters from XGB instance to be used for CV num_boost_round = bst.get_params()['n_estimators'] # XGB-CV has different names than sklearn cvresult = xgb.cv(params, dtrain, num_boost_round=num_boost_round, nfold=10, metrics=['logloss', 'auc'], seed=1) print("="*80) print("\nXGBoost results for 10-fold cross-validation:") print(cvresult) print("="*80) # XGBoost summary print("="*80) print("\nXGBoost summary for 100 rounds of 10-fold cross-validation:") print("\nBest mean log-loss: %.4f" % cvresult['test-logloss-mean'].min()) print("\nBest mean AUC: %.4f" % cvresult['test-auc-mean'].max()) print("="*80)
def done(istrain=True): # test_save.drop('click',axis=1,inplace=True) # op=['n_estimators','max_depth','min_child_weight','subsample','reg_alpha','gamma','fin'] # scale_pos_weight rate_drop op=['reg_alpha'] if istrain: train_save = gdbt_data_get_train(25) np.random.seed(999) r1 = np.random.uniform(0, 1, train_save.shape[0]) #产生0~40M的随机数 # train_save = train_save.ix[r1 < 0.2, :] print(train_save.shape) y_train = train_save['click'] train_save.drop('click',axis=1,inplace=True) X_train = train_save # dtrain = xgb.DMatrix(X_train, label=y_train) # n_estimators = [i for i in range(200,1000,1)] xgb1 = XGBClassifier(**gbtree_param, objective='binary:logistic', eval_metric=['logloss'], nthread=-1, verbose=2, seed=27, silent=True,**gpu_dict) for i,oper in enumerate(op): modelfit_cv(xgb1, X_train,y_train, cv_folds = kfold,cv_type=oper,random_state=i) logging.debug(oper+":to save validation predictions ...") ret=dump(xgb1, FLAGS.tmp_data_path+'xgboost.cv_'+oper+'.model.joblib_dat') logging.debug(ret) del train_save del X_train del y_train else: X_test = gdbt_data_get_test() print(X_test.shape) # X_test.drop('click',axis=1,inplace=True) for oper in op: xgb1 = load(FLAGS.tmp_data_path+'xgboost.cv_'+oper+'.model.joblib_dat') logging.debug(xgb1.get_params()['n_estimators']) dtrain_predprob = xgb1.predict_proba(X_test)[:,1] logging.debug(dtrain_predprob) y_pred = [round(value,4) for value in dtrain_predprob] logging.debug('-'*30) y_pred=np.array(y_pred).reshape(-1,1) logging.debug(y_pred.shape) test_id=pd.read_csv(FLAGS.tmp_data_path+'test_id.csv') logging.debug(test_id['id'].shape) test_id['id']=test_id['id'].map(int) test_id['click']=y_pred test_id.to_csv(FLAGS.tmp_data_path+'1-'+oper+'-xgboost.test.csv',index=False) del X_test
def get_params(self, deep=True): ''' A hack to make it work through the XGB code. They use the base class 0 to retrieve the parameters. Since I overwrite the base_class[0] as OnehotEncodingClassifierMixin, now I do a hack to temporarily assign the base class as the next one (XGB class). ''' orig_bases = copy.deepcopy(self.__class__.__bases__) self.__class__.__bases__ = (XGBClassifier, ) self.__class__ = XGBClassifier params = XGBClassifier.get_params(self, deep=deep) self.__class__ = MyXGBClassifier self.__class__.__bases__ = orig_bases return params
def xgboost_k_default( k=4, sequence_origin='DairyDB', primers_origin='DairyDB', taxonomy_level: int = 1, selected_primer: str = 'V4', model_preprocessing='Computing frequency of {}-mer (ATCG) in every sequence', test_size=0.2): """ Apply Random Forest model on a set of sequence preprocessed data. :return: """ model_preprocessing = model_preprocessing.format(k) X_train, X_test, y_train, y_test = ETL_k_mer( k=k, sequence_origin=sequence_origin, primers_origin=primers_origin, taxonomy_level=taxonomy_level, selected_primer=selected_primer) XGB = XGBClassifier(silent=0, eta=0.3, max_depth=3, n_estimators=100) y_pred = XGB.fit(X_train, y_train).predict(X_test) test_size, prop_main_class, accuracy = main_stats_model( y_train=y_train, y_test=y_test, y_pred=y_pred, model_name='XGB_{}'.format(k), model_parameters=XGB.get_params(), model_preprocessing=model_preprocessing, sequence_origin=sequence_origin, primers_origin=primers_origin, taxonomy_level=taxonomy_level, selected_primer=selected_primer, test_size=test_size, k=k, feature_importances=XGB.feature_importances_, xgb_model=XGB, save_model=True, save_tree=20) del XGB, X_train, X_test, y_train, y_test, y_pred return test_size, prop_main_class, accuracy
def __init__( self, model: XGBClassifier, feature_names: List[str], classification_labels: Optional[List[str]] = None, ): super().__init__( model.get_booster(), feature_names, model.base_score, model.objective, classification_labels, ) if model.classes_ is None: n_estimators = model.get_params()["n_estimators"] num_trees = model.get_booster().trees_to_dataframe()["Tree"].max() + 1 self._num_classes = num_trees // n_estimators else: self._num_classes = len(model.classes_)
def opt_BDT(input, output, params, show, names): model = XGBClassifier(**params) xgb_param = model.get_xgb_params() cvscores = [] AUC = [] X_train, X_test, y_train, y_test = train_test_split(input, output, test_size=0.2, random_state=42) matrix_train = xgb.DMatrix(X_train, label=y_train) cvresult = xgb.cv( xgb_param, matrix_train, num_boost_round=model.get_params()["n_estimators"], nfold=5, metrics="auc", early_stopping_rounds=30, verbose_eval=True, ) model.set_params(n_estimators=cvresult.shape[0]) model.fit(X_train, y_train, eval_metric="auc") y_prob = model.predict_proba(X_test) y_pred = model.predict(X_test) prediction = [round(value) for value in y_pred] auc = roc_auc_score(y_test, y_prob[:, 1]) accuracy = accuracy_score(y_test, prediction) print("Accuracy: %.2f%%; AUC = %.4f%" % (accuracy * 100, auc)) if show: name = "channel_" + str(channel) + "_BDT" name = "%s_%s" % (name, selection) modelname = "models/%s.h5" % name print("Save to %s" % modelname) plotter.plot_separation(model, X_test, y_test, name, False) plotter.plot_ROC(model, X_test, y_test, name, False) model.get_booster().feature_names = names mp.rc("figure", figsize=(5, 5)) plot_importance(model.get_booster()) plt.subplots_adjust(left=0.3) plt.show()
def grid_search_para(train_data, label, best_para=0, grid_param=0, is_search_estimator=False, search_lr=0.1, scoring='accuracy', search_estimators=10000, iid=False, cv=skfold): if not is_search_estimator: for key, value in grid_param.items(): print('start GridSearchCV {} in range {}'.format(key, value)) xgb_ = XGBClassifier(**best_para) grid_search = GridSearchCV(estimator=xgb_, param_grid=grid_param, scoring=scoring, iid=iid, cv=cv) grid_search.fit(train_data, label) best_para.update(grid_search.best_params_) print('the best parameter is ', grid_search.best_params_) print('the best score is %f' % grid_search.best_score_) else: xgb_ = XGBClassifier() if best_para == 0: best_para = xgb_.get_params() best_para['n_estimators'] = search_estimators best_para['learning_rate'] = search_lr xgb_ = XGBClassifier(**best_para) best_estimator = xgb_cv(xgb_, train_data, label) best_para['n_estimators'] = best_estimator return best_para
def done(istrain='train'): # columns=['hour_t1_start_hour_len_32', 'app_id_weight', 'every_app_len', 'type_no', 'times_len', 'brand', 'close_hour_len_t1_32', 'brand_cnt', 'app_len', 'app_time_t2_61', 'type_no_cnt', 'close_day_t1_32', 'hour_t1_close_hour_weight_32', 'close_hour_t1_32', 'hour_t1_start_hour_weight_32', 'hour_t2_start_hour_len_132', 'hour_t2_start_hour_len_94', 'close_hour_t1_33', 'close_day_t1_33', 'close_hour_t1_43', 'close_hour_t1_22', 'hour_t2_start_hour_len_124', 'hour_t2_start_hour_len_251', 'close_day_t1_19', 'close_hour_t1_36', 'hour_t1_start_hour_size_32', 'close_day_t1_4', 'close_day_size_t1_32', 'close_hour_t1_4', 'hour_t1_close_hour_weight_19', 'close_day_t1_36', 'close_day_t1_43', 'hour_t2_close_hour_weight_124', 'hour_t1_start_hour_len_33', 'close_hour_len_t1_33', 'hour_t2_start_hour_len_11', 'hour_t2_start_hour_len_158', 'close_hour_t1_19', 'hour_t2_start_hour_weight_132', 'close_day_size_t1_33', 'hour_t1_start_hour_size_36', 'hour_t1_start_hour_len_17', 'hour_t1_start_hour_len_43', 'app_time_t1_32',] # test_save.drop('click',axis=1,inplace=True) # op=['n_estimators','max_depth','min_child_weight','subsample','reg_alpha','gamma','fin'] # scale_pos_weight rate_drop logging.debug(istrain) op=['fin'] if istrain=='train': train_save = gdbt_data_get_train('n_class') # np.random.seed(999) # train_save = train_save.ix[r1 < 0.2, :] print(train_save.shape) y_train = train_save['n_class'] train_save.drop('n_class',axis=1,inplace=True) X_train = train_save # X_train = train_save.ix[:,columns] """ 归一化 """ X_train=data_normalization(X_train) """ PCA """ X_train=data_pca(X_train) # dtrain = xgb.DMatrix(X_train, label=y_train) # n_estimators = [i for i in range(200,1000,1)] xgb1 = XGBClassifier(**gbtree_param, objective='multi:softprob', eval_metric=['mlogloss',], nthread=-1, verbose=2, seed=27, silent=True,**gpu_dict) for i,oper in enumerate(op): modelfit_multi_cv(xgb1, X_train,y_train,cv_type=oper,)#random_state=i) logging.debug(oper+":to save validation predictions ...") ret=dump(xgb1, FLAGS.tmp_data_path+'xgboost.cv_'+oper+'.model.joblib_dat') logging.debug(ret) gc.collect() # xgb1.save_model(FLAGS.tmp_data_path+'xgb_new_features.model') # 特征选择 # feature_selectfrommodel(xgb1, X_train,y_train) del train_save del X_train del y_train elif istrain=='eval': X_eval = gdbt_data_get_eval('n_class') print(X_eval.shape) y_eval = X_eval['n_class'] X_eval.drop('n_class',axis=1,inplace=True) logging.debug(X_eval.shape) # X_eval = X_eval.ix[:,columns] """ 归一化 """ X_eval=data_normalization(X_eval) """ PCA """ X_eval=data_pca(X_eval) for oper in op: xgb1 = load(FLAGS.tmp_data_path+'xgboost.cv_'+oper+'.model.joblib_dat') logging.debug(xgb1.get_params()['n_estimators']) dtrain_predprob = xgb1.predict_proba(X_eval) logging.debug(dtrain_predprob.shape) columns=[] for i in [1,2]: for j in range(11): columns.append(str(i)+'-'+str(j)) y_pred=pd.DataFrame(dtrain_predprob,columns=columns) def c(line): return [round(x,6) for x in line] y_pred.apply(lambda line:c(line),axis=1) logging.debug('-'*30) logging.debug(test_score(y_pred,y_eval)) del X_eval elif 'train_predict'==istrain: train_save = gdbt_data_get_train('n_class') print(train_save.shape) y_train = train_save['n_class'] train_save.drop('n_class',axis=1,inplace=True) X_train = train_save X_train_part, X_val, y_train_part, y_val = train_test_split(X_train, y_train, train_size = 0.8,random_state = 7) dtrain = xgb.DMatrix(X_train_part, label=y_train_part) dvalid = xgb.DMatrix(X_val, label=y_val) # del y_train # watchlist = [(dtrain, 'train'), (dvalid, 'valid')] # # logging.debug (X_train_part.shape, y_train_part.shape) # plst = list(gbtree_param.items()) + [('eval_metric', 'mlogloss')] # FLAGS.n_trees=gbtree_param['n_estimators'] # xgb_test_basis = xgb.train(plst, dtrain, FLAGS.n_trees, watchlist) # xgb_test_basis.save_model(FLAGS.tmp_data_path+'xgb_new_features.model') # del dtrain,dvalid # gc.collect() xgb_test_basis = xgb.Booster({'nthread':-1}) #init model xgb_test_basis.load_model(FLAGS.tmp_data_path+'xgb_new_features.model') # load data # xgb_test_basis = load(FLAGS.tmp_data_path+'xgb_new_features.model') dtrain = xgb.DMatrix(X_train, label=y_train) xgb_leaves = xgb_test_basis.predict(dtrain, pred_leaf = True) new_pd = pd.DataFrame() logging.debug(xgb_leaves.shape) for i in range(FLAGS.n_trees): pred2 = xgb_leaves[:, i] # logging.debug(i, np.unique(pred2).size) new_pd['xgb_basis'+str(i)] = pred2 # train_save = gdbt_data_get_train(799) idx_base = 0 for vn in ['xgb_basis' + str(i) for i in range(FLAGS.n_trees)]: _cat = np.asarray(new_pd[vn].astype('category').values.codes, dtype='int32') _cat1 = _cat + idx_base # logging.debug(vn, idx_base, _cat1.min(), _cat1.max(), np.unique(_cat).size) new_pd[vn] = _cat1 idx_base += _cat.max() + 1 logging.debug(new_pd.shape) logging.debug(new_pd.head(3)) new_pd.to_csv(FLAGS.tmp_data_path+'xgb_new_train_features.csv',index=False) gc.collect() elif 'test_predict'==istrain: X_test = gdbt_data_get_test() logging.debug(X_test.shape) oper=op[0] xgb_test_basis = xgb.Booster({'nthread':-1}) #init model xgb_test_basis.load_model(FLAGS.tmp_data_path+'xgb_new_features.model') # load data dtrain = xgb.DMatrix(X_test) # xgb_test_basis = load(FLAGS.tmp_data_path+'xgb_new_features.model') xgb_leaves = xgb_test_basis.predict(dtrain, pred_leaf = True) FLAGS.n_trees=gbtree_param['n_estimators'] new_pd = pd.DataFrame() logging.debug(xgb_leaves.shape) for i in range(FLAGS.n_trees): pred2 = xgb_leaves[:, i] # logging.debug(i, np.unique(pred2).size) new_pd['xgb_basis'+str(i)] = pred2 # train_save = gdbt_data_get_train(799) idx_base = 0 for vn in ['xgb_basis' + str(i) for i in range(FLAGS.n_trees)]: _cat = np.asarray(new_pd[vn].astype('category').values.codes, dtype='int32') _cat1 = _cat + idx_base # logging.debug(vn, idx_base, _cat1.min(), _cat1.max(), np.unique(_cat).size) new_pd[vn] = _cat1 idx_base += _cat.max() + 1 logging.debug(new_pd.shape) logging.debug(new_pd.head(3)) new_pd.to_csv(FLAGS.tmp_data_path+'xgb_new_test_features.csv',index=False) elif istrain=='test': X_test = gdbt_data_get_test() print(X_test.shape) # X_test = X_test.ix[:,columns] # X_test.drop('click',axis=1,inplace=True) """ 归一化 """ X_test=data_normalization(X_test) """ PCA """ X_test=data_pca(X_test) for oper in op: xgb1 = load(FLAGS.tmp_data_path+'xgboost.cv_'+oper+'.model.joblib_dat') logging.debug(xgb1.get_params()['n_estimators']) dtrain_predprob = xgb1.predict_proba(X_test) logging.debug(dtrain_predprob.shape) columns=[] for i in [1,2]: for j in range(11): columns.append(str(i)+'-'+str(j)) y_pred=pd.DataFrame(dtrain_predprob,columns=columns) def c(line): return [round(x,6) for x in line] y_pred.apply(lambda line:c(line),axis=1) logging.debug('-'*30) # y_pred=np.array(y_pred).reshape(-1,1) logging.debug(y_pred) test_id=pd.read_csv(FLAGS.file_path+'deviceid_test.csv') logging.debug(test_id['device_id'].shape) test_id['device_id']=test_id['device_id'].map(str) test_id.rename(columns={'device_id':'DeviceID'}, inplace = True) fin=pd.concat([test_id,y_pred],axis=1) print(fin) fin.to_csv(FLAGS.tmp_data_path+'1-'+oper+'-xgboost.test.csv',index=False) del X_test
def _xgb_classification_train(table, feature_cols, label_col, max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, objective='binary:logistic', booster='gbtree', n_jobs=1, nthread=None, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, random_state=0, seed=None, missing=None, sample_weight=None, eval_set=None, eval_metric=None, early_stopping_rounds=None, verbose=True, xgb_model=None, sample_weight_eval_set=None): validate(greater_than_or_equal_to(max_depth, 1, 'max_depth'), greater_than_or_equal_to(learning_rate, 0.0, 'learning_rate'), greater_than_or_equal_to(n_estimators, 1, 'n_estimators')) classifier = XGBClassifier(max_depth, learning_rate, n_estimators, silent, objective, booster, n_jobs, nthread, gamma, min_child_weight, max_delta_step, subsample, colsample_bytree, colsample_bylevel, reg_alpha, reg_lambda, scale_pos_weight, base_score, random_state, seed, missing) classifier.fit(table[feature_cols], table[label_col], sample_weight, eval_set, eval_metric, early_stopping_rounds, verbose, xgb_model, sample_weight_eval_set) # json get_param = classifier.get_params() feature_importance = classifier.feature_importances_ # plt.rcdefaults() plot_importance(classifier) plt.tight_layout() fig_plot_importance = plt2MD(plt) plt.clf() # plt.rcParams['figure.dpi'] = figure_dpi # plot_tree(classifier) # fig_plot_tree_UT = plt2MD(plt) # plt.clf() # plt.rcParams['figure.dpi'] = figure_dpi # plot_tree(classifier, rankdir='LR') # fig_plot_tree_LR = plt2MD(plt) # plt.rcdefaults() # plt.clf() model = _model_dict('xgb_classification_model') model['feature_cols'] = feature_cols model['label_col'] = label_col model['parameters'] = get_param model['feature_importance'] = feature_importance model['classifier'] = classifier # report # get_param_list = [] # get_param_list.append(['feature_cols', feature_cols]) # get_param_list.append(['label_col', label_col]) params = dict2MD(get_param) # for key, value in get_param.items(): # temp = [key, value] # get_param_list.append(temp) # get_param_df = pd.DataFrame(data=get_param_list, columns=['parameter', 'value']) feature_importance_df = pd.DataFrame(data=feature_importance, index=feature_cols).T rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## XGB Classification Train Result | | ### Plot Importance | {fig_importance} | | ### Feature Importance | {table_feature_importance} | | ### Parameters | {list_parameters} | """.format(fig_importance=fig_plot_importance, table_feature_importance=pandasDF2MD(feature_importance_df, 20), list_parameters=params))) model['_repr_brtc_'] = rb.get() return {'model': model}
learning_rate =0.1, n_estimators=1000, max_depth=9, min_child_weight=1, gamma=0.2, subsample=0.8, colsample_bytree=0.8, objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27, reg_alpha=1e-05) xgb_param = xgb_clf.get_xgb_params() xgtrain = xgb.DMatrix(x_train, label=y_train) cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=xgb_clf.get_params()['n_estimators'], nfold=5, metrics='auc', early_stopping_rounds=50) xgb_clf.set_params(n_estimators=cvresult.shape[0]) xgb_clf.fit(x_train, y_train) y_pred_xgb=xgb_clf.predict(x_test) y_pred_xgb_test_data=xgb_clf.predict(test) score = accuracy_score(y_test, y_pred_xgb) f1_score_xgboost=f1_score(y_test,y_pred_xgb) print(cvresult.shape[0]) print( "\nModel Report") print( "Accuracy : %.4g" % metrics.accuracy_score(y_test, y_pred_xgb))
rfc.n_features_ # 看一下泛化能力 cross_val_score(rfc, X_proc, y, cv=5) cross_val_score(rfc, X_proc, y, cv=5).mean() # 预测 y_test = rfc.predict(X_test_proc) y_test_df = pd.DataFrame(y_test, index=X_test.index) # 使用XGBoost from xgboost import XGBClassifier # 使用默认参数配置 xgbc = XGBClassifier() xgbc.fit(X_proc,y) xgbc.get_booster() xgbc.get_params() # 使用交叉验证评估一下效果 cross_val_score(xgbc, X_proc, y, cv=5) # 预测 y_test = xgbc.predict(X_test_proc) y_test_df = pd.DataFrame(y_test, index=X_test.index) # ----------------kaggle实战的示例代码----------------------- dtype = {'PassengerId': str} train_all = pd.read_csv("train.csv", dtype=dtype) # 根据列索引来删除某一列 train = train_all.drop(train_all.columns[1], axis=1) # train['PassengerId'] = train['PassengerId'].astype(str) test = pd.read_csv("test.csv", dtype=dtype)
def get_params(self, deep=False): model5 = XGBClassifier(max_depth=10, n_estimators=1000, learning_rate=0.1) return model5.get_params(deep=deep)
color='m') #prettify using pyplot: https://matplotlib.org/api/pyplot_api.html plt.title('Machine Learning Algorithm Accuracy Score \n') plt.xlabel('Accuracy Score (%)') plt.ylabel('Algorithm') #base model - tune 1 xgboost = XGBClassifier() base_results = model_selection.cross_validate(xgboost, data1[data1_x_bin], data1[Target], cv=cv_split) xgboost.fit(data1[data1_x_bin], data1[Target]) print('BEFORE DT Parameters: ', xgboost.get_params()) print("BEFORE DT Training w/bin score mean: {:.2f}".format( base_results['train_score'].mean() * 100)) print("BEFORE DT Test w/bin score mean: {:.2f}".format( base_results['test_score'].mean() * 100)) print("BEFORE DT Test w/bin score 3*std: +/- {:.2f}".format( base_results['test_score'].std() * 100 * 3)) #print("BEFORE DT Test w/bin set score min: {:.2f}". format(base_results['test_score'].min()*100)) print('-' * 10) #tune hyper-parameters: http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier param_grid = { 'n_estimators': [50, 100, 200, 400], 'max_depth': [2, 4, 6, 8, 10], #max depth tree can grow; default is none 'random_state': [ 0
def done(istrain,X_train,y_train,flag): # test_save.drop('click',axis=1,inplace=True) # op=['n_estimators','max_depth','min_child_weight','subsample','reg_alpha','gamma','fin'] op=['n_estimators'] if istrain=='train': xgb1 = XGBClassifier(**gbtree_param, objective='multi:softprob', eval_metric=['mlogloss'], nthread=-1, verbose=1, seed=27, silent=True,**gpu_dict) for i,oper in enumerate(op): modelfit_multi_cv(xgb1, X_train,y_train,cv_type=oper,random_state=i) logging.debug(oper+":to save validation predictions ...") ret=dump(xgb1, FLAGS.tmp_data_path+flag+'_xgboost.cv_'+oper+'.model.joblib_dat') logging.debug(ret) gc.collect() del X_train del y_train elif istrain=='eval': for oper in op: xgb1 = load(FLAGS.tmp_data_path+flag+'_xgboost.cv_'+oper+'.model.joblib_dat') logging.debug(xgb1.get_params()['n_estimators']) # dtrain_predprob = xgb1.predict_proba(X_train) y_pred = xgb1.predict(X_train) acc = accuracy_score(y_train, y_pred) logging.debug('acc:'+str( acc*100.0)+'%') logging.debug('-'*30) del X_train elif istrain=='test': for oper in op: xgb1 = load(FLAGS.tmp_data_path+flag+'_xgboost.cv_'+oper+'.model.joblib_dat') logging.debug(xgb1.get_params()['n_estimators']) dtrain_predprob = xgb1.predict_proba(X_train) logging.debug(dtrain_predprob.shape) columns=[] for i in [1,2]: for j in range(11): col=str(i)+'-'+str(j) columns.append(col) y_pred=pd.DataFrame(dtrain_predprob,columns=columns) def c(line): return [round(x,6) for x in line] y_pred.apply(lambda line:c(line),axis=1) logging.debug('-'*30) # y_pred=np.array(y_pred).reshape(-1,1) logging.debug(y_pred) test_id=pd.read_csv(FLAGS.file_path+'deviceid_test.csv') logging.debug(test_id['device_id'].shape) test_id['device_id']=test_id['device_id'].map(str) test_id.rename(columns={'device_id':'DeviceID'}, inplace = True) fin=pd.concat([test_id,y_pred],axis=1) # print(fin) # # # ## df1=pd.read_csv(FLAGS.tmp_data_path+'sex_fin-xgboost.test.csv') # fin['sex']=X_train['sex'].values # columns=['DeviceID'] # # fitle=np.logical_and(fin['sex'].values==i,True) # fin.ix[fitle,col]=fin.ix[fitle,str(j)].values # fin.ix[np.logical_and(fitle,False),col]=0.000001 # # fin.drop('sex',axis=1,inplace=True) fin.to_csv(FLAGS.tmp_data_path+flag+'_'+oper+'-xgboost.test.csv',index=False) # test_concat(df1,fin) del X_train return fin
y_train, scoring='accuracy', return_estimator=True ) score_mean = val_info['test_score'].mean() score_std = val_info['test_score'].std() print(f'{score_mean} accuracy with a standard deviation of {score_std}') # Cell clf = val_info['estimator'][0] # Cell importance_df = pd.DataFrame({'feature':X_train.columns, 'importance': clf.feature_importances_}).sort_values('importance', ascending=False) # Cell importance_df.to_html('../output/data/feature_importance.html', index=False) # Cell if LOG_MLFLOW: with mlflow.start_run(experiment_id=EX_ID): mlflow.log_param('num_features', X_train.shape[1]) mlflow.log_param('n_estimators', clf.get_params()['n_estimators']) mlflow.log_param('max_depth', clf.get_params()['max_depth']) mlflow.log_param('learning_rate', clf.get_params()['learning_rate']) mlflow.log_param('booster', clf.get_params()['booster']) mlflow.log_metric('mean_accuracy', score_mean) mlflow.log_metric('std_accuracy', score_std) mlflow.log_artifact('../output/data/feature_importance.html')
y_devset = devset4['is_used'] N_ESTIMATORS = [10, 50, 100, 300, 500, 1000] MAX_DEPTH = [4, 6, 8] LEARNING_RATE = [0.05, 0.1, 0.15, 0.2] param_grid = dict(max_depth = MAX_DEPTH, n_estimators=N_ESTIMATORS, learning_rate = LEARNING_RATE) results_list = list() parameters = ParameterGrid(param_grid) for g in parameters: #clf = XGBClassifier(random_state=0, n_jobs=-1, early_stopping_rounds=10, eval_metric="auc", eval_set=eval_set) clf = XGBClassifier(random_state=42, n_jobs=-1) clf.set_params(**g) print "parameters: ", clf.get_params() clf.fit(X_train, y_train) # http://xgboost.readthedocs.io/en/latest/python/python_intro.html y_pred = clf.predict_proba(X_devset) y_pred_class = clf.predict(X_devset) if clf.classes_[1] == 1: y_pred_prob = clf.predict_proba(X_devset)[:, 1] else: y_pred_prob = clf.predict_proba(X_devset)[:, 0] cmatrix = confusion_matrix(y_devset, y_pred_class, labels=[1,0]) print(cmatrix) F5 = fbeta_score(y_devset, y_pred_class, beta=0.5, labels=[1,0])
# create the pipeline pipeline = Pipeline(steps=[('data.pecarn.preprocess', pecarn.make_preprocess_pipeline() ), ('feature_selection', feature_selector), ('xgboost', clf)]) # save the feature names for use later in prediction pipeline.input_features = X[0:0] # neptune initialization - NEPTUNE_API_TOKEN and NEPTUNE_PROJECT environment variables must be set neptune.init() # create a neptune experiment to log to with neptune.create_experiment( name='xgboost.XGBClassifier_selected_features', params=clf.get_params(), upload_source_files=[__file__, 'src/data/pecarn/*.py'], send_hardware_metrics=False) as exp: # train the classifier pipeline.fit(X_train, y_train) # calculate scores on train set y_train_pred = pipeline.predict(X_train) train_scores = { 'accuracy': accuracy_score(y_train, y_train_pred), 'f1': f1_score(y_train, y_train_pred), 'f1_weighted': f1_score(y_train, y_train_pred, average='weighted'), 'avg_precision': average_precision_score(y_train, y_train_pred) }
RFC_clf_report, RFC_clf_acc_score, RFC_f1score, RFC_y_pred = modeling(RFC,X_train_resampled,y_train_resampled,X_test,y_test) svc_clf_report, svc_clf_acc_score, svc_f1score,svc_y_pred = modeling(svc,X_train_resampled,y_train_resampled,X_test,y_test) results={'Classifier':["XGB Classifier","Gradient Boosting Classifier","Random Forest Classifier","SVC"], 'Accuracy':[str(XGB_clf_acc_score)[:5],str(GBC_clf_acc_score)[:5],str(RFC_clf_acc_score)[:5],str(svc_clf_acc_score)[:5]], 'F1_macro':[str(XGB_f1score)[:5],str(GBC_f1score)[:5],str(RFC_f1score)[:5],str(svc_f1score)[:5]]} score_report_df =pd.DataFrame(data=results,columns=["Classifier","Accuracy","F1_macro"]) print("Base models with upsampled train datasets, holdout method (8:2) validation") print(score_report_df) # Turned hyperparameter and WTIH resampled dataset # Print baseline hyperparameters pprint(XGB.get_params()) # Pass turned hyperparameters # when you set objective="multi:softmax", num_class=3 (func, non func, need repair) needs to be set manually! # get_params().keys() does NOT SHOW the param! XGB_t= XGBClassifier(colsample_bylevel=0.5,max_depth=10,objective="multi:softmax",num_class=3) pprint(XGB_t.get_params()) XGB_t_clf_report_r, XGB_t_clf_acc_score_r, XGB_t_f1score_r, XGB_t_y_pred_r= modeling(XGB_t,X_train_resampled,y_train_resampled,X_test,y_test) print(XGB_t_clf_report_r) # Confusion matrix visualisation XGB_t_confusion_matrix_ =confusion_matrix(y_test,XGB_t_y_pred_r) class_names = ["Func","Need Repair","Non Func"] fig,ax =plot_confusion_matrix(conf_mat = XGB_t_confusion_matrix_,colorbar = True, show_absolute=False, show_normed=True, class_names = class_names)
class XGBoostClassifier(AbstractSKLearnClassifier): def __init__(self): AbstractSKLearnClassifier.__init__(self) self.model = False def set_label_encoder(self, labels): AbstractSKLearnClassifier.set_label_encoder(self, labels) def return_label_encoding(self, labels): return AbstractSKLearnClassifier.return_label_encoding(self, labels) def train_classifier(self, trainvectors, labels, booster='gbtree', silent='1', learning_rate='0.1', min_child_weight='1', max_depth='6', gamma='0', max_delta_step='0', subsample='1', colsample_bytree='1', reg_lambda='1', reg_alpha='0', scale_pos_weight='1',objective='binary:logistic', seed='7', n_estimators='100',jobs='12', iterations='50',scoring='roc_auc',v=2): # prepare grid search if len(list(set(labels))) > 2: # more than two classes to distinguish parameters = ['estimator__n_estimators','estimator__min_child_weight', 'estimator__max_depth', 'estimator__gamma', 'estimator__subsample','estimator__colsample_bytree','estimator__reg_alpha','estimator__reg_lambda','estimator__scale_pos_weight'] multi = True else: # only two classes to distinguish parameters = ['n_estimators','min_child_weight', 'max_depth', 'gamma', 'subsample','colsample_bytree','reg_alpha', 'reg_lambda', 'scale_pos_weight'] multi = False silent = int(silent) nthread=int(jobs) seed=int(seed) iterations=int(iterations) learning_rate = float(learning_rate) max_delta_step = float(max_delta_step) reg_lambda_values = [i/10 for i in range(0,5)] if reg_lambda == 'search' else [float(x) for x in reg_lambda.split()] n_estimators_values = list(range(100,1000,100)) if n_estimators == 'search' else [int(x) for x in n_estimators.split()] min_child_weight_values = list(range(1,6,1)) if min_child_weight == 'search' else [int(x) for x in min_child_weight.split()] max_depth_values = list(range(3,10,1)) if max_depth == 'search' else [int(x) for x in max_depth.split()] gamma_values = [i/10 for i in range(0,5)] if gamma == 'search' else [float(x) for x in gamma.split()] subsample_values = [i/10 for i in range(6,10)] if subsample == 'search' else [float(x) for x in subsample.split()] colsample_bytree_values = [i/10 for i in range(6,10)] if colsample_bytree == 'search' else [float(x) for x in colsample_bytree.split()] reg_alpha_values = [1e-5,1e-2,0.1,1,100] if reg_alpha == 'search' else [float(x) for x in reg_alpha.split()] scale_pos_weight_values = [1,3,5,7,9] if scale_pos_weight == 'search' else [int(x) for x in scale_pos_weight.split()] grid_values = [n_estimators_values,min_child_weight_values, max_depth_values, gamma_values, subsample_values, colsample_bytree_values, reg_alpha_values, reg_lambda_values, scale_pos_weight_values] if not False in [len(x) == 1 for x in grid_values]: # only sinle parameter settings settings = {} for i, parameter in enumerate(parameters): settings[parameter] = grid_values[i][0] else: param_grid = {} for i, parameter in enumerate(parameters): param_grid[parameter] = grid_values[i] model = XGBClassifier(silent=silent,nthread=nthread,learning_rate=learning_rate,max_delta_step=max_delta_step) if multi: model = OutputCodeClassifier(model) trainvectors = trainvectors.todense() if [len(x) > 1 for x in grid_values].count(True) <= 3: # exhaustive grid search with one to three variant parameters paramsearch = GridSearchCV(model, param_grid, verbose=v, scoring=scoring, cv=5, n_jobs=1) else: # random grid search paramsearch = RandomizedSearchCV(model, param_grid, verbose=v, n_iter=iterations, scoring=scoring, cv=5, n_jobs=1) paramsearch.fit(trainvectors, labels) settings = paramsearch.best_params_ self.model = XGBClassifier( learning_rate = learning_rate, max_delta_step = max_delta_step, silent = silent, nthread = nthread, n_estimators = settings[parameters[0]], min_child_weight = settings[parameters[1]], max_depth = settings[parameters[2]], gamma = settings[parameters[3]], subsample = settings[parameters[4]], colsample_bytree = settings[parameters[5]], reg_alpha = settings[parameters[6]], reg_lambda = settings[parameters[7]], scale_pos_weight = settings[parameters[8]], verbose = v ) self.model.fit(trainvectors, labels) def return_classifier(self): return self.model def return_feature_importance(self,vocab=False): feature_importance = [] if vocab: for i,val in enumerate(self.model.feature_importances_.T.tolist()): feature_importance.append([vocab[i],val]) else: for i,val in enumerate(self.model.coef_.T.tolist()): feature_importance.append([str(i),val]) sorted_feature_importance = sorted(feature_importance,key = lambda k : k[1],reverse=True) sorted_feature_importance_str = '\n'.join(['\t'.join([str(x) for x in row]) for row in sorted_feature_importance]) return sorted_feature_importance_str def return_parameter_settings(self): parameter_settings = [] for param in ['n_estimators','min_child_weight', 'max_depth', 'gamma', 'subsample','colsample_bytree', 'reg_alpha', 'scale_pos_weight','learning_rate','max_delta_step','reg_lambda']: parameter_settings.append([param,str(self.model.get_params()[param])]) return '\n'.join([': '.join(x) for x in parameter_settings]) def return_model_insights(self,vocab=False): model_insights = [['feature_importance.txt',self.return_feature_importance(vocab)],['parameter_settings.txt',self.return_parameter_settings()]] return model_insights
colsample_bylevel=0.7, learning_rate=0.01, # 学习率,控制每次迭代更新权重时的步长,值越小,训练越慢。默认0.3,典型值为0.01-0.2。 n_estimators=1000000, # 总共迭代的次数,即决策树的个数,数值大没关系,cv会自动返回合适的n_estimators max_depth=5, # 树的深度,默认值为6,典型值3-10。 min_child_weight=2, # 值越大,越容易欠拟合;值越小,越容易过拟合(值较大时,避免模型学习到局部的特殊样本)。默认值为1 gamma=0, # 惩罚项系数,指定节点分裂所需的最小损失函数下降值。 objective='multi:softprob', ) if useTrainCV: xgb_param = xgb1.get_xgb_params() xgtrain = xgb.DMatrix(X_train, label=y_train) cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=xgb1.get_params()['n_estimators'], folds=cv_folds, metrics='mlogloss', early_stopping_rounds=early_stopping_rounds) n_estimators = cvresult.shape[0] xgb1.set_params(n_estimators=n_estimators) # print(cvresult) # Fit the algorithm on the data xgb1.fit(X_train, y_train, eval_metric='mlogloss') # Predict training set: train_predprob = xgb1.predict_proba(X_train) logloss = metrics.log_loss(y_train, train_predprob) # Predict training set: print("logloss of train :%.4f" % logloss) y_pred = np.array(xgb1.predict(X_test)) predictions = [round(value) for value in y_pred]
nthread = 4, min_child_weight = 1, subsample= 0.8, seed = 1337, objective= 'multi:softprob', max_depth = 7, gamma= .2) # use the xgb interface xgb_param = clf.get_xgb_params() xgb_param['num_class'] = 5 xgb_param['eval_metric'] = 'mlogloss' Xg_train = xgb.DMatrix(X_train, label=y_train, missing=np.nan) cvresult = xgb.cv(xgb_param, Xg_train, num_boost_round = clf.get_params()['n_estimators'], nfold = 5, show_progress = True, early_stopping_rounds = 100) clf.set_params(n_estimators=cvresult.shape[0]) clf.fit(X_train, y_train) best_outcome_params = clf.get_params() best_outcome_score = cvresult.min() try: # predict the outcome probabilities y_pred = grid.predict_proba(X_test) except: # predict the outcome probabilities y_pred = clf.predict_proba(X_test)
df = pd.read_csv("data/tanzania_cleaned_df2.csv") X = df.iloc[0:59400, 0:110] y = df[['status_group']] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) print(len(X_train)) print(len(y_train)) print(len(X_test)) print(len(y_test)) print(df.status_group.value_counts()) # XGB model and its parameters XGB = XGBClassifier() pprint(XGB.get_params()) # kfold 5 kf = KFold(n_splits=5, random_state=42, shuffle=False) # params_xgb = { #'n_estimators': [100], this is default 'max_depth': [6, 8, 10], #'validate_parameters': [True], this is default 'min_child_weight': [1, 2, 3], 'gamma': [0, 0.5], 'learning_rate': [0.05, 0.1, 0.3, 0, 4], 'colsample_bytree': [1, 0.5] } # Scoring ="f1_macro"
gamma=gamma, subsample=subsample, colsample_bytree=colsample_bytree, reg_alpha=reg_alpha, objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27 ) ml.xgbfit(xgb4, trn, tst, use_columns, printFeatureImportance=False, target=target) params4 = xgb4.get_params() params4['eval_metric'] = 'auc' # 0.954 params4['learning_rate'] = 0.1 # 0.952 params4['grow_policy'] = 'lossguide' params4['max_leaves'] = 1400 params4['alpha'] = 4 params4['scale_pos_weight'] = 9 params5 = {'learning_rate': 0.3, 'tree_method': "auto", 'grow_policy': "lossguide", 'max_leaves': 1400, 'max_depth': 4, 'min_child_weight':1, 'subsample': 0.9,
def xgb_classifier(X_train, X_test, y_train, y_test, useTrainCV=True, cv_folds=5, early_stopping_rounds=50): """ 关于现在这个模型 准确率 : 0.9995 AUC : 0.887708 F1 Score : 0.847584 -----------------------------------> 关于现在这个模型 准确率 : 0.9996 AUC 得分 (训练集): 0.977480 F1 Score 得分 (训练集): 0.858209 ----------------------------------> 关于现在这个模型 ['V14', 'V4', 'V17', 'V10', 'V12', 'V20', 'Amount', 'V21', 'V26', 'V28', 'V11', 'V19', 'V8', 'V7', 'V13'] 准确率 : 0.9996 AUC 得分 (训练集): 0.978563 F1 Score 得分 (训练集): 0.859259 ----------------------------------> # {'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 3} 0.862920874517388 # {'colsample_bytree': 1.0, 'gamma': 0.2} 0.871 # {'gamma': 0.2, 'scale_pos_weight': 1} 0.8702009952422571 # {'subsample': 0.6} 0.864310306628855 """ alg = XGBClassifier(learning_rate=0.1, n_estimators=140, max_depth=5, min_child_weight=3, gamma=0.2, subsample=0.6, colsample_bytree=1.0, objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=27) if useTrainCV: print("Start Feeding Data") xgb_param = alg.get_xgb_params() xgtrain = xgb.DMatrix(X_train.values, label=y_train.values) # xgtest = xgb.DMatrix(X_test.values, label=y_test.values) cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds, early_stopping_rounds=early_stopping_rounds) alg.set_params(n_estimators=cvresult.shape[0]) # 建模 print('Start Training') alg.fit(X_train, y_train, eval_metric='auc') # param_test1 = {} # gsearch1 = GridSearchCV(estimator=XGBClassifier(learning_rate=0.1, n_estimators=140, max_depth=5, # min_child_weight=3, gamma=0.2, subsample=0.8, # colsample_bytree=1.0, # objective='binary:logistic', nthread=4, scale_pos_weight=1, # seed=27), # param_grid=param_test1, # scoring='f1', # n_jobs=4, iid=False, cv=5) # gsearch1.fit(X_train, y_train) # print(gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_) # 对训练集预测 print("Start Predicting") predictions = alg.predict(X_test) pred_proba = alg.predict_proba(X_test)[:, 1] # 输出模型的一些结果 print("\n关于现在这个模型") print("准确率 : %.4g" % metrics.accuracy_score(y_test, predictions)) print("AUC 得分 (训练集): %f" % metrics.roc_auc_score(y_test, pred_proba)) print("F1 Score 得分 (训练集): %f" % metrics.f1_score(y_test, predictions)) feat_imp = alg.feature_importances_ feat = X_train.columns.tolist() # clf.best_estimator_.booster().get_fscore() res_df = pd.DataFrame({ 'Features': feat, 'Importance': feat_imp }).sort_values(by='Importance', ascending=False) res_df.plot('Features', 'Importance', kind='bar', title='Feature Importances') plt.ylabel('Feature Importance Score') plt.show() print(res_df) print(res_df["Features"].tolist())
def _xgb_classification_train(table, feature_cols, label_col, max_depth=3, learning_rate=0.1, n_estimators=100, silent=True, objective='binary:logistic', booster='gbtree', n_jobs=1, nthread=None, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, random_state=None, seed=None, missing=None, importance_type='gain', class_weight=None, eval_set=None, eval_metric=None, early_stopping_rounds=None, verbose=True, xgb_model=None, sample_weight_eval_set=None): feature_names, features = check_col_type(table, feature_cols) if isinstance(features, list): features = np.array(features) if random_state is None: random_state = randint(-2**31, 2**31 - 1) y_train = table[label_col] class_labels = sorted(set(y_train)) if class_weight is None: sample_weight = None else: if len(class_weight) != len(class_labels): raise ValueError( "Number of class weights should match number of labels.") else: class_weight = { class_labels[i]: class_weight[i] for i in range(len(class_labels)) } sample_weight = np.vectorize(_make_sample_weight)(y_train, class_weight) classifier = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, silent=silent, objective=objective, booster=booster, n_jobs=n_jobs, nthread=nthread, gamma=gamma, min_child_weight=min_child_weight, max_delta_step=max_delta_step, subsample=subsample, colsample_bytree=colsample_bytree, colsample_bylevel=colsample_bylevel, reg_alpha=reg_alpha, reg_lambda=reg_lambda, scale_pos_weight=scale_pos_weight, base_score=base_score, random_state=random_state, seed=seed, missing=missing, importance_type=importance_type) classifier.fit(features, table[label_col], sample_weight, eval_set, eval_metric, early_stopping_rounds, verbose, xgb_model, sample_weight_eval_set) # json get_param = classifier.get_params() feature_importance = classifier.feature_importances_ # plt.rcdefaults() plot_importance(classifier) plt.tight_layout() fig_plot_importance = plt2MD(plt) plt.clf() # plt.rcParams['figure.dpi'] = figure_dpi # plot_tree(classifier) # fig_plot_tree_UT = plt2MD(plt) # plt.clf() # plt.rcParams['figure.dpi'] = figure_dpi # plot_tree(classifier, rankdir='LR') # fig_plot_tree_LR = plt2MD(plt) # plt.rcdefaults() # plt.clf() model = _model_dict('xgb_classification_model') model['feature_cols'] = feature_cols model['label_col'] = label_col model['parameters'] = get_param model['feature_importance'] = feature_importance model['classifier'] = classifier # report # get_param_list = [] # get_param_list.append(['feature_cols', feature_cols]) # get_param_list.append(['label_col', label_col]) params = dict2MD(get_param) # for key, value in get_param.items(): # temp = [key, value] # get_param_list.append(temp) # get_param_df = pd.DataFrame(data=get_param_list, columns=['parameter', 'value']) feature_importance_df = pd.DataFrame(data=feature_importance, index=feature_names).T rb = BrtcReprBuilder() rb.addMD( strip_margin(""" | ## XGB Classification Train Result | | ### Plot Feature Importance | {fig_importance} | | ### Normalized Feature Importance Table | {table_feature_importance} | | ### Parameters | {list_parameters} | """.format(fig_importance=fig_plot_importance, table_feature_importance=pandasDF2MD(feature_importance_df, 20), list_parameters=params))) model['_repr_brtc_'] = rb.get() feature_importance_table = pd.DataFrame( [[feature_names[i], feature_importance[i]] for i in range(len(feature_names))], columns=['feature_name', 'importance']) model['feature_importance_table'] = feature_importance_table return {'model': model}
def main(): # Start timer t_start = time.time() # Command line options parser = argparse.ArgumentParser() group_model = parser.add_mutually_exclusive_group() group_model.add_argument('-x', '--xgboost', action='store_true', help='Run gradient BDT') group_model.add_argument('-n', '--nn', action='store_true', help='Run neural network') group_model.add_argument('-p', '--prepare_hdf5', type=str, nargs='?', default='', help='Prepare input datasets for ML and store in HDF5 file; options: "2L2J" or "2L3J+"') group_read_dataset = parser.add_mutually_exclusive_group() group_read_dataset.add_argument('-r', '--read_hdf5', action='store_true', help='Read prepared datasets from HDF5 file') #group_read_dataset.add_argument('-d', '--direct_read', action='store_true', help='Read unprepared datasets from ROOT file') parser.add_argument('-l', '--load_pretrained_model', action='store_true', help='Load pre-trained classifier model, i.e. only run on test data') #parser.add_argument('-B', '--N_sig_events', type=lambda x: int(float(x)), default=0, help='Number of signal events to read from the dataset') #parser.add_argument('-S', '--N_bkg_events', type=lambda x: int(float(x)), default=0, help='Number of background events to read from the dataset for each class') parser.add_argument('-s', '--signal_region', type=str, nargs='?', default='int', help='Choose signal region: low-2J, int-2J, high-2J, low-3J+, int-3J+, high-3J+') parser.add_argument('-b', '--balanced', type=int, nargs='?', default=-1, help='Balance dataset for training; 0: oversample signal, 1: undersample background') parser.add_argument('-m', '--multiclass', action='store_true', help='Use multiple background classes in addition to the signal class') parser.add_argument('-w', '--event_weight', action='store_true', help='Apply event weights during training') parser.add_argument('-c', '--class_weight', action='store_true', help='Apply class weights to account for unbalanced dataset') parser.add_argument('-t', '--do_train', action='store_true', help='Train the classifier') parser.add_argument('-T', '--do_test', action='store_true', help='Test the classifier on data it has not been trained on') parser.add_argument('-e', '--train_even', action='store_true', help='Use even run numbers for training and odd run numbers for testing') parser.add_argument('-o', '--train_odd', action='store_true', help='Use odd run numbers for training and even run numbers for testing') parser.add_argument('-C', '--doCV', action='store_true', help='Perform a k-fold cross-validation on the training set during training') parser.add_argument('-O', '--oversample', action='store_true', help='Balance imbalanced dataset using oversampling') parser.add_argument('-U', '--undersample', action='store_true', help='Balance imbalanced dataset using undersampling') parser.add_argument('--n_nodes', type=int, nargs='?', default=20, help='Number of nodes in each hidden neural network layer') parser.add_argument('--n_hidden_layers', type=int, nargs='?', default=1, help='Number of nodes in each hidden neural network layer') parser.add_argument('--dropout', type=float, nargs='?', default=0., help='Use dropout regularization on neural network layers to reduce overfitting') parser.add_argument('--L1', type=float, nargs='?', default=0., help='Use L1 regularization on neural network weights to reduce overfitting') parser.add_argument('--L2', type=float, nargs='?', default=0., help='Use L2 regularization (weights decay) on neural network weights to reduce overfitting') parser.add_argument('--lr', type=float, nargs='?', default=0.001, help='Set learning rate for the neural network or BDT optimizer') parser.add_argument('--batch_size', type=int, nargs='?', default=32, help='Number of events to use for each weight update') parser.add_argument('--epochs', type=lambda x: int(float(x)), nargs='?', default=1, help='Number of passes through the training set') parser.add_argument('--max_depth', type=int, nargs='?', default=3, help='Maximum tree depth for BDT') parser.add_argument('--n_estimators', type=lambda x: int(float(x)), nargs='?', default=100, help='Number of trees in BDT ensemble') parser.add_argument('--gamma', type=float, nargs='?', default=0, help='Minimum loss reduction required to make a further partition on a leaf node of the XGBoost tree') parser.add_argument('--min_child_weight', type=float, nargs='?', default=1, help='Minimum sum of instance weight(hessian) needed in a child') parser.add_argument('--max_delta_step', type=float, nargs='?', default=0, help='Maximum delta step we allow each tree’s weight estimation to be') parser.add_argument('--subsample', type=float, nargs='?', default=1, help='Subsample ratio of the training instance') parser.add_argument('--colsample_bytree', type=float, nargs='?', default=1, help='Subsample ratio of columns when constructing each tree') parser.add_argument('--colsample_bylevel', type=float, nargs='?', default=1, help='Subsample ratio of columns for each level') parser.add_argument('--colsample_bynode', type=float, nargs='?', default=1, help='Subsample ratio of columns for each node') parser.add_argument('-G', '--doGridSearchCV', action='store_true', help='Perform a grid search for optimal hyperparameter values using cross-validation') parser.add_argument('-V', '--plot_validation_curve', action='store_true', help='Calculate and plot perforance score as function of number of training events') parser.add_argument('-L', '--plot_learning_curve', action='store_true', help='Calculate and plot perforance score for different values of a chosen hyperparameter') args = parser.parse_args() # Set which sample types to prepare HDF5s for use_sig = 1 use_bkg = 1 use_data = 0 # Where to put preprocessed datasets preproc_dir = 'preprocessed_datasets/' preproc_suffix = '' if args.prepare_hdf5: preproc_suffix = '_group_{}_preprocessed.h5'.format(args.prepare_hdf5) elif '2J' in args.signal_region: preproc_suffix = '_group_2L2J_preprocessed.h5' elif '3J+' in args.signal_region: preproc_suffix = '_group_2L3J+_preprocessed.h5' filename_sig_low_preprocessed = preproc_dir + 'sig_low' + preproc_suffix filename_sig_int_preprocessed = preproc_dir + 'sig_int' + preproc_suffix filename_sig_high_preprocessed = preproc_dir + 'sig_high' + preproc_suffix filename_sig_preprocessed = filename_sig_low_preprocessed filename_bkg_preprocessed = preproc_dir + 'bkg' + preproc_suffix filename_data_preprocessed = preproc_dir + 'data' + preproc_suffix # Where to put output output_dir = 'output/' #trained_model_dir = 'trained_models/' trained_model_dir = output_dir trained_model_xgb_suffix = '2LJets_trained_model.joblib' trained_model_nn_suffix = '2LJets_trained_model.h5' # Counters n_events_read = n_events_kept = 0 n_events_read_sample = n_events_kept_sample = 0 n_events_read_sample_type = n_events_kept_sample_type = 0 if args.xgboost: output_dir += 'xgboost/latest/xgb_' trained_model_dir += 'xgboost/latest/xgb_' elif args.nn: output_dir += 'neural_network/latest/nn_' trained_model_dir += 'neural_network/latest/nn_' if 'low' in args.signal_region: output_dir += 'low_' trained_model_dir += 'low_' elif 'int' in args.signal_region: output_dir += 'int_' trained_model_dir += 'int_' elif 'high' in args.signal_region: output_dir += 'high_' trained_model_dir += 'high_' if args.train_even: output_dir += 'trainEven_' trained_model_dir += 'trainEven_' elif args.train_odd: output_dir += 'trainOdd_' trained_model_dir += 'trainOdd_' if args.xgboost: trained_model_path = trained_model_dir + trained_model_xgb_suffix elif args.nn: trained_model_path = trained_model_dir + trained_model_nn_suffix global df_sig_feat, df_bkg_feat, df_data_feat l_sig = [] if use_sig: if 'low' in args.signal_region: l_sig = d_sig['low'] filename_sig_preprocessed = filename_sig_low_preprocessed elif 'int' in args.signal_region: #elif args.signal_region == 'int': l_sig = d_sig['int'] filename_sig_preprocessed = filename_sig_int_preprocessed elif 'high' in args.signal_region: l_sig = d_sig['high'] filename_sig_preprocessed = filename_sig_high_preprocessed d_sig_infile = {'low': filename_sig_low_preprocessed, 'int': filename_sig_int_preprocessed, 'high': filename_sig_high_preprocessed} class Logger(object): def __init__(self): self.terminal = sys.stdout self.log = open(output_dir+".log", "w") def write(self, message): self.terminal.write(message) self.log.write(message) def flush(self): #this flush method is needed for python 3 compatibility. #this handles the flush command by doing nothing. #you might want to specify some extra behavior here. pass sys.stdout = Logger() if args.prepare_hdf5: """Read input dataset in chunks, select features and perform cuts, before storing DataFrame in HDF5 file""" # Prepare and store signal dataset if use_sig: prepareHDF5(filename_sig_low_preprocessed, d_sig['low'], sample_type='sig', selection=args.prepare_hdf5, chunk_size=1e5, n_chunks=None, entrystart=0) prepareHDF5(filename_sig_int_preprocessed, d_sig['int'], sample_type='sig', selection=args.prepare_hdf5, chunk_size=1e5, n_chunks=None, entrystart=0) prepareHDF5(filename_sig_high_preprocessed, d_sig['high'], sample_type='sig', selection=args.prepare_hdf5, chunk_size=1e5, n_chunks=None, entrystart=0) # Prepare and store background dataset if use_bkg: prepareHDF5(filename_bkg_preprocessed, l_bkg, sample_type='bkg', selection=args.prepare_hdf5, chunk_size=1e6, n_chunks=None, entrystart=0) #prepareHDF5(filename_bkg_preprocessed, l_bkg, sample_type='bkg', selection=args.prepare_hdf5, chunk_size=1e4, n_chunks=1, entrystart=0) # Prepare and store real dataset if use_data: prepareHDF5(filename_data_preprocessed, l_data, sample_type='data', selection=args.prepare_hdf5, chunk_size=1e5, n_chunks=None, entrystart=0) return elif args.read_hdf5: if use_sig: # Read in preprocessed signal DataFrame from HDF5 file df_sig_feat = pd.DataFrame({}) for key_sig, value_sig_infile in d_sig_infile.items(): if key_sig in args.signal_region: print("\nReading in file:", value_sig_infile) sig_store = pd.HDFStore(value_sig_infile) for i_sig in sig_store.keys(): #d_sig[key_sig]: if len(df_sig_feat) is 0: df_sig_feat = sig_store[i_sig]#.astype('float64') df_sig_feat['group'] = i_sig else: df_sig_sample = sig_store[i_sig]#.astype('float64') df_sig_sample['group'] = i_sig df_sig_feat = df_sig_feat.append(df_sig_sample) if 'mTl3' in df_sig_feat: df_sig_feat.drop(columns='mTl3', inplace=True) print("\ndf_sig_feat.head():\n", df_sig_feat.head()) sig_store.close() print("Closed store") if use_bkg: # Read in preprocessed background DataFrame from HDF5 file df_bkg_feat = pd.DataFrame({}) print("\nReading in file:", filename_bkg_preprocessed) bkg_store = pd.HDFStore(filename_bkg_preprocessed) for i_bkg in bkg_store.keys(): #l_bkg: if len(df_bkg_feat) is 0: df_bkg_feat = bkg_store[i_bkg]#.astype('float64') df_bkg_feat['group'] = i_bkg else: df_bkg_sample = bkg_store[i_bkg]#.astype('float64') df_bkg_sample['group'] = i_bkg df_bkg_feat = df_bkg_feat.append(df_bkg_sample) if 'mTl3' in df_bkg_feat: df_bkg_feat.drop(columns='mTl3', inplace=True) print("\ndf_bkg_feat.head():\n", df_bkg_feat.head()) bkg_store.close() print("Closed store") if use_data: # Read in preprocessed DataFrame of real data from HDF5 file data_store = pd.HDFStore(filename_data_preprocessed) df_data_feat = data_store['data'] print("\ndf_data_feat.head():\n", df_data_feat.head()) data_store.close() print("Closed store") elif args.direct_read: """Read the input dataset for direct use, without reading in chunks and storing to output file""" print("Not available at the moment") return #entry_start = 0 #sig_entry_stop = 1e4 #bkg_entry_stop = 1e4 ## import signal dataset #df_sig = importOpenData(sample_type="sig", entrystart=entry_start, entrystop=sig_entry_stop) #df_sig = shuffle(df_sig) # shuffle the rows/events #df_sig_feat = selectFeatures(df_sig, l_features) #df_sig_feat = df_sig_feat*1 # multiplying by 1 to convert booleans to integers #df_sig_feat["eventweight"] = getEventWeights(df_sig, l_eventweights) ## import background dataset #df_bkg = importOpenData(sample_type="bkg", entrystart=entry_start, entrystop=bkg_entry_stop) #df_bkg = shuffle(df_bkg) # shuffle the rows/events #df_bkg_feat = selectFeatures(df_bkg, l_features) #df_bkg_feat = df_bkg_feat*1 # multiplying by 1 to convert booleans to integers #df_bkg_feat["eventweight"] = getEventWeights(df_bkg, l_eventweights) ## import data ##df_data = importOpenData(sample_type="data", entrystart=entry_start, entrystop=entry_stop) if 'low' in args.signal_region: print('\nBefore xsec correction: df_sig_feat.query("DatasetNumber == 396210").loc[:,"eventweight"]\n', df_sig_feat.query("DatasetNumber == 396210").loc[:,"eventweight"].head()) df_sig_feat.loc[df_sig_feat.DatasetNumber==396210,'eventweight'] = df_sig_feat.loc[df_sig_feat.DatasetNumber==396210,'eventweight'] * 0.08836675497457203 print('\nAfter xsec correction: df_sig_feat.query("DatasetNumber == 396210").loc[:,"eventweight"]\n', df_sig_feat.query("DatasetNumber == 396210").loc[:,"eventweight"].head()) # Preselection cuts l_presel = ['met_Sign > 2', 'mt2leplsp_0 > 10'] #df_sig_feat.query('&'.join(l_presel), inplace=True) print("\n======================================") print("df_sig_feat.shape =", df_sig_feat.shape) print("df_bkg_feat.shape =", df_bkg_feat.shape) print("======================================") # make array of features df_X = pd.concat([df_bkg_feat, df_sig_feat], axis=0)#, sort=False) print("\ndf_X.isna().sum().sum()", df_X.isna().sum().sum()) #print("\ndf_X.dtypes", df_X.dtypes) #col_float32 = (df_X.dtypes == 'float32').values #df_X.iloc[:, col_float32] = df_X.iloc[:, col_float32].astype('float64') #print("\nAfter converting all columns to float64:\ndf_X.dtypes", df_X.dtypes) # make array of labels y_bkg = np.zeros(len(df_bkg_feat)) y_sig = np.ones(len(df_sig_feat)) y = np.concatenate((y_bkg, y_sig), axis=0).astype(int) df_X['ylabel'] = y if args.multiclass: df_X.loc[df_X.group=='Zjets', 'ylabel'] = 2 df_X.loc[df_X.group=='diboson', 'ylabel'] = 3 df_X = df_X.query('group=="diboson" | group=="Zjets" | ylabel==1') Y = df_X.ylabel # encode class values as integers encoder = LabelEncoder() encoder.fit(Y) encoded_Y = encoder.transform(Y) # convert integers to dummy variables (i.e. one hot encoded) y_multi = np_utils.to_categorical(encoded_Y) # Split the dataset in train and test sets test_size = 0.5 seed = 42 df_X_even = df_X.query("RandomRunNumber % 2 == 0") df_X_odd = df_X.query("RandomRunNumber % 2 == 1") df_X_even = shuffle(df_X_even) df_X_odd = shuffle(df_X_odd) if args.train_even: X_train = df_X_even X_test = df_X_odd elif args.train_odd: X_train = df_X_odd X_test = df_X_even # Balance dataset by resampling: equal number of signal and background events if args.balanced >= 0: # Oversample signal if args.balanced is 0: N_train_sig = len(X_train.query('ylabel==0')) # Undersample background elif args.balanced is 1: N_train_sig = len(X_train.query('ylabel==1')) N_train_bkg = N_train_sig # Draw balanced training datasets where the number of signal and background events are equal X_train_sig = resample(X_train.query('ylabel==1'), replace=True, n_samples=N_train_sig, random_state=42)#, stratify=None) X_train_bkg = resample(X_train.query('ylabel==0'), replace=True, n_samples=N_train_bkg, random_state=42)#, stratify=None) X_train = pd.concat([X_train_bkg, X_train_sig], axis=0) print("\n---------- After balancing ----------") print("args.balanced =", args.balanced) print("X_train.query('ylabel==1').shape =", X_train.query('ylabel==1').shape) print("X_train.query('ylabel==1').shape =", X_train.query('ylabel==0').shape) print("---------------------------------------") #X_train_bkg = resample(X_train.query('group==Zjets'), replace=True, n_samples=N_train_bkg, random_state=42)#, stratify=None) #X_train = X_train.query('group=="diboson" | ylabel==1') # Draw validation set as subsample of test set, for quicker evaluation of validation loss during training n_val_samples = 1e5 X_val = resample(X_test, replace=False, n_samples=n_val_samples, random_state=42, stratify=X_test.ylabel) y_val = X_val.ylabel y_train = X_train.ylabel y_test = X_test.ylabel # Making a copy of the DFs with only feature columns X_train_feat_only = X_train.copy() X_test_feat_only = X_test.copy() X_val_feat_only = X_val.copy() l_non_features = ['DatasetNumber', 'RandomRunNumber', 'eventweight', 'group', 'ylabel'] X_train_feat_only.drop(l_non_features, axis=1, inplace=True) X_test_feat_only.drop(l_non_features, axis=1, inplace=True) X_val_feat_only.drop(l_non_features, axis=1, inplace=True) print("\nX_train_feat_only:", X_train_feat_only.columns) print("X_test_feat_only:", X_test_feat_only.columns) print("X_val_feat_only:", X_val_feat_only.columns) print("\nX_train_feat_only:", X_train_feat_only.shape) print("X_test_feat_only:", X_test_feat_only.shape) print("X_val_feat_only:", X_val_feat_only.shape) # Feature scaling # Scale all variables to the interval [0,1] #scaler = preprocessing.MinMaxScaler(feature_range=(0, 1), copy=True) scaler = preprocessing.StandardScaler(copy=True, with_mean=True, with_std=True) print("\nscaler.fit_transform(X_train_feat_only)") X_train_scaled = scaler.fit_transform(X_train_feat_only) print("scaler.transform(X_test_feat_only)") X_test_scaled = scaler.transform(X_test_feat_only) print("scaler.transform(X_val_feat_only)") X_val_scaled = scaler.transform(X_val_feat_only) print("\n\n//////////////////// ML part ////////////////////////") global model scale_pos_weight = 1 event_weight = None class_weight = None class_weight_dict = {} if args.event_weight: event_weight = X_train.eventweight #event_weight = eventweight_train_resampled if args.class_weight: if args.xgboost: # XGBoost: Scale signal events up by a factor n_bkg_train_events / n_sig_train_events scale_pos_weight = len(X_train[X_train.ylabel == 0]) / len(X_train[X_train.ylabel == 1]) #scale_pos_weight = 10 else: # sciki-learn: Scale overrespresented sample down (bkg) and underrepresented sample up (sig) class_weight = "balanced" else: class_weight = None print("\n# bkg train events / # sig train events = {0:d} / {1:d}".format(len(X_train[X_train.ylabel == 0]), len(X_train[X_train.ylabel == 1]))) print("scale_pos_weight =", scale_pos_weight) classes = np.unique(y) class_weight_vect = compute_class_weight(class_weight, classes, y) class_weight_dict = {0: class_weight_vect[0], 1: class_weight_vect[1]} # Initialize variables for storing CV output valid_score = test_score = fit_time = score_time = 0 # Initialize variables for storing validation and learning curve output train_scores_vc_mean = train_scores_vc_std = 0 valid_scores_vc_mean = valid_scores_vc_std = 0 train_scores_lc_mean = train_scores_lc_std = 0 valid_scores_lc_mean = valid_scores_lc_std = 0 # List of training set sizes for plotting of learning curve train_sizes = [0.5, 0.75, 1.0] # List of parameter values for hyperparameter grid search # XGBoost max_depth = [5, 6, 8, 10] n_estimators = [50, 100, 200, 500, 1000] learning_rate = [0.001, 0.01, 0.1, 0.5, 1.0] reg_alpha = [0, 0.001, 0.01, 0.1, 1.] reg_lambda = [0, 0.001, 0.01, 0.1, 1.] d_param_grid_xgb = {'max_depth': max_depth, 'n_estimators': n_estimators, 'learning_rate': learning_rate, 'reg_alpha': reg_alpha, 'reg_lambda': reg_lambda } # Specify one of the above parameter lists to plot validation curve for param_name_xgb = 'max_depth' param_range_xgb = d_param_grid_xgb[param_name_xgb] # Neural network n_hidden_layers = [1, 3, 5, 7, 10] n_nodes = [10, 20, 50, 100, 500] batch_size = [8, 16, 32, 64, 128] epochs = [10, 50, 100, 500, 1000] #kernel_regularizer = [l1_l2(l1=1e-6, l2=1e-6), l1_l2(l1=1e-6, l2=1e-5), l1_l2(l1=1e-5, l2=1e-6), l1_l2(l1=1e-5, l2=1e-5)] d_param_grid_nn = {'n_hidden_layers': [1] #n_hidden_layers, #'n_nodes': #n_nodes, #'batch_size': batch_size, #'epochs': epochs, #'kernel_regularizer': kernel_regularizer } # Specify one of the above parameter lists to plot validation curve for param_name_nn = 'n_hidden_layers' param_range_nn = d_param_grid_nn[param_name_nn] if args.xgboost: param_range = param_range_xgb param_name = param_name_xgb elif args.nn: param_range = param_range_nn param_name = param_name_nn # Run XGBoost BDT if args.xgboost: if args.multiclass: objective = 'multi:softmax' eval_metric = 'mlogloss' else: objective = 'binary:logistic' eval_metric = 'logloss' #eval_metric = 'auc' max_depth = args.max_depth lr = args.lr n_estimators = args.n_estimators gamma = args.gamma min_child_weight = args.min_child_weight max_delta_step = args.max_delta_step subsample = args.subsample colsample_bytree = args.colsample_bytree colsample_bylevel = args.colsample_bylevel colsample_bynode = args.colsample_bynode reg_alpha = args.L1 reg_lambda = args.L2 if not args.load_pretrained_model: model = XGBClassifier(max_depth=max_depth, learning_rate=lr, n_estimators=n_estimators, verbosity=1, objective=objective, n_jobs=-1, gamma=gamma, min_child_weight=min_child_weight, max_delta_step=max_delta_step, subsample=subsample, colsample_bytree=colsample_bytree, colsample_bylevel=colsample_bylevel, colsample_bynode=colsample_bynode, reg_alpha=reg_alpha, # L1 regularization reg_lambda=reg_alpha, # L2 regularization scale_pos_weight=scale_pos_weight) print("\nmodel.get_params()\n", model.get_params()) if not args.plot_validation_curve and not args.plot_learning_curve: if args.doGridSearchCV: model = GridSearchCV(model, d_param_grid, cv=3, scoring='roc_auc', n_jobs=-1, verbose=1) print("\nTraining XGBoost BDT...") if args.doCV: cv_results = cross_validate(model, X_train_scaled, y_train, cv=3, scoring='roc_auc', n_jobs=-1, verbose=1, return_train_score=True) valid_score = cv_results['test_score'] train_score = cv_results['train_score'] fit_time = cv_results['fit_time'] score_time = cv_results['score_time'] fit_time = cv_results['fit_time'] else: model.fit(X_train_scaled, y_train, sample_weight=event_weight, eval_set=[(X_train_scaled, y_train), (X_val_scaled, y_val)], #eval_set=[(X_val_scaled, y_val)], eval_metric=eval_metric, early_stopping_rounds=20, verbose=True) evals_result = model.evals_result() sns.set() ax = sns.lineplot(x=range(0, len(evals_result['validation_0'][eval_metric])), y=evals_result['validation_0'][eval_metric], label='Training loss') ax = sns.lineplot(x=range(0, len(evals_result['validation_1'][eval_metric])), y=evals_result['validation_1'][eval_metric], label='Validation loss') ax.set(xlabel='Epochs', ylabel='Loss') plt.show() print("\nTraining done!") if args.doGridSearchCV: joblib.dump(model.best_estimator_, trained_model_path) else: joblib.dump(model, trained_model_path) print("\nSaving the trained XGBoost BDT:", trained_model_path) elif args.load_pretrained_model: print("\nReading in pre-trained XGBoost BDT:", trained_model_path) model = joblib.load(trained_model_path) # Run neural network elif args.nn: n_inputs = X_train_scaled.shape[1] n_nodes = args.n_nodes n_hidden_layers = args.n_hidden_layers dropout_rate = args.dropout batch_size = args.batch_size epochs = args.epochs l1 = args.L1 l2 = args.L2 lr = args.lr if not args.load_pretrained_model: print("\nBuilding and training neural network") es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=20) model = KerasClassifier(build_fn=create_model, n_inputs=n_inputs, n_hidden_layers=n_hidden_layers, n_nodes=n_nodes, dropout_rate=dropout_rate, l1=l1, l2=l2, lr=lr, batch_size=batch_size, epochs=epochs, verbose=1, ) if not args.plot_validation_curve and not args.plot_learning_curve: if args.doGridSearchCV: param_grid = d_param_grid_nn model = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='roc_auc', n_jobs=-1, verbose=1) history = model.fit(X_train_scaled, y_train, sample_weight=event_weight, class_weight=class_weight_dict, verbose=1, callbacks=[es], validation_data=(X_val_scaled, y_val) #validation_data=(X_test_scaled, y_test) ) print("\nmodel.model.summary()\n", model.model.summary()) if not args.doGridSearchCV: d_val_loss = {'Training loss': history.history['loss'], 'Validation loss': history.history['val_loss']} df_val_loss = pd.DataFrame(d_val_loss) sns.set() ax = sns.lineplot(data=df_val_loss) ax.set(xlabel='Epochs', ylabel='Loss') plt.show() if args.doGridSearchCV: model.best_estimator_.model.save(trained_model_path) else: model.model.save(trained_model_path) print("\nSaving the trained neural network:", trained_model_path) elif args.load_pretrained_model: print("\nReading in pre-trained neural network:", trained_model_path) model = load_model(trained_model_path) if not args.plot_validation_curve and not args.plot_learning_curve: # Print results of grid search if args.doGridSearchCV: print("Best parameters set found on development set:") print("") print("model.best_params_", model.best_params_) print("") print("Grid scores on development set:") means = model.cv_results_['mean_test_score'] stds = model.cv_results_['std_test_score'] for mean, std, params in zip(means, stds, model.cv_results_['params']): print("{0:0.3f} (+/-{1:0.03f}) for {2!r}".format(mean, std, params)) print("") df = pd.DataFrame.from_dict(model.cv_results_) print("pandas DataFrame of cv results") print(df) print("") # Get predicted signal probabilities for train and test sets output_train = model.predict_proba(X_train_scaled) output_test = model.predict_proba(X_test_scaled) #X_train = X_train.copy() #X_test = X_test.copy() if args.multiclass: output_test = output_test.reshape(output_test.shape[0], 3) print("output_train", len(output_train[0])) for i_output in range(len(output_train[0])): X_train["output"+str(i_output)] = output_train[:,i_output] X_test["output"+str(i_output)] = output_test[:,i_output] elif output_train.shape[1] is 2: print("output_train[:10,1]", output_train[:10,1]) X_train["output"] = output_train[:,1] X_test["output"] = output_test[:,1] else: X_train["output"] = output_train X_test["output"] = output_test print("\n\n//////////////////// Plotting part ////////////////////////\n") if not args.multiclass: print("len(X_train.query('ylabel==0').loc[:,'eventweight'])", len(X_train.query('ylabel==0').loc[:,'eventweight'])) print("len(X_train.query('ylabel==0').loc[:,'output'])", len(X_train.query('ylabel==0').loc[:,'output'])) print("X_train.query('ylabel==0').loc[:,'eventweight']", X_train.query("ylabel==0").loc[:,"eventweight"].head()) print("X_train.query('ylabel==0').loc[:,'output']", X_train.query("ylabel==0").loc[:,"output"].head()) print("X_train[['eventweight', 'output']].min(): \n", X_train[['eventweight', 'output']].min()) print("X_train[['eventweight', 'output']].max(): \n", X_train[['eventweight', 'output']].max()) l_X_train_bkg = [X_train.query('group=="/bkg/'+i_bkg+'"').filter(like='output') for i_bkg in l_bkg] l_ew_train_bkg = [X_train.query('group=="/bkg/'+i_bkg+'"').loc[:,'eventweight'] for i_bkg in l_bkg] l_X_test_bkg = [X_test.query('group=="/bkg/'+i_bkg+'"').filter(like='output') for i_bkg in l_bkg] l_ew_test_bkg = [X_test.query('group=="/bkg/'+i_bkg+'"').loc[:,'eventweight'] for i_bkg in l_bkg] l_X_train_sig = [X_train.query('ylabel==1 & group=="/sig/'+i_sig+'"').filter(like='output') for i_sig in l_sig] l_ew_train_sig = [X_train.query('ylabel==1 & group=="/sig/'+i_sig+'"').loc[:,'eventweight'] for i_sig in l_sig] l_X_test_sig = [X_test.query('ylabel==1 & group=="/sig/'+i_sig+'"').filter(like='output') for i_sig in l_sig] l_ew_test_sig = [X_test.query('ylabel==1 & group=="/sig/'+i_sig+'"').loc[:,'eventweight'] for i_sig in l_sig] d_X_train_bkg = dict(zip(l_bkg, l_X_train_bkg)) d_ew_train_bkg = dict(zip(l_bkg, l_ew_train_bkg)) d_X_test_bkg = dict(zip(l_bkg, l_X_test_bkg)) d_ew_test_bkg = dict(zip(l_bkg, l_ew_test_bkg)) # Plot unweighted training and test output #plt.figure(1) #plotTrainTestOutput(d_X_train_bkg, None, # X_train.query("ylabel==1").loc[:,"output"], None, # d_X_test_bkg, None, # X_test.query("ylabel==1").loc[:,"output"], None) #plotTrainTestOutput(d_X_train_bkg, None, # X_train.query("ylabel==1").loc[:,"output"], None, # d_X_test_bkg, None, # X_test.query("ylabel==1").loc[:,"output"], None) #plt.savefig(output_dir + 'hist1_train_test_unweighted.pdf') # Plot weighted train and test output, with test set multiplied by 2 to match number of events in training set plt.figure() #for i_output in range(output_train.shape[1]): plotTrainTestOutput(d_X_train_bkg, d_ew_train_bkg, X_train.query("ylabel==1").filter(like='output'), X_train.query("ylabel==1").loc[:,"eventweight"], d_X_test_bkg, d_ew_test_bkg, X_test.query("ylabel==1").filter(like='output'), X_test.query("ylabel==1").loc[:,"eventweight"], args.signal_region) plt.savefig(output_dir + 'hist_train_test_weighted_comparison.pdf') # Plot final signal vs background estimate for test set, scaled to 10.6/fb if 'low' in args.signal_region: plt.figure() plotFinalTestOutput(d_X_test_bkg, d_ew_test_bkg, X_test.query("ylabel==1 & (DatasetNumber==392330 | DatasetNumber==396210)").filter(like='output'), X_test.query("ylabel==1 & (DatasetNumber==392330 | DatasetNumber==396210)").loc[:,"eventweight"], args.signal_region, figure_text='(200, 100) GeV') plt.savefig(output_dir + 'hist_test_392330_396210_C1N2_WZ_2L2J_200_100_weighted.pdf') elif 'int' in args.signal_region: plt.figure() plotFinalTestOutput(d_X_test_bkg, d_ew_test_bkg, X_test.query("ylabel==1 & DatasetNumber==392325").loc[:,"output"], X_test.query("ylabel==1 & DatasetNumber==392325").loc[:,"eventweight"], args.signal_region, figure_text='(500, 200) GeV') plt.savefig(output_dir + 'hist_test_392325_C1N2_WZ_2L2J_500_200_weighted.pdf') elif 'high' in args.signal_region: plt.figure() plotFinalTestOutput(d_X_test_bkg, d_ew_test_bkg, X_test.query("ylabel==1 & DatasetNumber==392356").loc[:,"output"], X_test.query("ylabel==1 & DatasetNumber==392356").loc[:,"eventweight"], args.signal_region, figure_text='(600, 0) GeV') plt.savefig(output_dir + 'hist5_test_392356_C1N2_WZ_2L2J_600_0_weighted.pdf') if args.xgboost and not args.doGridSearchCV: # Plot feature importance print("model.feature_importances_", model.feature_importances_) print("np.sum(model.feature_importances_)", np.sum(model.feature_importances_)) if args.multiclass: l_feat_drop = ['DatasetNumber', 'RandomRunNumber', 'eventweight', 'group', 'ylabel', 'output0', 'output1', 'output2'] else: l_feat_drop = ['DatasetNumber', 'RandomRunNumber', 'eventweight', 'group', 'ylabel', 'output'] s_feat_importance = pd.Series(model.feature_importances_, index=X_train.drop(l_feat_drop, axis=1).columns) print("X_train.drop(l_feat_drop, axis=1).columns\n", X_train.drop(l_feat_drop, axis=1).columns) s_feat_importance.sort_values(ascending=False, inplace=True) plt.figure() sns.set(style="ticks", color_codes=True) n_top_feat_importance = 20 ax = sns.barplot(x=s_feat_importance[:n_top_feat_importance]*100, y=s_feat_importance[:n_top_feat_importance].index)#, palette="Blues_r") #ax.set_yticklabels(s_feat_importance.index) ax.set(xlabel="Feature importance [%]") plt.savefig(output_dir + 'feature_importance.pdf') if not args.multiclass: # Plot ROC curve fpr, tpr, thresholds = metrics.roc_curve(X_test.loc[:,"ylabel"], X_test.loc[:,"output"]) auc = metrics.roc_auc_score(X_test.loc[:,"ylabel"], X_test.loc[:,"output"]) plt.figure() ax = sns.lineplot(x=tpr, y=1-fpr, estimator=None, label='ROC curve: AUC = %0.2f' % auc) plt.plot([1,0], [0,1], linestyle="--") ax.set(xlabel="Signal efficiency", ylabel="Background efficiency") plt.savefig(output_dir + 'ROC_curve_AUC_sigEff_vs_1minBkgEff.pdf') plt.figure() ax = sns.lineplot(x=tpr, y=1/(fpr), estimator=None, label='ROC curve: AUC = %0.2f' % auc) #plt.plot([0,1], [0,1], linestyle="--") ax.set(xlabel="Signal efficiency", ylabel="Background rejection = 1/(1 - bkg eff.)", yscale='log') plt.savefig(output_dir + 'ROC_curve_AUC_sigEff_vs_bkgRej.pdf') plt.show() # Signal significance print("\n///////////////// Signal significance /////////////////") def significance(cut_string_sig, cut_string_bkg, rel_unc=0.3): sig_exp = np.sum(X_test.query("ylabel == 1 & "+cut_string_sig).loc[:,"eventweight"]) bkg_exp = np.sum(X_test.query("(ylabel == 0 | ylabel == 2 | ylabel == 3) & "+cut_string_bkg).loc[:,"eventweight"]) Z_N_exp = RooStats.NumberCountingUtils.BinomialExpZ(sig_exp, bkg_exp, rel_unc) return [sig_exp, bkg_exp, Z_N_exp] #cut_string_DSID = 'DatasetNumber == {0:d}'.format(dsid) if 'low' in args.signal_region: key = '(200, 100)' cut_string_DSID = '(DatasetNumber == 392330 | DatasetNumber == 396210)' elif 'int' in args.signal_region: key = '(500, 200)' cut_string_DSID = 'DatasetNumber == 392325' elif 'high' in args.signal_region: key = '(600, 0)' cut_string_DSID = 'DatasetNumber == 392356' l_cuts = [0.9, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99] global cut_optimal cut_optimal = 0 Z_N_optimal = 0 for cut in l_cuts: if args.multiclass: cut_string_SR = 'output0 > {:f}'.format(cut) else: cut_string_SR = 'output > {:f}'.format(cut) cut_string_bkg = cut_string_SR cut_string_sig = cut_string_SR + " & " + cut_string_DSID print('\ncut_string_sig:', cut_string_sig) print('cut_string_bkg:', cut_string_bkg) [sig_exp, bkg_exp, Z_N_exp] = significance(cut_string_sig, cut_string_bkg, rel_unc=0.3) print("---", key) print("S_exp =", sig_exp) print("B_exp =", bkg_exp) for i in range(len(l_X_train_bkg)): l_cut_strings = ['ylabel == 0', 'group == "/bkg/{}"'.format(l_bkg[i]), cut_string_bkg] B_exp_i = np.sum(X_test.query('&'.join(l_cut_strings)).loc[:,"eventweight"]) print(" {0}: {1}".format(l_bkg[i], B_exp_i)) print("Z_N_exp =", Z_N_exp) if sig_exp >= 3 and bkg_exp >= 1: if Z_N_exp > Z_N_optimal: Z_N_optimal = Z_N_exp cut_optimal = cut # Print the optimal SR values if args.multiclass: cut_string_SR = 'output0 > {:f}'.format(cut_optimal) else: cut_string_SR = 'output > {:f}'.format(cut_optimal) cut_string_bkg = cut_string_SR cut_string_sig = cut_string_SR + " & " + cut_string_DSID print('\ncut_string_sig:', cut_string_sig) print('cut_string_bkg:', cut_string_bkg) [sig_exp, bkg_exp, Z_N_exp] = significance(cut_string_sig, cut_string_bkg, rel_unc=0.3) print("---", key) print("Optimal cut =", cut_optimal) print("S_exp =", sig_exp) print("B_exp =", bkg_exp) for i in range(len(l_X_train_bkg)): l_cut_strings = ['ylabel == 0', 'group == "/bkg/{}"'.format(l_bkg[i]), cut_string_bkg] B_exp_i = np.sum(X_test.query('&'.join(l_cut_strings)).loc[:,"eventweight"]) print(" {0}: {1}".format(l_bkg[i], B_exp_i)) print("Z_N_exp =", Z_N_exp) if args.plot_validation_curve: print("\nCalculating validation curve...") train_scores, valid_scores = validation_curve(model, X_train_scaled, y_train, param_name=param_name, param_range=param_range, cv=3, scoring='roc_auc', n_jobs=-1, verbose=11) train_scores_vc_mean = np.mean(train_scores, axis=1) train_scores_vc_std = np.std(train_scores, axis=1) valid_scores_vc_mean = np.mean(valid_scores, axis=1) valid_scores_vc_std = np.std(valid_scores, axis=1) # Plot validation curves figF, axsF = plt.subplots() # Training score axsF.plot( param_range, train_scores_vc_mean, 'o-', label="Training score", color="darkorange", lw=2) axsF.fill_between( param_range, train_scores_vc_mean - train_scores_vc_std, train_scores_vc_mean + train_scores_vc_std, alpha=0.2, color="darkorange", lw=2) # Test score axsF.plot( param_range, valid_scores_vc_mean, 'o-', label="Cross-validation score", color="navy", lw=2) axsF.fill_between( param_range, valid_scores_vc_mean - valid_scores_vc_std, valid_scores_vc_mean + valid_scores_vc_std, alpha=0.2, color="navy", lw=2) axsF.set_xlabel(param_name) axsF.set_ylabel('Score') axsF.legend(loc="best") axsF.set_title('Validation curves') #axsF.set_ylim(0., 1.) plt.savefig(output_dir + 'validation_curve_{}.pdf'.format(param_name)) plt.show() if args.plot_learning_curve: print("\nCalculating learning curve...") train_sizes, train_scores, valid_scores = learning_curve(model, X_train_scaled, y_train, train_sizes=train_sizes, cv=3, scoring='roc_auc', n_jobs=1, verbose=3) train_scores_lc_mean = np.mean(train_scores, axis=1) train_scores_lc_std = np.std(train_scores, axis=1) valid_scores_lc_mean = np.mean(valid_scores, axis=1) valid_scores_lc_std = np.std(valid_scores, axis=1) # Plot learning curves figG, axsG = plt.subplots() # 68% CL bands #if runBDT: #elif runNN: axsG.fill_between( train_sizes, train_scores_lc_mean - train_scores_lc_std, train_scores_lc_mean + train_scores_lc_std, alpha=0.2, color="r", lw=2) axsG.fill_between( train_sizes, valid_scores_lc_mean - valid_scores_lc_std, valid_scores_lc_mean + valid_scores_lc_std, alpha=0.2, color="g", lw=2) # Training and validation scores axsG.plot( train_sizes, train_scores_lc_mean, 'o-', label="Training score", color="r", lw=2) axsG.plot( train_sizes, valid_scores_lc_mean, 'o-', label="Cross-validation score", color="g", lw=2) axsG.set_xlabel("Training examples") axsG.set_ylabel('Score') axsG.legend(loc="best") axsG.set_title('Learning curves') #axsG.set_ylim(0., 1.) plt.savefig(output_dir + 'learning_curve.pdf') plt.show() # Stop timer t_end = time.time() print("\nProcess time: {:4.2f} s".format(t_end - t_start))
def main(): import time start = time.time() with open('./pkl/X.pkl', 'rb') as fh: # Load data set X = dill.load(fh) with open('./pkl/y.pkl', 'rb') as fh: y = dill.load(fh) scaler = Normalizer() smote_etomek = SMOTETomek(ratio='auto') cachedir = mkdtemp() cv = StratifiedKFold(n_splits=5, shuffle=True) classifier = XGBClassifier() # A parameter grid for XGBoost params = { 'min_child_weight': [1, 5, 10], 'gamma': [0, 0.5, 1, 1.5, 2, 5], 'subsample': [0.6, 0.8, 1.0], 'colsample_bytree': [0.6, 0.8, 1.0], 'max_depth': [1, 3, 4, 5, 10], } pipeline = Pipeline([ ('scaler', scaler), ('smt', smote_etomek), ('clf', classifier), ], memory=cachedir) sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=0) sss.get_n_splits(X, y) for train_index, test_index in sss.split(X, y): #print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = X[train_index], X[ test_index] # make training and test set y_train, y_test = y[train_index], y[test_index] clf = dasksearchCV(classifier, params, n_jobs=8, cv=3, scoring='roc_auc', refit=True) clf.fit(X_train, y_train) print(clf.best_params_) print(clf.best_score_) best_parameters, score = clf.best_params_, clf.best_score_ print('Raw AUC score:', score) for param_name in sorted(best_parameters.keys()): print("%s: %r" % (param_name, best_parameters[param_name])) classifier = XGBClassifier(**best_parameters, njobs=-1) plot_cross_validation( cv, X_train, y_train, pipeline) # do 5 fold stratified cross-validation clf = pipeline.fit(X_train, y_train) # print(classifier.get_params()) expected = y_test predicted = clf.predict(X_test) # test performance on test set plot_confusion_matrix(confusion_matrix(expected, predicted), classes=["Non-Zika", "Zika"]) print(time.time() - start) from sklearn import metrics print("Classification report for classifier %s:\n%s\n" % (clf, metrics.classification_report(expected, predicted)))
learning_rate =0.01, n_estimators=1000, max_depth=5, min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8, n_jobs=-1, random_state=42 ) xgb_param = xgb.get_xgb_params() xgb_param cvresult = xgboost.cv(xgb_param, xgtrain, num_boost_round=xgb.get_params()['n_estimators'], nfold=5, metrics='auc', early_stopping_rounds=50, seed=42 ) cvresult.head() cvresult.shape xgb_best_param = {'n_estimators': cvresult.shape[0]} xgb_best_param # best n_estimators value to be used in the stack model # update xgb with the optimal n_estimators
class XGBoostClassifier(ClassifierBase): def __init__(self, useTrainCV=True, cv_folds=5, early_stopping_rounds=50): super(XGBoostClassifier, self).__init__() self.useTrainCV = useTrainCV self.cv_folds = cv_folds self.early_stopping_rounds = early_stopping_rounds self.clf = XGBClassifier(learning_rate=0.1, n_estimators=140, max_depth=5, min_child_weight=3, gamma=0.2, subsample=0.6, colsample_bytree=1.0, objective='binary:logistic', n_jobs=6, scale_pos_weight=1, seed=27) def train(self, X_train, y_train): if self.useTrainCV: print("Start Feeding Data for Cross Validation") xgb_param = self.clf.get_xgb_params() xgtrain = xgb.DMatrix(X_train, label=y_train) cvresult = xgb.cv( xgb_param, xgtrain, num_boost_round=self.clf.get_params()['n_estimators'], nfold=self.cv_folds, early_stopping_rounds=self.early_stopping_rounds) self.clf.set_params(**cvresult) # param_test1 = {} # gsearch1 = GridSearchCV(estimator=XGBClassifier(learning_rate=0.1, n_estimators=140, max_depth=5, # min_child_weight=3, gamma=0.2, subsample=0.8, # colsample_bytree=1.0, # objective='binary:logistic', nthread=4, scale_pos_weight=1, # seed=27), # param_grid=param_test1, # scoring='f1', # n_jobs=4, iid=False, cv=5) # gsearch1.fit(X_train, y_train) # print(gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_) self.clf.fit(X_train, y_train, eval_metric='auc') def predict(self, X_test, y_test=None): y_pred_proba = self.clf.predict_proba(X_test)[:, 1] if not (y_test is None): print("Score: ", self.clf.score(X_test, y_test)) y_pred = self.clf.predict(X_test) print("Acc : %.4g" % metrics.accuracy_score(y_test, y_pred)) print("F1 score is: {}".format(f1_score(y_test, y_pred))) print("AUC Score is: {}".format(roc_auc_score( y_test, y_pred_proba))) return y_pred_proba def printFeatureImportance(self, X_train): feat_imp = self.clf.feature_importances_ feat = X_train.columns.tolist() #res_df = pd.DataFrame({'Features': feat, 'Importance': feat_imp}).sort_values(by='Importance', ascending=False) #res_df.plot('Features', 'Importance', kind='bar', title='Feature Importances') #plt.ylabel('Feature Importance Score') #plt.show() #print(res_df) #print(res_df["Features"].tolist()) print('Importance feats:', feat) def save(self, path): dump(self.clf, os.path.join(path, 'clf.joblib')) def load(self, path): self.clf = load(os.path.join(path, 'clf.joblib'))