def train_gbtree(X_train, y_train):

    # Training
    print('Training model...')
    # shuffle X and y
    X_train, y_train = shuffle(X_train, y_train, random_state=0)
    if args.gb_tool == 'xgboost':
        model = XGBClassifier(
            objective='binary:logistic',
            booster='gbtree',
            learning_rate=0.05,
            n_estimators=200,
            max_depth=3,
            min_child_weight=6,
            verbosity=1,
        )
        model.fit(X_train, y_train)
        params = model.get_params()
    else:
        model = CatBoostClassifier(
            verbose=0,
            cat_features=cat_features,
            random_state=args.rs_model,
            # scale_pos_weight=(1 - pos_rate) / pos_rate
        )
        model.fit(X_train, y_train)
        params = model.get_all_params()

    print('Parameters:', params)
    print('Done.')

    return model
def xgb_classifier(X_train, X_test, y_train, y_test, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
  alg = XGBClassifier(learning_rate=0.1, n_estimators=140, max_depth=5,
                        min_child_weight=3, gamma=0.2, subsample=0.6, colsample_bytree=1.0,
                        objective='binary:logistic', nthread=4, scale_pos_weight=1, seed=27)
  if useTrainCV:
        print("Start Feeding Data")
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(X_train.values, label=y_train.values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
                          early_stopping_rounds=early_stopping_rounds)
        display(cvresult)
        alg.set_params(n_estimators=cvresult.shape[0])

    
  print('Start Training')
  alg.fit(X_train, y_train, eval_metric='auc')
  print("Start Predicting")
  predictions = alg.predict(X_test)
  pred_proba = alg.predict_proba(X_test)[:, 1]

    # Model performance
  print("\nModel statistic")
  print("Accuracy : %.4g" % metrics.accuracy_score(y_test, predictions))
  print("AUC score (test set): %f" % metrics.roc_auc_score(y_test, pred_proba))
  print("F1 Score (test set): %f" % metrics.f1_score(y_test, predictions))

  feat_imp = alg.feature_importances_
  feat = X_train.columns.tolist()
  res_df = pd.DataFrame({'Features': feat, 'Importance': feat_imp}).sort_values(by='Importance', ascending=False)
  res_df.plot('Features', 'Importance', kind='bar', title='Feature Importances')
  plt.ylabel('Feature Importance Score')
  plt.show()
  print(res_df)
  print(res_df["Features"].tolist())
  return cvresult, alg
def train_gbtree(X_train, y_train, pos_rate, args):
    # Training
    print('Training model...')
    if args.gb_tool == 'xgboost':
        model = XGBClassifier(objective='binary:logistic',
                              booster='gbtree',
                              learning_rate=0.05,
                              n_estimators=200,
                              max_depth=3,
                              min_child_weight=6,
                              verbosity=1
                              )
    else:
        model = CatBoostClassifier(verbose=0,
                                   # scale_pos_weight=(1 - pos_rate) / pos_rate,
                                   learning_rate=args.lr,
                                   depth=args.depth,
                                   l2_leaf_reg=args.l2
                                   )

    model.fit(X_train, y_train)
    params = model.get_params() if args.gb_tool == 'xgboost' else model.get_all_params()
    print('Parameters:', params)
    print('Done.')

    return model
def xgb_cv(X, y):
    # Instantiate XGBoost
    n_estimators = 100
    dtrain = xgb.DMatrix(X, y)

    # XGBoost was tuned on the raw data.
    bst = XGBClassifier(n_estimators=100, #70
                        max_depth=3, 
                        min_child_weight=5, 
                        gamma=0.5, 
                        learning_rate=0.05, 
                        subsample=0.7, 
                        colsample_bytree=0.7, 
                        reg_alpha=0.001,
                        seed=1)

    # Cross-validate XGBoost
    params = bst.get_xgb_params() # Extract parameters from XGB instance to be used for CV
    num_boost_round = bst.get_params()['n_estimators'] # XGB-CV has different names than sklearn

    cvresult = xgb.cv(params, dtrain, num_boost_round=num_boost_round, 
                      nfold=10, metrics=['logloss', 'auc'], seed=1)

    print("="*80)
    print("\nXGBoost results for 10-fold cross-validation:")
    print(cvresult)
    print("="*80)

    # XGBoost summary
    print("="*80)
    print("\nXGBoost summary for 100 rounds of 10-fold cross-validation:")
    print("\nBest mean log-loss: %.4f" % cvresult['test-logloss-mean'].min())
    print("\nBest mean AUC: %.4f" % cvresult['test-auc-mean'].max())
    print("="*80)
Exemple #5
0
def done(istrain=True):
#    test_save.drop('click',axis=1,inplace=True)
#    op=['n_estimators','max_depth','min_child_weight','subsample','reg_alpha','gamma','fin']
    #  scale_pos_weight   rate_drop
    op=['reg_alpha']
    if istrain:
        train_save = gdbt_data_get_train(25)
        
        np.random.seed(999)
        r1 = np.random.uniform(0, 1, train_save.shape[0])  #产生0~40M的随机数
#        train_save = train_save.ix[r1 < 0.2, :]
        print(train_save.shape)
        y_train = train_save['click']
        train_save.drop('click',axis=1,inplace=True)
        X_train = train_save
#        dtrain = xgb.DMatrix(X_train, label=y_train)
#        n_estimators = [i for i in range(200,1000,1)]
        xgb1 = XGBClassifier(**gbtree_param,
        
        objective='binary:logistic',
        eval_metric=['logloss'],
        nthread=-1,
        verbose=2,
        seed=27,
        silent=True,**gpu_dict)
        for i,oper in enumerate(op):
            modelfit_cv(xgb1, X_train,y_train, cv_folds = kfold,cv_type=oper,random_state=i)        
            logging.debug(oper+":to save validation predictions ...")
            ret=dump(xgb1, FLAGS.tmp_data_path+'xgboost.cv_'+oper+'.model.joblib_dat') 
            logging.debug(ret)
        del train_save
        del X_train
        del y_train
    else:
        X_test = gdbt_data_get_test()
        print(X_test.shape)
#        X_test.drop('click',axis=1,inplace=True)

        for oper in op:
            xgb1 = load(FLAGS.tmp_data_path+'xgboost.cv_'+oper+'.model.joblib_dat')
            logging.debug(xgb1.get_params()['n_estimators'])
            dtrain_predprob = xgb1.predict_proba(X_test)[:,1]
            logging.debug(dtrain_predprob)
            y_pred = [round(value,4) for value in dtrain_predprob]
            logging.debug('-'*30)
            y_pred=np.array(y_pred).reshape(-1,1)
            logging.debug(y_pred.shape)
            test_id=pd.read_csv(FLAGS.tmp_data_path+'test_id.csv')
            logging.debug(test_id['id'].shape)
            test_id['id']=test_id['id'].map(int)
            test_id['click']=y_pred
            test_id.to_csv(FLAGS.tmp_data_path+'1-'+oper+'-xgboost.test.csv',index=False)
        del X_test
Exemple #6
0
    def get_params(self, deep=True):
        ''' 
        A hack to make it work through the XGB code. They use the base class 0 to retrieve the parameters.
        Since I overwrite the base_class[0] as OnehotEncodingClassifierMixin, now I do a hack to temporarily
        assign the base class as the next one (XGB class).
        '''
        orig_bases = copy.deepcopy(self.__class__.__bases__)
        self.__class__.__bases__ = (XGBClassifier, )
        self.__class__ = XGBClassifier

        params = XGBClassifier.get_params(self, deep=deep)
        self.__class__ = MyXGBClassifier
        self.__class__.__bases__ = orig_bases
        return params
Exemple #7
0
def xgboost_k_default(
        k=4,
        sequence_origin='DairyDB',
        primers_origin='DairyDB',
        taxonomy_level: int = 1,
        selected_primer: str = 'V4',
        model_preprocessing='Computing frequency of {}-mer (ATCG) in every sequence',
        test_size=0.2):
    """
    Apply Random Forest model on a set of sequence preprocessed data.
    :return:
    """
    model_preprocessing = model_preprocessing.format(k)
    X_train, X_test, y_train, y_test = ETL_k_mer(
        k=k,
        sequence_origin=sequence_origin,
        primers_origin=primers_origin,
        taxonomy_level=taxonomy_level,
        selected_primer=selected_primer)

    XGB = XGBClassifier(silent=0, eta=0.3, max_depth=3, n_estimators=100)
    y_pred = XGB.fit(X_train, y_train).predict(X_test)

    test_size, prop_main_class, accuracy = main_stats_model(
        y_train=y_train,
        y_test=y_test,
        y_pred=y_pred,
        model_name='XGB_{}'.format(k),
        model_parameters=XGB.get_params(),
        model_preprocessing=model_preprocessing,
        sequence_origin=sequence_origin,
        primers_origin=primers_origin,
        taxonomy_level=taxonomy_level,
        selected_primer=selected_primer,
        test_size=test_size,
        k=k,
        feature_importances=XGB.feature_importances_,
        xgb_model=XGB,
        save_model=True,
        save_tree=20)

    del XGB, X_train, X_test, y_train, y_test, y_pred

    return test_size, prop_main_class, accuracy
Exemple #8
0
 def __init__(
     self,
     model: XGBClassifier,
     feature_names: List[str],
     classification_labels: Optional[List[str]] = None,
 ):
     super().__init__(
         model.get_booster(),
         feature_names,
         model.base_score,
         model.objective,
         classification_labels,
     )
     if model.classes_ is None:
         n_estimators = model.get_params()["n_estimators"]
         num_trees = model.get_booster().trees_to_dataframe()["Tree"].max() + 1
         self._num_classes = num_trees // n_estimators
     else:
         self._num_classes = len(model.classes_)
Exemple #9
0
def opt_BDT(input, output, params, show, names):

    model = XGBClassifier(**params)
    xgb_param = model.get_xgb_params()
    cvscores = []
    AUC = []
    X_train, X_test, y_train, y_test = train_test_split(input,
                                                        output,
                                                        test_size=0.2,
                                                        random_state=42)
    matrix_train = xgb.DMatrix(X_train, label=y_train)
    cvresult = xgb.cv(
        xgb_param,
        matrix_train,
        num_boost_round=model.get_params()["n_estimators"],
        nfold=5,
        metrics="auc",
        early_stopping_rounds=30,
        verbose_eval=True,
    )
    model.set_params(n_estimators=cvresult.shape[0])
    model.fit(X_train, y_train, eval_metric="auc")
    y_prob = model.predict_proba(X_test)
    y_pred = model.predict(X_test)
    prediction = [round(value) for value in y_pred]
    auc = roc_auc_score(y_test, y_prob[:, 1])
    accuracy = accuracy_score(y_test, prediction)

    print("Accuracy: %.2f%%; AUC = %.4f%" % (accuracy * 100, auc))
    if show:

        name = "channel_" + str(channel) + "_BDT"
        name = "%s_%s" % (name, selection)
        modelname = "models/%s.h5" % name
        print("Save to %s" % modelname)

        plotter.plot_separation(model, X_test, y_test, name, False)
        plotter.plot_ROC(model, X_test, y_test, name, False)
        model.get_booster().feature_names = names
        mp.rc("figure", figsize=(5, 5))
        plot_importance(model.get_booster())
        plt.subplots_adjust(left=0.3)
        plt.show()
def grid_search_para(train_data,
                     label,
                     best_para=0,
                     grid_param=0,
                     is_search_estimator=False,
                     search_lr=0.1,
                     scoring='accuracy',
                     search_estimators=10000,
                     iid=False,
                     cv=skfold):
    if not is_search_estimator:
        for key, value in grid_param.items():
            print('start GridSearchCV {} in range {}'.format(key, value))

        xgb_ = XGBClassifier(**best_para)

        grid_search = GridSearchCV(estimator=xgb_,
                                   param_grid=grid_param,
                                   scoring=scoring,
                                   iid=iid,
                                   cv=cv)

        grid_search.fit(train_data, label)

        best_para.update(grid_search.best_params_)

        print('the best parameter is ', grid_search.best_params_)
        print('the best score is %f' % grid_search.best_score_)

    else:
        xgb_ = XGBClassifier()
        if best_para == 0:
            best_para = xgb_.get_params()
        best_para['n_estimators'] = search_estimators
        best_para['learning_rate'] = search_lr
        xgb_ = XGBClassifier(**best_para)

        best_estimator = xgb_cv(xgb_, train_data, label)

        best_para['n_estimators'] = best_estimator

    return best_para
Exemple #11
0
def done(istrain='train'):
#    columns=['hour_t1_start_hour_len_32', 'app_id_weight', 'every_app_len', 'type_no', 'times_len', 'brand', 'close_hour_len_t1_32', 'brand_cnt', 'app_len', 'app_time_t2_61', 'type_no_cnt', 'close_day_t1_32', 'hour_t1_close_hour_weight_32', 'close_hour_t1_32', 'hour_t1_start_hour_weight_32', 'hour_t2_start_hour_len_132', 'hour_t2_start_hour_len_94', 'close_hour_t1_33', 'close_day_t1_33', 'close_hour_t1_43', 'close_hour_t1_22', 'hour_t2_start_hour_len_124', 'hour_t2_start_hour_len_251', 'close_day_t1_19', 'close_hour_t1_36', 'hour_t1_start_hour_size_32', 'close_day_t1_4', 'close_day_size_t1_32', 'close_hour_t1_4', 'hour_t1_close_hour_weight_19', 'close_day_t1_36', 'close_day_t1_43', 'hour_t2_close_hour_weight_124', 'hour_t1_start_hour_len_33', 'close_hour_len_t1_33', 'hour_t2_start_hour_len_11', 'hour_t2_start_hour_len_158', 'close_hour_t1_19', 'hour_t2_start_hour_weight_132', 'close_day_size_t1_33', 'hour_t1_start_hour_size_36', 'hour_t1_start_hour_len_17', 'hour_t1_start_hour_len_43', 'app_time_t1_32',]
    
#    test_save.drop('click',axis=1,inplace=True)
#    op=['n_estimators','max_depth','min_child_weight','subsample','reg_alpha','gamma','fin']
    #  scale_pos_weight   rate_drop
    logging.debug(istrain) 
    op=['fin']
    if istrain=='train':
        train_save = gdbt_data_get_train('n_class')
        
#        np.random.seed(999)
#        train_save = train_save.ix[r1 < 0.2, :]
        print(train_save.shape)
        y_train = train_save['n_class']
        train_save.drop('n_class',axis=1,inplace=True)
        X_train = train_save
#        X_train = train_save.ix[:,columns]
        
        """
        归一化
        """
        X_train=data_normalization(X_train)
        
        """
        PCA
        """
        X_train=data_pca(X_train)
        
        
#        dtrain = xgb.DMatrix(X_train, label=y_train)
#        n_estimators = [i for i in range(200,1000,1)]
        xgb1 = XGBClassifier(**gbtree_param,
        objective='multi:softprob',
        eval_metric=['mlogloss',],
        nthread=-1,
        verbose=2,
        seed=27,
        silent=True,**gpu_dict)
        for i,oper in enumerate(op):
            modelfit_multi_cv(xgb1, X_train,y_train,cv_type=oper,)#random_state=i)        
            logging.debug(oper+":to save validation predictions ...")
            ret=dump(xgb1, FLAGS.tmp_data_path+'xgboost.cv_'+oper+'.model.joblib_dat') 
            logging.debug(ret)
            gc.collect()
#            xgb1.save_model(FLAGS.tmp_data_path+'xgb_new_features.model')
#         特征选择
#        feature_selectfrommodel(xgb1, X_train,y_train)
        del train_save
        del X_train
        del y_train
    elif istrain=='eval':
        
        X_eval = gdbt_data_get_eval('n_class')
        print(X_eval.shape)
        y_eval = X_eval['n_class']
        X_eval.drop('n_class',axis=1,inplace=True)
        logging.debug(X_eval.shape)
#        X_eval = X_eval.ix[:,columns]
        
        """
        归一化
        """
        X_eval=data_normalization(X_eval)
        
        """
        PCA
        """
        X_eval=data_pca(X_eval)
        
        for oper in op:
            xgb1 = load(FLAGS.tmp_data_path+'xgboost.cv_'+oper+'.model.joblib_dat')
            logging.debug(xgb1.get_params()['n_estimators'])
            dtrain_predprob = xgb1.predict_proba(X_eval)
            logging.debug(dtrain_predprob.shape)
            columns=[]
            for i in [1,2]:
                for j in range(11):
                    columns.append(str(i)+'-'+str(j))
            y_pred=pd.DataFrame(dtrain_predprob,columns=columns)
            def c(line):
                return [round(x,6) for x in line]
            y_pred.apply(lambda line:c(line),axis=1)
    
    
            logging.debug('-'*30)
            logging.debug(test_score(y_pred,y_eval))
        

        del X_eval
    elif 'train_predict'==istrain:
        
        train_save = gdbt_data_get_train('n_class')
        print(train_save.shape)
        y_train = train_save['n_class']
        train_save.drop('n_class',axis=1,inplace=True)
        X_train = train_save
        X_train_part, X_val, y_train_part, y_val = train_test_split(X_train, y_train, train_size = 0.8,random_state = 7)
        dtrain = xgb.DMatrix(X_train_part, label=y_train_part)
        dvalid = xgb.DMatrix(X_val, label=y_val)
    #    del y_train  
    
#        watchlist = [(dtrain, 'train'), (dvalid, 'valid')]
#    #    logging.debug (X_train_part.shape, y_train_part.shape)
#        plst = list(gbtree_param.items()) + [('eval_metric', 'mlogloss')]
#        FLAGS.n_trees=gbtree_param['n_estimators']
#        xgb_test_basis = xgb.train(plst, dtrain, FLAGS.n_trees, watchlist)
#        xgb_test_basis.save_model(FLAGS.tmp_data_path+'xgb_new_features.model')
#        del dtrain,dvalid
#        gc.collect()
        xgb_test_basis = xgb.Booster({'nthread':-1}) #init model
        xgb_test_basis.load_model(FLAGS.tmp_data_path+'xgb_new_features.model') # load data
#        xgb_test_basis = load(FLAGS.tmp_data_path+'xgb_new_features.model')
        dtrain = xgb.DMatrix(X_train, label=y_train)
        xgb_leaves = xgb_test_basis.predict(dtrain, pred_leaf = True)
        
        new_pd = pd.DataFrame()
        logging.debug(xgb_leaves.shape)
        for i in range(FLAGS.n_trees):
            pred2 = xgb_leaves[:, i]
#            logging.debug(i, np.unique(pred2).size)
            new_pd['xgb_basis'+str(i)] = pred2
    
    #    train_save = gdbt_data_get_train(799)
    
        idx_base = 0
        for vn in ['xgb_basis' + str(i) for i in range(FLAGS.n_trees)]:
            _cat = np.asarray(new_pd[vn].astype('category').values.codes, dtype='int32')
            _cat1 = _cat + idx_base
#            logging.debug(vn, idx_base, _cat1.min(), _cat1.max(), np.unique(_cat).size)
            new_pd[vn] = _cat1
            idx_base += _cat.max() + 1
        logging.debug(new_pd.shape)
        logging.debug(new_pd.head(3))
        new_pd.to_csv(FLAGS.tmp_data_path+'xgb_new_train_features.csv',index=False)
        gc.collect()
        
        
    elif 'test_predict'==istrain:
        X_test = gdbt_data_get_test()
        logging.debug(X_test.shape)
        oper=op[0]
        xgb_test_basis = xgb.Booster({'nthread':-1}) #init model
        xgb_test_basis.load_model(FLAGS.tmp_data_path+'xgb_new_features.model') # load data

        dtrain = xgb.DMatrix(X_test)
#        xgb_test_basis = load(FLAGS.tmp_data_path+'xgb_new_features.model')
        xgb_leaves = xgb_test_basis.predict(dtrain, pred_leaf = True)
        FLAGS.n_trees=gbtree_param['n_estimators']
        new_pd = pd.DataFrame()
        logging.debug(xgb_leaves.shape)
        for i in range(FLAGS.n_trees):
            pred2 = xgb_leaves[:, i]
#            logging.debug(i, np.unique(pred2).size)
            new_pd['xgb_basis'+str(i)] = pred2
    
    #    train_save = gdbt_data_get_train(799)
    
        idx_base = 0
        for vn in ['xgb_basis' + str(i) for i in range(FLAGS.n_trees)]:
            _cat = np.asarray(new_pd[vn].astype('category').values.codes, dtype='int32')
            _cat1 = _cat + idx_base
#            logging.debug(vn, idx_base, _cat1.min(), _cat1.max(), np.unique(_cat).size)
            new_pd[vn] = _cat1
            idx_base += _cat.max() + 1
        logging.debug(new_pd.shape)
        logging.debug(new_pd.head(3))
        new_pd.to_csv(FLAGS.tmp_data_path+'xgb_new_test_features.csv',index=False)
    elif istrain=='test':
        X_test = gdbt_data_get_test()
        print(X_test.shape)
#        X_test = X_test.ix[:,columns]
#        X_test.drop('click',axis=1,inplace=True)
        """
        归一化
        """
        X_test=data_normalization(X_test)
        
        """
        PCA
        """
        X_test=data_pca(X_test)
        for oper in op:
            xgb1 = load(FLAGS.tmp_data_path+'xgboost.cv_'+oper+'.model.joblib_dat')
            logging.debug(xgb1.get_params()['n_estimators'])
            dtrain_predprob = xgb1.predict_proba(X_test)
            logging.debug(dtrain_predprob.shape)
            columns=[]
            for i in [1,2]:
                for j in range(11):
                    columns.append(str(i)+'-'+str(j))
            y_pred=pd.DataFrame(dtrain_predprob,columns=columns)
            def c(line):
                return [round(x,6) for x in line]
            y_pred.apply(lambda line:c(line),axis=1)


            logging.debug('-'*30)
#            y_pred=np.array(y_pred).reshape(-1,1)
            logging.debug(y_pred)
            test_id=pd.read_csv(FLAGS.file_path+'deviceid_test.csv')
            logging.debug(test_id['device_id'].shape)
            test_id['device_id']=test_id['device_id'].map(str)
            test_id.rename(columns={'device_id':'DeviceID'}, inplace = True)
            fin=pd.concat([test_id,y_pred],axis=1)
            
            print(fin)

            
            fin.to_csv(FLAGS.tmp_data_path+'1-'+oper+'-xgboost.test.csv',index=False)
        del X_test
Exemple #12
0
def _xgb_classification_train(table,
                              feature_cols,
                              label_col,
                              max_depth=3,
                              learning_rate=0.1,
                              n_estimators=100,
                              silent=True,
                              objective='binary:logistic',
                              booster='gbtree',
                              n_jobs=1,
                              nthread=None,
                              gamma=0,
                              min_child_weight=1,
                              max_delta_step=0,
                              subsample=1,
                              colsample_bytree=1,
                              colsample_bylevel=1,
                              reg_alpha=0,
                              reg_lambda=1,
                              scale_pos_weight=1,
                              base_score=0.5,
                              random_state=0,
                              seed=None,
                              missing=None,
                              sample_weight=None,
                              eval_set=None,
                              eval_metric=None,
                              early_stopping_rounds=None,
                              verbose=True,
                              xgb_model=None,
                              sample_weight_eval_set=None):
    validate(greater_than_or_equal_to(max_depth, 1, 'max_depth'),
             greater_than_or_equal_to(learning_rate, 0.0, 'learning_rate'),
             greater_than_or_equal_to(n_estimators, 1, 'n_estimators'))

    classifier = XGBClassifier(max_depth, learning_rate, n_estimators, silent,
                               objective, booster, n_jobs, nthread, gamma,
                               min_child_weight, max_delta_step, subsample,
                               colsample_bytree, colsample_bylevel, reg_alpha,
                               reg_lambda, scale_pos_weight, base_score,
                               random_state, seed, missing)
    classifier.fit(table[feature_cols], table[label_col], sample_weight,
                   eval_set, eval_metric, early_stopping_rounds, verbose,
                   xgb_model, sample_weight_eval_set)

    # json
    get_param = classifier.get_params()
    feature_importance = classifier.feature_importances_
    #     plt.rcdefaults()
    plot_importance(classifier)
    plt.tight_layout()
    fig_plot_importance = plt2MD(plt)
    plt.clf()
    #     plt.rcParams['figure.dpi'] = figure_dpi
    #     plot_tree(classifier)
    #     fig_plot_tree_UT = plt2MD(plt)
    #     plt.clf()
    #     plt.rcParams['figure.dpi'] = figure_dpi
    #     plot_tree(classifier, rankdir='LR')
    #     fig_plot_tree_LR = plt2MD(plt)
    #     plt.rcdefaults()
    #     plt.clf()

    model = _model_dict('xgb_classification_model')
    model['feature_cols'] = feature_cols
    model['label_col'] = label_col
    model['parameters'] = get_param
    model['feature_importance'] = feature_importance
    model['classifier'] = classifier

    # report
    #     get_param_list = []
    #     get_param_list.append(['feature_cols', feature_cols])
    #     get_param_list.append(['label_col', label_col])

    params = dict2MD(get_param)
    #     for key, value in get_param.items():
    #         temp = [key, value]
    #         get_param_list.append(temp)
    #     get_param_df = pd.DataFrame(data=get_param_list, columns=['parameter', 'value'])
    feature_importance_df = pd.DataFrame(data=feature_importance,
                                         index=feature_cols).T

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## XGB Classification Train Result
    |
    | ### Plot Importance
    | {fig_importance}
    |
    | ### Feature Importance
    | {table_feature_importance}
    |
    | ### Parameters
    | {list_parameters}
    |
    """.format(fig_importance=fig_plot_importance,
               table_feature_importance=pandasDF2MD(feature_importance_df, 20),
               list_parameters=params)))
    model['_repr_brtc_'] = rb.get()

    return {'model': model}
Exemple #13
0
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=9,
 min_child_weight=1,
 gamma=0.2,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27,
 reg_alpha=1e-05)

xgb_param = xgb_clf.get_xgb_params()
xgtrain = xgb.DMatrix(x_train, label=y_train)
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=xgb_clf.get_params()['n_estimators'], nfold=5,
                          metrics='auc', early_stopping_rounds=50)
xgb_clf.set_params(n_estimators=cvresult.shape[0])
xgb_clf.fit(x_train, y_train)
y_pred_xgb=xgb_clf.predict(x_test)
y_pred_xgb_test_data=xgb_clf.predict(test)
score = accuracy_score(y_test, y_pred_xgb)
f1_score_xgboost=f1_score(y_test,y_pred_xgb)

print(cvresult.shape[0])


print(
    "\nModel Report")
print(
    "Accuracy : %.4g" % metrics.accuracy_score(y_test, y_pred_xgb))
Exemple #14
0
rfc.n_features_
# 看一下泛化能力
cross_val_score(rfc, X_proc, y, cv=5)
cross_val_score(rfc, X_proc, y, cv=5).mean()
# 预测
y_test = rfc.predict(X_test_proc)
y_test_df = pd.DataFrame(y_test, index=X_test.index)


# 使用XGBoost
from  xgboost import XGBClassifier
# 使用默认参数配置
xgbc = XGBClassifier()
xgbc.fit(X_proc,y)
xgbc.get_booster()
xgbc.get_params()
# 使用交叉验证评估一下效果
cross_val_score(xgbc, X_proc, y, cv=5)
# 预测
y_test = xgbc.predict(X_test_proc)
y_test_df = pd.DataFrame(y_test, index=X_test.index)



# ----------------kaggle实战的示例代码-----------------------
dtype = {'PassengerId': str}
train_all = pd.read_csv("train.csv", dtype=dtype)
# 根据列索引来删除某一列
train = train_all.drop(train_all.columns[1], axis=1)
# train['PassengerId'] = train['PassengerId'].astype(str)
test = pd.read_csv("test.csv", dtype=dtype)
Exemple #15
0
 def get_params(self, deep=False):
     model5 = XGBClassifier(max_depth=10,
                            n_estimators=1000,
                            learning_rate=0.1)
     return model5.get_params(deep=deep)
Exemple #16
0
            color='m')

#prettify using pyplot: https://matplotlib.org/api/pyplot_api.html
plt.title('Machine Learning Algorithm Accuracy Score \n')
plt.xlabel('Accuracy Score (%)')
plt.ylabel('Algorithm')

#base model - tune 1
xgboost = XGBClassifier()
base_results = model_selection.cross_validate(xgboost,
                                              data1[data1_x_bin],
                                              data1[Target],
                                              cv=cv_split)
xgboost.fit(data1[data1_x_bin], data1[Target])

print('BEFORE DT Parameters: ', xgboost.get_params())
print("BEFORE DT Training w/bin score mean: {:.2f}".format(
    base_results['train_score'].mean() * 100))
print("BEFORE DT Test w/bin score mean: {:.2f}".format(
    base_results['test_score'].mean() * 100))
print("BEFORE DT Test w/bin score 3*std: +/- {:.2f}".format(
    base_results['test_score'].std() * 100 * 3))
#print("BEFORE DT Test w/bin set score min: {:.2f}". format(base_results['test_score'].min()*100))
print('-' * 10)

#tune hyper-parameters: http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier
param_grid = {
    'n_estimators': [50, 100, 200, 400],
    'max_depth': [2, 4, 6, 8, 10],  #max depth tree can grow; default is none
    'random_state': [
        0
Exemple #17
0
def done(istrain,X_train,y_train,flag):
#    test_save.drop('click',axis=1,inplace=True)
#    op=['n_estimators','max_depth','min_child_weight','subsample','reg_alpha','gamma','fin']
    op=['n_estimators']

    if istrain=='train':
        xgb1 = XGBClassifier(**gbtree_param,
        objective='multi:softprob',
        eval_metric=['mlogloss'],
        nthread=-1,
        verbose=1,
        seed=27,
        silent=True,**gpu_dict)
        for i,oper in enumerate(op):
            modelfit_multi_cv(xgb1, X_train,y_train,cv_type=oper,random_state=i)        
            logging.debug(oper+":to save validation predictions ...")
            ret=dump(xgb1, FLAGS.tmp_data_path+flag+'_xgboost.cv_'+oper+'.model.joblib_dat') 
            logging.debug(ret)
            gc.collect()
        del X_train
        del y_train
    elif istrain=='eval':
        for oper in op:
            xgb1 = load(FLAGS.tmp_data_path+flag+'_xgboost.cv_'+oper+'.model.joblib_dat')
            logging.debug(xgb1.get_params()['n_estimators'])
#            dtrain_predprob = xgb1.predict_proba(X_train)
            y_pred = xgb1.predict(X_train)
        
            acc = accuracy_score(y_train, y_pred)
            logging.debug('acc:'+str( acc*100.0)+'%')
    
    
            logging.debug('-'*30)
            
        

        del X_train
    elif istrain=='test':

        for oper in op:
            xgb1 = load(FLAGS.tmp_data_path+flag+'_xgboost.cv_'+oper+'.model.joblib_dat')
            logging.debug(xgb1.get_params()['n_estimators'])
            dtrain_predprob = xgb1.predict_proba(X_train)
            logging.debug(dtrain_predprob.shape)
            columns=[]
            for i in [1,2]:
                for j in range(11):
                    col=str(i)+'-'+str(j)
                    columns.append(col)
            y_pred=pd.DataFrame(dtrain_predprob,columns=columns)
            def c(line):
                return [round(x,6) for x in line]
            y_pred.apply(lambda line:c(line),axis=1)


            logging.debug('-'*30)
#            y_pred=np.array(y_pred).reshape(-1,1)
            logging.debug(y_pred)
            test_id=pd.read_csv(FLAGS.file_path+'deviceid_test.csv')
            logging.debug(test_id['device_id'].shape)
            test_id['device_id']=test_id['device_id'].map(str)
            test_id.rename(columns={'device_id':'DeviceID'}, inplace = True)
            fin=pd.concat([test_id,y_pred],axis=1)
            
#            print(fin)
#
#            
#            
##            df1=pd.read_csv(FLAGS.tmp_data_path+'sex_fin-xgboost.test.csv')
#            fin['sex']=X_train['sex'].values
#            columns=['DeviceID']
#
#                    fitle=np.logical_and(fin['sex'].values==i,True)
#                    fin.ix[fitle,col]=fin.ix[fitle,str(j)].values
#                    fin.ix[np.logical_and(fitle,False),col]=0.000001
#            
#            fin.drop('sex',axis=1,inplace=True)
            fin.to_csv(FLAGS.tmp_data_path+flag+'_'+oper+'-xgboost.test.csv',index=False)
#            test_concat(df1,fin)
        del X_train
        return fin
Exemple #18
0
    y_train,
    scoring='accuracy',
    return_estimator=True
)

score_mean = val_info['test_score'].mean()
score_std = val_info['test_score'].std()
print(f'{score_mean} accuracy with a standard deviation of {score_std}')

# Cell
clf = val_info['estimator'][0]

# Cell
importance_df = pd.DataFrame({'feature':X_train.columns, 'importance': clf.feature_importances_}).sort_values('importance', ascending=False)

# Cell
importance_df.to_html('../output/data/feature_importance.html', index=False)

# Cell
if LOG_MLFLOW:
    with mlflow.start_run(experiment_id=EX_ID):
        mlflow.log_param('num_features', X_train.shape[1])
        mlflow.log_param('n_estimators', clf.get_params()['n_estimators'])
        mlflow.log_param('max_depth', clf.get_params()['max_depth'])
        mlflow.log_param('learning_rate', clf.get_params()['learning_rate'])
        mlflow.log_param('booster', clf.get_params()['booster'])

        mlflow.log_metric('mean_accuracy', score_mean)
        mlflow.log_metric('std_accuracy', score_std)

        mlflow.log_artifact('../output/data/feature_importance.html')
y_devset = devset4['is_used']

N_ESTIMATORS = [10, 50, 100, 300, 500, 1000]
MAX_DEPTH = [4, 6, 8]
LEARNING_RATE = [0.05, 0.1, 0.15, 0.2]


param_grid = dict(max_depth = MAX_DEPTH, n_estimators=N_ESTIMATORS, learning_rate = LEARNING_RATE)
results_list = list()
parameters = ParameterGrid(param_grid)

for g in parameters:
    #clf = XGBClassifier(random_state=0, n_jobs=-1, early_stopping_rounds=10, eval_metric="auc", eval_set=eval_set)
    clf = XGBClassifier(random_state=42, n_jobs=-1)
    clf.set_params(**g)
    print "parameters: ", clf.get_params()
    clf.fit(X_train, y_train)

    # http://xgboost.readthedocs.io/en/latest/python/python_intro.html
    y_pred = clf.predict_proba(X_devset)
    y_pred_class = clf.predict(X_devset)

    if clf.classes_[1] == 1:
        y_pred_prob = clf.predict_proba(X_devset)[:, 1]
    else:
        y_pred_prob = clf.predict_proba(X_devset)[:, 0]

    cmatrix = confusion_matrix(y_devset, y_pred_class, labels=[1,0])
    print(cmatrix)

    F5 = fbeta_score(y_devset, y_pred_class, beta=0.5, labels=[1,0])
    # create the pipeline
    pipeline = Pipeline(steps=[('data.pecarn.preprocess',
                                pecarn.make_preprocess_pipeline()
                                ), ('feature_selection',
                                    feature_selector), ('xgboost', clf)])

    # save the feature names for use later in prediction
    pipeline.input_features = X[0:0]

    # neptune initialization - NEPTUNE_API_TOKEN and NEPTUNE_PROJECT environment variables must be set
    neptune.init()

    # create a neptune experiment to log to
    with neptune.create_experiment(
            name='xgboost.XGBClassifier_selected_features',
            params=clf.get_params(),
            upload_source_files=[__file__, 'src/data/pecarn/*.py'],
            send_hardware_metrics=False) as exp:

        # train the classifier
        pipeline.fit(X_train, y_train)

        # calculate scores on train set
        y_train_pred = pipeline.predict(X_train)
        train_scores = {
            'accuracy': accuracy_score(y_train, y_train_pred),
            'f1': f1_score(y_train, y_train_pred),
            'f1_weighted': f1_score(y_train, y_train_pred, average='weighted'),
            'avg_precision': average_precision_score(y_train, y_train_pred)
        }
RFC_clf_report, RFC_clf_acc_score, RFC_f1score, RFC_y_pred = modeling(RFC,X_train_resampled,y_train_resampled,X_test,y_test)
svc_clf_report, svc_clf_acc_score, svc_f1score,svc_y_pred = modeling(svc,X_train_resampled,y_train_resampled,X_test,y_test)

results={'Classifier':["XGB Classifier","Gradient Boosting Classifier","Random Forest Classifier","SVC"],
'Accuracy':[str(XGB_clf_acc_score)[:5],str(GBC_clf_acc_score)[:5],str(RFC_clf_acc_score)[:5],str(svc_clf_acc_score)[:5]],
'F1_macro':[str(XGB_f1score)[:5],str(GBC_f1score)[:5],str(RFC_f1score)[:5],str(svc_f1score)[:5]]}

score_report_df =pd.DataFrame(data=results,columns=["Classifier","Accuracy","F1_macro"])
print("Base models with upsampled train datasets,  holdout method (8:2) validation")
print(score_report_df)


# Turned hyperparameter and WTIH resampled dataset

# Print baseline hyperparameters
pprint(XGB.get_params())
# Pass turned hyperparameters
# when you set objective="multi:softmax", num_class=3 (func, non func, need repair) needs to be set manually! 
# get_params().keys() does NOT SHOW the param! 
XGB_t= XGBClassifier(colsample_bylevel=0.5,max_depth=10,objective="multi:softmax",num_class=3)
pprint(XGB_t.get_params())

XGB_t_clf_report_r, XGB_t_clf_acc_score_r, XGB_t_f1score_r, XGB_t_y_pred_r= modeling(XGB_t,X_train_resampled,y_train_resampled,X_test,y_test)
print(XGB_t_clf_report_r)

# Confusion matrix visualisation
XGB_t_confusion_matrix_ =confusion_matrix(y_test,XGB_t_y_pred_r)
class_names = ["Func","Need Repair","Non Func"]
fig,ax =plot_confusion_matrix(conf_mat = XGB_t_confusion_matrix_,colorbar = True,
                             show_absolute=False, show_normed=True,
                             class_names = class_names)
Exemple #22
0
class XGBoostClassifier(AbstractSKLearnClassifier):

    def __init__(self):
        AbstractSKLearnClassifier.__init__(self)
        self.model = False

    def set_label_encoder(self, labels):
        AbstractSKLearnClassifier.set_label_encoder(self, labels)

    def return_label_encoding(self, labels):
        return AbstractSKLearnClassifier.return_label_encoding(self, labels)

    def train_classifier(self, trainvectors, labels, 
        booster='gbtree', silent='1', learning_rate='0.1', min_child_weight='1', max_depth='6', 
        gamma='0', max_delta_step='0', subsample='1', colsample_bytree='1', reg_lambda='1', reg_alpha='0', 
        scale_pos_weight='1',objective='binary:logistic', seed='7', n_estimators='100',jobs='12',
        iterations='50',scoring='roc_auc',v=2):
        # prepare grid search
        if len(list(set(labels))) > 2: # more than two classes to distinguish
            parameters = ['estimator__n_estimators','estimator__min_child_weight', 'estimator__max_depth', 'estimator__gamma', 'estimator__subsample','estimator__colsample_bytree','estimator__reg_alpha','estimator__reg_lambda','estimator__scale_pos_weight']
            multi = True
        else: # only two classes to distinguish
            parameters = ['n_estimators','min_child_weight', 'max_depth', 'gamma', 'subsample','colsample_bytree','reg_alpha', 'reg_lambda', 'scale_pos_weight'] 
            multi = False
        silent = int(silent)
        nthread=int(jobs)
        seed=int(seed)
        iterations=int(iterations)
        learning_rate = float(learning_rate)
        max_delta_step = float(max_delta_step)
        reg_lambda_values = [i/10 for i in range(0,5)] if reg_lambda == 'search' else [float(x) for x in reg_lambda.split()]
        n_estimators_values = list(range(100,1000,100)) if n_estimators == 'search' else [int(x) for x in n_estimators.split()]
        min_child_weight_values = list(range(1,6,1)) if min_child_weight == 'search' else [int(x) for x in min_child_weight.split()]
        max_depth_values = list(range(3,10,1)) if max_depth == 'search' else [int(x) for x in max_depth.split()]
        gamma_values = [i/10 for i in range(0,5)] if gamma == 'search' else [float(x) for x in gamma.split()]
        subsample_values = [i/10 for i in range(6,10)] if subsample == 'search' else [float(x) for x in subsample.split()]
        colsample_bytree_values = [i/10 for i in range(6,10)] if colsample_bytree == 'search' else [float(x) for x in colsample_bytree.split()]
        reg_alpha_values = [1e-5,1e-2,0.1,1,100] if reg_alpha == 'search' else [float(x) for x in reg_alpha.split()]
        scale_pos_weight_values = [1,3,5,7,9] if scale_pos_weight == 'search' else [int(x) for x in scale_pos_weight.split()]
        grid_values = [n_estimators_values,min_child_weight_values, max_depth_values, gamma_values, subsample_values, colsample_bytree_values, reg_alpha_values, reg_lambda_values, scale_pos_weight_values]
        if not False in [len(x) == 1 for x in grid_values]: # only sinle parameter settings
            settings = {}
            for i, parameter in enumerate(parameters):
                settings[parameter] = grid_values[i][0]
        else:
            param_grid = {}
            for i, parameter in enumerate(parameters):
                param_grid[parameter] = grid_values[i]
            model = XGBClassifier(silent=silent,nthread=nthread,learning_rate=learning_rate,max_delta_step=max_delta_step)
            if multi:
                model = OutputCodeClassifier(model)
                trainvectors = trainvectors.todense()
            if [len(x) > 1 for x in grid_values].count(True) <= 3: # exhaustive grid search with one to three variant parameters
                paramsearch = GridSearchCV(model, param_grid, verbose=v, scoring=scoring, cv=5, n_jobs=1)
            else: # random grid search
                paramsearch = RandomizedSearchCV(model, param_grid, verbose=v, n_iter=iterations, scoring=scoring, cv=5, n_jobs=1)
            paramsearch.fit(trainvectors, labels)
            settings = paramsearch.best_params_
        self.model = XGBClassifier(
            learning_rate = learning_rate, 
            max_delta_step = max_delta_step, 
            silent = silent,
            nthread = nthread,
            n_estimators = settings[parameters[0]], 
            min_child_weight = settings[parameters[1]], 
            max_depth = settings[parameters[2]],
            gamma = settings[parameters[3]],
            subsample = settings[parameters[4]],
            colsample_bytree = settings[parameters[5]],
            reg_alpha = settings[parameters[6]],
            reg_lambda = settings[parameters[7]], 
            scale_pos_weight = settings[parameters[8]],
            verbose = v
        )
        self.model.fit(trainvectors, labels)

    def return_classifier(self):
        return self.model

    def return_feature_importance(self,vocab=False):
        feature_importance = []
        if vocab:
            for i,val in enumerate(self.model.feature_importances_.T.tolist()):
                feature_importance.append([vocab[i],val])
        else:
            for i,val in enumerate(self.model.coef_.T.tolist()):
                feature_importance.append([str(i),val])
        sorted_feature_importance = sorted(feature_importance,key = lambda k : k[1],reverse=True)
        sorted_feature_importance_str = '\n'.join(['\t'.join([str(x) for x in row]) for row in sorted_feature_importance])
        return sorted_feature_importance_str
    
    def return_parameter_settings(self):
        parameter_settings = []
        for param in ['n_estimators','min_child_weight', 'max_depth', 'gamma', 'subsample','colsample_bytree',
            'reg_alpha', 'scale_pos_weight','learning_rate','max_delta_step','reg_lambda']:
            parameter_settings.append([param,str(self.model.get_params()[param])])
        return '\n'.join([': '.join(x) for x in parameter_settings])

    def return_model_insights(self,vocab=False):
        model_insights = [['feature_importance.txt',self.return_feature_importance(vocab)],['parameter_settings.txt',self.return_parameter_settings()]]
        return model_insights
Exemple #23
0
                    colsample_bylevel=0.7,

                    learning_rate=0.01,  # 学习率,控制每次迭代更新权重时的步长,值越小,训练越慢。默认0.3,典型值为0.01-0.2。
                    n_estimators=1000000,  # 总共迭代的次数,即决策树的个数,数值大没关系,cv会自动返回合适的n_estimators
                    max_depth=5,  # 树的深度,默认值为6,典型值3-10。
                    min_child_weight=2,  # 值越大,越容易欠拟合;值越小,越容易过拟合(值较大时,避免模型学习到局部的特殊样本)。默认值为1
                    gamma=0,  # 惩罚项系数,指定节点分裂所需的最小损失函数下降值。
                    objective='multi:softprob',
                    )

if useTrainCV:
    xgb_param = xgb1.get_xgb_params()

    xgtrain = xgb.DMatrix(X_train, label=y_train)

    cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=xgb1.get_params()['n_estimators'], folds=cv_folds,
                      metrics='mlogloss', early_stopping_rounds=early_stopping_rounds)

    n_estimators = cvresult.shape[0]
    xgb1.set_params(n_estimators=n_estimators)
    # print(cvresult)
# Fit the algorithm on the data
xgb1.fit(X_train, y_train, eval_metric='mlogloss')
# Predict training set:
train_predprob = xgb1.predict_proba(X_train)
logloss = metrics.log_loss(y_train, train_predprob)

# Predict training set:
print("logloss of train :%.4f" % logloss)
y_pred = np.array(xgb1.predict(X_test))
predictions = [round(value) for value in y_pred]
					nthread = 4, 
					min_child_weight = 1, 
					subsample= 0.8, 
					seed = 1337, 
					objective= 'multi:softprob', 
					max_depth = 7, 
					gamma= .2)

# use the xgb interface
xgb_param = clf.get_xgb_params()
xgb_param['num_class'] = 5
xgb_param['eval_metric'] = 'mlogloss'
Xg_train = xgb.DMatrix(X_train, label=y_train, missing=np.nan)
cvresult = xgb.cv(xgb_param, 
				  Xg_train, 
 				  num_boost_round = clf.get_params()['n_estimators'],
 				  nfold = 5,
 				  show_progress = True,
				  early_stopping_rounds = 100)
clf.set_params(n_estimators=cvresult.shape[0])
clf.fit(X_train, y_train)
best_outcome_params = clf.get_params()
best_outcome_score = cvresult.min()

try:
	# predict the outcome probabilities
	y_pred = grid.predict_proba(X_test)
except:
	# predict the outcome probabilities
	y_pred = clf.predict_proba(X_test)
Exemple #25
0
df = pd.read_csv("data/tanzania_cleaned_df2.csv")
X = df.iloc[0:59400, 0:110]
y = df[['status_group']]
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)
print(len(X_train))
print(len(y_train))
print(len(X_test))
print(len(y_test))
print(df.status_group.value_counts())

# XGB model and its parameters
XGB = XGBClassifier()
pprint(XGB.get_params())

# kfold 5
kf = KFold(n_splits=5, random_state=42, shuffle=False)

#
params_xgb = {  #'n_estimators': [100], this is default
    'max_depth': [6, 8, 10],
    #'validate_parameters': [True], this is default
    'min_child_weight': [1, 2, 3],
    'gamma': [0, 0.5],
    'learning_rate': [0.05, 0.1, 0.3, 0, 4],
    'colsample_bytree': [1, 0.5]
}

# Scoring ="f1_macro"
Exemple #26
0
    gamma=gamma,
    subsample=subsample,
    colsample_bytree=colsample_bytree,
    reg_alpha=reg_alpha,
    objective= 'binary:logistic',
    nthread=4,
    scale_pos_weight=1,
    seed=27 )



ml.xgbfit(xgb4, trn, tst, use_columns, printFeatureImportance=False, target=target)



params4 = xgb4.get_params()
params4['eval_metric'] = 'auc' # 0.954
params4['learning_rate'] = 0.1 # 0.952
params4['grow_policy'] = 'lossguide' 
params4['max_leaves'] = 1400
params4['alpha'] = 4
params4['scale_pos_weight'] = 9


params5 = {'learning_rate': 0.3,
          'tree_method': "auto",
          'grow_policy': "lossguide",
          'max_leaves': 1400,  
          'max_depth': 4, 
          'min_child_weight':1,
          'subsample': 0.9, 
Exemple #27
0
def xgb_classifier(X_train,
                   X_test,
                   y_train,
                   y_test,
                   useTrainCV=True,
                   cv_folds=5,
                   early_stopping_rounds=50):
    """
    关于现在这个模型
    准确率 : 0.9995
    AUC : 0.887708
    F1 Score : 0.847584
    ----------------------------------->
    关于现在这个模型
    准确率 : 0.9996
    AUC 得分 (训练集): 0.977480
    F1 Score 得分 (训练集): 0.858209
    ---------------------------------->
    关于现在这个模型
    ['V14', 'V4', 'V17', 'V10', 'V12', 'V20', 'Amount', 'V21', 'V26', 'V28', 'V11', 'V19', 'V8', 'V7', 'V13']
    准确率 : 0.9996
    AUC 得分 (训练集): 0.978563
    F1 Score 得分 (训练集): 0.859259
    ---------------------------------->
    # {'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 3} 0.862920874517388
    # {'colsample_bytree': 1.0, 'gamma': 0.2} 0.871
    # {'gamma': 0.2, 'scale_pos_weight': 1} 0.8702009952422571
    # {'subsample': 0.6} 0.864310306628855
    """
    alg = XGBClassifier(learning_rate=0.1,
                        n_estimators=140,
                        max_depth=5,
                        min_child_weight=3,
                        gamma=0.2,
                        subsample=0.6,
                        colsample_bytree=1.0,
                        objective='binary:logistic',
                        nthread=4,
                        scale_pos_weight=1,
                        seed=27)

    if useTrainCV:
        print("Start Feeding Data")
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(X_train.values, label=y_train.values)
        # xgtest = xgb.DMatrix(X_test.values, label=y_test.values)
        cvresult = xgb.cv(xgb_param,
                          xgtrain,
                          num_boost_round=alg.get_params()['n_estimators'],
                          nfold=cv_folds,
                          early_stopping_rounds=early_stopping_rounds)
        alg.set_params(n_estimators=cvresult.shape[0])

    # 建模
    print('Start Training')
    alg.fit(X_train, y_train, eval_metric='auc')

    # param_test1 = {}
    # gsearch1 = GridSearchCV(estimator=XGBClassifier(learning_rate=0.1, n_estimators=140, max_depth=5,
    #                                                 min_child_weight=3, gamma=0.2, subsample=0.8,
    #                                                 colsample_bytree=1.0,
    #                                                 objective='binary:logistic', nthread=4, scale_pos_weight=1,
    #                                                 seed=27),
    #                         param_grid=param_test1,
    #                         scoring='f1',
    #                         n_jobs=4, iid=False, cv=5)
    # gsearch1.fit(X_train, y_train)
    # print(gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_)

    # 对训练集预测
    print("Start Predicting")
    predictions = alg.predict(X_test)
    pred_proba = alg.predict_proba(X_test)[:, 1]

    # 输出模型的一些结果
    print("\n关于现在这个模型")
    print("准确率 : %.4g" % metrics.accuracy_score(y_test, predictions))
    print("AUC 得分 (训练集): %f" % metrics.roc_auc_score(y_test, pred_proba))
    print("F1 Score 得分 (训练集): %f" % metrics.f1_score(y_test, predictions))

    feat_imp = alg.feature_importances_
    feat = X_train.columns.tolist()
    # clf.best_estimator_.booster().get_fscore()
    res_df = pd.DataFrame({
        'Features': feat,
        'Importance': feat_imp
    }).sort_values(by='Importance', ascending=False)
    res_df.plot('Features',
                'Importance',
                kind='bar',
                title='Feature Importances')
    plt.ylabel('Feature Importance Score')
    plt.show()
    print(res_df)
    print(res_df["Features"].tolist())
Exemple #28
0
def _xgb_classification_train(table,
                              feature_cols,
                              label_col,
                              max_depth=3,
                              learning_rate=0.1,
                              n_estimators=100,
                              silent=True,
                              objective='binary:logistic',
                              booster='gbtree',
                              n_jobs=1,
                              nthread=None,
                              gamma=0,
                              min_child_weight=1,
                              max_delta_step=0,
                              subsample=1,
                              colsample_bytree=1,
                              colsample_bylevel=1,
                              reg_alpha=0,
                              reg_lambda=1,
                              scale_pos_weight=1,
                              base_score=0.5,
                              random_state=None,
                              seed=None,
                              missing=None,
                              importance_type='gain',
                              class_weight=None,
                              eval_set=None,
                              eval_metric=None,
                              early_stopping_rounds=None,
                              verbose=True,
                              xgb_model=None,
                              sample_weight_eval_set=None):
    feature_names, features = check_col_type(table, feature_cols)

    if isinstance(features, list):
        features = np.array(features)

    if random_state is None:
        random_state = randint(-2**31, 2**31 - 1)
    y_train = table[label_col]
    class_labels = sorted(set(y_train))
    if class_weight is None:
        sample_weight = None
    else:
        if len(class_weight) != len(class_labels):
            raise ValueError(
                "Number of class weights should match number of labels.")
        else:
            class_weight = {
                class_labels[i]: class_weight[i]
                for i in range(len(class_labels))
            }
            sample_weight = np.vectorize(_make_sample_weight)(y_train,
                                                              class_weight)

    classifier = XGBClassifier(max_depth=max_depth,
                               learning_rate=learning_rate,
                               n_estimators=n_estimators,
                               silent=silent,
                               objective=objective,
                               booster=booster,
                               n_jobs=n_jobs,
                               nthread=nthread,
                               gamma=gamma,
                               min_child_weight=min_child_weight,
                               max_delta_step=max_delta_step,
                               subsample=subsample,
                               colsample_bytree=colsample_bytree,
                               colsample_bylevel=colsample_bylevel,
                               reg_alpha=reg_alpha,
                               reg_lambda=reg_lambda,
                               scale_pos_weight=scale_pos_weight,
                               base_score=base_score,
                               random_state=random_state,
                               seed=seed,
                               missing=missing,
                               importance_type=importance_type)

    classifier.fit(features, table[label_col], sample_weight, eval_set,
                   eval_metric, early_stopping_rounds, verbose, xgb_model,
                   sample_weight_eval_set)

    # json
    get_param = classifier.get_params()
    feature_importance = classifier.feature_importances_
    #     plt.rcdefaults()
    plot_importance(classifier)
    plt.tight_layout()
    fig_plot_importance = plt2MD(plt)
    plt.clf()
    #     plt.rcParams['figure.dpi'] = figure_dpi
    #     plot_tree(classifier)
    #     fig_plot_tree_UT = plt2MD(plt)
    #     plt.clf()
    #     plt.rcParams['figure.dpi'] = figure_dpi
    #     plot_tree(classifier, rankdir='LR')
    #     fig_plot_tree_LR = plt2MD(plt)
    #     plt.rcdefaults()
    #     plt.clf()

    model = _model_dict('xgb_classification_model')
    model['feature_cols'] = feature_cols
    model['label_col'] = label_col
    model['parameters'] = get_param
    model['feature_importance'] = feature_importance
    model['classifier'] = classifier

    # report
    #     get_param_list = []
    #     get_param_list.append(['feature_cols', feature_cols])
    #     get_param_list.append(['label_col', label_col])

    params = dict2MD(get_param)
    #     for key, value in get_param.items():
    #         temp = [key, value]
    #         get_param_list.append(temp)
    #     get_param_df = pd.DataFrame(data=get_param_list, columns=['parameter', 'value'])
    feature_importance_df = pd.DataFrame(data=feature_importance,
                                         index=feature_names).T

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## XGB Classification Train Result
    |
    | ### Plot Feature Importance
    | {fig_importance}
    |
    | ### Normalized Feature Importance Table
    | {table_feature_importance}
    |
    | ### Parameters
    | {list_parameters}
    |
    """.format(fig_importance=fig_plot_importance,
               table_feature_importance=pandasDF2MD(feature_importance_df, 20),
               list_parameters=params)))
    model['_repr_brtc_'] = rb.get()
    feature_importance_table = pd.DataFrame(
        [[feature_names[i], feature_importance[i]]
         for i in range(len(feature_names))],
        columns=['feature_name', 'importance'])
    model['feature_importance_table'] = feature_importance_table
    return {'model': model}
Exemple #29
0
def main():

  # Start timer
  t_start = time.time()

  # Command line options
  parser = argparse.ArgumentParser()
  group_model = parser.add_mutually_exclusive_group() 
  group_model.add_argument('-x', '--xgboost', action='store_true', help='Run gradient BDT')
  group_model.add_argument('-n', '--nn', action='store_true', help='Run neural network')
  group_model.add_argument('-p', '--prepare_hdf5', type=str, nargs='?', default='', help='Prepare input datasets for ML and store in HDF5 file; options: "2L2J" or "2L3J+"')
  group_read_dataset = parser.add_mutually_exclusive_group() 
  group_read_dataset.add_argument('-r', '--read_hdf5', action='store_true', help='Read prepared datasets from HDF5 file')
  #group_read_dataset.add_argument('-d', '--direct_read', action='store_true', help='Read unprepared datasets from ROOT file')
  parser.add_argument('-l', '--load_pretrained_model', action='store_true', help='Load pre-trained classifier model, i.e. only run on test data')
  #parser.add_argument('-B', '--N_sig_events', type=lambda x: int(float(x)), default=0, help='Number of signal events to read from the dataset')
  #parser.add_argument('-S', '--N_bkg_events', type=lambda x: int(float(x)), default=0, help='Number of background events to read from the dataset for each class')
  parser.add_argument('-s', '--signal_region', type=str, nargs='?', default='int', help='Choose signal region: low-2J, int-2J, high-2J, low-3J+, int-3J+, high-3J+')
  parser.add_argument('-b', '--balanced', type=int, nargs='?', default=-1, help='Balance dataset for training; 0: oversample signal, 1: undersample background')
  parser.add_argument('-m', '--multiclass', action='store_true', help='Use multiple background classes in addition to the signal class')
  parser.add_argument('-w', '--event_weight', action='store_true', help='Apply event weights during training')
  parser.add_argument('-c', '--class_weight', action='store_true', help='Apply class weights to account for unbalanced dataset')
  parser.add_argument('-t', '--do_train', action='store_true', help='Train the classifier')
  parser.add_argument('-T', '--do_test', action='store_true', help='Test the classifier on data it has not been trained on')
  parser.add_argument('-e', '--train_even', action='store_true', help='Use even run numbers for training and odd run numbers for testing')
  parser.add_argument('-o', '--train_odd', action='store_true', help='Use odd run numbers for training and even run numbers for testing')
  parser.add_argument('-C', '--doCV', action='store_true', help='Perform a k-fold cross-validation on the training set during training')
  parser.add_argument('-O', '--oversample', action='store_true', help='Balance imbalanced dataset using oversampling')
  parser.add_argument('-U', '--undersample', action='store_true', help='Balance imbalanced dataset using undersampling')
  parser.add_argument('--n_nodes', type=int, nargs='?', default=20, help='Number of nodes in each hidden neural network layer')
  parser.add_argument('--n_hidden_layers', type=int, nargs='?', default=1, help='Number of nodes in each hidden neural network layer')
  parser.add_argument('--dropout', type=float, nargs='?', default=0., help='Use dropout regularization on neural network layers to reduce overfitting')
  parser.add_argument('--L1', type=float, nargs='?', default=0., help='Use L1 regularization on neural network weights to reduce overfitting')
  parser.add_argument('--L2', type=float, nargs='?', default=0., help='Use L2 regularization (weights decay) on neural network weights to reduce overfitting')
  parser.add_argument('--lr', type=float, nargs='?', default=0.001, help='Set learning rate for the neural network or BDT optimizer')
  parser.add_argument('--batch_size', type=int, nargs='?', default=32, help='Number of events to use for each weight update')
  parser.add_argument('--epochs', type=lambda x: int(float(x)), nargs='?', default=1, help='Number of passes through the training set')
  parser.add_argument('--max_depth', type=int, nargs='?', default=3, help='Maximum tree depth for BDT')
  parser.add_argument('--n_estimators', type=lambda x: int(float(x)), nargs='?', default=100, help='Number of trees in BDT ensemble')
  parser.add_argument('--gamma', type=float, nargs='?', default=0, help='Minimum loss reduction required to make a further partition on a leaf node of the XGBoost tree')
  parser.add_argument('--min_child_weight', type=float, nargs='?', default=1, help='Minimum sum of instance weight(hessian) needed in a child')
  parser.add_argument('--max_delta_step', type=float, nargs='?', default=0, help='Maximum delta step we allow each tree’s weight estimation to be')
  parser.add_argument('--subsample', type=float, nargs='?', default=1, help='Subsample ratio of the training instance')
  parser.add_argument('--colsample_bytree', type=float, nargs='?', default=1, help='Subsample ratio of columns when constructing each tree')
  parser.add_argument('--colsample_bylevel', type=float, nargs='?', default=1, help='Subsample ratio of columns for each level')
  parser.add_argument('--colsample_bynode', type=float, nargs='?', default=1, help='Subsample ratio of columns for each node')
  parser.add_argument('-G', '--doGridSearchCV', action='store_true', help='Perform a grid search for optimal hyperparameter values using cross-validation')
  parser.add_argument('-V', '--plot_validation_curve', action='store_true', help='Calculate and plot perforance score as function of number of training events')
  parser.add_argument('-L', '--plot_learning_curve', action='store_true', help='Calculate and plot perforance score for different values of a chosen hyperparameter')
  args = parser.parse_args()

  # Set which sample types to prepare HDF5s for
  use_sig = 1
  use_bkg = 1
  use_data = 0

  # Where to put preprocessed datasets
  preproc_dir = 'preprocessed_datasets/'
  preproc_suffix = ''
  if args.prepare_hdf5:
    preproc_suffix = '_group_{}_preprocessed.h5'.format(args.prepare_hdf5)
  elif '2J' in args.signal_region:
    preproc_suffix = '_group_2L2J_preprocessed.h5'
  elif '3J+' in args.signal_region:
    preproc_suffix = '_group_2L3J+_preprocessed.h5'
  filename_sig_low_preprocessed = preproc_dir + 'sig_low' + preproc_suffix
  filename_sig_int_preprocessed = preproc_dir + 'sig_int' + preproc_suffix
  filename_sig_high_preprocessed = preproc_dir + 'sig_high' + preproc_suffix
  filename_sig_preprocessed = filename_sig_low_preprocessed
  filename_bkg_preprocessed = preproc_dir + 'bkg' + preproc_suffix
  filename_data_preprocessed = preproc_dir + 'data' + preproc_suffix

  # Where to put output
  output_dir = 'output/'
  #trained_model_dir = 'trained_models/'
  trained_model_dir = output_dir
  trained_model_xgb_suffix = '2LJets_trained_model.joblib'
  trained_model_nn_suffix = '2LJets_trained_model.h5'

  # Counters
  n_events_read = n_events_kept = 0
  n_events_read_sample = n_events_kept_sample = 0
  n_events_read_sample_type = n_events_kept_sample_type = 0

  if args.xgboost:
    output_dir += 'xgboost/latest/xgb_'
    trained_model_dir += 'xgboost/latest/xgb_'
  elif args.nn:
    output_dir += 'neural_network/latest/nn_'
    trained_model_dir += 'neural_network/latest/nn_'

  if 'low' in args.signal_region:
    output_dir += 'low_'
    trained_model_dir += 'low_'
  elif 'int' in args.signal_region:
    output_dir += 'int_'
    trained_model_dir += 'int_'
  elif 'high' in args.signal_region:
    output_dir += 'high_'
    trained_model_dir += 'high_'

  if args.train_even:
    output_dir += 'trainEven_'
    trained_model_dir += 'trainEven_'
  elif args.train_odd:
    output_dir += 'trainOdd_'
    trained_model_dir += 'trainOdd_'

  if args.xgboost:
    trained_model_path = trained_model_dir + trained_model_xgb_suffix
  elif args.nn:
    trained_model_path = trained_model_dir + trained_model_nn_suffix

  global df_sig_feat, df_bkg_feat, df_data_feat

  l_sig = []
  if use_sig:
    if 'low' in args.signal_region:
      l_sig = d_sig['low']
      filename_sig_preprocessed = filename_sig_low_preprocessed
    elif 'int' in args.signal_region:
    #elif args.signal_region == 'int':
      l_sig = d_sig['int']
      filename_sig_preprocessed = filename_sig_int_preprocessed
    elif 'high' in args.signal_region:
      l_sig = d_sig['high']
      filename_sig_preprocessed = filename_sig_high_preprocessed

    d_sig_infile = {'low': filename_sig_low_preprocessed, 
                    'int': filename_sig_int_preprocessed, 
                    'high': filename_sig_high_preprocessed}

  class Logger(object):
      def __init__(self):
          self.terminal = sys.stdout
          self.log = open(output_dir+".log", "w")

      def write(self, message):
          self.terminal.write(message)
          self.log.write(message)  

      def flush(self):
          #this flush method is needed for python 3 compatibility.
          #this handles the flush command by doing nothing.
          #you might want to specify some extra behavior here.
          pass    

  sys.stdout = Logger()

  if args.prepare_hdf5:
    """Read input dataset in chunks, select features and perform cuts,
    before storing DataFrame in HDF5 file"""

    # Prepare and store signal dataset
    if use_sig:
      prepareHDF5(filename_sig_low_preprocessed, d_sig['low'], sample_type='sig', selection=args.prepare_hdf5, chunk_size=1e5, n_chunks=None, entrystart=0)
      prepareHDF5(filename_sig_int_preprocessed, d_sig['int'], sample_type='sig', selection=args.prepare_hdf5, chunk_size=1e5, n_chunks=None, entrystart=0)
      prepareHDF5(filename_sig_high_preprocessed, d_sig['high'], sample_type='sig', selection=args.prepare_hdf5, chunk_size=1e5, n_chunks=None, entrystart=0)

    # Prepare and store background dataset
    if use_bkg:
      prepareHDF5(filename_bkg_preprocessed, l_bkg, sample_type='bkg', selection=args.prepare_hdf5, chunk_size=1e6, n_chunks=None, entrystart=0)
      #prepareHDF5(filename_bkg_preprocessed, l_bkg, sample_type='bkg', selection=args.prepare_hdf5, chunk_size=1e4, n_chunks=1, entrystart=0)

    # Prepare and store real dataset
    if use_data:
      prepareHDF5(filename_data_preprocessed, l_data, sample_type='data', selection=args.prepare_hdf5, chunk_size=1e5, n_chunks=None, entrystart=0)

    return

  elif args.read_hdf5:

    if use_sig:
      # Read in preprocessed signal DataFrame from HDF5 file
      df_sig_feat = pd.DataFrame({})

      for key_sig, value_sig_infile in d_sig_infile.items():
        if key_sig in args.signal_region:
          print("\nReading in file:", value_sig_infile)
          sig_store = pd.HDFStore(value_sig_infile)
          for i_sig in sig_store.keys(): #d_sig[key_sig]:
            if len(df_sig_feat) is 0:
              df_sig_feat = sig_store[i_sig]#.astype('float64')
              df_sig_feat['group'] = i_sig
            else:
              df_sig_sample = sig_store[i_sig]#.astype('float64')
              df_sig_sample['group'] = i_sig
              df_sig_feat = df_sig_feat.append(df_sig_sample)

      if 'mTl3' in df_sig_feat:
        df_sig_feat.drop(columns='mTl3', inplace=True)

      print("\ndf_sig_feat.head():\n", df_sig_feat.head())
      sig_store.close()
      print("Closed store")

    if use_bkg:
      # Read in preprocessed background DataFrame from HDF5 file
      df_bkg_feat = pd.DataFrame({})

      print("\nReading in file:", filename_bkg_preprocessed)
      bkg_store = pd.HDFStore(filename_bkg_preprocessed)
      for i_bkg in bkg_store.keys(): #l_bkg:
        if len(df_bkg_feat) is 0:
          df_bkg_feat = bkg_store[i_bkg]#.astype('float64')
          df_bkg_feat['group'] = i_bkg
        else:
          df_bkg_sample = bkg_store[i_bkg]#.astype('float64')
          df_bkg_sample['group'] = i_bkg
          df_bkg_feat = df_bkg_feat.append(df_bkg_sample)

      if 'mTl3' in df_bkg_feat:
        df_bkg_feat.drop(columns='mTl3', inplace=True)

      print("\ndf_bkg_feat.head():\n", df_bkg_feat.head())
      bkg_store.close()
      print("Closed store")

    if use_data:
      # Read in preprocessed DataFrame of real data from HDF5 file
      data_store = pd.HDFStore(filename_data_preprocessed)
      df_data_feat = data_store['data']
      print("\ndf_data_feat.head():\n", df_data_feat.head())
      data_store.close()
      print("Closed store")

  elif args.direct_read:
    """Read the input dataset for direct use, without reading in chunks
    and storing to output file"""

    print("Not available at the moment")
    return

    #entry_start = 0
    #sig_entry_stop = 1e4
    #bkg_entry_stop = 1e4

    ## import signal dataset
    #df_sig = importOpenData(sample_type="sig", entrystart=entry_start, entrystop=sig_entry_stop)
    #df_sig = shuffle(df_sig)  # shuffle the rows/events
    #df_sig_feat = selectFeatures(df_sig, l_features)
    #df_sig_feat = df_sig_feat*1  # multiplying by 1 to convert booleans to integers
    #df_sig_feat["eventweight"] = getEventWeights(df_sig, l_eventweights)

    ## import background dataset
    #df_bkg = importOpenData(sample_type="bkg", entrystart=entry_start, entrystop=bkg_entry_stop)
    #df_bkg = shuffle(df_bkg)  # shuffle the rows/events
    #df_bkg_feat = selectFeatures(df_bkg, l_features)
    #df_bkg_feat = df_bkg_feat*1  # multiplying by 1 to convert booleans to integers
    #df_bkg_feat["eventweight"] = getEventWeights(df_bkg, l_eventweights)

    ## import data
    ##df_data = importOpenData(sample_type="data", entrystart=entry_start, entrystop=entry_stop)

  if 'low' in args.signal_region:
    print('\nBefore xsec correction: df_sig_feat.query("DatasetNumber == 396210").loc[:,"eventweight"]\n', df_sig_feat.query("DatasetNumber == 396210").loc[:,"eventweight"].head())
    df_sig_feat.loc[df_sig_feat.DatasetNumber==396210,'eventweight'] = df_sig_feat.loc[df_sig_feat.DatasetNumber==396210,'eventweight'] * 0.08836675497457203
    print('\nAfter xsec correction: df_sig_feat.query("DatasetNumber == 396210").loc[:,"eventweight"]\n', df_sig_feat.query("DatasetNumber == 396210").loc[:,"eventweight"].head())

  # Preselection cuts
  l_presel = ['met_Sign > 2', 'mt2leplsp_0 > 10']
  #df_sig_feat.query('&'.join(l_presel), inplace=True)

  print("\n======================================")
  print("df_sig_feat.shape =", df_sig_feat.shape)
  print("df_bkg_feat.shape =", df_bkg_feat.shape)
  print("======================================")

  # make array of features
  df_X = pd.concat([df_bkg_feat, df_sig_feat], axis=0)#, sort=False)

  print("\ndf_X.isna().sum().sum()", df_X.isna().sum().sum())

  #print("\ndf_X.dtypes", df_X.dtypes)
  #col_float32 = (df_X.dtypes == 'float32').values
  #df_X.iloc[:, col_float32] = df_X.iloc[:, col_float32].astype('float64')
  #print("\nAfter converting all columns to float64:\ndf_X.dtypes", df_X.dtypes)

  # make array of labels
  y_bkg = np.zeros(len(df_bkg_feat))
  y_sig = np.ones(len(df_sig_feat))
  y = np.concatenate((y_bkg, y_sig), axis=0).astype(int)
  df_X['ylabel'] = y

  if args.multiclass:
    df_X.loc[df_X.group=='Zjets', 'ylabel'] = 2
    df_X.loc[df_X.group=='diboson', 'ylabel'] = 3
    df_X = df_X.query('group=="diboson" | group=="Zjets" | ylabel==1')
    Y = df_X.ylabel
    # encode class values as integers
    encoder = LabelEncoder()
    encoder.fit(Y)
    encoded_Y = encoder.transform(Y)
    # convert integers to dummy variables (i.e. one hot encoded)
    y_multi = np_utils.to_categorical(encoded_Y)

  # Split the dataset in train and test sets
  test_size = 0.5
  seed = 42

  df_X_even = df_X.query("RandomRunNumber % 2 == 0")
  df_X_odd  = df_X.query("RandomRunNumber % 2 == 1")

  df_X_even = shuffle(df_X_even)
  df_X_odd = shuffle(df_X_odd)

  if args.train_even:
    X_train = df_X_even
    X_test = df_X_odd
  elif args.train_odd:
    X_train = df_X_odd
    X_test = df_X_even

  # Balance dataset by resampling: equal number of signal and background events
  if args.balanced >= 0:
    # Oversample signal
    if args.balanced is 0:
      N_train_sig = len(X_train.query('ylabel==0'))
    # Undersample background
    elif args.balanced is 1:
      N_train_sig = len(X_train.query('ylabel==1'))
    N_train_bkg = N_train_sig
    # Draw balanced training datasets where the number of signal and background events are equal
    X_train_sig = resample(X_train.query('ylabel==1'), replace=True, n_samples=N_train_sig, random_state=42)#, stratify=None)
    X_train_bkg = resample(X_train.query('ylabel==0'), replace=True, n_samples=N_train_bkg, random_state=42)#, stratify=None)
    X_train = pd.concat([X_train_bkg, X_train_sig], axis=0)

  print("\n---------- After balancing ----------")
  print("args.balanced =", args.balanced)
  print("X_train.query('ylabel==1').shape =", X_train.query('ylabel==1').shape)
  print("X_train.query('ylabel==1').shape =", X_train.query('ylabel==0').shape)
  print("---------------------------------------")

  #X_train_bkg = resample(X_train.query('group==Zjets'), replace=True, n_samples=N_train_bkg, random_state=42)#, stratify=None)
  #X_train = X_train.query('group=="diboson" | ylabel==1')

  # Draw validation set as subsample of test set, for quicker evaluation of validation loss during training
  n_val_samples = 1e5
  X_val = resample(X_test, replace=False, n_samples=n_val_samples, random_state=42, stratify=X_test.ylabel)
  y_val = X_val.ylabel

  y_train = X_train.ylabel
  y_test = X_test.ylabel

  # Making a copy of the DFs with only feature columns
  X_train_feat_only = X_train.copy()
  X_test_feat_only = X_test.copy()
  X_val_feat_only = X_val.copy()
  l_non_features = ['DatasetNumber', 'RandomRunNumber', 'eventweight', 'group', 'ylabel']
  X_train_feat_only.drop(l_non_features, axis=1, inplace=True)
  X_test_feat_only.drop(l_non_features, axis=1, inplace=True)
  X_val_feat_only.drop(l_non_features, axis=1, inplace=True)

  print("\nX_train_feat_only:", X_train_feat_only.columns)
  print("X_test_feat_only:", X_test_feat_only.columns)
  print("X_val_feat_only:", X_val_feat_only.columns)

  print("\nX_train_feat_only:", X_train_feat_only.shape)
  print("X_test_feat_only:", X_test_feat_only.shape)
  print("X_val_feat_only:", X_val_feat_only.shape)

  # Feature scaling
  # Scale all variables to the interval [0,1]
  #scaler = preprocessing.MinMaxScaler(feature_range=(0, 1), copy=True)
  scaler = preprocessing.StandardScaler(copy=True, with_mean=True, with_std=True)
  print("\nscaler.fit_transform(X_train_feat_only)")
  X_train_scaled = scaler.fit_transform(X_train_feat_only)
  print("scaler.transform(X_test_feat_only)")
  X_test_scaled = scaler.transform(X_test_feat_only)
  print("scaler.transform(X_val_feat_only)")
  X_val_scaled = scaler.transform(X_val_feat_only)

  
  print("\n\n//////////////////// ML part ////////////////////////")

  global model
  scale_pos_weight = 1
  event_weight = None
  class_weight = None
  class_weight_dict = {}

  if args.event_weight:
    event_weight = X_train.eventweight
    #event_weight = eventweight_train_resampled

  if args.class_weight:
    if args.xgboost:
      # XGBoost: Scale signal events up by a factor n_bkg_train_events / n_sig_train_events
      scale_pos_weight = len(X_train[X_train.ylabel == 0]) / len(X_train[X_train.ylabel == 1]) 
      #scale_pos_weight = 10
    else:
      # sciki-learn: Scale overrespresented sample down (bkg) and underrepresented sample up (sig)
      class_weight = "balanced"
  else:
    class_weight = None

  print("\n# bkg train events / # sig train events = {0:d} / {1:d}".format(len(X_train[X_train.ylabel == 0]), len(X_train[X_train.ylabel == 1])))
  print("scale_pos_weight =", scale_pos_weight)

  classes = np.unique(y)
  class_weight_vect = compute_class_weight(class_weight, classes, y)
  class_weight_dict = {0: class_weight_vect[0], 1: class_weight_vect[1]}

  # Initialize variables for storing CV output
  valid_score = test_score = fit_time = score_time = 0
  # Initialize variables for storing validation and learning curve output
  train_scores_vc_mean = train_scores_vc_std = 0
  valid_scores_vc_mean = valid_scores_vc_std = 0
  train_scores_lc_mean = train_scores_lc_std = 0
  valid_scores_lc_mean = valid_scores_lc_std = 0

  # List of training set sizes for plotting of learning curve
  train_sizes = [0.5, 0.75, 1.0]

  # List of parameter values for hyperparameter grid search
  # XGBoost
  max_depth = [5, 6, 8, 10]
  n_estimators = [50, 100, 200, 500, 1000]
  learning_rate = [0.001, 0.01, 0.1, 0.5, 1.0]
  reg_alpha = [0, 0.001, 0.01, 0.1, 1.]
  reg_lambda = [0, 0.001, 0.01, 0.1, 1.]

  d_param_grid_xgb = {'max_depth': max_depth,
                      'n_estimators': n_estimators,
                      'learning_rate': learning_rate,
                      'reg_alpha': reg_alpha,
                      'reg_lambda': reg_lambda
                      }

  # Specify one of the above parameter lists to plot validation curve for
  param_name_xgb = 'max_depth'
  param_range_xgb = d_param_grid_xgb[param_name_xgb]

  # Neural network
  n_hidden_layers = [1, 3, 5, 7, 10]
  n_nodes = [10, 20, 50, 100, 500]
  batch_size = [8, 16, 32, 64, 128]
  epochs = [10, 50, 100, 500, 1000]
  #kernel_regularizer = [l1_l2(l1=1e-6, l2=1e-6), l1_l2(l1=1e-6, l2=1e-5), l1_l2(l1=1e-5, l2=1e-6), l1_l2(l1=1e-5, l2=1e-5)]
  d_param_grid_nn = {'n_hidden_layers': [1] #n_hidden_layers,
                     #'n_nodes': #n_nodes,
                     #'batch_size': batch_size,
                     #'epochs': epochs,
                     #'kernel_regularizer': kernel_regularizer
                    }

  # Specify one of the above parameter lists to plot validation curve for
  param_name_nn = 'n_hidden_layers'
  param_range_nn = d_param_grid_nn[param_name_nn]

  if args.xgboost:
    param_range = param_range_xgb
    param_name = param_name_xgb
  elif args.nn:
    param_range = param_range_nn
    param_name = param_name_nn


  # Run XGBoost BDT
  if args.xgboost:

    if args.multiclass:
      objective = 'multi:softmax'
      eval_metric = 'mlogloss'
    else:
      objective = 'binary:logistic'
      eval_metric = 'logloss'
      #eval_metric = 'auc'

    max_depth = args.max_depth
    lr = args.lr
    n_estimators = args.n_estimators
    gamma = args.gamma
    min_child_weight = args.min_child_weight
    max_delta_step = args.max_delta_step
    subsample = args.subsample
    colsample_bytree = args.colsample_bytree
    colsample_bylevel = args.colsample_bylevel
    colsample_bynode = args.colsample_bynode
    reg_alpha = args.L1
    reg_lambda = args.L2

    if not args.load_pretrained_model:
      model = XGBClassifier(max_depth=max_depth, 
                            learning_rate=lr,
                            n_estimators=n_estimators, 
                            verbosity=1,
                            objective=objective, 
                            n_jobs=-1,
                            gamma=gamma,
                            min_child_weight=min_child_weight,
                            max_delta_step=max_delta_step,
                            subsample=subsample,
                            colsample_bytree=colsample_bytree,
                            colsample_bylevel=colsample_bylevel,
                            colsample_bynode=colsample_bynode,
                            reg_alpha=reg_alpha,  # L1 regularization
                            reg_lambda=reg_alpha, # L2 regularization
                            scale_pos_weight=scale_pos_weight)

      print("\nmodel.get_params()\n", model.get_params())

      if not args.plot_validation_curve and not args.plot_learning_curve:

        if args.doGridSearchCV:
          model = GridSearchCV(model, d_param_grid, cv=3, scoring='roc_auc', n_jobs=-1, verbose=1)
    
        print("\nTraining XGBoost BDT...")

        if args.doCV:

          cv_results = cross_validate(model, X_train_scaled, y_train, cv=3, scoring='roc_auc', n_jobs=-1, verbose=1, return_train_score=True)

          valid_score = cv_results['test_score']
          train_score = cv_results['train_score']
          fit_time = cv_results['fit_time']
          score_time = cv_results['score_time']
          fit_time = cv_results['fit_time']

        else:
          model.fit(X_train_scaled, y_train, 
                    sample_weight=event_weight, 
                    eval_set=[(X_train_scaled, y_train), (X_val_scaled, y_val)],
                    #eval_set=[(X_val_scaled, y_val)],
                    eval_metric=eval_metric,
                    early_stopping_rounds=20,
                    verbose=True)

          evals_result = model.evals_result()
          sns.set()
          ax = sns.lineplot(x=range(0, len(evals_result['validation_0'][eval_metric])), y=evals_result['validation_0'][eval_metric], label='Training loss')
          ax = sns.lineplot(x=range(0, len(evals_result['validation_1'][eval_metric])), y=evals_result['validation_1'][eval_metric], label='Validation loss')
          ax.set(xlabel='Epochs', ylabel='Loss')
          plt.show()

        print("\nTraining done!")

        if args.doGridSearchCV:
          joblib.dump(model.best_estimator_, trained_model_path)
        else:
          joblib.dump(model, trained_model_path)
        print("\nSaving the trained XGBoost BDT:", trained_model_path)

    elif args.load_pretrained_model:
      print("\nReading in pre-trained XGBoost BDT:", trained_model_path)
      model = joblib.load(trained_model_path)


  # Run neural network
  elif args.nn:

    n_inputs = X_train_scaled.shape[1]
    n_nodes = args.n_nodes
    n_hidden_layers = args.n_hidden_layers
    dropout_rate = args.dropout
    batch_size = args.batch_size
    epochs = args.epochs
    l1 = args.L1
    l2 = args.L2
    lr = args.lr

    if not args.load_pretrained_model:
      print("\nBuilding and training neural network")

      es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=20)

      model = KerasClassifier(build_fn=create_model,
                              n_inputs=n_inputs,
                              n_hidden_layers=n_hidden_layers,
                              n_nodes=n_nodes,
                              dropout_rate=dropout_rate,
                              l1=l1,
                              l2=l2,
                              lr=lr,
                              batch_size=batch_size, 
                              epochs=epochs, 
                              verbose=1,
                              )

      if not args.plot_validation_curve and not args.plot_learning_curve:

        if args.doGridSearchCV:
          param_grid = d_param_grid_nn
          model = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, scoring='roc_auc', n_jobs=-1, verbose=1)

        history = model.fit(X_train_scaled, y_train, 
                            sample_weight=event_weight, 
                            class_weight=class_weight_dict,
                            verbose=1,
                            callbacks=[es],
                            validation_data=(X_val_scaled, y_val)
                            #validation_data=(X_test_scaled, y_test)
                            )

        print("\nmodel.model.summary()\n", model.model.summary())

        if not args.doGridSearchCV:
          d_val_loss = {'Training loss': history.history['loss'], 'Validation loss': history.history['val_loss']}
          df_val_loss = pd.DataFrame(d_val_loss)

          sns.set()
          ax = sns.lineplot(data=df_val_loss)
          ax.set(xlabel='Epochs', ylabel='Loss')
          plt.show()

        if args.doGridSearchCV:
          model.best_estimator_.model.save(trained_model_path)
        else:
          model.model.save(trained_model_path)
        print("\nSaving the trained neural network:", trained_model_path)

    elif args.load_pretrained_model:
      print("\nReading in pre-trained neural network:", trained_model_path)
      model = load_model(trained_model_path)

  if not args.plot_validation_curve and not args.plot_learning_curve:

    # Print results of grid search
    if args.doGridSearchCV:
      print("Best parameters set found on development set:")
      print("")
      print("model.best_params_", model.best_params_)
      print("")
      print("Grid scores on development set:")
      means = model.cv_results_['mean_test_score']
      stds = model.cv_results_['std_test_score']
      for mean, std, params in zip(means, stds, model.cv_results_['params']):
          print("{0:0.3f} (+/-{1:0.03f}) for {2!r}".format(mean, std, params))
      print("")
      df = pd.DataFrame.from_dict(model.cv_results_)
      print("pandas DataFrame of cv results")
      print(df)
      print("")

    # Get predicted signal probabilities for train and test sets
    output_train = model.predict_proba(X_train_scaled)
    output_test = model.predict_proba(X_test_scaled)
    #X_train = X_train.copy()
    #X_test = X_test.copy()

    if args.multiclass:
      output_test = output_test.reshape(output_test.shape[0], 3)
      print("output_train", len(output_train[0]))

      for i_output in range(len(output_train[0])):
        X_train["output"+str(i_output)] = output_train[:,i_output]
        X_test["output"+str(i_output)] = output_test[:,i_output]

    elif output_train.shape[1] is 2:
      print("output_train[:10,1]", output_train[:10,1])
      X_train["output"] = output_train[:,1]
      X_test["output"] = output_test[:,1]

    else:
      X_train["output"] = output_train
      X_test["output"] = output_test


    print("\n\n//////////////////// Plotting part ////////////////////////\n")

    if not args.multiclass:
      print("len(X_train.query('ylabel==0').loc[:,'eventweight'])", len(X_train.query('ylabel==0').loc[:,'eventweight']))
      print("len(X_train.query('ylabel==0').loc[:,'output'])", len(X_train.query('ylabel==0').loc[:,'output']))
      print("X_train.query('ylabel==0').loc[:,'eventweight']", X_train.query("ylabel==0").loc[:,"eventweight"].head())
      print("X_train.query('ylabel==0').loc[:,'output']", X_train.query("ylabel==0").loc[:,"output"].head())

      print("X_train[['eventweight', 'output']].min(): \n", X_train[['eventweight', 'output']].min())
      print("X_train[['eventweight', 'output']].max(): \n", X_train[['eventweight', 'output']].max())
    
    l_X_train_bkg = [X_train.query('group=="/bkg/'+i_bkg+'"').filter(like='output') for i_bkg in l_bkg]
    l_ew_train_bkg = [X_train.query('group=="/bkg/'+i_bkg+'"').loc[:,'eventweight'] for i_bkg in l_bkg]
    l_X_test_bkg = [X_test.query('group=="/bkg/'+i_bkg+'"').filter(like='output') for i_bkg in l_bkg]
    l_ew_test_bkg = [X_test.query('group=="/bkg/'+i_bkg+'"').loc[:,'eventweight'] for i_bkg in l_bkg]

    l_X_train_sig = [X_train.query('ylabel==1 & group=="/sig/'+i_sig+'"').filter(like='output') for i_sig in l_sig]
    l_ew_train_sig = [X_train.query('ylabel==1 & group=="/sig/'+i_sig+'"').loc[:,'eventweight'] for i_sig in l_sig]
    l_X_test_sig = [X_test.query('ylabel==1 & group=="/sig/'+i_sig+'"').filter(like='output') for i_sig in l_sig]
    l_ew_test_sig = [X_test.query('ylabel==1 & group=="/sig/'+i_sig+'"').loc[:,'eventweight'] for i_sig in l_sig]

    d_X_train_bkg = dict(zip(l_bkg, l_X_train_bkg))
    d_ew_train_bkg = dict(zip(l_bkg, l_ew_train_bkg))
    d_X_test_bkg = dict(zip(l_bkg, l_X_test_bkg))
    d_ew_test_bkg = dict(zip(l_bkg, l_ew_test_bkg))

    # Plot unweighted training and test output
    #plt.figure(1)
    #plotTrainTestOutput(d_X_train_bkg, None,
    #                    X_train.query("ylabel==1").loc[:,"output"], None,
    #                    d_X_test_bkg, None,
    #                    X_test.query("ylabel==1").loc[:,"output"], None)
    #plotTrainTestOutput(d_X_train_bkg, None,
    #                    X_train.query("ylabel==1").loc[:,"output"], None,
    #                    d_X_test_bkg, None,
    #                    X_test.query("ylabel==1").loc[:,"output"], None)
    #plt.savefig(output_dir + 'hist1_train_test_unweighted.pdf')

    # Plot weighted train and test output, with test set multiplied by 2 to match number of events in training set
    plt.figure()
    #for i_output in range(output_train.shape[1]):
    plotTrainTestOutput(d_X_train_bkg, d_ew_train_bkg,
                        X_train.query("ylabel==1").filter(like='output'), X_train.query("ylabel==1").loc[:,"eventweight"],
                        d_X_test_bkg, d_ew_test_bkg,
                        X_test.query("ylabel==1").filter(like='output'), X_test.query("ylabel==1").loc[:,"eventweight"],
                        args.signal_region)
    plt.savefig(output_dir + 'hist_train_test_weighted_comparison.pdf')

    # Plot final signal vs background estimate for test set, scaled to 10.6/fb
    if 'low' in args.signal_region:
      plt.figure()
      plotFinalTestOutput(d_X_test_bkg,
                          d_ew_test_bkg,
                          X_test.query("ylabel==1 & (DatasetNumber==392330 | DatasetNumber==396210)").filter(like='output'),
                          X_test.query("ylabel==1 & (DatasetNumber==392330 | DatasetNumber==396210)").loc[:,"eventweight"],
                          args.signal_region,
                          figure_text='(200, 100) GeV')
      plt.savefig(output_dir + 'hist_test_392330_396210_C1N2_WZ_2L2J_200_100_weighted.pdf')
    elif 'int' in args.signal_region:
      plt.figure()
      plotFinalTestOutput(d_X_test_bkg,
                          d_ew_test_bkg,
                          X_test.query("ylabel==1 & DatasetNumber==392325").loc[:,"output"],
                          X_test.query("ylabel==1 & DatasetNumber==392325").loc[:,"eventweight"],
                          args.signal_region,
                          figure_text='(500, 200) GeV')
      plt.savefig(output_dir + 'hist_test_392325_C1N2_WZ_2L2J_500_200_weighted.pdf')
    elif 'high' in args.signal_region:
      plt.figure()
      plotFinalTestOutput(d_X_test_bkg,
                          d_ew_test_bkg,
                          X_test.query("ylabel==1 & DatasetNumber==392356").loc[:,"output"],
                          X_test.query("ylabel==1 & DatasetNumber==392356").loc[:,"eventweight"],
                          args.signal_region,
                          figure_text='(600, 0) GeV')
      plt.savefig(output_dir + 'hist5_test_392356_C1N2_WZ_2L2J_600_0_weighted.pdf')


    if args.xgboost and not args.doGridSearchCV:
      # Plot feature importance
      print("model.feature_importances_", model.feature_importances_)
      print("np.sum(model.feature_importances_)", np.sum(model.feature_importances_))
      if args.multiclass:
        l_feat_drop = ['DatasetNumber', 'RandomRunNumber', 'eventweight', 'group', 'ylabel', 'output0', 'output1', 'output2']
      else:
        l_feat_drop = ['DatasetNumber', 'RandomRunNumber', 'eventweight', 'group', 'ylabel', 'output']
      s_feat_importance = pd.Series(model.feature_importances_, index=X_train.drop(l_feat_drop, axis=1).columns)
      print("X_train.drop(l_feat_drop, axis=1).columns\n", X_train.drop(l_feat_drop, axis=1).columns)
      s_feat_importance.sort_values(ascending=False, inplace=True)

      plt.figure()
      sns.set(style="ticks", color_codes=True)
      n_top_feat_importance = 20
      ax = sns.barplot(x=s_feat_importance[:n_top_feat_importance]*100, y=s_feat_importance[:n_top_feat_importance].index)#, palette="Blues_r")
      #ax.set_yticklabels(s_feat_importance.index)
      ax.set(xlabel="Feature importance [%]")
      plt.savefig(output_dir + 'feature_importance.pdf')


    if not args.multiclass:
      # Plot ROC curve
      fpr, tpr, thresholds = metrics.roc_curve(X_test.loc[:,"ylabel"], X_test.loc[:,"output"])
      auc = metrics.roc_auc_score(X_test.loc[:,"ylabel"], X_test.loc[:,"output"])

      plt.figure()
      ax = sns.lineplot(x=tpr, y=1-fpr, estimator=None, label='ROC curve: AUC = %0.2f' % auc)
      plt.plot([1,0], [0,1], linestyle="--")
      ax.set(xlabel="Signal efficiency", ylabel="Background efficiency")
      plt.savefig(output_dir + 'ROC_curve_AUC_sigEff_vs_1minBkgEff.pdf')

      plt.figure()
      ax = sns.lineplot(x=tpr, y=1/(fpr), estimator=None, label='ROC curve: AUC = %0.2f' % auc)
      #plt.plot([0,1], [0,1], linestyle="--")
      ax.set(xlabel="Signal efficiency", ylabel="Background rejection = 1/(1 - bkg eff.)", yscale='log')
      plt.savefig(output_dir + 'ROC_curve_AUC_sigEff_vs_bkgRej.pdf')


    plt.show()


    # Signal significance
    print("\n///////////////// Signal significance /////////////////")

    def significance(cut_string_sig, cut_string_bkg, rel_unc=0.3):
      sig_exp = np.sum(X_test.query("ylabel == 1 & "+cut_string_sig).loc[:,"eventweight"])
      bkg_exp = np.sum(X_test.query("(ylabel == 0 | ylabel == 2 | ylabel == 3) & "+cut_string_bkg).loc[:,"eventweight"])
      Z_N_exp = RooStats.NumberCountingUtils.BinomialExpZ(sig_exp, bkg_exp, rel_unc)
      return [sig_exp, bkg_exp, Z_N_exp]

    #cut_string_DSID = 'DatasetNumber == {0:d}'.format(dsid)
    if 'low' in args.signal_region: 
      key = '(200, 100)'
      cut_string_DSID = '(DatasetNumber == 392330 | DatasetNumber == 396210)'
    elif 'int' in args.signal_region: 
      key = '(500, 200)'
      cut_string_DSID = 'DatasetNumber == 392325'
    elif 'high' in args.signal_region: 
      key = '(600, 0)'
      cut_string_DSID = 'DatasetNumber == 392356'

    l_cuts = [0.9, 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99]
    global cut_optimal
    cut_optimal = 0
    Z_N_optimal = 0
    for cut in l_cuts:

      if args.multiclass:
        cut_string_SR = 'output0 > {:f}'.format(cut)
      else:
        cut_string_SR = 'output > {:f}'.format(cut)
      cut_string_bkg = cut_string_SR
      cut_string_sig = cut_string_SR + " & " + cut_string_DSID
      print('\ncut_string_sig:', cut_string_sig)
      print('cut_string_bkg:', cut_string_bkg)

      [sig_exp, bkg_exp, Z_N_exp] = significance(cut_string_sig, cut_string_bkg, rel_unc=0.3)
      print("---", key)
      print("S_exp =", sig_exp)
      print("B_exp =", bkg_exp)
      for i in range(len(l_X_train_bkg)):
        l_cut_strings = ['ylabel == 0', 'group == "/bkg/{}"'.format(l_bkg[i]), cut_string_bkg]
        B_exp_i = np.sum(X_test.query('&'.join(l_cut_strings)).loc[:,"eventweight"])
        print("  {0}: {1}".format(l_bkg[i], B_exp_i))
      print("Z_N_exp =", Z_N_exp)

      if sig_exp >= 3 and bkg_exp >= 1:
        if Z_N_exp > Z_N_optimal:
          Z_N_optimal = Z_N_exp
          cut_optimal = cut

    # Print the optimal SR values
    if args.multiclass:
      cut_string_SR = 'output0 > {:f}'.format(cut_optimal)
    else:
      cut_string_SR = 'output > {:f}'.format(cut_optimal)
    cut_string_bkg = cut_string_SR
    cut_string_sig = cut_string_SR + " & " + cut_string_DSID
    print('\ncut_string_sig:', cut_string_sig)
    print('cut_string_bkg:', cut_string_bkg)


    [sig_exp, bkg_exp, Z_N_exp] = significance(cut_string_sig, cut_string_bkg, rel_unc=0.3)
    print("---", key)
    print("Optimal cut =", cut_optimal)
    print("S_exp =", sig_exp)
    print("B_exp =", bkg_exp)
    for i in range(len(l_X_train_bkg)):
      l_cut_strings = ['ylabel == 0', 'group == "/bkg/{}"'.format(l_bkg[i]), cut_string_bkg]
      B_exp_i = np.sum(X_test.query('&'.join(l_cut_strings)).loc[:,"eventweight"])
      print("  {0}: {1}".format(l_bkg[i], B_exp_i))
    print("Z_N_exp =", Z_N_exp)



  if args.plot_validation_curve:
    print("\nCalculating validation curve...")
    train_scores, valid_scores = validation_curve(model, X_train_scaled, y_train, 
                                                  param_name=param_name, param_range=param_range,
                                                  cv=3, 
                                                  scoring='roc_auc', 
                                                  n_jobs=-1,
                                                  verbose=11)

    train_scores_vc_mean = np.mean(train_scores, axis=1)
    train_scores_vc_std = np.std(train_scores, axis=1)
    valid_scores_vc_mean = np.mean(valid_scores, axis=1)
    valid_scores_vc_std = np.std(valid_scores, axis=1)
 
    # Plot validation curves
    figF, axsF = plt.subplots()
    # Training score
    axsF.plot( param_range, train_scores_vc_mean, 'o-', label="Training score", color="darkorange", lw=2)
    axsF.fill_between( param_range, train_scores_vc_mean - train_scores_vc_std, train_scores_vc_mean + train_scores_vc_std, alpha=0.2, color="darkorange", lw=2)
    # Test score
    axsF.plot( param_range, valid_scores_vc_mean, 'o-', label="Cross-validation score", color="navy", lw=2)
    axsF.fill_between( param_range, valid_scores_vc_mean - valid_scores_vc_std, valid_scores_vc_mean + valid_scores_vc_std, alpha=0.2, color="navy", lw=2)
    axsF.set_xlabel(param_name)
    axsF.set_ylabel('Score')
    axsF.legend(loc="best")
    axsF.set_title('Validation curves')
    #axsF.set_ylim(0., 1.)
    plt.savefig(output_dir + 'validation_curve_{}.pdf'.format(param_name))
    plt.show()

  if args.plot_learning_curve:
    print("\nCalculating learning curve...")
    train_sizes, train_scores, valid_scores = learning_curve(model, X_train_scaled, y_train, train_sizes=train_sizes,
                                                             cv=3, scoring='roc_auc', n_jobs=1, verbose=3)
    train_scores_lc_mean = np.mean(train_scores, axis=1)
    train_scores_lc_std = np.std(train_scores, axis=1)
    valid_scores_lc_mean = np.mean(valid_scores, axis=1)
    valid_scores_lc_std = np.std(valid_scores, axis=1)

    # Plot learning curves
    figG, axsG = plt.subplots()
    # 68% CL bands
    #if runBDT:
    #elif runNN:
    axsG.fill_between( train_sizes, train_scores_lc_mean - train_scores_lc_std, train_scores_lc_mean + train_scores_lc_std, alpha=0.2, color="r", lw=2)
    axsG.fill_between( train_sizes, valid_scores_lc_mean - valid_scores_lc_std, valid_scores_lc_mean + valid_scores_lc_std, alpha=0.2, color="g", lw=2)
    # Training and validation scores
    axsG.plot( train_sizes, train_scores_lc_mean, 'o-', label="Training score", color="r", lw=2)
    axsG.plot( train_sizes, valid_scores_lc_mean, 'o-', label="Cross-validation score", color="g", lw=2)
    axsG.set_xlabel("Training examples")
    axsG.set_ylabel('Score')
    axsG.legend(loc="best")
    axsG.set_title('Learning curves')
    #axsG.set_ylim(0., 1.)
    plt.savefig(output_dir + 'learning_curve.pdf')
    plt.show()


  # Stop timer
  t_end = time.time()
  print("\nProcess time: {:4.2f} s".format(t_end - t_start))
Exemple #30
0
def main():
    import time
    start = time.time()
    with open('./pkl/X.pkl', 'rb') as fh:  # Load data set
        X = dill.load(fh)
    with open('./pkl/y.pkl', 'rb') as fh:
        y = dill.load(fh)
    scaler = Normalizer()
    smote_etomek = SMOTETomek(ratio='auto')
    cachedir = mkdtemp()
    cv = StratifiedKFold(n_splits=5, shuffle=True)
    classifier = XGBClassifier()

    # A parameter grid for XGBoost
    params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0, 0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [1, 3, 4, 5, 10],
    }
    pipeline = Pipeline([
        ('scaler', scaler),
        ('smt', smote_etomek),
        ('clf', classifier),
    ],
                        memory=cachedir)
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=0)
    sss.get_n_splits(X, y)
    for train_index, test_index in sss.split(X, y):
        #print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[
            test_index]  # make training and test set
        y_train, y_test = y[train_index], y[test_index]

        clf = dasksearchCV(classifier,
                           params,
                           n_jobs=8,
                           cv=3,
                           scoring='roc_auc',
                           refit=True)

        clf.fit(X_train, y_train)
        print(clf.best_params_)
        print(clf.best_score_)
        best_parameters, score = clf.best_params_, clf.best_score_
        print('Raw AUC score:', score)
        for param_name in sorted(best_parameters.keys()):
            print("%s: %r" % (param_name, best_parameters[param_name]))
        classifier = XGBClassifier(**best_parameters, njobs=-1)
        plot_cross_validation(
            cv, X_train, y_train,
            pipeline)  # do 5 fold stratified cross-validation
        clf = pipeline.fit(X_train, y_train)  #

        print(classifier.get_params())
        expected = y_test
        predicted = clf.predict(X_test)  # test performance on test set
        plot_confusion_matrix(confusion_matrix(expected, predicted),
                              classes=["Non-Zika", "Zika"])
    print(time.time() - start)
    from sklearn import metrics
    print("Classification report for classifier %s:\n%s\n" %
          (clf, metrics.classification_report(expected, predicted)))
                     learning_rate =0.01,
                     n_estimators=1000,
                     max_depth=5,
                     min_child_weight=1,
                     gamma=0,
                     subsample=0.8,
                     colsample_bytree=0.8,
                     n_jobs=-1,
                     random_state=42
                    )

xgb_param = xgb.get_xgb_params()
xgb_param

cvresult = xgboost.cv(xgb_param, xgtrain, 
                  num_boost_round=xgb.get_params()['n_estimators'], 
                  nfold=5,
                  metrics='auc', 
                  early_stopping_rounds=50,
                  seed=42
                  )

cvresult.head()

cvresult.shape

xgb_best_param = {'n_estimators': cvresult.shape[0]}
xgb_best_param
# best n_estimators value to be used in the stack model

# update xgb with the optimal n_estimators
Exemple #32
0
class XGBoostClassifier(ClassifierBase):
    def __init__(self, useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
        super(XGBoostClassifier, self).__init__()
        self.useTrainCV = useTrainCV
        self.cv_folds = cv_folds
        self.early_stopping_rounds = early_stopping_rounds
        self.clf = XGBClassifier(learning_rate=0.1,
                                 n_estimators=140,
                                 max_depth=5,
                                 min_child_weight=3,
                                 gamma=0.2,
                                 subsample=0.6,
                                 colsample_bytree=1.0,
                                 objective='binary:logistic',
                                 n_jobs=6,
                                 scale_pos_weight=1,
                                 seed=27)

    def train(self, X_train, y_train):
        if self.useTrainCV:
            print("Start Feeding Data for Cross Validation")
            xgb_param = self.clf.get_xgb_params()
            xgtrain = xgb.DMatrix(X_train, label=y_train)
            cvresult = xgb.cv(
                xgb_param,
                xgtrain,
                num_boost_round=self.clf.get_params()['n_estimators'],
                nfold=self.cv_folds,
                early_stopping_rounds=self.early_stopping_rounds)
            self.clf.set_params(**cvresult)
            # param_test1 = {}
            # gsearch1 = GridSearchCV(estimator=XGBClassifier(learning_rate=0.1, n_estimators=140, max_depth=5,
            #                                                 min_child_weight=3, gamma=0.2, subsample=0.8,
            #                                                 colsample_bytree=1.0,
            #                                                 objective='binary:logistic', nthread=4, scale_pos_weight=1,
            #                                                 seed=27),
            #                         param_grid=param_test1,
            #                         scoring='f1',
            #                         n_jobs=4, iid=False, cv=5)
            # gsearch1.fit(X_train, y_train)
            # print(gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_)

        self.clf.fit(X_train, y_train, eval_metric='auc')

    def predict(self, X_test, y_test=None):

        y_pred_proba = self.clf.predict_proba(X_test)[:, 1]
        if not (y_test is None):
            print("Score: ", self.clf.score(X_test, y_test))
            y_pred = self.clf.predict(X_test)
            print("Acc : %.4g" % metrics.accuracy_score(y_test, y_pred))
            print("F1 score is: {}".format(f1_score(y_test, y_pred)))
            print("AUC Score is: {}".format(roc_auc_score(
                y_test, y_pred_proba)))
        return y_pred_proba

    def printFeatureImportance(self, X_train):
        feat_imp = self.clf.feature_importances_
        feat = X_train.columns.tolist()
        #res_df = pd.DataFrame({'Features': feat, 'Importance': feat_imp}).sort_values(by='Importance', ascending=False)
        #res_df.plot('Features', 'Importance', kind='bar', title='Feature Importances')
        #plt.ylabel('Feature Importance Score')
        #plt.show()
        #print(res_df)
        #print(res_df["Features"].tolist())
        print('Importance feats:', feat)

    def save(self, path):
        dump(self.clf, os.path.join(path, 'clf.joblib'))

    def load(self, path):
        self.clf = load(os.path.join(path, 'clf.joblib'))