Example #1
0
def mainXGB(options):

  import xgboost as xgb
  from taggerOptions import StandardVariables, getJetVarNames

  print "PROCESSING TRAINING DATA"

  #get variables 
  globalVars, jetVars = StandardVariables(options.variables)
  allVars = globalVars + getJetVarNames(jetVars)

  # Import data
  dg = DataGetter(allVars)
  dataFiles = []
  dataFiles += glob(options.dataFilePath + "/")
  dataFiles += glob(options.dataFilePath + "/")
  dataFiles += glob(options.dataFilePath + "/")
  dataFiles += glob(options.dataFilePath + "/")
  trainData = dg.importData(samplesToRun = tuple(dataFiles), prescale=True, ptReweight=options.ptReweight)
  #= dg.importData(samplesToRun = tuple(glob(options.dataFilePath + "/trainingTuple_TTbarSingleLepT*_0_division_0_TTbarSingleLepT*_training_[0].h5")), prescale=True, ptReweight=options.ptReweight)

  print "TRAINING XGB"

  # Create xgboost classifier
  # Train random forest 
  xgData = xgb.DMatrix(trainData["data"], label=trainData["labels"][:,0])#, weight=trainData["weights"][:,0])
  param = {'max_depth':6, 'eta':0.03, 'objective':'binary:logistic', 'eval_metric':['error', 'auc', 'logloss'], 'nthread':28 }
  gbm = xgb.train(param, xgData, num_boost_round=2000)
  
  #Dump output from training
  gbm.save_model(options.directory + "/" + 'TrainingModel.xgb')
Example #2
0
def mainXGB(options):

  import xgboost as xgb
  from taggerOptions import StandardVariables, getJetVarNames

  print "PROCESSING TRAINING DATA"

  #get variables 
  globalVars, jetVars = StandardVariables(options.variables)
  allVars = globalVars + getJetVarNames(jetVars)

  # Import data
  dg = DataGetter(allVars)
  dataFiles = []
  dataFiles += glob(options.dataFilePath + "/trainingTuple_TTbarSingleLepT*_0_division_0_TTbarSingleLepT*_training_[01234].h5")
  dataFiles += glob(options.dataFilePath + "/trainingTuple_TTbarSingleLepT*_20_division_0_TTbarSingleLepT*_training_[01234].h5")
  dataFiles += glob(options.dataFilePath + "/trainingTuple_TTbarSingleLepT*_40_division_0_TTbarSingleLepT*_training_[01234].h5")
  dataFiles += glob(options.dataFilePath + "/trainingTuple_TTbarSingleLepT*_60_division_0_TTbarSingleLepT*_training_[01234].h5")
  trainData = dg.importData(samplesToRun = tuple(dataFiles), prescale=True, ptReweight=options.ptReweight)
  #= dg.importData(samplesToRun = tuple(glob(options.dataFilePath + "/trainingTuple_TTbarSingleLepT*_0_division_0_TTbarSingleLepT*_training_[0].h5")), prescale=True, ptReweight=options.ptReweight)

  print "TRAINING XGB"

  # Create xgboost classifier
  # Train random forest 
  xgData = xgb.DMatrix(trainData["data"], label=trainData["labels"][:,0])#, weight=trainData["weights"][:,0])
  param = {'max_depth':6, 'eta':0.03, 'objective':'binary:logistic', 'eval_metric':['error', 'auc', 'logloss'], 'nthread':28 }
  gbm = xgb.train(param, xgData, num_boost_round=2000)
  
  #Dump output from training
  gbm.save_model(options.directory + "/" + 'TrainingModel.xgb')
Example #3
0
def mainSKL(options):

    from sklearn.ensemble import RandomForestClassifier
    import pickle

    print "PROCESSING TRAINING DATA"

    from taggerOptions import StandardVariables, getJetVarNames

    #get variables
    globalVars, jetVars = StandardVariables(options.variables)
    allVars = globalVars + getJetVarNames(jetVars)

    # Import data
    dg = DataGetter(allVars)
    trainData = dg.importData(samplesToRun=tuple(
        glob(
            options.dataFilePath +
            "/trainingTuple_TTbarSingleLepT*_0_division_0_TTbarSingleLepT*_training_0.h5"
        )),
                              prescale=True,
                              ptReweight=options.ptReweight)

    # Create random forest
    clf = RandomForestClassifier(n_estimators=500,
                                 max_depth=10,
                                 n_jobs=4,
                                 verbose=True)

    print "TRAINING RF"

    # Train random forest
    clf = clf.fit(trainData["data"],
                  trainData["labels"][:, 0],
                  sample_weight=trainData["weights"][:, 0])

    #Dump output from training
    fileObject = open(options.directory + "/" + "TrainingOutput.pkl", 'wb')
    out = pickle.dump(clf, fileObject)
    fileObject.close()
Example #4
0
def mainSKL(options):

    from sklearn.ensemble import RandomForestClassifier
    #  import xgboost as xgb
    import pickle

    print "PROCESSING TRAINING DATA"

    from taggerOptions import StandardVariables, getJetVarNames

    #get variables
    globalVars, jetVars = StandardVariables(options.variables)
    allVars = globalVars + getJetVarNames(jetVars)
    print allVars

    # Import data
    #dgSig = DataGetter.DefinedVariables(allVars, signal = True)
    #dgBg = DataGetter.DefinedVariables(allVars, background = True)
    #
    #validDataSig = [(glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6p1/trainingTuple_0_division_0_TTbarSingleLepT_training_[01234].h5", ), 1),
    #                (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6p1/trainingTuple_0_division_0_TTbarSingleLepTbar_training_[01234].h5", ), 1),
    #                (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6p1/trainingTuple_20_division_0_TTbarSingleLepT_training_[01234].h5", ), 1),
    #                (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6p1/trainingTuple_20_division_0_TTbarSingleLepTbar_training_[01234].h5", ), 1),
    #                (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6p1/trainingTuple_40_division_0_TTbarSingleLepT_training_[01234].h5", ), 1),
    #                (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6p1/trainingTuple_40_division_0_TTbarSingleLepTbar_training_[01234].h5", ), 1),
    #                (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6p1/trainingTuple_60_division_0_TTbarSingleLepT_training_[01234].h5", ), 1),
    #                (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6p1/trainingTuple_60_division_0_TTbarSingleLepTbar_training_[01234].h5", ), 1),]
    #
    #validDataBgTTbar = [(glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_0_division_0_TTbarSingleLepT_training_[01234].h5", ), 1),
    #                    (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_0_division_0_TTbarSingleLepTbar_training_[01234].h5", ), 1),
    #                    (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_20_division_0_TTbarSingleLepT_training_[01234].h5", ), 1),
    #                    (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_20_division_0_TTbarSingleLepTbar_training_[01234].h5", ), 1),
    #                    (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_40_division_0_TTbarSingleLepT_training_[01234].h5", ), 1),
    #                    (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_40_division_0_TTbarSingleLepTbar_training_[01234].h5", ), 1),
    #                    (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_60_division_0_TTbarSingleLepT_training_[01234].h5", ), 1),
    #                    (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_60_division_0_TTbarSingleLepTbar_training_[01234].h5", ), 1),]
    #
    #validDataBgQCDMC = [(glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_0_division_0_QCD_HT100to200_training_0.h5", ), 1),
    #                    (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_0_division_0_QCD_HT200to300_training_0.h5", ), 1),
    #                    (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_0_division_0_QCD_HT300to500_training_0.h5", ), 1),
    #                    (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_0_division_0_QCD_HT500to700_training_0.h5", ), 1),
    #                    (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_0_division_0_QCD_HT700to1000_training_0.h5", ), 1),
    #                    (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_0_division_0_QCD_HT1000to1500_training_0.h5", ), 1),
    #                    (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_0_division_0_QCD_HT1500to2000_training_0.h5", ), 1),
    #                    (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_0_division_0_QCD_HT2000toInf_training_0.h5", ), 1)]
    #
    #validDataBgQCDData = [(glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_0_division_0_Data_JetHT_2016_training_0.h5", ), 1)]
    #
    #
    #print "Input Variables: ",len(dgSig.getList())
    #
    ## Import data
    ##print options.runOp.validationSamples
    #
    #validDataSig =       getValidData(dgSig, validDataSig,       options)
    #validDataBgTTbar =   getValidData(dgBg,  validDataBgTTbar,   options)
    #validDataBgQCDMC =   getValidData(dgBg,  validDataBgQCDMC,   options)
    #validDataBgQCDData = getValidData(dgBg,  validDataBgQCDData, options)
    #
    #validDataTTbar = combineValidationData(validDataSig, validDataBgTTbar)
    #validDataQCDMC = combineValidationData(validDataSig, validDataBgQCDMC)
    #validDataQCDData = combineValidationData(validDataSig, validDataBgQCDData)

    dg = DataGetter(allVars)
    dataFiles = []
    dataFiles += glob(
        options.dataFilePath +
        "/trainingTuple_TTbarSingleLepT*_0_division_0_TTbarSingleLepT*_training_[01234].h5"
    )
    dataFiles += glob(
        options.dataFilePath +
        "/trainingTuple_TTbarSingleLepT*_20_division_0_TTbarSingleLepT*_training_[01234].h5"
    )
    dataFiles += glob(
        options.dataFilePath +
        "/trainingTuple_TTbarSingleLepT*_40_division_0_TTbarSingleLepT*_training_[01234].h5"
    )
    dataFiles += glob(
        options.dataFilePath +
        "/trainingTuple_TTbarSingleLepT*_60_division_0_TTbarSingleLepT*_training_[01234].h5"
    )
    print dataFiles
    trainData = dg.importData(samplesToRun=tuple(dataFiles),
                              prescale=True,
                              ptReweight=options.ptReweight)

    # Create random forest
    #clf = xgb.XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
    #                        gamma=0, learning_rate=0.001, max_delta_step=0, max_depth=6,
    #                        min_child_weight=0.1, missing=None, n_estimators=2000, nthread=28,
    #                        objective='binary:logistic', reg_alpha=0, reg_lambda=0.01,
    #                        scale_pos_weight=1, seed=0, silent=False, subsample=1 )
    clf = RandomForestClassifier(n_estimators=1000,
                                 max_depth=10,
                                 n_jobs=28,
                                 verbose=True)

    print "TRAINING RF"

    # Train random forest
    clf = clf.fit(trainData["data"],
                  trainData["labels"][:, 0],
                  sample_weight=trainData["weights"][:, 0])

    #Dump output from training
    fileObject = open(options.directory + "/" + "TrainingOutput.pkl", 'wb')
    out = pickle.dump(clf, fileObject)
    fileObject.close()
Example #5
0
def mainSKL(options):

  from sklearn.ensemble import RandomForestClassifier
  from sklearn.neural_network import MLPClassifier
  #import xgboost as xgb
  import pickle

  print "PROCESSING TRAINING DATA"

  from taggerOptions import StandardVariables, getJetVarNames

  #get variables 
  globalVars, jetVars = StandardVariables(options.variables)
  allVars = globalVars + getJetVarNames(jetVars)

  print allVars

  # Import data
  #dg = DataGetter(allVars)
  dgSig = DataGetter.DefinedVariables(allVars, signal = True,  background = False)
  dgBg = DataGetter.DefinedVariables(allVars,  signal = False, background = True)
  dataFiles = []
  dataFiles += glob(dataPath + "/trainingTuple_*_division_*_rpv_stop_*_training_0.h5")
  dataFiles2 =glob(dataPath + "/trainingTuple_*_division_0_TT_training_0.h5")

  dataSig = dgSig.importData(samplesToRun = tuple(dataFiles), prescale=True, ptReweight=False)
  dataBg = dgBg.importData(samplesToRun = tuple(dataFiles2), prescale=True, ptReweight=False)
  
  minLen = min(len(dataSig["data"]),len(dataBg["data"]))

  trainDataArray = [dataSig,dataBg]
  trainData = {}
  for data in trainDataArray:
    for key in data:
      if key in trainData:
        trainData[key] = numpy.vstack([trainData[key], data[key][:minLen]])
      else:
        trainData[key] = data[key][:minLen]


  perms = numpy.random.permutation(trainData["data"].shape[0])
  for key in trainData:
    trainData[key] = trainData[key][perms]

  # Create random forest
  #clf = xgb.XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
  #                        gamma=0, learning_rate=0.001, max_delta_step=0, max_depth=6,
  #                        min_child_weight=0.1, missing=None, n_estimators=2000, nthread=28,
  #                        objective='binary:logistic', reg_alpha=0, reg_lambda=0.01,
  #                        scale_pos_weight=1, seed=0, silent=False, subsample=1 )
  #clf = RandomForestClassifier(n_estimators=500, max_depth=10, n_jobs = 28, verbose = True)
  clf = MLPClassifier(hidden_layer_sizes=(20))

  print "TRAINING RF"
  
  # Train random forest 
  clf = clf.fit(trainData["data"], trainData["labels"][:,0])#, sample_weight=trainData["weights"][:,0])
  
  #Dump output from training
  fileObject = open(options.directory + "/" + "TrainingOutput.pkl",'wb')
  out = pickle.dump(clf, fileObject)
  fileObject.close()
Example #6
0
def mainSKL(options):

  from sklearn.ensemble import RandomForestClassifier
#  import xgboost as xgb
  import pickle

  print "PROCESSING TRAINING DATA"

  from taggerOptions import StandardVariables, getJetVarNames

  #get variables 
  globalVars, jetVars = StandardVariables(options.variables)
  allVars = globalVars + getJetVarNames(jetVars)
  print allVars

  # Import data
  #dgSig = DataGetter.DefinedVariables(allVars, signal = True)
  #dgBg = DataGetter.DefinedVariables(allVars, background = True)
  #
  #validDataSig = [(glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6p1/trainingTuple_0_division_0_TTbarSingleLepT_training_[01234].h5", ), 1),
  #                (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6p1/trainingTuple_0_division_0_TTbarSingleLepTbar_training_[01234].h5", ), 1),
  #                (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6p1/trainingTuple_20_division_0_TTbarSingleLepT_training_[01234].h5", ), 1),
  #                (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6p1/trainingTuple_20_division_0_TTbarSingleLepTbar_training_[01234].h5", ), 1),
  #                (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6p1/trainingTuple_40_division_0_TTbarSingleLepT_training_[01234].h5", ), 1),
  #                (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6p1/trainingTuple_40_division_0_TTbarSingleLepTbar_training_[01234].h5", ), 1),
  #                (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6p1/trainingTuple_60_division_0_TTbarSingleLepT_training_[01234].h5", ), 1),
  #                (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6p1/trainingTuple_60_division_0_TTbarSingleLepTbar_training_[01234].h5", ), 1),]
  #
  #validDataBgTTbar = [(glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_0_division_0_TTbarSingleLepT_training_[01234].h5", ), 1),
  #                    (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_0_division_0_TTbarSingleLepTbar_training_[01234].h5", ), 1),
  #                    (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_20_division_0_TTbarSingleLepT_training_[01234].h5", ), 1),
  #                    (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_20_division_0_TTbarSingleLepTbar_training_[01234].h5", ), 1),
  #                    (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_40_division_0_TTbarSingleLepT_training_[01234].h5", ), 1),
  #                    (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_40_division_0_TTbarSingleLepTbar_training_[01234].h5", ), 1),
  #                    (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_60_division_0_TTbarSingleLepT_training_[01234].h5", ), 1),
  #                    (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_60_division_0_TTbarSingleLepTbar_training_[01234].h5", ), 1),]
  #
  #validDataBgQCDMC = [(glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_0_division_0_QCD_HT100to200_training_0.h5", ), 1),
  #                    (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_0_division_0_QCD_HT200to300_training_0.h5", ), 1),
  #                    (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_0_division_0_QCD_HT300to500_training_0.h5", ), 1),
  #                    (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_0_division_0_QCD_HT500to700_training_0.h5", ), 1),
  #                    (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_0_division_0_QCD_HT700to1000_training_0.h5", ), 1),
  #                    (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_0_division_0_QCD_HT1000to1500_training_0.h5", ), 1),
  #                    (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_0_division_0_QCD_HT1500to2000_training_0.h5", ), 1),
  #                    (glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_0_division_0_QCD_HT2000toInf_training_0.h5", ), 1)]
  #
  #validDataBgQCDData = [(glob("/cms/data/pastika/trainData_pt20_30_40_dRPi_tightMass_deepFlavor_v6/trainingTuple_0_division_0_Data_JetHT_2016_training_0.h5", ), 1)]
  #
  #
  #print "Input Variables: ",len(dgSig.getList())
  #
  ## Import data
  ##print options.runOp.validationSamples
  #
  #validDataSig =       getValidData(dgSig, validDataSig,       options)
  #validDataBgTTbar =   getValidData(dgBg,  validDataBgTTbar,   options)
  #validDataBgQCDMC =   getValidData(dgBg,  validDataBgQCDMC,   options)
  #validDataBgQCDData = getValidData(dgBg,  validDataBgQCDData, options)
  #
  #validDataTTbar = combineValidationData(validDataSig, validDataBgTTbar)
  #validDataQCDMC = combineValidationData(validDataSig, validDataBgQCDMC)
  #validDataQCDData = combineValidationData(validDataSig, validDataBgQCDData)

  dg = DataGetter(allVars)
  dataFiles = []
  dataFiles += glob(options.dataFilePath + "/trainingTuple_TTbarSingleLepT*_0_division_0_TTbarSingleLepT*_training_[01234].h5")
  dataFiles += glob(options.dataFilePath + "/trainingTuple_TTbarSingleLepT*_20_division_0_TTbarSingleLepT*_training_[01234].h5")
  dataFiles += glob(options.dataFilePath + "/trainingTuple_TTbarSingleLepT*_40_division_0_TTbarSingleLepT*_training_[01234].h5")
  dataFiles += glob(options.dataFilePath + "/trainingTuple_TTbarSingleLepT*_60_division_0_TTbarSingleLepT*_training_[01234].h5")
  print dataFiles
  trainData = dg.importData(samplesToRun = tuple(dataFiles), prescale=True, ptReweight=options.ptReweight)

  # Create random forest
  #clf = xgb.XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
  #                        gamma=0, learning_rate=0.001, max_delta_step=0, max_depth=6,
  #                        min_child_weight=0.1, missing=None, n_estimators=2000, nthread=28,
  #                        objective='binary:logistic', reg_alpha=0, reg_lambda=0.01,
  #                        scale_pos_weight=1, seed=0, silent=False, subsample=1 )
  clf = RandomForestClassifier(n_estimators=1000, max_depth=10, n_jobs = 28, verbose = True)

  print "TRAINING RF"
  
  # Train random forest 
  clf = clf.fit(trainData["data"], trainData["labels"][:,0], sample_weight=trainData["weights"][:,0])
  
  #Dump output from training
  fileObject = open(options.directory + "/" + "TrainingOutput.pkl",'wb')
  out = pickle.dump(clf, fileObject)
  fileObject.close()