コード例 #1
0
#Adding additional variables needed
import os
#st + pt/mgg, OR + ptMjj+dR
#additionalCut_names = 'MX,Mjj,CMS_hgg_mass'.split(",")
additionalCut_names = 'MX,CMS_hgg_mass'.split(",")
#st + pt/mgg+pt/mjj+dR
outTag = 'Hggbb/legacy_branch_flattrees/'
#outTag = 'Hggbb'
outDir = os.path.expanduser("/afs/cern.ch/work/i/ivovtin/" + outTag)
if not os.path.exists(outDir):
    os.mkdir(outDir)

bkg_count_df = rpd.read_root(utils.IO.backgroundName[0],
                             "bbggSelectionTree",
                             columns=branch_names + additionalCut_names)
preprocessing.define_process_weight(bkg_count_df, utils.IO.bkgProc[0],
                                    utils.IO.backgroundName[0])

nTot, dictVar = postprocessing.stackFeatures(
    bkg_count_df, branch_names + additionalCut_names)

processPath = os.path.expanduser(
    '/afs/cern.ch/work/i/ivovtin/') + outTag + '/' + utils.IO.backgroundName[
        0].split("/")[len(utils.IO.backgroundName[0].split("/")) - 1].replace(
            "output_", "").replace(".root", "") + "_preselection" + ".root"
postprocessing.saveTree(processPath, dictVar, nTot, Y_pred_bkg)

processPath = os.path.expanduser(
    '/afs/cern.ch/work/i/ivovtin/') + outTag + '/' + utils.IO.backgroundName[
        0].split("/")[len(utils.IO.backgroundName[0].split("/")) - 1].replace(
            "output_", "").replace(".root",
                                   "") + "_preselection_diffNaming" + ".root"
コード例 #2
0
#additionalCut_names = 'noexpand:leadingPhoton_pt/CMS_hgg_mass,noexpand:subleadingPhoton_pt/CMS_hgg_mass,MX'.split(",")
#st + pt/mgg, OR + ptMjj+dR
additionalCut_names = 'MX,Mjj,CMS_hgg_mass'.split(",")
#additionalCut_names = 'MX,CMS_hgg_mass'.split(",")
#outTag = 'Hggbb/legacy_branch_flattrees/reduceTree_st_ptmgg_ptmjj_dR_2018'
outTag = 'Hggbb/legacy_branch_flattrees/reduceTree_rho_rew_2018'
#outTag = 'Hggbb/legacy_branch_flattrees/train_withMjj/reduceTree_st_Mjj_2018_v2'
#outTag = 'Hggbb'
outDir=os.path.expanduser("/afs/cern.ch/work/i/ivovtin/"+outTag)
if not os.path.exists(outDir):
    os.mkdir(outDir)
    
    
#Save Signal
sig_count_df = rpd.read_root(utils.IO.signalName[0],"bbggSelectionTree", columns = branch_names+additionalCut_names)
preprocessing.define_process_weight(sig_count_df,utils.IO.sigProc[0],utils.IO.signalName[0])

#nTot is a multidim vector with all additional variables, dictVar is a dictionary associating a name of the variable
#to a position in the vector
nTot,dictVar = postprocessing.stackFeatures(sig_count_df,branch_names+additionalCut_names)
print "Y_pred"
print Y_pred_sig.shape

processPath=os.path.expanduser('/afs/cern.ch/work/i/ivovtin/')+outTag+'/'+utils.IO.signalName[0].split("/")[len(utils.IO.signalName[0].split("/"))-1].replace("output_","").replace(".root","")+"_preselection"+".root"
postprocessing.saveTree(processPath,dictVar,nTot,Y_pred_sig)

processPath=os.path.expanduser('/afs/cern.ch/work/i/ivovtin/')+outTag+'/'+utils.IO.signalName[0].split("/")[len(utils.IO.signalName[0].split("/"))-1].replace("output_","").replace(".root","")+"_preselection_diffNaming"+".root"
postprocessing.saveTree(processPath,dictVar,nTot,Y_pred_sig,nameTree="reducedTree_sig")


# do gJets not in the loop since they have two samples for one process
コード例 #3
0
def main(options, args):

    addSamples()

    #mva variables, use noexpand for root expressions, it needs this file https://github.com/ibab/root_pandas/blob/master/root_pandas/readwrite.py
    branch_names = 'leadingJet_bDis,subleadingJet_bDis,noexpand:fabs(CosThetaStar_CS),noexpand:fabs(CosTheta_bb),noexpand:fabs(CosTheta_gg)'.split(
        ",")
    branch_names += 'noexpand:diphotonCandidate.Pt()/diHiggsCandidate.M(),noexpand:dijetCandidate.Pt()/diHiggsCandidate.M()'.split(
        ",")
    branch_names += 'customLeadingPhotonIDMVA,customSubLeadingPhotonIDMVA,leadingPhotonSigOverE,subleadingPhotonSigOverE,sigmaMOverMDecorr,PhoJetMinDr'.split(
        ",")
    branch_names = [c.strip() for c in branch_names]
    print "using following variables for MVA: "
    print branch_names

    # no need to shuffle here, we just count events
    preprocessing.set_signals_and_backgrounds("bbggSelectionTree",
                                              branch_names,
                                              shuffle=False)
    X_bkg, y_bkg, weights_bkg, X_sig, y_sig, weights_sig = preprocessing.set_variables(
        branch_names)

    X_data, y_data, weights_data = preprocessing.set_data(
        "bbggSelectionTree", branch_names)
    X_data, y_data, weights_data = preprocessing.clean_signal_events_single_dataset(
        X_data, y_data, weights_data)

    #bbggTrees have by default signal and CR events, let's be sure that we clean it
    X_bkg, y_bkg, weights_bkg, X_sig, y_sig, weights_sig = preprocessing.clean_signal_events(
        X_bkg, y_bkg, weights_bkg, X_sig, y_sig, weights_sig)

    # load the model from disk
    from sklearn.externals import joblib
    loaded_model = joblib.load(
        os.path.expanduser('~/HHbbgg_ETH_devel/Training/output_files/' +
                           options.trainingVersion + '.pkl'))

    bkg = []
    for i in range(0, len(utils.IO.backgroundName) - 1):
        bkg.append(X_bkg[y_bkg == -i - 1])

    #compute the MVA
    if not options.addHHTagger:
        Y_pred_sig = loaded_model.predict_proba(
            X_sig)[:, loaded_model.n_classes_ - 1].astype(np.float64)
        Y_pred_bkg = []
        for i in range(0, len(utils.IO.backgroundName) - 1):
            print str(i)
            Y_pred_bkg.append(
                loaded_model.predict_proba(bkg[i])[:, loaded_model.n_classes_ -
                                                   1].astype(np.float64))

        Y_pred_data = loaded_model.predict_proba(
            X_data)[:, loaded_model.n_classes_ - 1].astype(np.float64)
        print Y_pred_data

    #define MVA cut and additional variables needed
    additionalCut_names = 'noexpand:diphotonCandidate.M(),noexpand:dijetCandidate.M(),MX,isSignal'.split(
        ",")
    #mva output
    if options.addHHTagger:
        additionalCut_names += 'HHTagger2017'.split(",")
    outTag = options.outTag
    outDir = os.path.expanduser("~/HHbbgg_ETH_devel/outfiles/" + outTag)
    if not os.path.exists(outDir):
        os.mkdir(outDir)

    sig_count_df = rpd.read_root(utils.IO.signalName[0],
                                 "bbggSelectionTree",
                                 columns=branch_names + additionalCut_names)
    preprocessing.define_process_weight(sig_count_df, utils.IO.sigProc[0],
                                        utils.IO.signalName[0])

    #nTot is a multidim vector with all additional variables, dictVar is a dictionary associating a name of the variable
    #to a position in the vector
    nTot, dictVar = postprocessing.stackFeatures(
        sig_count_df, branch_names + additionalCut_names)
    #apply isSignal cleaning
    nCleaned = nTot[np.where(nTot[:, dictVar['weight']] != 0), :][0]

    processPath = os.path.expanduser(
        '~/HHbbgg_ETH_devel/outfiles/') + outTag + '/' + utils.IO.signalName[
            0].split("/")[len(utils.IO.signalName[0].split("/")) - 1].replace(
                "output_", "").replace(".root", "") + "_preselection" + ".root"

    if not options.addHHTagger:
        postprocessing.saveTree(processPath, dictVar, nCleaned, Y_pred_sig)
    else:
        postprocessing.saveTree(processPath, dictVar, nCleaned)

    processPath = os.path.expanduser(
        '~/HHbbgg_ETH_devel/outfiles/') + outTag + '/' + utils.IO.signalName[
            0].split("/")[len(utils.IO.signalName[0].split("/")) - 1].replace(
                "output_", "").replace(
                    ".root", "") + "_preselection_diffNaming" + ".root"

    if not options.addHHTagger:
        postprocessing.saveTree(processPath,
                                dictVar,
                                nCleaned,
                                Y_pred_sig,
                                nameTree="reducedTree_sig")
    else:
        postprocessing.saveTree(processPath,
                                dictVar,
                                nCleaned,
                                nameTree="reducedTree_sig")

    ## do gJets not in the loop since they have two samples for one process, to be fixed
    bkg_1_count_df = rpd.read_root(utils.IO.backgroundName[1],
                                   "bbggSelectionTree",
                                   columns=branch_names + additionalCut_names)
    preprocessing.define_process_weight(bkg_1_count_df, utils.IO.bkgProc[1],
                                        utils.IO.backgroundName[1])

    crazySF_20 = 25
    nTot, dictVar = postprocessing.stackFeatures(bkg_1_count_df,
                                                 branch_names +
                                                 additionalCut_names,
                                                 SF=crazySF_20)

    print nTot.shape

    bkg_2_count_df = rpd.read_root(utils.IO.backgroundName[2],
                                   "bbggSelectionTree",
                                   columns=branch_names + additionalCut_names)
    preprocessing.define_process_weight(bkg_2_count_df, utils.IO.bkgProc[2],
                                        utils.IO.backgroundName[2])

    crazySF_40 = 3
    nTot_2, dictVar = postprocessing.stackFeatures(bkg_2_count_df,
                                                   branch_names +
                                                   additionalCut_names,
                                                   SF=crazySF_40)

    nTot_3 = np.concatenate((nTot, nTot_2))

    print nTot_3.shape
    nCleaned = nTot_3[np.where(nTot_3[:, dictVar['weight']] != 0), :][0]
    print "nCleaned"
    print nCleaned.shape

    processPath = (os.path.expanduser('~/HHbbgg_ETH_devel/outfiles/') +
                   outTag + '/' + utils.IO.backgroundName[1].split("/")[
                       len(utils.IO.backgroundName[1].split("/")) - 1].replace(
                           "output_", "").replace(".root", "") +
                   "_preselection" + ".root").replace("_Pt-20to40", "")
    if not options.addHHTagger:
        postprocessing.saveTree(processPath, dictVar, nCleaned, Y_pred_bkg[1])
    else:
        postprocessing.saveTree(processPath, dictVar, nCleaned)

    processPath = (os.path.expanduser('~/HHbbgg_ETH_devel/outfiles/') +
                   outTag + '/' + utils.IO.backgroundName[1].split("/")[
                       len(utils.IO.backgroundName[1].split("/")) - 1].replace(
                           "output_", "").replace(".root", "") +
                   "_preselection_diffNaming" + ".root").replace(
                       "_Pt-20to40", "")

    if not options.addHHTagger:
        postprocessing.saveTree(processPath,
                                dictVar,
                                nCleaned,
                                Y_pred_bkg[1],
                                nameTree="reducedTree_bkg_2")
    else:
        postprocessing.saveTree(processPath,
                                dictVar,
                                nCleaned,
                                nameTree="reducedTree_bkg_2")

    for iProcess in range(0, len(utils.IO.backgroundName)):
        ##gJets which are two samples for one process are skipped
        iSample = iProcess
        if iProcess == 1 or iProcess == 2:
            continue
        if iProcess > 2:
            iSample = iProcess - 1

        print "Processing sample: " + str(iProcess)
        bkg_count_df = rpd.read_root(utils.IO.backgroundName[iProcess],
                                     "bbggSelectionTree",
                                     columns=branch_names +
                                     additionalCut_names)
        preprocessing.define_process_weight(bkg_count_df,
                                            utils.IO.bkgProc[iProcess],
                                            utils.IO.backgroundName[iProcess])

        crazySF = 1
        ##scale diphoton + jets
        if iProcess == 0:
            crazySF = 1.45
        nTot, dictVar = postprocessing.stackFeatures(bkg_count_df,
                                                     branch_names +
                                                     additionalCut_names,
                                                     SF=crazySF)

        nCleaned = nTot[np.where(nTot[:, dictVar['weight']] != 0), :][0]
        print "nCleaned"
        print nCleaned.shape

        processPath = os.path.expanduser(
            '~/HHbbgg_ETH_devel/outfiles/'
        ) + outTag + '/' + utils.IO.backgroundName[iProcess].split("/")[
            len(utils.IO.backgroundName[7].split("/")) - 1].replace(
                "output_", "").replace(".root", "") + "_preselection" + ".root"
        if not options.addHHTagger:
            postprocessing.saveTree(processPath, dictVar, nCleaned,
                                    Y_pred_bkg[iSample])
        else:
            postprocessing.saveTree(processPath, dictVar, nCleaned)

        processPath = os.path.expanduser(
            '~/HHbbgg_ETH_devel/outfiles/'
        ) + outTag + '/' + utils.IO.backgroundName[iProcess].split("/")[
            len(utils.IO.backgroundName[7].split("/")) - 1].replace(
                "output_", "").replace(
                    ".root", "") + "_preselection_diffNaming" + ".root"
        if "GluGluToHHTo2B2G_node_" in processPath:
            treeName = "reducedTree_sig_node_" + str(iProcess - 6)
        else:
            treeName = "reducedTree_bkg_" + str(iProcess)
        if not options.addHHTagger:
            postprocessing.saveTree(processPath,
                                    dictVar,
                                    nCleaned,
                                    Y_pred_bkg[iSample],
                                    nameTree=treeName)
        else:
            postprocessing.saveTree(processPath,
                                    dictVar,
                                    nCleaned,
                                    nameTree=treeName)

    ##data
    data_count_df = rpd.read_root(utils.IO.dataName[0],
                                  "bbggSelectionTree",
                                  columns=branch_names + additionalCut_names)

    nTot, dictVar = postprocessing.stackFeatures(data_count_df,
                                                 branch_names +
                                                 additionalCut_names,
                                                 isData=1)

    #apply isSignal cleaning
    nCleaned = nTot[np.where(nTot[:, dictVar['weight']] != 0), :][0]
    print "nCleaned"
    print nCleaned.shape

    #save preselection data
    processPath = os.path.expanduser(
        '~/HHbbgg_ETH_devel/outfiles/') + outTag + '/' + utils.IO.dataName[
            0].split("/")[len(utils.IO.dataName[0].split("/")) - 1].replace(
                "output_", "").replace(".root", "") + "_preselection" + ".root"
    if not options.addHHTagger:
        postprocessing.saveTree(processPath, dictVar, nCleaned, Y_pred_data)
    else:
        postprocessing.saveTree(processPath, dictVar, nCleaned)

    processPath = os.path.expanduser(
        '~/HHbbgg_ETH_devel/outfiles/') + outTag + '/' + utils.IO.dataName[
            0].split("/")[len(utils.IO.dataName[0].split("/")) - 1].replace(
                "output_", "").replace(
                    ".root", "") + "_preselection_diffNaming" + ".root"
    if not options.addHHTagger:
        postprocessing.saveTree(processPath,
                                dictVar,
                                nCleaned,
                                Y_pred_data,
                                nameTree="reducedTree_bkg")
    else:
        postprocessing.saveTree(processPath,
                                dictVar,
                                nCleaned,
                                nameTree="reducedTree_bkg")

    os.system('hadd ' + os.path.expanduser('~/HHbbgg_ETH_devel/outfiles/') +
              outTag + '/' + 'Total_preselection_diffNaming.root ' +
              os.path.expanduser('~/HHbbgg_ETH_devel/outfiles/') + outTag +
              '/' + '*diffNaming.root')