Example #1
0
def gen_jet_pair_mass(df):
    gjmass = None
    gjets = df.GenJet
    gleptons = df.GenPart[(abs(df.GenPart.pdgId) == 13)
                          | (abs(df.GenPart.pdgId) == 11)
                          | (abs(df.GenPart.pdgId) == 15)]
    gl_pair = ak.cartesian({
        "jet": gjets,
        "lepton": gleptons
    },
                           axis=1,
                           nested=True)
    _, _, dr_gl = delta_r(
        gl_pair["jet"].eta,
        gl_pair["lepton"].eta,
        gl_pair["jet"].phi,
        gl_pair["lepton"].phi,
    )
    isolated = ak.all((dr_gl > 0.3), axis=-1)
    if ak.count(gjets[isolated], axis=None) > 0:
        # TODO: convert only relevant fields!
        gjet1 = ak.to_pandas(gjets[isolated]).loc[pd.IndexSlice[:, 0],
                                                  ["pt", "eta", "phi", "mass"]]
        gjet2 = ak.to_pandas(gjets[isolated]).loc[pd.IndexSlice[:, 1],
                                                  ["pt", "eta", "phi", "mass"]]
        gjet1.index = gjet1.index.droplevel("subentry")
        gjet2.index = gjet2.index.droplevel("subentry")

        gjsum = p4_sum(gjet1, gjet2)
        gjmass = gjsum.mass
    return gjmass
def test_count():
    array = ak.Array(
        [
            [
                [np.datetime64("2022"), np.datetime64("2023"), np.datetime64("2025")],
                [],
                [np.datetime64("2027"), np.datetime64("2011")],
                [np.datetime64("2013")],
            ],
            [],
            [[np.datetime64("2017"), np.datetime64("2019")], [np.datetime64("2023")]],
        ],
        check_valid=True,
    )
    assert ak.count(array) == 9
    assert ak.to_list(ak.count(array, axis=-1)) == [[3, 0, 2, 1], [], [2, 1]]
    assert ak.to_list(ak.count(array, axis=2)) == [[3, 0, 2, 1], [], [2, 1]]
    assert ak.to_list(ak.count(array, axis=-1, keepdims=True)) == [
        [[3], [0], [2], [1]],
        [],
        [[2], [1]],
    ]
    assert ak.to_list(ak.count(array, axis=-2)) == [[3, 2, 1], [], [2, 1]]
    assert ak.to_list(ak.count(array, axis=1)) == [[3, 2, 1], [], [2, 1]]
    assert ak.to_list(ak.count(array, axis=-2, keepdims=True)) == [
        [[3, 2, 1]],
        [[]],
        [[2, 1]],
    ]
def get_sum_wgts(file):
    try:
        events = NanoEventsFactory.from_root(
            file, "Delphes", schemaclass=DelphesSchema).events()
        # result = (file, ak.sum(events.Event.Weight))
        result = (file, ak.count(events.Event.Number))
    except Exception:
        result = (file, 0)
    return result
Example #4
0
def test_nested_collection(collection, subcollection, arr_type, element, events):
    assert ak.type(events[collection][subcollection])
    assert ak.type(events[collection][subcollection + "Counts"])
    assert (
        ak.type(events[collection][subcollection])
        .type.type.type.__str__()
        .startswith(arr_type)
    )
    if element is None:
        assert ak.all(
            events[collection][subcollection + "Counts"]
            == ak.count(events[collection][subcollection], axis=-1)
        )
    else:
        assert ak.all(
            events[collection][subcollection + "Counts"]
            == ak.count(events[collection][subcollection][element], axis=-1)
        )
Example #5
0
 def __iadd__(self, other):
     for branch, branch_data in other.data.items():
         if branch in self.data.keys():
             if ak.count(self.data[branch], axis=None) == 0:
                 self.data[branch] = branch_data
             else:
                 self.data[branch] = ak.concatenate(
                     [self.data[branch], branch_data])
         else:
             self.data[branch] = branch_data
     return self
Example #6
0
 def __iadd__(self, other):
     attrs = [
         a for a in dir(other)
         if not a.startswith('__') and not callable(getattr(other, a))
     ]
     for a in attrs:
         if hasattr(self, a):
             attr = getattr(self, a)
             if ak.count(attr, axis=None) == 0:
                 setattr(self, a, getattr(other, a))
             else:
                 setattr(self, a, ak.concatenate([attr, getattr(other, a)]))
         else:
             setattr(self, a, getattr(other, a))
     return self
Example #7
0
def PassTrigger(triggerPass):
    indicesOfHighEffTrig = [11, 12, 13, 14, 67, 107, 108, 131, 8, 90, 98, 116]
    tPassedHEList = []
    tPassedList = []
    for evt in triggerPass:
        tPassed = []
        tPassedHE = []
        for tp in range(len(evt)):
            if evt[tp] == 1:
                if tp in indicesOfHighEffTrig:
                    tPassedHE.append(tp)
        tPassedList.append(tPassed)
        tPassedHEList.append(tPassedHE)
    tPassedList = ak.Array(tPassedList)
    tPassedHEList = ak.Array(tPassedHEList)
    return ak.count(tPassedHEList, axis=-1) > 0
Example #8
0
def mask_inf(var_array, var_name=None, var_inf_counter=None):
    """
    Mask inf values in `var_array` with None. If var_inf_counter is passed, append there inplace for a given `var_name` the fraction of its inf values.

    Arguments:
        - var_array: awkward array, values of a given feature for a given set of taus
        - var_name (optional, default=None): string, variable name
        - var_inf_counter (optional, default=None): defaultdict(list), stores fraction of inf values for variables

    Returns
        var_array witn masked infs values to None
    """
    if np.sum(np.isinf(var_array)) > 0:
        is_inf_mask = np.isinf(var_array)
        var_array = ak.mask(var_array, is_inf_mask, valid_when=False)
        if var_inf_counter is not None:
            var_inf_counter[var_name].append(
                np.sum(is_inf_mask) / ak.count(var_array))
    return var_array
def test_reducers():
    # axis=None reducers are implemented in NumPy.
    assert ak.sum(ak.from_iter([[1 + 1j, 2 + 2j], [], [3 + 3j]])) == 6 + 6j
    assert ak.prod(ak.from_iter([[1 + 1j, 2 + 2j], [], [3 + 3j]])) == -12 + 12j

    # axis != None reducers are implemented in libawkward; this should be ReducerSum.
    assert ak.sum(ak.from_iter([[1 + 1j, 2 + 2j], [], [3 + 3j]]),
                  axis=1).tolist() == [
                      3 + 3j,
                      0 + 0j,
                      3 + 3j,
                  ]
    # And this is in ReducerProd.
    assert ak.prod(ak.from_iter([[1 + 1j, 2 + 2j], [], [3 + 3j]]),
                   axis=1).tolist() == [
                       0 + 4j,
                       1 + 0j,
                       3 + 3j,
                   ]

    # ReducerCount, ReducerCountNonzero, ReducerAny, and ReducerAll work.
    assert ak.count(ak.from_iter([[1 + 1j, 2 + 2j], [], [3 + 3j]]),
                    axis=1).tolist() == [2, 0, 1]
    assert ak.count_nonzero(ak.from_iter([[1 + 1j, 2 + 2j], [], [3 + 3j]]),
                            axis=1).tolist() == [2, 0, 1]
    assert ak.any(ak.from_iter([[1 + 1j, 2 + 2j], [], [3 + 3j]]),
                  axis=1).tolist() == [
                      True,
                      False,
                      True,
                  ]
    assert ak.all(ak.from_iter([[1 + 1j, 2 + 2j], [], [3 + 3j]]),
                  axis=1).tolist() == [
                      True,
                      True,
                      True,
                  ]
    assert ak.any(ak.from_iter([[1 + 1j, 2 + 2j, 0 + 0j], [], [3 + 3j]]),
                  axis=1).tolist() == [True, False, True]
    assert ak.all(ak.from_iter([[1 + 1j, 2 + 2j, 0 + 0j], [], [3 + 3j]]),
                  axis=1).tolist() == [False, True, True]
Example #10
0
def lhe_weights(df, output, dataset, year):
    factor2 = ("dy_m105_160_amc" in dataset) and (("2017" in year) or ("2018" in year))
    if factor2:
        lhefactor = 2.0
    else:
        lhefactor = 1.0
    nLHEScaleWeight = ak.count(df.LHEScaleWeight, axis=1)
    lhe_df = pd.DataFrame(
        data=ak.to_numpy(nLHEScaleWeight),
        index=output.index,
        columns=["nLHEScaleWeight"],
    )
    for i in [1, 3, 4, 5, 6, 7, 15, 24, 34]:
        cut = lhe_df.nLHEScaleWeight > i
        cut_ak = nLHEScaleWeight > i
        lhe_df[f"LHE{i}"] = 1.0
        lhe_df.loc[cut, f"LHE{i}"] = ak.to_numpy(df.LHEScaleWeight[cut_ak][:, i])

    cut8 = lhe_df.nLHEScaleWeight > 8
    cut30 = lhe_df.nLHEScaleWeight > 30
    lhe_ren_up = lhe_df.LHE6 * lhefactor
    lhe_ren_up[cut8] = lhe_df.LHE7 * lhefactor
    lhe_ren_up[cut30] = lhe_df.LHE34 * lhefactor
    lhe_ren_down = lhe_df.LHE1 * lhefactor
    lhe_ren_down[cut8] = lhe_df.LHE1 * lhefactor
    lhe_ren_down[cut30] = lhe_df.LHE5 * lhefactor

    lhe_fac_up = lhe_df.LHE4 * lhefactor
    lhe_fac_up[cut8] = lhe_df.LHE5 * lhefactor
    lhe_fac_up[cut30] = lhe_df.LHE24 * lhefactor
    lhe_fac_down = lhe_df.LHE3 * lhefactor
    lhe_fac_down[cut8] = lhe_df.LHE3 * lhefactor
    lhe_fac_down[cut30] = lhe_df.LHE15 * lhefactor

    lhe_ren = {"up": lhe_ren_up, "down": lhe_ren_down}
    lhe_fac = {"up": lhe_fac_up, "down": lhe_fac_down}
    return lhe_ren, lhe_fac
def get_nsv(sj, sv, R=0.4):

    sv_dr = sj.delta_r(sv)
    nsv = ak.count(sv_dr[sv_dr < R], axis=1)

    return nsv
Example #12
0
def future_savez(dataset, currentfile):

    print('before selection ', len(events_slice))
    # select Muon
    myMuon = events_slice.Muon[:]
    myMuon['istight'] = ((events_slice.Muon.tightId == 1) &
                         (events_slice.Muon.pfRelIso03_all < 0.15) &
                         (events_slice.Muon.pt > 20.))
    events_slice['Muon'] = myMuon[myMuon.istight]
    # select electrons
    myElectron = events_slice.Electron[:]
    myElectron['istight'] = ((events_slice.Electron.mvaFall17V1Iso_WP80 == 1) &
                             (events_slice.Electron.pt > 20.0))
    events_slice['Electron'] = myElectron[myElectron.istight]
    # select events with n tight leptons
    n_tight_leptons = ak.count(
        events_slice.Muon.pt[events_slice.Muon.istight], axis=-1) + ak.count(
            events_slice.Electron.pt[events_slice.Electron.istight], axis=-1)
    # number of leptons can be larger than the required number
    events_selected = events_slice[n_tight_leptons >= options.n_leptons]
    print('after selection ', len(events_selected))

    muons = events_selected.Muon[events_selected.Muon.istight]
    electrons = events_selected.Electron[events_selected.Electron.istight]
    # mix leptons and sort according to pt
    leptons = ak.concatenate([muons, electrons], axis=1)
    leptons = leptons[ak.argsort(leptons.pt, axis=1, ascending=False)]
    leptons = leptons[:, 0:int(options.n_leptons_subtract)]
    # only want the first n_leptons_subtract leptons
    #print('number of leptons ', ak.count(leptons.pt, axis=-1))
    leptons_px = leptons.pt * np.cos(leptons.phi)
    leptons_py = leptons.pt * np.sin(leptons.phi)
    leptons_px = ak.sum(leptons_px, axis=1)
    leptons_py = ak.sum(leptons_py, axis=1)
    met_list = np.column_stack([
        events_selected.GenMET.pt * np.cos(events_selected.GenMET.phi) +
        leptons_px,
        events_selected.GenMET.pt * np.sin(events_selected.GenMET.phi) +
        leptons_py,
        events_selected.MET.pt * np.cos(events_selected.MET.phi) + leptons_px,
        events_selected.MET.pt * np.sin(events_selected.MET.phi) + leptons_py,
        events_selected.PuppiMET.pt * np.cos(events_selected.PuppiMET.phi) +
        leptons_px,
        events_selected.PuppiMET.pt * np.sin(events_selected.PuppiMET.phi) +
        leptons_py, events_selected.DeepMETResponseTune.pt *
        np.cos(events_selected.DeepMETResponseTune.phi) + leptons_px,
        events_selected.DeepMETResponseTune.pt *
        np.sin(events_selected.DeepMETResponseTune.phi) + leptons_py,
        events_selected.DeepMETResolutionTune.pt *
        np.cos(events_selected.DeepMETResolutionTune.phi) + leptons_px,
        events_selected.DeepMETResolutionTune.pt *
        np.sin(events_selected.DeepMETResolutionTune.phi) + leptons_py,
        events_selected.LHE.HT
    ])
    overlap_removal = run_deltar_matching(events_selected.PFCands,
                                          leptons,
                                          drname='deltaR',
                                          radius=0.001,
                                          unique=True,
                                          sort=False)
    # remove the cloest PF particle
    mask = ak.count(overlap_removal.deltaR, axis=-1) == 0
    #print(len(events_selected.PFCands.pt[0]))
    events_selected['PFCands'] = events_selected.PFCands[mask]
    #print(len(events_selected.PFCands.pt[0]))
    #save the rest of PFcandidates

    nparticles_per_event = max(ak.num(events_selected.PFCands.pt, axis=1))
    print("max NPF in this range: ", nparticles_per_event)

    particle_list = ak.concatenate([
        [
            ak.fill_none(
                ak.pad_none(events_selected.PFCands.pt,
                            nparticles_per_event,
                            clip=True), -999)
        ],
        [
            ak.fill_none(
                ak.pad_none(events_selected.PFCands.eta,
                            nparticles_per_event,
                            clip=True), -999)
        ],
        [
            ak.fill_none(
                ak.pad_none(events_selected.PFCands.phi,
                            nparticles_per_event,
                            clip=True), -999)
        ],
        [
            ak.fill_none(
                ak.pad_none(events_selected.PFCands.d0,
                            nparticles_per_event,
                            clip=True), -999)
        ],
        [
            ak.fill_none(
                ak.pad_none(events_selected.PFCands.dz,
                            nparticles_per_event,
                            clip=True), -999)
        ],
        [
            ak.fill_none(
                ak.pad_none(events_selected.PFCands.mass,
                            nparticles_per_event,
                            clip=True), -999)
        ],
        [
            ak.fill_none(
                ak.pad_none(events_selected.PFCands.puppiWeight,
                            nparticles_per_event,
                            clip=True), -999)
        ],
        [
            ak.fill_none(
                ak.pad_none(events_selected.PFCands.pdgId,
                            nparticles_per_event,
                            clip=True), -999)
        ],
        [
            ak.fill_none(
                ak.pad_none(events_selected.PFCands.charge,
                            nparticles_per_event,
                            clip=True), -999)
        ],
        [
            ak.fill_none(
                ak.pad_none(events_selected.PFCands.fromPV,
                            nparticles_per_event,
                            clip=True), -999)
        ],
        [
            ak.fill_none(
                ak.pad_none(events_selected.PFCands.pvRef,
                            nparticles_per_event,
                            clip=True), -999)
        ],
        [
            ak.fill_none(
                ak.pad_none(events_selected.PFCands.pvAssocQuality,
                            nparticles_per_event,
                            clip=True), -999)
        ],
    ])
    npz_file = os.environ['PWD'] + '/raw/' + dataset + '_file' + str(
        currentfile) + '_slice_' + str(i) + '_nevent_' + str(
            len(events_selected))
    np.savez(npz_file, x=particle_list, y=met_list)
Example #13
0
    def process(self, events):
        output = self.accumulator.identity()

        dataset = events.metadata['dataset']
        output['sumw'][dataset] += ak.sum(events.genWeight)
        
        ##############
        # Trigger level
        triggers = [
        "HLT_Mu12_TrkIsoVVL_Ele23_CaloIdL_TrackIdL_IsoVL_DZ",
        "HLT_Mu23_TrkIsoVVL_Ele12_CaloIdL_TrackIdL_IsoVL_DZ",    
        ]
        
        trig_arrs = [events.HLT[_trig.strip("HLT_")] for _trig in triggers]
        req_trig = np.zeros(len(events), dtype='bool')
        for t in trig_arrs:
            req_trig = req_trig | t

        ############
        # Event level
        
        ## Muon cuts
        # muon twiki: https://twiki.cern.ch/twiki/bin/view/CMS/SWGuideMuonIdRun2
        events.Muon = events.Muon[(events.Muon.pt > 30) & (abs(events.Muon.eta < 2.4))] # & (events.Muon.tightId > .5)
        events.Muon = ak.pad_none(events.Muon, 1, axis=1) 
        req_muon =(ak.count(events.Muon.pt, axis=1) == 1)
        
        ## Electron cuts
        # electron twiki: https://twiki.cern.ch/twiki/bin/viewauth/CMS/CutBasedElectronIdentificationRun2
        events.Electron = events.Electron[(events.Electron.pt > 30) & (abs(events.Electron.eta) < 2.4)]
        events.Electron = ak.pad_none(events.Electron, 1, axis=1) 
        req_ele = (ak.count(events.Electron.pt, axis=1) == 1)
        
        ## Jet cuts
        events.Jet = events.Jet[(events.Jet.pt > 25) & (abs(events.Jet.eta) <= 2.5)]
        req_jets = (ak.count(events.Jet.pt, axis=1) >= 2)    
        
        req_opposite_charge = events.Electron[:, 0].charge * events.Muon[:, 0].charge == -1
        
        event_level = req_trig & req_muon & req_ele & req_opposite_charge & req_jets
        
        # Selected
        selev = events[event_level]    
        
        #########
        
        # Per electron
        el_eta   = (abs(selev.Electron.eta) <= 2.4)
        el_pt    = selev.Electron.pt > 30
        el_level = el_eta & el_pt
        
        # Per muon
        mu_eta   = (abs(selev.Muon.eta) <= 2.4)
        mu_pt    = selev.Muon.pt > 30
        mu_level = mu_eta & mu_pt
        
        # Per jet
        jet_eta    = (abs(selev.Jet.eta) <= 2.4)
        jet_pt     = selev.Jet.pt > 25
        jet_pu     = ( ((selev.Jet.puId > 6) & (selev.Jet.pt < 50)) | (selev.Jet.pt > 50) ) 
        jet_id     = selev.Jet.jetId >= 2 
        #jet_id     = selev.Jet.isTight() == 1 & selev.Jet.isTightLeptonVeto() == 0
        jet_level  = jet_pu & jet_eta & jet_pt & jet_id

        # b-tag twiki : https://twiki.cern.ch/twiki/bin/viewauth/CMS/BtagRecommendation102X
        bjet_disc_t  = selev.Jet.btagDeepB > 0.7264 # L=0.0494, M=0.2770, T=0.7264
        bjet_disc_m  = selev.Jet.btagDeepB > 0.2770 # L=0.0494, M=0.2770, T=0.7264
        bjet_disc_l  = selev.Jet.btagDeepB > 0.0494 # L=0.0494, M=0.2770, T=0.7264
        bjet_level_t = jet_level & bjet_disc_t
        bjet_level_m = jet_level & bjet_disc_m
        bjet_level_l = jet_level & bjet_disc_l
        
        sel    = selev.Electron[el_level]
        smu    = selev.Muon[mu_level]
        sjets  = selev.Jet[jet_level]
        sbjets_t = selev.Jet[bjet_level_t]
        sbjets_m = selev.Jet[bjet_level_m]
        sbjets_l = selev.Jet[bjet_level_l]
        
        # output['pt'].fill(dataset=dataset, pt=selev.Jet.pt.flatten())
        # Fill histograms dynamically  
        for histname, h in output.items():
            if (histname not in self.jet_hists) and (histname not in self.deepcsv_hists): continue
            # Get valid fields perhistogram to fill
            fields = {k: ak.flatten(sjets[k], axis=None) for k in h.fields if k in dir(sjets)}
            h.fill(dataset=dataset, **fields)


        def flatten(ar): # flatten awkward into a 1d array to hist
            return ak.flatten(ar, axis=None)

        def num(ar):
            return ak.num(ak.fill_none(ar[~ak.is_none(ar)], 0), axis=0)

        output['njet'].fill(dataset=dataset,  njet=flatten(ak.num(sjets)))
        output['nbjet_t'].fill(dataset=dataset, nbjet_t=flatten(ak.num(sbjets_t)))
        output['nbjet_m'].fill(dataset=dataset, nbjet_m=flatten(ak.num(sbjets_m)))
        output['nbjet_l'].fill(dataset=dataset, nbjet_l=flatten(ak.num(sbjets_l)))
        output['nel'].fill(dataset=dataset,   nel=flatten(ak.num(sel)))
        output['nmu'].fill(dataset=dataset,   nmu=flatten(ak.num(smu)))

        output['lelpt'].fill(dataset=dataset, lelpt=flatten(selev.Electron[:, 0].pt))
        output['lmupt'].fill(dataset=dataset, lmupt=flatten(selev.Muon[:, 0].pt))
        output['ljpt'].fill(dataset=dataset,  ljpt=flatten(selev.Jet[:, 0].pt))
        output['sljpt'].fill(dataset=dataset,  sljpt=flatten(selev.Jet[:, 1].pt))

        return output
Example #14
0
    "mfv_splitSUSY_tau000000300um_M2000_1900_2017",
    "mfv_splitSUSY_tau000001000um_M2000_1800_2017",
    "mfv_splitSUSY_tau000001000um_M2000_1900_2017",
    "mfv_splitSUSY_tau000010000um_M2000_1800_2017",
    "mfv_splitSUSY_tau000010000um_M2000_1900_2017",
]

ntk_bkg = []
ntk_sig = []
for fn in fns_bkg:
  f = uproot.open(fndir+fn+'.root')
  f = f["mfvJetTreer/tree_DV"]
  if len(f['evt'].array())==0:
    print( "no events!!!")
    continue
  ntk = np.array(ak.count(f['tk_pt'].array(), axis=1))
  ntk_bkg.append(ntk)

for fn in fns_signal:
  f = uproot.open(fndir+fn+'.root')
  f = f["mfvJetTreer/tree_DV"]
  if len(f['evt'].array())==0:
    print( "no events!!!")
    continue
  ntk = np.array(ak.count(f['tk_pt'].array(), axis=1))
  ntk_sig.append(ntk)

ntk_bkg = np.concatenate(ntk_bkg, axis = None)
ntk_sig = np.concatenate(ntk_sig, axis = None)

plt.hist(ntk_bkg,label='background',bins=350, range=(0,350),alpha=0.5,density=True)
Example #15
0
def test():
    nums = [
        [
            17,
            11,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
        ],
        [
            17,
            11,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
        ],
        [
            17,
            11,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
        ],
        [
            17,
            11,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
        ],
        [
            17,
            11,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
        ],
        [
            17,
            11,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
        ],
        [
            17,
            11,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
        ],
        [
            17,
            11,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
        ],
        [
            17,
            11,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
        ],
        [
            17,
            11,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            8,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
        ],
    ]
    sample = []
    for outer in nums:
        sample.append([])
        for inner in outer:
            sample[-1].append([0] * inner)

    assert ak.is_valid(ak.count(sample, axis=0))
Example #16
0
    def process(self, events):
        output = self.accumulator.identity()

        dataset = events.metadata['dataset']

        isRealData = 'genWeight' not in events.fields
        if not isRealData:
            output['sumw'][dataset] += sum(events.genWeight)
            JECversion = JECversions[str(self.year)]['MC']
        else:
            output['nbtagmu'][dataset] += ak.count(events.event)
            JECversion = JECversions[str(
                self.year)]['Data'][dataset.split('BTagMu')[1]]

        ############
        # Some corrections
        weights = processor.Weights(len(events))
        corrections = {}
        if not isRealData:
            weights.add('genWeight', events.genWeight)
            weights.add(
                'pileup_weight',
                self.puReweight(self.puFile, self.nTrueFile,
                                dataset)(events.Pileup.nPU))

        events.FatJet = self.applyJEC(events.FatJet,
                                      events.fixedGridRhoFastjetAll,
                                      events.caches[0], 'AK8PFPuppi',
                                      isRealData, JECversion)

        cuts = processor.PackedSelection()

        ############
        # Trigger selection
        if self.year == 2016:
            if 'BTagMu_AK4Jet300_Mu5' not in events.HLT.fields:
                self.triggers = [
                    trigger.replace('AK4', '') for trigger in self.triggers
                ]
        elif self.year == 2018:
            for (i, trigger) in enumerate(self.triggers):
                if trigger.strip("HLT_") not in events.HLT.fields:
                    self.triggers[i] = trigger + "_noalgo"

        trig_arrs = [
            events.HLT[_trig.strip("HLT_")] for _trig in self.triggers
        ]
        req_trig = np.zeros(len(events), dtype='bool')
        for t in trig_arrs:
            req_trig = req_trig | t
        cuts.add('trigger', ak.to_numpy(req_trig))

        ############
        # Basic cuts
        ## Muon cuts
        # muon twiki: https://twiki.cern.ch/twiki/bin/view/CMS/SWGuideMuonIdRun2
        events.Muon = events.Muon[(events.Muon.pt > 5)
                                  & (abs(events.Muon.eta < 2.4)) &
                                  (events.Muon.tightId != 1) &
                                  (events.Muon.pfRelIso04_all > 0.15)]
        events.Muon = ak.pad_none(events.Muon, 2, axis=1)

        ## Jet cuts  (not used)
        events.Jet = events.Jet[(events.Jet.pt > 25)
                                & (abs(events.Jet.eta) <= 2.5)]
        #req_jets = (ak.count(events.Jet.pt, axis=1) >= 2)

        ## FatJet cuts
        events.FatJet = events.FatJet[
            (events.FatJet.pt > self._mask_fatjets['basic']['pt_cut']) &
            (abs(events.FatJet.eta) <= self._mask_fatjets['basic']['eta_cut'])
            & (events.FatJet.jetId > self._mask_fatjets['basic']['jetId_cut'])
            & (ak.count(events.FatJet.subjets.pt, axis=2) >=
               2)]  ## subjet sel to crosscheck
        #print(events['FatJetSVs'])

        ## Event level variables
        eventVariables = {}
        eventVariables['nfatjet'] = ak.num(events.FatJet)

        ## Leading jet variables
        leadfatjet = ak.firsts(events.FatJet)
        leadfatjet['tau21'] = leadfatjet.tau2 / leadfatjet.tau1
        subjet1 = ak.pad_none(leadfatjet.subjets, 2)[:, 0]
        subjet2 = ak.pad_none(leadfatjet.subjets, 2)[:, 1]
        leadfatjet['nsv1'] = get_nsv(subjet1, events.SV)
        leadfatjet['nsv2'] = get_nsv(subjet2, events.SV)
        leadfatjet['nmusj1'] = ak.num(subjet1.delta_r(events.Muon) < 0.4)
        leadfatjet['nmusj2'] = ak.num(subjet2.delta_r(events.Muon) < 0.4)

        fatjet_mutag = (leadfatjet.nmusj1 >= 1) & (leadfatjet.nmusj2 >= 1)
        cuts.add('fatjet_mutag', ak.to_numpy(fatjet_mutag))

        for DDX in self._mask_DDX.keys():
            for wp, cut in self._mask_DDX[DDX].items():
                DDX_pass = (leadfatjet[f'btag{DDX}vLV2'] > cut)
                DDX_fail = (leadfatjet[f'btag{DDX}vLV2'] < cut)
                cuts.add(f'{DDX}_pass{wp}wp', ak.to_numpy(DDX_pass))
                cuts.add(f'{DDX}_fail{wp}wp', ak.to_numpy(DDX_fail))

        flavors = {}
        if not isRealData:
            flavors['b'] = (leadfatjet.hadronFlavour == 5)
            flavors['c'] = (leadfatjet.hadronFlavour == 4)
            flavors['l'] = (leadfatjet.hadronFlavour < 4)
            flavors['bb'] = abs(leadfatjet.hadronFlavour == 5) & (
                leadfatjet.nBHadrons >= 2)  #& (leadfatjet.nCHadrons == 0)
            flavors['cc'] = abs(leadfatjet.hadronFlavour == 4) & (
                leadfatjet.nBHadrons == 0) & (leadfatjet.nCHadrons >= 2)
            #flavors['ll'] = abs(leadfatjet.hadronFlavour < 4) & (leadfatjet.nBHadrons == 0) & (leadfatjet.nCHadrons == 0)
            flavors['b'] = flavors['b'] & ~flavors['bb']
            flavors['c'] = flavors['c'] & ~flavors['cc']
            flavors['l'] = flavors['l'] & ~flavors['bb'] & ~flavors[
                'cc'] & ~flavors['b'] & ~flavors['c']
            #flavors['others'] = ~flavors['l'] & ~flavors['bb'] & ~flavors['cc'] & ~flavors['b'] & ~flavors['c']
        else:
            flavors['Data'] = np.ones(len(events), dtype='bool')

        for selname, cut in self._mask_fatjets.items():

            sel = (leadfatjet.pt > cut['pt_cut']) & \
                    (leadfatjet.msoftdrop > cut['mass_cut']) & \
                    (abs(leadfatjet.eta) < cut['eta_cut']) & \
                    (leadfatjet.jetId >= cut['jetId_cut']) & \
                    (leadfatjet.tau21 < cut['tau21_cut'])
            #(leadfatjet.Hbb > cut['Hbb'])

            cuts.add(selname, ak.to_numpy(sel))

        selection = {}
        selection['basic'] = {'trigger', 'basic'}
        selection['pt350msd50'] = {'trigger', 'fatjet_mutag', 'pt350msd50'}
        selection['msd100tau06'] = {'trigger', 'fatjet_mutag', 'msd100tau06'}
        selection['pt400msd100tau06'] = {
            'trigger', 'fatjet_mutag', 'pt400msd100tau06'
        }
        for mask_f in self._final_mask:
            for DDX in self._mask_DDX.keys():
                for wp, cut in self._mask_DDX[DDX].items():
                    selection[f'{mask_f}{DDX}pass{wp}wp'] = selection[
                        mask_f].copy()
                    selection[f'{mask_f}{DDX}pass{wp}wp'].add(
                        f'{DDX}_pass{wp}wp')
                    selection[f'{mask_f}{DDX}fail{wp}wp'] = selection[
                        mask_f].copy()
                    selection[f'{mask_f}{DDX}fail{wp}wp'].add(
                        f'{DDX}_fail{wp}wp')

        for histname, h in output.items():
            sel = [r for r in selection.keys() if r in histname.split('_')]
            if ((histname in self.fatjet_hists) |
                ('hist2d_fatjet' in histname)):
                for flav, mask in flavors.items():
                    weight = weights.weight() * cuts.all(
                        *selection[sel[0]]) * ak.to_numpy(mask)
                    fields = {
                        k: ak.fill_none(leadfatjet[k], -9999)
                        for k in h.fields if k in dir(leadfatjet)
                    }
                    h.fill(dataset=dataset,
                           flavor=flav,
                           **fields,
                           weight=weight)
            if histname in self.event_hists:
                for flav, mask in flavors.items():
                    weight = weights.weight() * cuts.all(
                        *selection[sel[0]]) * ak.to_numpy(mask)
                    fields = {
                        k: ak.fill_none(eventVariables[k], -9999)
                        for k in h.fields if k in eventVariables.keys()
                    }
                    h.fill(dataset=dataset,
                           flavor=flav,
                           **fields,
                           weight=weight)

        return output
 def process(self, events):
     output = self._accumulator.identity()
     jets=events.Jet
     jetSel = (jets.pt>30) & (abs(jets.eta)<2.4)
     tightJet = jets[jetSel]
     bJet = tightJet[tightJet.btagDeepFlavB > 0.642]
     muons = events.Muon
     muonSel = (muons.pt>30) & (abs(muons.eta)<2.4)
     tightMuon = muons[muonSel]
     ele = events.Electron
     eleSel = (ele.pt>35)&(abs(ele.eta)<2.4)
     tightEle = ele[eleSel]
     eventSel = (((ak.num(tightMuon)==1) | (ak.num(tightEle)==1)) &
         (ak.num(tightJet)>= 3) & (ak.num(bJet)>=1)
                )
     final = events[eventSel]
     
     
     #####GENPART MATCHING ######
     
     genPart = final.GenPart
     tops = genPart[abs(genPart.pdgId)==6]
     #The isLastCopy Flag filters out copy Genparticles:
     tops = tops[tops.hasFlags('isLastCopy')]
     tDecay = tops.distinctChildren
     tDecay = tDecay[tDecay.hasFlags('isLastCopy')]
     t_Events=tDecay[abs(tDecay.pdgId)==5]
     W = tDecay[abs(tDecay.pdgId)==24]
     W = W[W.hasFlags('isLastCopy')]
     WDecay = W.distinctChildren
     WDecay = WDecay[WDecay.hasFlags('isLastCopy')]
     #t_events is the lone bottom, W_events is the -> two jets
     #select the hadronically decaying W
     W_Events=ak.flatten(WDecay[ak.all(abs(WDecay.pdgId)<=8,axis=-1)],axis=3)
     #print(qqb)
     #HadW is mask for Quark deacying W boson
     hadW = ak.num(W_Events,axis=2)==2
     #filters out t_events that have a hadronically decayign W Boson
     hadB = t_Events[hadW]
     hadB = ak.flatten(hadB,axis=2)
     W_quarks = W_Events[hadW]
     W_quarks = ak.flatten(W_quarks,axis=2)
     #concatentating these two arrays make an array of events with the correctly decaying GenParticles.
     qqb = ak.concatenate([hadB,W_quarks],axis=1)
     
     
     #####GEN JET MATCHING ######
     final=final[(ak.count(qqb.pdgId,axis=1)==3)]
     finaljets=final.Jet
     qqb=qqb[(ak.count(qqb.pdgId,axis=1)==3)]
     #Implementing Tight Jet Cuts on Training Data
     finaljetSel=(abs(finaljets.eta)<2.4)&(finaljets.pt>30)
     finalJets=finaljets[finaljetSel]
     #Match Gen part to gen jet
     matchedGenJets=qqb.nearest(final.GenJet)
     #match gen to reco
     matchedJets=matchedGenJets.nearest(finalJets)
 
     ### VALIDATION ###
     test=matchedJets.genJetIdx
     combs=ak.combinations(finalJets,3,replacement=False)
     t1=(combs['0'].genJetIdx==test[:,0])|(combs['0'].genJetIdx==test[:,1])|(combs['0'].genJetIdx==test[:,2])
     t2=(combs['1'].genJetIdx==test[:,0])|(combs['1'].genJetIdx==test[:,1])|(combs['1'].genJetIdx==test[:,2])
     t3=(combs['2'].genJetIdx==test[:,0])|(combs['2'].genJetIdx==test[:,1])|(combs['2'].genJetIdx==test[:,2])
     t=t1&t2&t3
     
     trutharray=ak.flatten(t)
     jetcombos=ak.flatten(combs)
     j1,j2,j3=ak.unzip(jetcombos)
     output["dR12"]+=processor.column_accumulator(ak.to_numpy(j1.delta_r(j2)))
     output["dR13"]+=processor.column_accumulator(ak.to_numpy(j1.delta_r(j3)))
     output["dR23"]+=processor.column_accumulator(ak.to_numpy(j2.delta_r(j3)))
     output["j1btag"]+=processor.column_accumulator(ak.to_numpy(j1.btagCSVV2))
     output["j2btag"]+=processor.column_accumulator(ak.to_numpy(j1.btagCSVV2))
     output["j3btag"]+=processor.column_accumulator(ak.to_numpy(j1.btagCSVV2))
     output["j1area"]+=processor.column_accumulator(ak.to_numpy(j1.area))
     output["j2area"]+=processor.column_accumulator(ak.to_numpy(j2.area))
     output["j3area"]+=processor.column_accumulator(ak.to_numpy(j3.area))
     output["j12deta"]+=processor.column_accumulator(ak.to_numpy(j1.eta-j2.eta))
     output["j23deta"]+=processor.column_accumulator(ak.to_numpy(j2.eta-j3.eta))
     output["j13deta"]+=processor.column_accumulator(ak.to_numpy(j1.eta-j3.eta))
     output["j12dphi"]+=processor.column_accumulator(ak.to_numpy(j1.phi-j2.phi))
     output["j23dphi"]+=processor.column_accumulator(ak.to_numpy(j2.phi-j3.phi))
     output["j13dphi"]+=processor.column_accumulator(ak.to_numpy(j1.phi-j3.phi))
     output["j1j2mass"]+=processor.column_accumulator(ak.to_numpy(j1.mass+j2.mass))
     output["j2j3mass"]+=processor.column_accumulator(ak.to_numpy(j2.mass+j3.mass))
     output["j1j3mass"]+=processor.column_accumulator(ak.to_numpy(j1.mass+j3.mass))
     output["j1pt"]+=processor.column_accumulator(ak.to_numpy(j1.pt))
     output["j1phi"]+=processor.column_accumulator(ak.to_numpy(j1.phi))
     output["j1eta"]+=processor.column_accumulator(ak.to_numpy(abs(j1.eta)))
     output["j1mass"]+=processor.column_accumulator(ak.to_numpy(j1.mass))
     output["j2pt"]+=processor.column_accumulator(ak.to_numpy(j2.pt))
     output["j2phi"]+=processor.column_accumulator(ak.to_numpy(j2.phi))
     output["j2eta"]+=processor.column_accumulator(ak.to_numpy(abs(j2.eta)))
     output["j2mass"]+=processor.column_accumulator(ak.to_numpy(j2.mass))
     output["j3pt"]+=processor.column_accumulator(ak.to_numpy(j3.pt))
     output["j3phi"]+=processor.column_accumulator(ak.to_numpy(j3.phi))
     output["j3eta"]+=processor.column_accumulator(ak.to_numpy(abs(j3.eta)))
     output["j3mass"]+=processor.column_accumulator(ak.to_numpy(j3.mass))
     output["event"]+=processor.column_accumulator(ak.to_numpy(ak.flatten(ak.broadcast_arrays(final.event,combs['0'].pt)[0])))
     output["truth"]+=processor.column_accumulator(ak.to_numpy(trutharray).astype(int))
     
     return output
Example #18
0
def uproot_tree_to_numpy(fname,
                         MeanNormTuple,
                         inbranches_listlist,
                         nMaxslist,
                         nevents,
                         treename="ttree",
                         stop=None,
                         branches=None):

    # array = uproot_root2array(fname, treename, stop=stop, branches=branches)

    # Read in total number of events
    totallengthperjet = 0
    for i in range(len(nMaxslist)):
        if nMaxslist[i] >= 0:
            totallengthperjet += len(inbranches_listlist[i]) * nMaxslist[i]
        else:
            totallengthperjet += len(inbranches_listlist[i])  #flat branch
    # branches = [ak.fill_none(ak.pad_none(tree[barr, target=feature_length), 0.) for feature_length, arr in zip( nMaxslist, inbranches_listlist)]
    tree = u3.open(fname)[treename]
    branches = [
        ak.fill_none(
            ak.pad_none(tree[branch_name].array(),
                        target=feature_length,
                        axis=-1,
                        clip=True if feature_length > 1 else False), 0.)
        for feature_length, branch_list in zip(nMaxslist, inbranches_listlist)
        for branch_name in branch_list
    ]

    branchnames = [n for names in inbranches_listlist for n in names]
    feature_lenghts = [
        f for branches, f in zip(inbranches_listlist, nMaxslist)
        for _ in branches
    ]
    means = [
        m[0] for branches, m in zip(inbranches_listlist, MeanNormTuple)
        for _ in branches
    ]
    norms = [
        m[1] for branches, m in zip(inbranches_listlist, MeanNormTuple)
        for _ in branches
    ]
    print("Debugigng means and norms")
    print(means)
    print(norms)

    print(branchnames)
    branches_numpy = []
    for br, brname, fl, mean, norm in zip(branches, branchnames,
                                          feature_lenghts, means, norms):
        print("DBG {}".format(brname))
        print(br)
        print("Length: {}".format(len(br)))
        if brname == "TagVarCSV_trackJetDistVal":
            print("BONUS DEBUG!")
            print("Min: {}, Max: {}".format(ak.min(ak.count(br, axis=-1)),
                                            ak.max(ak.count(br, axis=-1))))
        if fl > 1:
            # branches_numpy.append( (ak.to_numpy( br ) - mean) / norm)
            branches_numpy.append((ak.to_numpy(br) - 0.) / 1.)
        elif fl == 1:
            # branches_numpy.append( (np.expand_dims( ak.to_numpy( br ), axis=-1) - mean)/norm  )
            branches_numpy.append(
                (np.expand_dims(ak.to_numpy(br), axis=-1) - 0.) / 1.)
    print("FINISHED THIS LOOP, YOU ARE PERFECT! :) ")

    numpyarray = np.concatenate(branches_numpy, axis=-1)
    print("\n" * 5)
    print("Some metrics about this numpy array")
    print(np.mean(numpyarray, axis=0))
    print(np.std(numpyarray, axis=0))
    print("Normalize array")
    numpyarray = (numpyarray - np.mean(numpyarray, axis=0)) / np.std(
        numpyarray, axis=0)
    print("Some metrics about this numpy array")
    print(np.mean(numpyarray, axis=0))
    print(np.std(numpyarray, axis=0))
    return numpyarray
Example #19
0
    Arguments:
        - var_array: awkward array, values of a given feature for a given set of taus
        - var_name (optional, default=None): string, variable name
        - var_inf_counter (optional, default=None): defaultdict(list), stores fraction of inf values for variables
        - raise_exception (optional, default=True): bool, whether to raise exception instead of masking inf values 

    Returns
        var_array witn masked infs values to None
    """
    if np.any(is_inf_mask:=np.isinf(var_array)):
        if raise_exception:
            raise ValueError(f'Inf value detected in {var_name}')
        var_array = ak.mask(var_array, is_inf_mask, valid_when=False)
        if var_inf_counter is not None:
            var_inf_counter[var_name].append(np.sum(is_inf_mask) / ak.count(var_array))
    return var_array

def mask_nan(var_array, var_name=None, var_nan_counter=None, raise_exception=True):
    """
    Mask nan values in `var_array` with None. If var_nan_counter is passed, append there inplace for a given `var_name` the fraction of its nan values.
    Arguments:
        - var_array: awkward array, values of a given feature for a given set of taus
        - var_name (optional, default=None): string, variable name
        - var_nan_counter (optional, default=None): defaultdict(list), stores fraction of nan values for variables
        - raise_exception (optional, default=True): bool, whether to raise exception instead of masking NaN values 
    Returns
        var_array witn masked nan values to None
    """
    if np.any(is_nan_mask:=np.isnan(var_array)):
        if raise_exception:
Example #20
0
def fill_aggregators(tree, var, var_type, file_i, file_name_i, cone_type, cone_definition_dict, cone_selection_dict, inf_counter, nan_counter,
                     selection_cut, aliases, sums, sums2, counts, fill_scaling_params=False, scaling_params=None, quantile_params=None):
    """
    Update `sums`, `sums2` and `counts` dictionaries with the values of `var` variable (belonging to `var_type`) taken from input `tree` either inclusively or exclusively for inner/outer cones (`cone_type` argument).
    In the latter case, derive `constituent_dR` with respect to the tau direction of flight and define cones as:
        - inner: `constituent_dR` <= `dR_tau_signal_cone`
        - outer: constituent_dR` > `dR_tau_signal_cone` and `constituent_dR` < `dR_tau_outer_cone`
    Then mask consitutents which appear in the `cone_type` and update sums/sums2/counts only using those constituents which enter the given cone.

    If `fill_scaling_params` is set to `True`, also update `scaling_params` dictionary (i.e. make a "snapshot" of scaling parameters based on the current state of sums/sums2/counts)
    If `quantile_params` dicitonary is provided, will compute quantiles for a given `var` per cone types and store them in this dictionary.

    Arguments:
        - tree: uproot TTree, input tree to read arrays from
        - var: string, variable name
        - var_type: string, variable type
        - file_i: int, index of the file being processed as enumerator of the input file list
        - file_name_i: int, index of the file being processed as extracted from the file name
        - cone_type: string, type of cone being processed, should be either inner or outer
        - cone_definition_dict: dict, parameters for inner/outer tau cones' definition, defined in training *.yaml cfg
        - cone_selection_dict: dict, per feature types configuration for cone splitting, defined in training *.yaml cfg
        - inf_counter: defaultdict(list), stores fraction of inf values for variables
        - nan_counter: defaultdict(list), stores fraction of nan values for variables
        - selection_cut: str, cut to be applied by uproot at the array reading step
        - aliases: dict, definitions of variables to be constructed by uproot at the array reading step
        - sums: dict, container for accumulating sums of features' values and to be filled based on the input `var_array`
        - sums2: dict, container for accumulating square sums of features' values and to be filled based on the input `var_array`
        - counts: dict, container for accumulating counts of features' values and to be filled based on the input `var_array`
        - fill_scaling_params (optional, default=False): bool, whether to update the `scaling_params` dictionary with the values from the current state of sums/sums2/counts
        - scaling_params(optional, default=None): dict, main dictionary storing scaling parameters per variable type/variable name/cone type. Used only if `fill_scaling_params` is set to `True`
        - quantile_params(optional, default=None): dict, if passed, will store in this disctionary for a given `file_i` quantile numbers for `var_array` as returned by `get_quantiles()` function

    Returns:
        None
    """
    constituent_eta_name, constituent_phi_name = cone_selection_dict[var_type]['var_names']['eta'], cone_selection_dict[var_type]['var_names']['phi']
    var_array, constituent_eta_array, constituent_phi_array = tree.arrays([var, constituent_eta_name, constituent_phi_name], cut=selection_cut, aliases=aliases, how=tuple)
    #var_array = mask_inf(var_array, var, inf_counter, raise_exception=True)
    #var_array = mask_nan(var_array, var, nan_counter, raise_exception=True)

    if cone_type == None:
        sums[var_type][var][file_i] += ak.sum(var_array)
        sums2[var_type][var][file_i] += ak.sum(var_array**2)
        counts[var_type][var][file_i] += ak.count(var_array)
        if fill_scaling_params:
            mean_ = compute_mean(sums[var_type][var], counts[var_type][var], aggregate=True)
            sqmean_ = compute_mean(sums2[var_type][var], counts[var_type][var], aggregate=True)
            std_ = compute_std(sums[var_type][var], sums2[var_type][var], counts[var_type][var], aggregate=True)

            scaling_params[var_type][var]['global']['num'] = int(counts[var_type][var].sum())
            if mean_ == None:
                print(f"Low statistics in {var} for mean computation")
                scaling_params[var_type][var]['global']['mean'] = None
            else:
                scaling_params[var_type][var]['global']['mean'] = float(format(mean_, '.4g')) # round to 4 significant digits
            if std_ == None:
                print(f"Low statistics in {var} for std computation")
                scaling_params[var_type][var]['global']['std'] = None
            else:
                scaling_params[var_type][var]['global']['std'] = float(format(std_, '.4g'))
            if sqmean_ == None:
                print(f"Low statistics in {var} for sqmean computation")
                scaling_params[var_type][var]['global']['sqmean'] = None
            else:
                scaling_params[var_type][var]['global']['sqmean'] = float(format(sqmean_, '.4g'))
        if quantile_params:
            quantile_params[var_type][var]['global'][file_name_i] = get_quantiles(var_array)
            if None in quantile_params[var_type][var]['global'][file_name_i].values(): print(f"Low statistics in {var} for quantile computation")
    elif cone_type == 'inner' or cone_type == 'outer':
        tau_pt_name, tau_eta_name, tau_phi_name = cone_selection_dict['TauFlat']['var_names']['pt'], cone_selection_dict['TauFlat']['var_names']['eta'], cone_selection_dict['TauFlat']['var_names']['phi']
        tau_pt_array, tau_eta_array, tau_phi_array = tree.arrays([tau_pt_name, tau_eta_name, tau_phi_name], cut=None, aliases=None, how=tuple)
        dR_tau_signal_cone = dR_signal_cone(tau_pt_array, cone_definition_dict['inner']['min_pt'], cone_definition_dict['inner']['min_radius'], cone_definition_dict['inner']['opening_coef'])
        constituent_dR = dR(tau_eta_array - constituent_eta_array, tau_phi_array - constituent_phi_array)
        if cone_type == 'inner':
            cone_mask = constituent_dR <= dR_tau_signal_cone
        elif cone_type == 'outer':
            cone_mask = (constituent_dR > dR_tau_signal_cone) & (constituent_dR < cone_definition_dict['outer']['dR'])
        sums[var_type][var][cone_type][file_i] += ak.sum(var_array[cone_mask])
        sums2[var_type][var][cone_type][file_i] += ak.sum(var_array[cone_mask]**2)
        counts[var_type][var][cone_type][file_i] += ak.count(var_array[cone_mask])
        if fill_scaling_params:
            mean_ = compute_mean(sums[var_type][var][cone_type], counts[var_type][var][cone_type], aggregate=True)
            sqmean_ =  compute_mean(sums2[var_type][var][cone_type], counts[var_type][var][cone_type], aggregate=True)
            std_ = compute_std(sums[var_type][var][cone_type], sums2[var_type][var][cone_type], counts[var_type][var][cone_type], aggregate=True)

            scaling_params[var_type][var][cone_type]['num'] = int(counts[var_type][var][cone_type].sum())
            if mean_ == None:
                print(f"Low statistics in {var} for mean computation")
                scaling_params[var_type][var][cone_type]['mean'] = None
            else:
                scaling_params[var_type][var][cone_type]['mean'] = float(format(mean_, '.4g'))
            if std_ == None:
                print(f"Low statistics in {var} for std computation")
                scaling_params[var_type][var][cone_type]['std'] = None
            else:
                scaling_params[var_type][var][cone_type]['std'] = float(format(std_, '.4g'))
            if sqmean_ == None:
                print(f"Low statistics in {var} for sqmean computation")
                scaling_params[var_type][var][cone_type]['sqmean'] = None
            else:
                scaling_params[var_type][var][cone_type]['sqmean'] = float(format(sqmean_, '.4g'))
        if quantile_params:
            quantile_params[var_type][var][cone_type][file_name_i] = get_quantiles(var_array[cone_mask])
            if None in quantile_params[var_type][var][cone_type][file_name_i].values(): print(f"Low statistics in {var} for quantile computation")
    else:
        raise ValueError(f'cone_type for {var_type} should be either inner, or outer')
Example #21
0
def fill_aggregators(var_array,
                     tau_eta_array,
                     tau_phi_array,
                     constituent_eta_array,
                     constituent_phi_array,
                     var,
                     var_type,
                     file_i,
                     file_name_id,
                     cone_type,
                     dR_tau_signal_cone,
                     dR_tau_outer_cone,
                     sums,
                     sums2,
                     counts,
                     fill_scaling_params=False,
                     scaling_params=None,
                     quantile_params=None):
    """
    Update `sums`, `sums2` and `counts` dictionaries with the values from `var_array` either inclusively or exclusively (based on `cone_type` argument) for inner/outer cones.
    In the latter case, derive `constituent_dR` with respect to the tau direction of flight and define cones as:
        - inner: `constituent_dR` <= `dR_tau_signal_cone`
        - outer: constituent_dR` > `dR_tau_signal_cone` and `constituent_dR` < `dR_tau_outer_cone`
    Then mask consitutents which appear in the `cone_type` and update sums/sums2/counts only using those constituents which enter the given cone.

    If `fill_scaling_params` is set to `True`, also update `scaling_params` dictionary (i.e. make a "snapshot" of scaling parameters based on the current state of sums/sums2/counts)

    Arguments:
        - var_array: awkward array, values of a given feature for a given set of taus
        - tau_eta_array: awkward array, eta values of taus
        - tau_phi_array: awkward array, phi values of taus
        - constituent_eta_array: awkward array, eta values of tau constituents
        - constituent_phi_array: awkward array, phi values of tau constituents
        - var: string, variable name
        - var_type: string, variable type
        - file_i: int, index of the file being processed in the input file list
        - file_name_id: int, index of the file being processed taken from the corresponding file name
        - cone_type: string, type of cone being processed, should be either inner or outer
        - dR_tau_signal_cone: awkward array, per tau dR values defining the signal cone
        - dR_tau_outer_cone: float, dR value defining the tau outer cone
        - sums: dict, container for accumulating sums of features' values and to be filled based on the input `var_array`
        - sums2: dict, container for accumulating square sums of features' values and to be filled based on the input `var_array`
        - counts: dict, container for accumulating counts of features' values and to be filled based on the input `var_array`
        - fill_scaling_params (optional, default=False): bool, whether to update the `scaling_params` dictionary with the values from the current state of sums/sums2/counts
        - scaling_params(optional, default=None): dict, main dictionary storing scaling parameters per variable type/variable name/cone type. Used only if `fill_scaling_params` is set to `True`
        - quantile_params(optional, default=None): dict, if passed, will store in this disctionary for a given `file_i` quantile numbers for `var_array` as returned by `get_quantiles()` function

    Returns:
        None
    """
    if cone_type == None:
        sums[var_type][var][file_i] += ak.sum(var_array)
        sums2[var_type][var][file_i] += ak.sum(var_array**2)
        counts[var_type][var][file_i] += ak.count(var_array)
        if fill_scaling_params:
            mean_ = compute_mean(sums[var_type][var],
                                 counts[var_type][var],
                                 aggregate=True)
            std_ = compute_std(sums[var_type][var],
                               sums2[var_type][var],
                               counts[var_type][var],
                               aggregate=True)
            scaling_params[var_type][var]['global']['mean'] = float(
                format(mean_, '.4g'))  # round to 4 significant digits
            scaling_params[var_type][var]['global']['std'] = float(
                format(std_, '.4g'))
        if quantile_params:
            quantile_params[var_type][var]['global'][
                file_name_id] = get_quantiles(var_array)
    elif cone_type == 'inner' or cone_type == 'outer':
        constituent_dR = dR(tau_eta_array - constituent_eta_array,
                            tau_phi_array - constituent_phi_array)
        if cone_type == 'inner':
            cone_mask = constituent_dR <= dR_tau_signal_cone
        elif cone_type == 'outer':
            cone_mask = (constituent_dR > dR_tau_signal_cone) & (
                constituent_dR < dR_tau_outer_cone)
        sums[var_type][var][cone_type][file_i] += ak.sum(var_array[cone_mask])
        sums2[var_type][var][cone_type][file_i] += ak.sum(
            var_array[cone_mask]**2)
        counts[var_type][var][cone_type][file_i] += ak.count(
            var_array[cone_mask])
        if fill_scaling_params:
            mean_ = compute_mean(sums[var_type][var][cone_type],
                                 counts[var_type][var][cone_type],
                                 aggregate=True)
            std_ = compute_std(sums[var_type][var][cone_type],
                               sums2[var_type][var][cone_type],
                               counts[var_type][var][cone_type],
                               aggregate=True)
            scaling_params[var_type][var][cone_type]['mean'] = float(
                format(mean_, '.4g'))
            scaling_params[var_type][var][cone_type]['std'] = float(
                format(std_, '.4g'))
        if quantile_params:
            quantile_params[var_type][var][cone_type][
                file_name_id] = get_quantiles(var_array[cone_mask])
    else:
        raise ValueError(
            f'cone_type for {var_type} should be either inner, or outer')
Example #22
0
    def process(self, df):
        # print(df.fields)
        # numevents = len(df)
        # dataset = df.metadata["dataset"]
        output = pd.DataFrame({"event": df.Event.Number})

        output.index.name = "entry"

        output["dataset"] = df.metadata["dataset"]
        regions = df.metadata["regions"]
        # channels = df.metadata['channels']
        output["lumi_wgt"] = float(df.metadata["lumi_wgt"])
        output["mc_wgt"] = ak.to_pandas(df.Event.Weight)
        # There are multiple weights per event - need to figure this out
        # output['lhe_wgt'] = ak.to_pandas(df.Weight.Weight)
        output["year"] = "snowmass"

        # Select muons
        muons = df[parameters["muon_branch"]]
        muon_filter = ((muons.pt > parameters["muon_pt_cut"])
                       & (abs(muons.eta) < parameters["muon_eta_cut"])
                       & (muons.IsolationVar < parameters["muon_iso_cut"]))
        nmuons = ak.to_pandas(ak.count(muons[muon_filter].pt, axis=1))

        mu_map = {"PT": "pt", "Eta": "eta", "Phi": "phi", "Charge": "charge"}
        muon_columns = ["PT", "Eta", "Phi", "Charge", "IsolationVar"]

        # Convert one column at a time to preserve event indices in Pandas
        muon_feature_list = []
        for col in muon_columns:
            muon_feature = df[parameters["muon_branch"]][col]
            val = ak.to_pandas(muon_feature[muon_filter])
            muon_feature_list.append(val)

        muons = pd.concat(muon_feature_list, axis=1)
        muons.columns = muon_columns
        muons.rename(columns=mu_map, inplace=True)

        mu1 = muons.loc[muons.pt.groupby("entry").idxmax()]
        mu2 = muons.loc[muons.pt.groupby("entry").idxmin()]
        mu1.index = mu1.index.droplevel("subentry")
        mu2.index = mu2.index.droplevel("subentry")
        pass_leading_pt = mu1.pt > parameters["muon_leading_pt"]
        fill_muons(output, mu1, mu2)

        output.mm_charge = output.mu1_charge * output.mu2_charge

        # Select electrons
        electrons = df[parameters["electron_branch"]]
        electrons = electrons[
            (electrons.pt > parameters["electron_pt_cut"])
            & (abs(electrons.eta) < parameters["electron_eta_cut"])]
        nelectrons = ak.to_pandas(ak.count(electrons.pt, axis=1))

        # Select jets
        jets = df[parameters["jet_branch"]]
        mu_for_clean = df[parameters["muon_branch"]]
        mu_for_clean = mu_for_clean[
            (mu_for_clean.pt > parameters["muon_pt_cut"])
            & (mu_for_clean.IsolationVar < parameters["muon_iso_cut"])]
        _, jet_mu_dr = jets.nearest(mu_for_clean, return_metric=True)
        jet_filter = (
            ak.fill_none(jet_mu_dr > parameters["min_dr_mu_jet"], True)
            & (jets.pt > parameters["jet_pt_cut"])
            & (abs(jets.eta) < parameters["jet_eta_cut"]))
        njets = ak.to_pandas(ak.count(jets[jet_filter].pt, axis=1))

        jet_map = {"PT": "pt", "Eta": "eta", "Phi": "phi", "Mass": "mass"}
        jet_columns = ["PT", "Eta", "Phi", "Mass"]

        jet_feature_list = []
        for col in jet_columns:
            jet_feature = df[parameters["jet_branch"]][col]
            val = ak.to_pandas(jet_feature[jet_filter])
            jet_feature_list.append(val)

        jets = pd.concat(jet_feature_list, axis=1)
        jets.columns = jet_columns
        jets.rename(columns=jet_map, inplace=True)

        jets = jets.sort_values(["entry", "pt"], ascending=[True, False])
        jets.index = pd.MultiIndex.from_arrays(
            [jets.index.get_level_values(0),
             jets.groupby(level=0).cumcount()],
            names=["entry", "subentry"],
        )
        jet1 = jets.loc[pd.IndexSlice[:, 0], :]
        jet2 = jets.loc[pd.IndexSlice[:, 1], :]
        jet1.index = jet1.index.droplevel("subentry")
        jet2.index = jet2.index.droplevel("subentry")

        fill_jets(output, jet1, jet2)
        fill_gen_jets(df, output)

        # Event selection: two opposite-sign muons and no electrons
        output["nmuons"] = nmuons
        output["nelectrons"] = nelectrons
        output["njets"] = njets
        output[["nmuons", "nelectrons",
                "njets"]] = output[["nmuons", "nelectrons", "njets"]].fillna(0)

        output["event_selection"] = ((output.nmuons == 2)
                                     & (output.mm_charge == -1)
                                     & (output.nelectrons == 0)
                                     & pass_leading_pt)

        mass = output.dimuon_mass
        output["region"] = None
        output.loc[((mass > 76) & (mass < 106)), "region"] = "z-peak"
        output.loc[((mass > 110) & (mass < 115.03)) | ((mass > 135.03) &
                                                       (mass < 150)),
                   "region", ] = "h-sidebands"
        output.loc[((mass > 115.03) & (mass < 135.03)), "region"] = "h-peak"

        output = output.loc[output.event_selection, :]
        output = output.reindex(sorted(output.columns), axis=1)

        output = output[output.region.isin(regions)]
        """
        input_evts = numevents
        output_evts = output.shape[0]
        out_yield = output.lumi_wgt.sum()
        out_vbf = output[
            (output.jj_mass>400) & (output.jj_dEta>2.5) & (output.jet1_pt>35) & (output.njets>=2)
        ].lumi_wgt.sum()
        out_ggh = out_yield - out_vbf

        print(f"\n{dataset}:    {input_evts}  ->  {output_evts};    yield = {out_ggh} (ggH) + {out_vbf} (VBF) = {out_yield}")
        """

        to_return = None
        if self.apply_to_output is None:
            to_return = output
        else:
            self.apply_to_output(output)
            to_return = self.accumulator.identity()

        return to_return
def test_highlevel():
    array = ak.Array(
        [[[2, 3, 5], [], [7, 11], [13]], [], [[17, 19], [23]]], check_valid=True
    )

    assert ak.count(array) == 9
    assert ak.to_list(ak.count(array, axis=-1)) == [[3, 0, 2, 1], [], [2, 1]]
    assert ak.to_list(ak.count(array, axis=2)) == [[3, 0, 2, 1], [], [2, 1]]
    assert ak.to_list(ak.count(array, axis=-1, keepdims=True)) == [
        [[3], [0], [2], [1]],
        [],
        [[2], [1]],
    ]
    assert ak.to_list(ak.count(array, axis=-2)) == [[3, 2, 1], [], [2, 1]]
    assert ak.to_list(ak.count(array, axis=1)) == [[3, 2, 1], [], [2, 1]]
    assert ak.to_list(ak.count(array, axis=-2, keepdims=True)) == [
        [[3, 2, 1]],
        [[]],
        [[2, 1]],
    ]

    assert ak.count_nonzero(array) == 9
    assert ak.to_list(ak.count_nonzero(array, axis=-1)) == [[3, 0, 2, 1], [], [2, 1]]
    assert ak.to_list(ak.count_nonzero(array, axis=-2)) == [[3, 2, 1], [], [2, 1]]

    assert ak.sum(array) == 2 + 3 + 5 + 7 + 11 + 13 + 17 + 19 + 23
    assert ak.to_list(ak.sum(array, axis=-1)) == [
        [2 + 3 + 5, 0, 7 + 11, 13],
        [],
        [17 + 19, 23],
    ]
    assert ak.to_list(ak.sum(array, axis=-2)) == [
        [2 + 7 + 13, 3 + 11, 5],
        [],
        [17 + 23, 19],
    ]

    assert ak.prod(array) == 2 * 3 * 5 * 7 * 11 * 13 * 17 * 19 * 23
    assert ak.to_list(ak.prod(array, axis=-1)) == [
        [2 * 3 * 5, 1, 7 * 11, 13],
        [],
        [17 * 19, 23],
    ]
    assert ak.to_list(ak.prod(array, axis=-2)) == [
        [2 * 7 * 13, 3 * 11, 5],
        [],
        [17 * 23, 19],
    ]

    assert ak.min(array) == 2
    assert ak.to_list(ak.min(array, axis=-1)) == [[2, None, 7, 13], [], [17, 23]]
    assert ak.to_list(ak.min(array, axis=-2)) == [[2, 3, 5], [], [17, 19]]

    assert ak.max(array) == 23
    assert ak.to_list(ak.max(array, axis=-1)) == [[5, None, 11, 13], [], [19, 23]]
    assert ak.to_list(ak.max(array, axis=-2)) == [[13, 11, 5], [], [23, 19]]

    array = ak.Array(
        [
            [[True, False, True], [], [False, False], [True]],
            [],
            [[False, True], [True]],
        ],
        check_valid=True,
    )

    assert ak.any(array) == True
    assert ak.to_list(ak.any(array, axis=-1)) == [
        [True, False, False, True],
        [],
        [True, True],
    ]
    assert ak.to_list(ak.any(array, axis=-2)) == [[True, False, True], [], [True, True]]

    assert ak.all(array) == False
    assert ak.to_list(ak.all(array, axis=-1)) == [
        [False, True, False, True],
        [],
        [False, True],
    ]
    assert ak.to_list(ak.all(array, axis=-2)) == [
        [False, False, True],
        [],
        [False, True],
    ]
Example #24
0
    def _put_tracks_into_blob(self, blob, tracks, reco_identifier, n_tracks):

        """
        Put a certain type of "tracks" in the blob and give specific name.

        Parameters
        ----------
        tracks : awkward array
            The tracks object to be put in the blob eventually. Can be only best tracks.
        identifier : string
            A string to name the kp table.
        n_tracks : int
            The number of tracks from before. Use to distinguish between best and all tracks.

        """

        reco_tracks = dict(
            pos_x=tracks.pos_x,
            pos_y=tracks.pos_y,
            pos_z=tracks.pos_z,
            dir_x=tracks.dir_x,
            dir_y=tracks.dir_y,
            dir_z=tracks.dir_z,
            E=tracks.E,
            rec_type=tracks.rec_type,
            t=tracks.t,
            likelihood=tracks.lik,
            length=tracks.len,  # do all recos have this?
        )

        if n_tracks != 1:
            reco_tracks.update(
                id=tracks.id,
                idx=np.arange(n_tracks),
            )

        n_columns = max(km3io.definitions.fitparameters.values()) + 1
        fitinf_array = np.ma.filled(
            ak.to_numpy(ak.pad_none(tracks.fitinf, target=n_columns, axis=-1)),
            fill_value=np.nan,
        ).astype("float32")
        fitinf_split = np.split(fitinf_array, fitinf_array.shape[-1], axis=-1)

        if n_tracks == 1:
            for fitparam, idx in km3io.definitions.fitparameters.items():
                reco_tracks[fitparam] = fitinf_split[idx][0]

        else:
            for fitparam, idx in km3io.definitions.fitparameters.items():
                reco_tracks[fitparam] = fitinf_split[idx][:, 0]

        blob["Reco_" + reco_identifier] = kp.Table(
            reco_tracks,
            h5loc=f"/reco/" + reco_identifier,
            name="Reco " + reco_identifier,
            split_h5=self.split,
        )

        # write out the rec stages only once with all tracks
        if n_tracks != 1:

            _rec_stage = np.array(ak.flatten(tracks.rec_stages)._layout)
            _counts = ak.count(tracks.rec_stages, axis=1)
            _idx = np.repeat(np.arange(n_tracks), _counts)

            blob["RecStages"] = kp.Table(
                dict(rec_stage=_rec_stage, idx=_idx),
                # Just to save space, we specify smaller dtypes.
                # We assume there will be never more than 32767
                # reco tracks for a single reconstruction type.
                dtypes=[("rec_stage", np.int16), ("idx", np.uint16)],
                h5loc=f"/reco/rec_stages",
                name="Reconstruction Stages",
                split_h5=self.split,
            )
Example #25
0
    def process(self, df):
        # Initialize timer
        if self.timer:
            self.timer.update()

        # Dataset name (see definitions in config/datasets.py)
        dataset = df.metadata["dataset"]
        is_mc = "data" not in dataset
        numevents = len(df)

        # ------------------------------------------------------------#
        # Apply HLT, lumimask, genweights, PU weights
        # and L1 prefiring weights
        # ------------------------------------------------------------#

        # All variables that we want to save
        # will be collected into the 'output' dataframe
        output = pd.DataFrame({"run": df.run, "event": df.event})
        output.index.name = "entry"
        output["npv"] = df.PV.npvs
        output["met"] = df.MET.pt

        # Separate dataframe to keep track on weights
        # and their systematic variations
        weights = Weights(output)

        if is_mc:
            # For MC: Apply gen.weights, pileup weights, lumi weights,
            # L1 prefiring weights
            mask = np.ones(numevents, dtype=bool)
            genweight = df.genWeight
            weights.add_weight("genwgt", genweight)
            weights.add_weight("lumi", self.lumi_weights[dataset])

            pu_wgts = pu_evaluator(
                self.pu_lookups,
                self.parameters,
                numevents,
                np.array(df.Pileup.nTrueInt),
                self.auto_pu,
            )
            weights.add_weight("pu_wgt", pu_wgts, how="all")

            if self.parameters["do_l1prefiring_wgts"]:
                if "L1PreFiringWeight" in df.fields:
                    l1pfw = l1pf_weights(df)
                    weights.add_weight("l1prefiring_wgt", l1pfw, how="all")
                else:
                    weights.add_weight("l1prefiring_wgt", how="dummy_vars")

        else:
            # For Data: apply Lumi mask
            lumi_info = LumiMask(self.parameters["lumimask"])
            mask = lumi_info(df.run, df.luminosityBlock)

        # Apply HLT to both Data and MC
        hlt_columns = [c for c in self.parameters["hlt"] if c in df.HLT.fields]
        hlt = ak.to_pandas(df.HLT[hlt_columns])
        if len(hlt_columns) == 0:
            hlt = False
        else:
            hlt = hlt[hlt_columns].sum(axis=1)

        if self.timer:
            self.timer.add_checkpoint("HLT, lumimask, PU weights")

        # ------------------------------------------------------------#
        # Update muon kinematics with Rochester correction,
        # FSR recovery and GeoFit correction
        # Raw pT and eta are stored to be used in event selection
        # ------------------------------------------------------------#

        # Save raw variables before computing any corrections
        df["Muon", "pt_raw"] = df.Muon.pt
        df["Muon", "eta_raw"] = df.Muon.eta
        df["Muon", "phi_raw"] = df.Muon.phi
        df["Muon", "pfRelIso04_all_raw"] = df.Muon.pfRelIso04_all

        # Rochester correction
        if self.do_roccor:
            apply_roccor(df, self.roccor_lookup, is_mc)
            df["Muon", "pt"] = df.Muon.pt_roch

            # variations will be in branches pt_roch_up and pt_roch_down
            # muons_pts = {
            #     'nominal': df.Muon.pt,
            #     'roch_up':df.Muon.pt_roch_up,
            #     'roch_down':df.Muon.pt_roch_down
            # }

        # for ...
        if True:  # indent reserved for loop over muon pT variations
            # According to HIG-19-006, these variations have negligible
            # effect on significance, but it's better to have them
            # implemented in the future

            # FSR recovery
            if self.do_fsr:
                has_fsr = fsr_recovery(df)
                df["Muon", "pt"] = df.Muon.pt_fsr
                df["Muon", "eta"] = df.Muon.eta_fsr
                df["Muon", "phi"] = df.Muon.phi_fsr
                df["Muon", "pfRelIso04_all"] = df.Muon.iso_fsr

            # if FSR was applied, 'pt_fsr' will be corrected pt
            # if FSR wasn't applied, just copy 'pt' to 'pt_fsr'
            df["Muon", "pt_fsr"] = df.Muon.pt

            # GeoFit correction
            if self.do_geofit and ("dxybs" in df.Muon.fields):
                apply_geofit(df, self.year, ~has_fsr)
                df["Muon", "pt"] = df.Muon.pt_fsr

            if self.timer:
                self.timer.add_checkpoint("Muon corrections")

            # --- conversion from awkward to pandas --- #
            muon_columns = [
                "pt",
                "pt_fsr",
                "eta",
                "phi",
                "charge",
                "ptErr",
                "mass",
                "pt_raw",
                "eta_raw",
                "pfRelIso04_all",
            ] + [self.parameters["muon_id"]]
            muons = ak.to_pandas(df.Muon[muon_columns])

            # --------------------------------------------------------#
            # Select muons that pass pT, eta, isolation cuts,
            # muon ID and quality flags
            # Select events with 2 OS muons, no electrons,
            # passing quality cuts and at least one good PV
            # --------------------------------------------------------#

            # Apply event quality flags
            flags = ak.to_pandas(df.Flag[self.parameters["event_flags"]])
            flags = flags[self.parameters["event_flags"]].product(axis=1)
            muons["pass_flags"] = True
            if self.parameters["muon_flags"]:
                muons["pass_flags"] = muons[
                    self.parameters["muon_flags"]].product(axis=1)

            # Define baseline muon selection (applied to pandas DF!)
            muons["selection"] = (
                (muons.pt_raw > self.parameters["muon_pt_cut"])
                & (abs(muons.eta_raw) < self.parameters["muon_eta_cut"])
                & (muons.pfRelIso04_all < self.parameters["muon_iso_cut"])
                & muons[self.parameters["muon_id"]]
                & muons.pass_flags)

            # Count muons
            nmuons = (muons[muons.selection].reset_index().groupby("entry")
                      ["subentry"].nunique())

            # Find opposite-sign muons
            mm_charge = muons.loc[muons.selection,
                                  "charge"].groupby("entry").prod()

            # Veto events with good quality electrons
            electrons = df.Electron[
                (df.Electron.pt > self.parameters["electron_pt_cut"])
                & (abs(df.Electron.eta) < self.parameters["electron_eta_cut"])
                & (df.Electron[self.parameters["electron_id"]] == 1)]
            electron_veto = ak.to_numpy(ak.count(electrons.pt, axis=1) == 0)

            # Find events with at least one good primary vertex
            good_pv = ak.to_pandas(df.PV).npvsGood > 0

            # Define baseline event selection
            output["two_muons"] = nmuons == 2
            output["event_selection"] = (mask
                                         & (hlt > 0)
                                         & (flags > 0)
                                         & (nmuons == 2)
                                         & (mm_charge == -1)
                                         & electron_veto
                                         & good_pv)

            # --------------------------------------------------------#
            # Select two leading-pT muons
            # --------------------------------------------------------#

            # Find pT-leading and subleading muons
            # This is slow for large chunk size.
            # Consider reimplementing using sort_values().groupby().nth()
            # or sort_values().drop_duplicates()
            # or using Numba
            # https://stackoverflow.com/questions/50381064/select-the-max-row-per-group-pandas-performance-issue
            muons = muons[muons.selection & (nmuons == 2)]
            mu1 = muons.loc[muons.pt.groupby("entry").idxmax()]
            mu2 = muons.loc[muons.pt.groupby("entry").idxmin()]
            mu1.index = mu1.index.droplevel("subentry")
            mu2.index = mu2.index.droplevel("subentry")

            # --------------------------------------------------------#
            # Select events with muons passing leading pT cut
            # and trigger matching (trig match not done in final vrsn)
            # --------------------------------------------------------#

            # Events where there is at least one muon passing
            # leading muon pT cut
            pass_leading_pt = mu1.pt_raw > self.parameters["muon_leading_pt"]

            # update event selection with leading muon pT cut
            output["pass_leading_pt"] = pass_leading_pt
            output[
                "event_selection"] = output.event_selection & output.pass_leading_pt

            # --------------------------------------------------------#
            # Fill dimuon and muon variables
            # --------------------------------------------------------#

            fill_muons(self, output, mu1, mu2, is_mc)

            if self.timer:
                self.timer.add_checkpoint("Event & muon selection")

        # ------------------------------------------------------------#
        # Prepare jets
        # ------------------------------------------------------------#

        prepare_jets(df, is_mc)

        # ------------------------------------------------------------#
        # Apply JEC, get JEC and JER variations
        # ------------------------------------------------------------#

        jets = df.Jet

        self.do_jec = False

        # We only need to reapply JEC for 2018 data
        # (unless new versions of JEC are released)
        if ("data" in dataset) and ("2018" in self.year):
            self.do_jec = True

        jets = apply_jec(
            df,
            jets,
            dataset,
            is_mc,
            self.year,
            self.do_jec,
            self.do_jecunc,
            self.do_jerunc,
            self.jec_factories,
            self.jec_factories_data,
        )

        # ------------------------------------------------------------#
        # Calculate other event weights
        # ------------------------------------------------------------#

        if is_mc:
            do_nnlops = self.do_nnlops and ("ggh" in dataset)
            if do_nnlops:
                nnlopsw = nnlops_weights(df, numevents, self.parameters,
                                         dataset)
                weights.add_weight("nnlops", nnlopsw)
            else:
                weights.add_weight("nnlops", how="dummy")
            # --- --- --- --- --- --- --- --- --- --- --- --- --- --- #
            # do_zpt = ('dy' in dataset)
            #
            # if do_zpt:
            #     zpt_weight = np.ones(numevents, dtype=float)
            #     zpt_weight[two_muons] =\
            #         self.evaluator[self.zpt_path](
            #             output['dimuon_pt'][two_muons]
            #         ).flatten()
            #     weights.add_weight('zpt_wgt', zpt_weight)
            # --- --- --- --- --- --- --- --- --- --- --- --- --- --- #
            do_musf = True
            if do_musf:
                muID, muIso, muTrig = musf_evaluator(self.musf_lookup,
                                                     self.year, numevents, mu1,
                                                     mu2)
                weights.add_weight("muID", muID, how="all")
                weights.add_weight("muIso", muIso, how="all")
                weights.add_weight("muTrig", muTrig, how="all")
            else:
                weights.add_weight("muID", how="dummy_all")
                weights.add_weight("muIso", how="dummy_all")
                weights.add_weight("muTrig", how="dummy_all")
            # --- --- --- --- --- --- --- --- --- --- --- --- --- --- #
            do_lhe = (("LHEScaleWeight" in df.fields)
                      and ("LHEPdfWeight" in df.fields)
                      and ("nominal" in self.pt_variations))
            if do_lhe:
                lhe_ren, lhe_fac = lhe_weights(df, output, dataset, self.year)
                weights.add_weight("LHERen", lhe_ren, how="only_vars")
                weights.add_weight("LHEFac", lhe_fac, how="only_vars")
            else:
                weights.add_weight("LHERen", how="dummy_vars")
                weights.add_weight("LHEFac", how="dummy_vars")
            # --- --- --- --- --- --- --- --- --- --- --- --- --- --- #
            do_thu = (("vbf" in dataset) and ("dy" not in dataset)
                      and ("nominal" in self.pt_variations)
                      and ("stage1_1_fine_cat_pTjet30GeV" in df.HTXS.fields))
            if do_thu:
                for i, name in enumerate(self.sths_names):
                    wgt_up = stxs_uncert(
                        i,
                        ak.to_numpy(df.HTXS.stage1_1_fine_cat_pTjet30GeV),
                        1.0,
                        self.stxs_acc_lookups,
                        self.powheg_xsec_lookup,
                    )
                    wgt_down = stxs_uncert(
                        i,
                        ak.to_numpy(df.HTXS.stage1_1_fine_cat_pTjet30GeV),
                        -1.0,
                        self.stxs_acc_lookups,
                        self.powheg_xsec_lookup,
                    )
                    thu_wgts = {"up": wgt_up, "down": wgt_down}
                    weights.add_weight("THU_VBF_" + name,
                                       thu_wgts,
                                       how="only_vars")
            else:
                for i, name in enumerate(self.sths_names):
                    weights.add_weight("THU_VBF_" + name, how="dummy_vars")
            # --- --- --- --- --- --- --- --- --- --- --- --- --- --- #
            do_pdf = (self.do_pdf and ("nominal" in self.pt_variations)
                      and ("dy" in dataset or "ewk" in dataset
                           or "ggh" in dataset or "vbf" in dataset)
                      and ("mg" not in dataset))
            if "2016" in self.year:
                max_replicas = 0
                if "dy" in dataset:
                    max_replicas = 100
                elif "ewk" in dataset:
                    max_replicas = 33
                else:
                    max_replicas = 100
                if do_pdf:
                    pdf_wgts = df.LHEPdfWeight[:, 0:self.
                                               parameters["n_pdf_variations"]]
                for i in range(100):
                    if (i < max_replicas) and do_pdf:
                        output[f"pdf_mcreplica{i}"] = pdf_wgts[:, i]
                    else:
                        output[f"pdf_mcreplica{i}"] = np.nan
            else:
                if do_pdf:
                    pdf_wgts = df.LHEPdfWeight[:, 0:self.
                                               parameters["n_pdf_variations"]][
                                                   0]
                    pdf_wgts = np.array(pdf_wgts)
                    pdf_vars = {
                        "up": (1 + 2 * pdf_wgts.std()),
                        "down": (1 - 2 * pdf_wgts.std()),
                    }
                    weights.add_weight("pdf_2rms", pdf_vars, how="only_vars")
                else:
                    weights.add_weight("pdf_2rms", how="dummy_vars")
            # --- --- --- --- --- --- --- --- --- --- --- --- --- --- #

        if is_mc:
            output = fill_gen_jets(df, output)

        # ------------------------------------------------------------#
        # Loop over JEC variations and fill jet variables
        # ------------------------------------------------------------#

        output.columns = pd.MultiIndex.from_product(
            [output.columns, [""]], names=["Variable", "Variation"])

        if self.timer:
            self.timer.add_checkpoint("Jet preparation & event weights")

        for v_name in self.pt_variations:
            output_updated = self.jet_loop(
                v_name,
                is_mc,
                df,
                dataset,
                mask,
                muons,
                mu1,
                mu2,
                jets,
                weights,
                numevents,
                output,
            )
            if output_updated is not None:
                output = output_updated

        if self.timer:
            self.timer.add_checkpoint("Jet loop")

        # ------------------------------------------------------------#
        # Fill outputs
        # ------------------------------------------------------------#
        mass = output.dimuon_mass
        output["region"] = None
        output.loc[((mass > 76) & (mass < 106)), "region"] = "z-peak"
        output.loc[((mass > 110) & (mass < 115.03)) | ((mass > 135.03) &
                                                       (mass < 150)),
                   "region", ] = "h-sidebands"
        output.loc[((mass > 115.03) & (mass < 135.03)), "region"] = "h-peak"
        output["dataset"] = dataset
        output["year"] = int(self.year)

        for wgt in weights.df.columns:
            skip_saving = (("nominal" not in wgt) and ("up" not in wgt)
                           and ("down" not in wgt))
            if skip_saving:
                continue
            output[f"wgt_{wgt}"] = weights.get_weight(wgt)

        columns_to_save = [
            c for c in output.columns
            if (c[0] in self.vars_to_save) or ("wgt_" in c[0]) or (
                "mcreplica" in c[0]) or (c[0] in ["region", "dataset", "year"])
            or ("gjet" in c[0]) or ("gjj" in c[0])
        ]

        output = output.loc[output.event_selection, columns_to_save]
        output = output.reindex(sorted(output.columns), axis=1)

        output.columns = [
            " ".join(col).strip() for col in output.columns.values
        ]

        output = output[output.region.isin(self.regions)]
        """
        input_evts = numevents
        output_evts = output.shape[0]
        out_yield = output.wgt_nominal.sum()
        out_vbf = output[
                (output["jj_mass nominal"]>400) & (output["jj_dEta nominal"]>2.5) & (output["jet1_pt nominal"]>35)
            ].wgt_nominal.sum()
        out_ggh = out_yield - out_vbf

        print(f"\n{dataset}:    {input_evts}  ->  {output_evts};    yield = {out_ggh} (ggH) + {out_vbf} (VBF) = {out_yield}")
        """

        to_return = None
        if self.apply_to_output is None:
            to_return = output
        else:
            self.apply_to_output(output)
            to_return = self.accumulator.identity()

        if self.timer:
            self.timer.add_checkpoint("Saving outputs")
            self.timer.summary()

        return to_return