def gen_jet_pair_mass(df): gjmass = None gjets = df.GenJet gleptons = df.GenPart[(abs(df.GenPart.pdgId) == 13) | (abs(df.GenPart.pdgId) == 11) | (abs(df.GenPart.pdgId) == 15)] gl_pair = ak.cartesian({ "jet": gjets, "lepton": gleptons }, axis=1, nested=True) _, _, dr_gl = delta_r( gl_pair["jet"].eta, gl_pair["lepton"].eta, gl_pair["jet"].phi, gl_pair["lepton"].phi, ) isolated = ak.all((dr_gl > 0.3), axis=-1) if ak.count(gjets[isolated], axis=None) > 0: # TODO: convert only relevant fields! gjet1 = ak.to_pandas(gjets[isolated]).loc[pd.IndexSlice[:, 0], ["pt", "eta", "phi", "mass"]] gjet2 = ak.to_pandas(gjets[isolated]).loc[pd.IndexSlice[:, 1], ["pt", "eta", "phi", "mass"]] gjet1.index = gjet1.index.droplevel("subentry") gjet2.index = gjet2.index.droplevel("subentry") gjsum = p4_sum(gjet1, gjet2) gjmass = gjsum.mass return gjmass
def test_count(): array = ak.Array( [ [ [np.datetime64("2022"), np.datetime64("2023"), np.datetime64("2025")], [], [np.datetime64("2027"), np.datetime64("2011")], [np.datetime64("2013")], ], [], [[np.datetime64("2017"), np.datetime64("2019")], [np.datetime64("2023")]], ], check_valid=True, ) assert ak.count(array) == 9 assert ak.to_list(ak.count(array, axis=-1)) == [[3, 0, 2, 1], [], [2, 1]] assert ak.to_list(ak.count(array, axis=2)) == [[3, 0, 2, 1], [], [2, 1]] assert ak.to_list(ak.count(array, axis=-1, keepdims=True)) == [ [[3], [0], [2], [1]], [], [[2], [1]], ] assert ak.to_list(ak.count(array, axis=-2)) == [[3, 2, 1], [], [2, 1]] assert ak.to_list(ak.count(array, axis=1)) == [[3, 2, 1], [], [2, 1]] assert ak.to_list(ak.count(array, axis=-2, keepdims=True)) == [ [[3, 2, 1]], [[]], [[2, 1]], ]
def get_sum_wgts(file): try: events = NanoEventsFactory.from_root( file, "Delphes", schemaclass=DelphesSchema).events() # result = (file, ak.sum(events.Event.Weight)) result = (file, ak.count(events.Event.Number)) except Exception: result = (file, 0) return result
def test_nested_collection(collection, subcollection, arr_type, element, events): assert ak.type(events[collection][subcollection]) assert ak.type(events[collection][subcollection + "Counts"]) assert ( ak.type(events[collection][subcollection]) .type.type.type.__str__() .startswith(arr_type) ) if element is None: assert ak.all( events[collection][subcollection + "Counts"] == ak.count(events[collection][subcollection], axis=-1) ) else: assert ak.all( events[collection][subcollection + "Counts"] == ak.count(events[collection][subcollection][element], axis=-1) )
def __iadd__(self, other): for branch, branch_data in other.data.items(): if branch in self.data.keys(): if ak.count(self.data[branch], axis=None) == 0: self.data[branch] = branch_data else: self.data[branch] = ak.concatenate( [self.data[branch], branch_data]) else: self.data[branch] = branch_data return self
def __iadd__(self, other): attrs = [ a for a in dir(other) if not a.startswith('__') and not callable(getattr(other, a)) ] for a in attrs: if hasattr(self, a): attr = getattr(self, a) if ak.count(attr, axis=None) == 0: setattr(self, a, getattr(other, a)) else: setattr(self, a, ak.concatenate([attr, getattr(other, a)])) else: setattr(self, a, getattr(other, a)) return self
def PassTrigger(triggerPass): indicesOfHighEffTrig = [11, 12, 13, 14, 67, 107, 108, 131, 8, 90, 98, 116] tPassedHEList = [] tPassedList = [] for evt in triggerPass: tPassed = [] tPassedHE = [] for tp in range(len(evt)): if evt[tp] == 1: if tp in indicesOfHighEffTrig: tPassedHE.append(tp) tPassedList.append(tPassed) tPassedHEList.append(tPassedHE) tPassedList = ak.Array(tPassedList) tPassedHEList = ak.Array(tPassedHEList) return ak.count(tPassedHEList, axis=-1) > 0
def mask_inf(var_array, var_name=None, var_inf_counter=None): """ Mask inf values in `var_array` with None. If var_inf_counter is passed, append there inplace for a given `var_name` the fraction of its inf values. Arguments: - var_array: awkward array, values of a given feature for a given set of taus - var_name (optional, default=None): string, variable name - var_inf_counter (optional, default=None): defaultdict(list), stores fraction of inf values for variables Returns var_array witn masked infs values to None """ if np.sum(np.isinf(var_array)) > 0: is_inf_mask = np.isinf(var_array) var_array = ak.mask(var_array, is_inf_mask, valid_when=False) if var_inf_counter is not None: var_inf_counter[var_name].append( np.sum(is_inf_mask) / ak.count(var_array)) return var_array
def test_reducers(): # axis=None reducers are implemented in NumPy. assert ak.sum(ak.from_iter([[1 + 1j, 2 + 2j], [], [3 + 3j]])) == 6 + 6j assert ak.prod(ak.from_iter([[1 + 1j, 2 + 2j], [], [3 + 3j]])) == -12 + 12j # axis != None reducers are implemented in libawkward; this should be ReducerSum. assert ak.sum(ak.from_iter([[1 + 1j, 2 + 2j], [], [3 + 3j]]), axis=1).tolist() == [ 3 + 3j, 0 + 0j, 3 + 3j, ] # And this is in ReducerProd. assert ak.prod(ak.from_iter([[1 + 1j, 2 + 2j], [], [3 + 3j]]), axis=1).tolist() == [ 0 + 4j, 1 + 0j, 3 + 3j, ] # ReducerCount, ReducerCountNonzero, ReducerAny, and ReducerAll work. assert ak.count(ak.from_iter([[1 + 1j, 2 + 2j], [], [3 + 3j]]), axis=1).tolist() == [2, 0, 1] assert ak.count_nonzero(ak.from_iter([[1 + 1j, 2 + 2j], [], [3 + 3j]]), axis=1).tolist() == [2, 0, 1] assert ak.any(ak.from_iter([[1 + 1j, 2 + 2j], [], [3 + 3j]]), axis=1).tolist() == [ True, False, True, ] assert ak.all(ak.from_iter([[1 + 1j, 2 + 2j], [], [3 + 3j]]), axis=1).tolist() == [ True, True, True, ] assert ak.any(ak.from_iter([[1 + 1j, 2 + 2j, 0 + 0j], [], [3 + 3j]]), axis=1).tolist() == [True, False, True] assert ak.all(ak.from_iter([[1 + 1j, 2 + 2j, 0 + 0j], [], [3 + 3j]]), axis=1).tolist() == [False, True, True]
def lhe_weights(df, output, dataset, year): factor2 = ("dy_m105_160_amc" in dataset) and (("2017" in year) or ("2018" in year)) if factor2: lhefactor = 2.0 else: lhefactor = 1.0 nLHEScaleWeight = ak.count(df.LHEScaleWeight, axis=1) lhe_df = pd.DataFrame( data=ak.to_numpy(nLHEScaleWeight), index=output.index, columns=["nLHEScaleWeight"], ) for i in [1, 3, 4, 5, 6, 7, 15, 24, 34]: cut = lhe_df.nLHEScaleWeight > i cut_ak = nLHEScaleWeight > i lhe_df[f"LHE{i}"] = 1.0 lhe_df.loc[cut, f"LHE{i}"] = ak.to_numpy(df.LHEScaleWeight[cut_ak][:, i]) cut8 = lhe_df.nLHEScaleWeight > 8 cut30 = lhe_df.nLHEScaleWeight > 30 lhe_ren_up = lhe_df.LHE6 * lhefactor lhe_ren_up[cut8] = lhe_df.LHE7 * lhefactor lhe_ren_up[cut30] = lhe_df.LHE34 * lhefactor lhe_ren_down = lhe_df.LHE1 * lhefactor lhe_ren_down[cut8] = lhe_df.LHE1 * lhefactor lhe_ren_down[cut30] = lhe_df.LHE5 * lhefactor lhe_fac_up = lhe_df.LHE4 * lhefactor lhe_fac_up[cut8] = lhe_df.LHE5 * lhefactor lhe_fac_up[cut30] = lhe_df.LHE24 * lhefactor lhe_fac_down = lhe_df.LHE3 * lhefactor lhe_fac_down[cut8] = lhe_df.LHE3 * lhefactor lhe_fac_down[cut30] = lhe_df.LHE15 * lhefactor lhe_ren = {"up": lhe_ren_up, "down": lhe_ren_down} lhe_fac = {"up": lhe_fac_up, "down": lhe_fac_down} return lhe_ren, lhe_fac
def get_nsv(sj, sv, R=0.4): sv_dr = sj.delta_r(sv) nsv = ak.count(sv_dr[sv_dr < R], axis=1) return nsv
def future_savez(dataset, currentfile): print('before selection ', len(events_slice)) # select Muon myMuon = events_slice.Muon[:] myMuon['istight'] = ((events_slice.Muon.tightId == 1) & (events_slice.Muon.pfRelIso03_all < 0.15) & (events_slice.Muon.pt > 20.)) events_slice['Muon'] = myMuon[myMuon.istight] # select electrons myElectron = events_slice.Electron[:] myElectron['istight'] = ((events_slice.Electron.mvaFall17V1Iso_WP80 == 1) & (events_slice.Electron.pt > 20.0)) events_slice['Electron'] = myElectron[myElectron.istight] # select events with n tight leptons n_tight_leptons = ak.count( events_slice.Muon.pt[events_slice.Muon.istight], axis=-1) + ak.count( events_slice.Electron.pt[events_slice.Electron.istight], axis=-1) # number of leptons can be larger than the required number events_selected = events_slice[n_tight_leptons >= options.n_leptons] print('after selection ', len(events_selected)) muons = events_selected.Muon[events_selected.Muon.istight] electrons = events_selected.Electron[events_selected.Electron.istight] # mix leptons and sort according to pt leptons = ak.concatenate([muons, electrons], axis=1) leptons = leptons[ak.argsort(leptons.pt, axis=1, ascending=False)] leptons = leptons[:, 0:int(options.n_leptons_subtract)] # only want the first n_leptons_subtract leptons #print('number of leptons ', ak.count(leptons.pt, axis=-1)) leptons_px = leptons.pt * np.cos(leptons.phi) leptons_py = leptons.pt * np.sin(leptons.phi) leptons_px = ak.sum(leptons_px, axis=1) leptons_py = ak.sum(leptons_py, axis=1) met_list = np.column_stack([ events_selected.GenMET.pt * np.cos(events_selected.GenMET.phi) + leptons_px, events_selected.GenMET.pt * np.sin(events_selected.GenMET.phi) + leptons_py, events_selected.MET.pt * np.cos(events_selected.MET.phi) + leptons_px, events_selected.MET.pt * np.sin(events_selected.MET.phi) + leptons_py, events_selected.PuppiMET.pt * np.cos(events_selected.PuppiMET.phi) + leptons_px, events_selected.PuppiMET.pt * np.sin(events_selected.PuppiMET.phi) + leptons_py, events_selected.DeepMETResponseTune.pt * np.cos(events_selected.DeepMETResponseTune.phi) + leptons_px, events_selected.DeepMETResponseTune.pt * np.sin(events_selected.DeepMETResponseTune.phi) + leptons_py, events_selected.DeepMETResolutionTune.pt * np.cos(events_selected.DeepMETResolutionTune.phi) + leptons_px, events_selected.DeepMETResolutionTune.pt * np.sin(events_selected.DeepMETResolutionTune.phi) + leptons_py, events_selected.LHE.HT ]) overlap_removal = run_deltar_matching(events_selected.PFCands, leptons, drname='deltaR', radius=0.001, unique=True, sort=False) # remove the cloest PF particle mask = ak.count(overlap_removal.deltaR, axis=-1) == 0 #print(len(events_selected.PFCands.pt[0])) events_selected['PFCands'] = events_selected.PFCands[mask] #print(len(events_selected.PFCands.pt[0])) #save the rest of PFcandidates nparticles_per_event = max(ak.num(events_selected.PFCands.pt, axis=1)) print("max NPF in this range: ", nparticles_per_event) particle_list = ak.concatenate([ [ ak.fill_none( ak.pad_none(events_selected.PFCands.pt, nparticles_per_event, clip=True), -999) ], [ ak.fill_none( ak.pad_none(events_selected.PFCands.eta, nparticles_per_event, clip=True), -999) ], [ ak.fill_none( ak.pad_none(events_selected.PFCands.phi, nparticles_per_event, clip=True), -999) ], [ ak.fill_none( ak.pad_none(events_selected.PFCands.d0, nparticles_per_event, clip=True), -999) ], [ ak.fill_none( ak.pad_none(events_selected.PFCands.dz, nparticles_per_event, clip=True), -999) ], [ ak.fill_none( ak.pad_none(events_selected.PFCands.mass, nparticles_per_event, clip=True), -999) ], [ ak.fill_none( ak.pad_none(events_selected.PFCands.puppiWeight, nparticles_per_event, clip=True), -999) ], [ ak.fill_none( ak.pad_none(events_selected.PFCands.pdgId, nparticles_per_event, clip=True), -999) ], [ ak.fill_none( ak.pad_none(events_selected.PFCands.charge, nparticles_per_event, clip=True), -999) ], [ ak.fill_none( ak.pad_none(events_selected.PFCands.fromPV, nparticles_per_event, clip=True), -999) ], [ ak.fill_none( ak.pad_none(events_selected.PFCands.pvRef, nparticles_per_event, clip=True), -999) ], [ ak.fill_none( ak.pad_none(events_selected.PFCands.pvAssocQuality, nparticles_per_event, clip=True), -999) ], ]) npz_file = os.environ['PWD'] + '/raw/' + dataset + '_file' + str( currentfile) + '_slice_' + str(i) + '_nevent_' + str( len(events_selected)) np.savez(npz_file, x=particle_list, y=met_list)
def process(self, events): output = self.accumulator.identity() dataset = events.metadata['dataset'] output['sumw'][dataset] += ak.sum(events.genWeight) ############## # Trigger level triggers = [ "HLT_Mu12_TrkIsoVVL_Ele23_CaloIdL_TrackIdL_IsoVL_DZ", "HLT_Mu23_TrkIsoVVL_Ele12_CaloIdL_TrackIdL_IsoVL_DZ", ] trig_arrs = [events.HLT[_trig.strip("HLT_")] for _trig in triggers] req_trig = np.zeros(len(events), dtype='bool') for t in trig_arrs: req_trig = req_trig | t ############ # Event level ## Muon cuts # muon twiki: https://twiki.cern.ch/twiki/bin/view/CMS/SWGuideMuonIdRun2 events.Muon = events.Muon[(events.Muon.pt > 30) & (abs(events.Muon.eta < 2.4))] # & (events.Muon.tightId > .5) events.Muon = ak.pad_none(events.Muon, 1, axis=1) req_muon =(ak.count(events.Muon.pt, axis=1) == 1) ## Electron cuts # electron twiki: https://twiki.cern.ch/twiki/bin/viewauth/CMS/CutBasedElectronIdentificationRun2 events.Electron = events.Electron[(events.Electron.pt > 30) & (abs(events.Electron.eta) < 2.4)] events.Electron = ak.pad_none(events.Electron, 1, axis=1) req_ele = (ak.count(events.Electron.pt, axis=1) == 1) ## Jet cuts events.Jet = events.Jet[(events.Jet.pt > 25) & (abs(events.Jet.eta) <= 2.5)] req_jets = (ak.count(events.Jet.pt, axis=1) >= 2) req_opposite_charge = events.Electron[:, 0].charge * events.Muon[:, 0].charge == -1 event_level = req_trig & req_muon & req_ele & req_opposite_charge & req_jets # Selected selev = events[event_level] ######### # Per electron el_eta = (abs(selev.Electron.eta) <= 2.4) el_pt = selev.Electron.pt > 30 el_level = el_eta & el_pt # Per muon mu_eta = (abs(selev.Muon.eta) <= 2.4) mu_pt = selev.Muon.pt > 30 mu_level = mu_eta & mu_pt # Per jet jet_eta = (abs(selev.Jet.eta) <= 2.4) jet_pt = selev.Jet.pt > 25 jet_pu = ( ((selev.Jet.puId > 6) & (selev.Jet.pt < 50)) | (selev.Jet.pt > 50) ) jet_id = selev.Jet.jetId >= 2 #jet_id = selev.Jet.isTight() == 1 & selev.Jet.isTightLeptonVeto() == 0 jet_level = jet_pu & jet_eta & jet_pt & jet_id # b-tag twiki : https://twiki.cern.ch/twiki/bin/viewauth/CMS/BtagRecommendation102X bjet_disc_t = selev.Jet.btagDeepB > 0.7264 # L=0.0494, M=0.2770, T=0.7264 bjet_disc_m = selev.Jet.btagDeepB > 0.2770 # L=0.0494, M=0.2770, T=0.7264 bjet_disc_l = selev.Jet.btagDeepB > 0.0494 # L=0.0494, M=0.2770, T=0.7264 bjet_level_t = jet_level & bjet_disc_t bjet_level_m = jet_level & bjet_disc_m bjet_level_l = jet_level & bjet_disc_l sel = selev.Electron[el_level] smu = selev.Muon[mu_level] sjets = selev.Jet[jet_level] sbjets_t = selev.Jet[bjet_level_t] sbjets_m = selev.Jet[bjet_level_m] sbjets_l = selev.Jet[bjet_level_l] # output['pt'].fill(dataset=dataset, pt=selev.Jet.pt.flatten()) # Fill histograms dynamically for histname, h in output.items(): if (histname not in self.jet_hists) and (histname not in self.deepcsv_hists): continue # Get valid fields perhistogram to fill fields = {k: ak.flatten(sjets[k], axis=None) for k in h.fields if k in dir(sjets)} h.fill(dataset=dataset, **fields) def flatten(ar): # flatten awkward into a 1d array to hist return ak.flatten(ar, axis=None) def num(ar): return ak.num(ak.fill_none(ar[~ak.is_none(ar)], 0), axis=0) output['njet'].fill(dataset=dataset, njet=flatten(ak.num(sjets))) output['nbjet_t'].fill(dataset=dataset, nbjet_t=flatten(ak.num(sbjets_t))) output['nbjet_m'].fill(dataset=dataset, nbjet_m=flatten(ak.num(sbjets_m))) output['nbjet_l'].fill(dataset=dataset, nbjet_l=flatten(ak.num(sbjets_l))) output['nel'].fill(dataset=dataset, nel=flatten(ak.num(sel))) output['nmu'].fill(dataset=dataset, nmu=flatten(ak.num(smu))) output['lelpt'].fill(dataset=dataset, lelpt=flatten(selev.Electron[:, 0].pt)) output['lmupt'].fill(dataset=dataset, lmupt=flatten(selev.Muon[:, 0].pt)) output['ljpt'].fill(dataset=dataset, ljpt=flatten(selev.Jet[:, 0].pt)) output['sljpt'].fill(dataset=dataset, sljpt=flatten(selev.Jet[:, 1].pt)) return output
"mfv_splitSUSY_tau000000300um_M2000_1900_2017", "mfv_splitSUSY_tau000001000um_M2000_1800_2017", "mfv_splitSUSY_tau000001000um_M2000_1900_2017", "mfv_splitSUSY_tau000010000um_M2000_1800_2017", "mfv_splitSUSY_tau000010000um_M2000_1900_2017", ] ntk_bkg = [] ntk_sig = [] for fn in fns_bkg: f = uproot.open(fndir+fn+'.root') f = f["mfvJetTreer/tree_DV"] if len(f['evt'].array())==0: print( "no events!!!") continue ntk = np.array(ak.count(f['tk_pt'].array(), axis=1)) ntk_bkg.append(ntk) for fn in fns_signal: f = uproot.open(fndir+fn+'.root') f = f["mfvJetTreer/tree_DV"] if len(f['evt'].array())==0: print( "no events!!!") continue ntk = np.array(ak.count(f['tk_pt'].array(), axis=1)) ntk_sig.append(ntk) ntk_bkg = np.concatenate(ntk_bkg, axis = None) ntk_sig = np.concatenate(ntk_sig, axis = None) plt.hist(ntk_bkg,label='background',bins=350, range=(0,350),alpha=0.5,density=True)
def test(): nums = [ [ 17, 11, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ], [ 17, 11, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ], [ 17, 11, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ], [ 17, 11, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ], [ 17, 11, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ], [ 17, 11, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ], [ 17, 11, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ], [ 17, 11, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ], [ 17, 11, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ], [ 17, 11, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ], ] sample = [] for outer in nums: sample.append([]) for inner in outer: sample[-1].append([0] * inner) assert ak.is_valid(ak.count(sample, axis=0))
def process(self, events): output = self.accumulator.identity() dataset = events.metadata['dataset'] isRealData = 'genWeight' not in events.fields if not isRealData: output['sumw'][dataset] += sum(events.genWeight) JECversion = JECversions[str(self.year)]['MC'] else: output['nbtagmu'][dataset] += ak.count(events.event) JECversion = JECversions[str( self.year)]['Data'][dataset.split('BTagMu')[1]] ############ # Some corrections weights = processor.Weights(len(events)) corrections = {} if not isRealData: weights.add('genWeight', events.genWeight) weights.add( 'pileup_weight', self.puReweight(self.puFile, self.nTrueFile, dataset)(events.Pileup.nPU)) events.FatJet = self.applyJEC(events.FatJet, events.fixedGridRhoFastjetAll, events.caches[0], 'AK8PFPuppi', isRealData, JECversion) cuts = processor.PackedSelection() ############ # Trigger selection if self.year == 2016: if 'BTagMu_AK4Jet300_Mu5' not in events.HLT.fields: self.triggers = [ trigger.replace('AK4', '') for trigger in self.triggers ] elif self.year == 2018: for (i, trigger) in enumerate(self.triggers): if trigger.strip("HLT_") not in events.HLT.fields: self.triggers[i] = trigger + "_noalgo" trig_arrs = [ events.HLT[_trig.strip("HLT_")] for _trig in self.triggers ] req_trig = np.zeros(len(events), dtype='bool') for t in trig_arrs: req_trig = req_trig | t cuts.add('trigger', ak.to_numpy(req_trig)) ############ # Basic cuts ## Muon cuts # muon twiki: https://twiki.cern.ch/twiki/bin/view/CMS/SWGuideMuonIdRun2 events.Muon = events.Muon[(events.Muon.pt > 5) & (abs(events.Muon.eta < 2.4)) & (events.Muon.tightId != 1) & (events.Muon.pfRelIso04_all > 0.15)] events.Muon = ak.pad_none(events.Muon, 2, axis=1) ## Jet cuts (not used) events.Jet = events.Jet[(events.Jet.pt > 25) & (abs(events.Jet.eta) <= 2.5)] #req_jets = (ak.count(events.Jet.pt, axis=1) >= 2) ## FatJet cuts events.FatJet = events.FatJet[ (events.FatJet.pt > self._mask_fatjets['basic']['pt_cut']) & (abs(events.FatJet.eta) <= self._mask_fatjets['basic']['eta_cut']) & (events.FatJet.jetId > self._mask_fatjets['basic']['jetId_cut']) & (ak.count(events.FatJet.subjets.pt, axis=2) >= 2)] ## subjet sel to crosscheck #print(events['FatJetSVs']) ## Event level variables eventVariables = {} eventVariables['nfatjet'] = ak.num(events.FatJet) ## Leading jet variables leadfatjet = ak.firsts(events.FatJet) leadfatjet['tau21'] = leadfatjet.tau2 / leadfatjet.tau1 subjet1 = ak.pad_none(leadfatjet.subjets, 2)[:, 0] subjet2 = ak.pad_none(leadfatjet.subjets, 2)[:, 1] leadfatjet['nsv1'] = get_nsv(subjet1, events.SV) leadfatjet['nsv2'] = get_nsv(subjet2, events.SV) leadfatjet['nmusj1'] = ak.num(subjet1.delta_r(events.Muon) < 0.4) leadfatjet['nmusj2'] = ak.num(subjet2.delta_r(events.Muon) < 0.4) fatjet_mutag = (leadfatjet.nmusj1 >= 1) & (leadfatjet.nmusj2 >= 1) cuts.add('fatjet_mutag', ak.to_numpy(fatjet_mutag)) for DDX in self._mask_DDX.keys(): for wp, cut in self._mask_DDX[DDX].items(): DDX_pass = (leadfatjet[f'btag{DDX}vLV2'] > cut) DDX_fail = (leadfatjet[f'btag{DDX}vLV2'] < cut) cuts.add(f'{DDX}_pass{wp}wp', ak.to_numpy(DDX_pass)) cuts.add(f'{DDX}_fail{wp}wp', ak.to_numpy(DDX_fail)) flavors = {} if not isRealData: flavors['b'] = (leadfatjet.hadronFlavour == 5) flavors['c'] = (leadfatjet.hadronFlavour == 4) flavors['l'] = (leadfatjet.hadronFlavour < 4) flavors['bb'] = abs(leadfatjet.hadronFlavour == 5) & ( leadfatjet.nBHadrons >= 2) #& (leadfatjet.nCHadrons == 0) flavors['cc'] = abs(leadfatjet.hadronFlavour == 4) & ( leadfatjet.nBHadrons == 0) & (leadfatjet.nCHadrons >= 2) #flavors['ll'] = abs(leadfatjet.hadronFlavour < 4) & (leadfatjet.nBHadrons == 0) & (leadfatjet.nCHadrons == 0) flavors['b'] = flavors['b'] & ~flavors['bb'] flavors['c'] = flavors['c'] & ~flavors['cc'] flavors['l'] = flavors['l'] & ~flavors['bb'] & ~flavors[ 'cc'] & ~flavors['b'] & ~flavors['c'] #flavors['others'] = ~flavors['l'] & ~flavors['bb'] & ~flavors['cc'] & ~flavors['b'] & ~flavors['c'] else: flavors['Data'] = np.ones(len(events), dtype='bool') for selname, cut in self._mask_fatjets.items(): sel = (leadfatjet.pt > cut['pt_cut']) & \ (leadfatjet.msoftdrop > cut['mass_cut']) & \ (abs(leadfatjet.eta) < cut['eta_cut']) & \ (leadfatjet.jetId >= cut['jetId_cut']) & \ (leadfatjet.tau21 < cut['tau21_cut']) #(leadfatjet.Hbb > cut['Hbb']) cuts.add(selname, ak.to_numpy(sel)) selection = {} selection['basic'] = {'trigger', 'basic'} selection['pt350msd50'] = {'trigger', 'fatjet_mutag', 'pt350msd50'} selection['msd100tau06'] = {'trigger', 'fatjet_mutag', 'msd100tau06'} selection['pt400msd100tau06'] = { 'trigger', 'fatjet_mutag', 'pt400msd100tau06' } for mask_f in self._final_mask: for DDX in self._mask_DDX.keys(): for wp, cut in self._mask_DDX[DDX].items(): selection[f'{mask_f}{DDX}pass{wp}wp'] = selection[ mask_f].copy() selection[f'{mask_f}{DDX}pass{wp}wp'].add( f'{DDX}_pass{wp}wp') selection[f'{mask_f}{DDX}fail{wp}wp'] = selection[ mask_f].copy() selection[f'{mask_f}{DDX}fail{wp}wp'].add( f'{DDX}_fail{wp}wp') for histname, h in output.items(): sel = [r for r in selection.keys() if r in histname.split('_')] if ((histname in self.fatjet_hists) | ('hist2d_fatjet' in histname)): for flav, mask in flavors.items(): weight = weights.weight() * cuts.all( *selection[sel[0]]) * ak.to_numpy(mask) fields = { k: ak.fill_none(leadfatjet[k], -9999) for k in h.fields if k in dir(leadfatjet) } h.fill(dataset=dataset, flavor=flav, **fields, weight=weight) if histname in self.event_hists: for flav, mask in flavors.items(): weight = weights.weight() * cuts.all( *selection[sel[0]]) * ak.to_numpy(mask) fields = { k: ak.fill_none(eventVariables[k], -9999) for k in h.fields if k in eventVariables.keys() } h.fill(dataset=dataset, flavor=flav, **fields, weight=weight) return output
def process(self, events): output = self._accumulator.identity() jets=events.Jet jetSel = (jets.pt>30) & (abs(jets.eta)<2.4) tightJet = jets[jetSel] bJet = tightJet[tightJet.btagDeepFlavB > 0.642] muons = events.Muon muonSel = (muons.pt>30) & (abs(muons.eta)<2.4) tightMuon = muons[muonSel] ele = events.Electron eleSel = (ele.pt>35)&(abs(ele.eta)<2.4) tightEle = ele[eleSel] eventSel = (((ak.num(tightMuon)==1) | (ak.num(tightEle)==1)) & (ak.num(tightJet)>= 3) & (ak.num(bJet)>=1) ) final = events[eventSel] #####GENPART MATCHING ###### genPart = final.GenPart tops = genPart[abs(genPart.pdgId)==6] #The isLastCopy Flag filters out copy Genparticles: tops = tops[tops.hasFlags('isLastCopy')] tDecay = tops.distinctChildren tDecay = tDecay[tDecay.hasFlags('isLastCopy')] t_Events=tDecay[abs(tDecay.pdgId)==5] W = tDecay[abs(tDecay.pdgId)==24] W = W[W.hasFlags('isLastCopy')] WDecay = W.distinctChildren WDecay = WDecay[WDecay.hasFlags('isLastCopy')] #t_events is the lone bottom, W_events is the -> two jets #select the hadronically decaying W W_Events=ak.flatten(WDecay[ak.all(abs(WDecay.pdgId)<=8,axis=-1)],axis=3) #print(qqb) #HadW is mask for Quark deacying W boson hadW = ak.num(W_Events,axis=2)==2 #filters out t_events that have a hadronically decayign W Boson hadB = t_Events[hadW] hadB = ak.flatten(hadB,axis=2) W_quarks = W_Events[hadW] W_quarks = ak.flatten(W_quarks,axis=2) #concatentating these two arrays make an array of events with the correctly decaying GenParticles. qqb = ak.concatenate([hadB,W_quarks],axis=1) #####GEN JET MATCHING ###### final=final[(ak.count(qqb.pdgId,axis=1)==3)] finaljets=final.Jet qqb=qqb[(ak.count(qqb.pdgId,axis=1)==3)] #Implementing Tight Jet Cuts on Training Data finaljetSel=(abs(finaljets.eta)<2.4)&(finaljets.pt>30) finalJets=finaljets[finaljetSel] #Match Gen part to gen jet matchedGenJets=qqb.nearest(final.GenJet) #match gen to reco matchedJets=matchedGenJets.nearest(finalJets) ### VALIDATION ### test=matchedJets.genJetIdx combs=ak.combinations(finalJets,3,replacement=False) t1=(combs['0'].genJetIdx==test[:,0])|(combs['0'].genJetIdx==test[:,1])|(combs['0'].genJetIdx==test[:,2]) t2=(combs['1'].genJetIdx==test[:,0])|(combs['1'].genJetIdx==test[:,1])|(combs['1'].genJetIdx==test[:,2]) t3=(combs['2'].genJetIdx==test[:,0])|(combs['2'].genJetIdx==test[:,1])|(combs['2'].genJetIdx==test[:,2]) t=t1&t2&t3 trutharray=ak.flatten(t) jetcombos=ak.flatten(combs) j1,j2,j3=ak.unzip(jetcombos) output["dR12"]+=processor.column_accumulator(ak.to_numpy(j1.delta_r(j2))) output["dR13"]+=processor.column_accumulator(ak.to_numpy(j1.delta_r(j3))) output["dR23"]+=processor.column_accumulator(ak.to_numpy(j2.delta_r(j3))) output["j1btag"]+=processor.column_accumulator(ak.to_numpy(j1.btagCSVV2)) output["j2btag"]+=processor.column_accumulator(ak.to_numpy(j1.btagCSVV2)) output["j3btag"]+=processor.column_accumulator(ak.to_numpy(j1.btagCSVV2)) output["j1area"]+=processor.column_accumulator(ak.to_numpy(j1.area)) output["j2area"]+=processor.column_accumulator(ak.to_numpy(j2.area)) output["j3area"]+=processor.column_accumulator(ak.to_numpy(j3.area)) output["j12deta"]+=processor.column_accumulator(ak.to_numpy(j1.eta-j2.eta)) output["j23deta"]+=processor.column_accumulator(ak.to_numpy(j2.eta-j3.eta)) output["j13deta"]+=processor.column_accumulator(ak.to_numpy(j1.eta-j3.eta)) output["j12dphi"]+=processor.column_accumulator(ak.to_numpy(j1.phi-j2.phi)) output["j23dphi"]+=processor.column_accumulator(ak.to_numpy(j2.phi-j3.phi)) output["j13dphi"]+=processor.column_accumulator(ak.to_numpy(j1.phi-j3.phi)) output["j1j2mass"]+=processor.column_accumulator(ak.to_numpy(j1.mass+j2.mass)) output["j2j3mass"]+=processor.column_accumulator(ak.to_numpy(j2.mass+j3.mass)) output["j1j3mass"]+=processor.column_accumulator(ak.to_numpy(j1.mass+j3.mass)) output["j1pt"]+=processor.column_accumulator(ak.to_numpy(j1.pt)) output["j1phi"]+=processor.column_accumulator(ak.to_numpy(j1.phi)) output["j1eta"]+=processor.column_accumulator(ak.to_numpy(abs(j1.eta))) output["j1mass"]+=processor.column_accumulator(ak.to_numpy(j1.mass)) output["j2pt"]+=processor.column_accumulator(ak.to_numpy(j2.pt)) output["j2phi"]+=processor.column_accumulator(ak.to_numpy(j2.phi)) output["j2eta"]+=processor.column_accumulator(ak.to_numpy(abs(j2.eta))) output["j2mass"]+=processor.column_accumulator(ak.to_numpy(j2.mass)) output["j3pt"]+=processor.column_accumulator(ak.to_numpy(j3.pt)) output["j3phi"]+=processor.column_accumulator(ak.to_numpy(j3.phi)) output["j3eta"]+=processor.column_accumulator(ak.to_numpy(abs(j3.eta))) output["j3mass"]+=processor.column_accumulator(ak.to_numpy(j3.mass)) output["event"]+=processor.column_accumulator(ak.to_numpy(ak.flatten(ak.broadcast_arrays(final.event,combs['0'].pt)[0]))) output["truth"]+=processor.column_accumulator(ak.to_numpy(trutharray).astype(int)) return output
def uproot_tree_to_numpy(fname, MeanNormTuple, inbranches_listlist, nMaxslist, nevents, treename="ttree", stop=None, branches=None): # array = uproot_root2array(fname, treename, stop=stop, branches=branches) # Read in total number of events totallengthperjet = 0 for i in range(len(nMaxslist)): if nMaxslist[i] >= 0: totallengthperjet += len(inbranches_listlist[i]) * nMaxslist[i] else: totallengthperjet += len(inbranches_listlist[i]) #flat branch # branches = [ak.fill_none(ak.pad_none(tree[barr, target=feature_length), 0.) for feature_length, arr in zip( nMaxslist, inbranches_listlist)] tree = u3.open(fname)[treename] branches = [ ak.fill_none( ak.pad_none(tree[branch_name].array(), target=feature_length, axis=-1, clip=True if feature_length > 1 else False), 0.) for feature_length, branch_list in zip(nMaxslist, inbranches_listlist) for branch_name in branch_list ] branchnames = [n for names in inbranches_listlist for n in names] feature_lenghts = [ f for branches, f in zip(inbranches_listlist, nMaxslist) for _ in branches ] means = [ m[0] for branches, m in zip(inbranches_listlist, MeanNormTuple) for _ in branches ] norms = [ m[1] for branches, m in zip(inbranches_listlist, MeanNormTuple) for _ in branches ] print("Debugigng means and norms") print(means) print(norms) print(branchnames) branches_numpy = [] for br, brname, fl, mean, norm in zip(branches, branchnames, feature_lenghts, means, norms): print("DBG {}".format(brname)) print(br) print("Length: {}".format(len(br))) if brname == "TagVarCSV_trackJetDistVal": print("BONUS DEBUG!") print("Min: {}, Max: {}".format(ak.min(ak.count(br, axis=-1)), ak.max(ak.count(br, axis=-1)))) if fl > 1: # branches_numpy.append( (ak.to_numpy( br ) - mean) / norm) branches_numpy.append((ak.to_numpy(br) - 0.) / 1.) elif fl == 1: # branches_numpy.append( (np.expand_dims( ak.to_numpy( br ), axis=-1) - mean)/norm ) branches_numpy.append( (np.expand_dims(ak.to_numpy(br), axis=-1) - 0.) / 1.) print("FINISHED THIS LOOP, YOU ARE PERFECT! :) ") numpyarray = np.concatenate(branches_numpy, axis=-1) print("\n" * 5) print("Some metrics about this numpy array") print(np.mean(numpyarray, axis=0)) print(np.std(numpyarray, axis=0)) print("Normalize array") numpyarray = (numpyarray - np.mean(numpyarray, axis=0)) / np.std( numpyarray, axis=0) print("Some metrics about this numpy array") print(np.mean(numpyarray, axis=0)) print(np.std(numpyarray, axis=0)) return numpyarray
Arguments: - var_array: awkward array, values of a given feature for a given set of taus - var_name (optional, default=None): string, variable name - var_inf_counter (optional, default=None): defaultdict(list), stores fraction of inf values for variables - raise_exception (optional, default=True): bool, whether to raise exception instead of masking inf values Returns var_array witn masked infs values to None """ if np.any(is_inf_mask:=np.isinf(var_array)): if raise_exception: raise ValueError(f'Inf value detected in {var_name}') var_array = ak.mask(var_array, is_inf_mask, valid_when=False) if var_inf_counter is not None: var_inf_counter[var_name].append(np.sum(is_inf_mask) / ak.count(var_array)) return var_array def mask_nan(var_array, var_name=None, var_nan_counter=None, raise_exception=True): """ Mask nan values in `var_array` with None. If var_nan_counter is passed, append there inplace for a given `var_name` the fraction of its nan values. Arguments: - var_array: awkward array, values of a given feature for a given set of taus - var_name (optional, default=None): string, variable name - var_nan_counter (optional, default=None): defaultdict(list), stores fraction of nan values for variables - raise_exception (optional, default=True): bool, whether to raise exception instead of masking NaN values Returns var_array witn masked nan values to None """ if np.any(is_nan_mask:=np.isnan(var_array)): if raise_exception:
def fill_aggregators(tree, var, var_type, file_i, file_name_i, cone_type, cone_definition_dict, cone_selection_dict, inf_counter, nan_counter, selection_cut, aliases, sums, sums2, counts, fill_scaling_params=False, scaling_params=None, quantile_params=None): """ Update `sums`, `sums2` and `counts` dictionaries with the values of `var` variable (belonging to `var_type`) taken from input `tree` either inclusively or exclusively for inner/outer cones (`cone_type` argument). In the latter case, derive `constituent_dR` with respect to the tau direction of flight and define cones as: - inner: `constituent_dR` <= `dR_tau_signal_cone` - outer: constituent_dR` > `dR_tau_signal_cone` and `constituent_dR` < `dR_tau_outer_cone` Then mask consitutents which appear in the `cone_type` and update sums/sums2/counts only using those constituents which enter the given cone. If `fill_scaling_params` is set to `True`, also update `scaling_params` dictionary (i.e. make a "snapshot" of scaling parameters based on the current state of sums/sums2/counts) If `quantile_params` dicitonary is provided, will compute quantiles for a given `var` per cone types and store them in this dictionary. Arguments: - tree: uproot TTree, input tree to read arrays from - var: string, variable name - var_type: string, variable type - file_i: int, index of the file being processed as enumerator of the input file list - file_name_i: int, index of the file being processed as extracted from the file name - cone_type: string, type of cone being processed, should be either inner or outer - cone_definition_dict: dict, parameters for inner/outer tau cones' definition, defined in training *.yaml cfg - cone_selection_dict: dict, per feature types configuration for cone splitting, defined in training *.yaml cfg - inf_counter: defaultdict(list), stores fraction of inf values for variables - nan_counter: defaultdict(list), stores fraction of nan values for variables - selection_cut: str, cut to be applied by uproot at the array reading step - aliases: dict, definitions of variables to be constructed by uproot at the array reading step - sums: dict, container for accumulating sums of features' values and to be filled based on the input `var_array` - sums2: dict, container for accumulating square sums of features' values and to be filled based on the input `var_array` - counts: dict, container for accumulating counts of features' values and to be filled based on the input `var_array` - fill_scaling_params (optional, default=False): bool, whether to update the `scaling_params` dictionary with the values from the current state of sums/sums2/counts - scaling_params(optional, default=None): dict, main dictionary storing scaling parameters per variable type/variable name/cone type. Used only if `fill_scaling_params` is set to `True` - quantile_params(optional, default=None): dict, if passed, will store in this disctionary for a given `file_i` quantile numbers for `var_array` as returned by `get_quantiles()` function Returns: None """ constituent_eta_name, constituent_phi_name = cone_selection_dict[var_type]['var_names']['eta'], cone_selection_dict[var_type]['var_names']['phi'] var_array, constituent_eta_array, constituent_phi_array = tree.arrays([var, constituent_eta_name, constituent_phi_name], cut=selection_cut, aliases=aliases, how=tuple) #var_array = mask_inf(var_array, var, inf_counter, raise_exception=True) #var_array = mask_nan(var_array, var, nan_counter, raise_exception=True) if cone_type == None: sums[var_type][var][file_i] += ak.sum(var_array) sums2[var_type][var][file_i] += ak.sum(var_array**2) counts[var_type][var][file_i] += ak.count(var_array) if fill_scaling_params: mean_ = compute_mean(sums[var_type][var], counts[var_type][var], aggregate=True) sqmean_ = compute_mean(sums2[var_type][var], counts[var_type][var], aggregate=True) std_ = compute_std(sums[var_type][var], sums2[var_type][var], counts[var_type][var], aggregate=True) scaling_params[var_type][var]['global']['num'] = int(counts[var_type][var].sum()) if mean_ == None: print(f"Low statistics in {var} for mean computation") scaling_params[var_type][var]['global']['mean'] = None else: scaling_params[var_type][var]['global']['mean'] = float(format(mean_, '.4g')) # round to 4 significant digits if std_ == None: print(f"Low statistics in {var} for std computation") scaling_params[var_type][var]['global']['std'] = None else: scaling_params[var_type][var]['global']['std'] = float(format(std_, '.4g')) if sqmean_ == None: print(f"Low statistics in {var} for sqmean computation") scaling_params[var_type][var]['global']['sqmean'] = None else: scaling_params[var_type][var]['global']['sqmean'] = float(format(sqmean_, '.4g')) if quantile_params: quantile_params[var_type][var]['global'][file_name_i] = get_quantiles(var_array) if None in quantile_params[var_type][var]['global'][file_name_i].values(): print(f"Low statistics in {var} for quantile computation") elif cone_type == 'inner' or cone_type == 'outer': tau_pt_name, tau_eta_name, tau_phi_name = cone_selection_dict['TauFlat']['var_names']['pt'], cone_selection_dict['TauFlat']['var_names']['eta'], cone_selection_dict['TauFlat']['var_names']['phi'] tau_pt_array, tau_eta_array, tau_phi_array = tree.arrays([tau_pt_name, tau_eta_name, tau_phi_name], cut=None, aliases=None, how=tuple) dR_tau_signal_cone = dR_signal_cone(tau_pt_array, cone_definition_dict['inner']['min_pt'], cone_definition_dict['inner']['min_radius'], cone_definition_dict['inner']['opening_coef']) constituent_dR = dR(tau_eta_array - constituent_eta_array, tau_phi_array - constituent_phi_array) if cone_type == 'inner': cone_mask = constituent_dR <= dR_tau_signal_cone elif cone_type == 'outer': cone_mask = (constituent_dR > dR_tau_signal_cone) & (constituent_dR < cone_definition_dict['outer']['dR']) sums[var_type][var][cone_type][file_i] += ak.sum(var_array[cone_mask]) sums2[var_type][var][cone_type][file_i] += ak.sum(var_array[cone_mask]**2) counts[var_type][var][cone_type][file_i] += ak.count(var_array[cone_mask]) if fill_scaling_params: mean_ = compute_mean(sums[var_type][var][cone_type], counts[var_type][var][cone_type], aggregate=True) sqmean_ = compute_mean(sums2[var_type][var][cone_type], counts[var_type][var][cone_type], aggregate=True) std_ = compute_std(sums[var_type][var][cone_type], sums2[var_type][var][cone_type], counts[var_type][var][cone_type], aggregate=True) scaling_params[var_type][var][cone_type]['num'] = int(counts[var_type][var][cone_type].sum()) if mean_ == None: print(f"Low statistics in {var} for mean computation") scaling_params[var_type][var][cone_type]['mean'] = None else: scaling_params[var_type][var][cone_type]['mean'] = float(format(mean_, '.4g')) if std_ == None: print(f"Low statistics in {var} for std computation") scaling_params[var_type][var][cone_type]['std'] = None else: scaling_params[var_type][var][cone_type]['std'] = float(format(std_, '.4g')) if sqmean_ == None: print(f"Low statistics in {var} for sqmean computation") scaling_params[var_type][var][cone_type]['sqmean'] = None else: scaling_params[var_type][var][cone_type]['sqmean'] = float(format(sqmean_, '.4g')) if quantile_params: quantile_params[var_type][var][cone_type][file_name_i] = get_quantiles(var_array[cone_mask]) if None in quantile_params[var_type][var][cone_type][file_name_i].values(): print(f"Low statistics in {var} for quantile computation") else: raise ValueError(f'cone_type for {var_type} should be either inner, or outer')
def fill_aggregators(var_array, tau_eta_array, tau_phi_array, constituent_eta_array, constituent_phi_array, var, var_type, file_i, file_name_id, cone_type, dR_tau_signal_cone, dR_tau_outer_cone, sums, sums2, counts, fill_scaling_params=False, scaling_params=None, quantile_params=None): """ Update `sums`, `sums2` and `counts` dictionaries with the values from `var_array` either inclusively or exclusively (based on `cone_type` argument) for inner/outer cones. In the latter case, derive `constituent_dR` with respect to the tau direction of flight and define cones as: - inner: `constituent_dR` <= `dR_tau_signal_cone` - outer: constituent_dR` > `dR_tau_signal_cone` and `constituent_dR` < `dR_tau_outer_cone` Then mask consitutents which appear in the `cone_type` and update sums/sums2/counts only using those constituents which enter the given cone. If `fill_scaling_params` is set to `True`, also update `scaling_params` dictionary (i.e. make a "snapshot" of scaling parameters based on the current state of sums/sums2/counts) Arguments: - var_array: awkward array, values of a given feature for a given set of taus - tau_eta_array: awkward array, eta values of taus - tau_phi_array: awkward array, phi values of taus - constituent_eta_array: awkward array, eta values of tau constituents - constituent_phi_array: awkward array, phi values of tau constituents - var: string, variable name - var_type: string, variable type - file_i: int, index of the file being processed in the input file list - file_name_id: int, index of the file being processed taken from the corresponding file name - cone_type: string, type of cone being processed, should be either inner or outer - dR_tau_signal_cone: awkward array, per tau dR values defining the signal cone - dR_tau_outer_cone: float, dR value defining the tau outer cone - sums: dict, container for accumulating sums of features' values and to be filled based on the input `var_array` - sums2: dict, container for accumulating square sums of features' values and to be filled based on the input `var_array` - counts: dict, container for accumulating counts of features' values and to be filled based on the input `var_array` - fill_scaling_params (optional, default=False): bool, whether to update the `scaling_params` dictionary with the values from the current state of sums/sums2/counts - scaling_params(optional, default=None): dict, main dictionary storing scaling parameters per variable type/variable name/cone type. Used only if `fill_scaling_params` is set to `True` - quantile_params(optional, default=None): dict, if passed, will store in this disctionary for a given `file_i` quantile numbers for `var_array` as returned by `get_quantiles()` function Returns: None """ if cone_type == None: sums[var_type][var][file_i] += ak.sum(var_array) sums2[var_type][var][file_i] += ak.sum(var_array**2) counts[var_type][var][file_i] += ak.count(var_array) if fill_scaling_params: mean_ = compute_mean(sums[var_type][var], counts[var_type][var], aggregate=True) std_ = compute_std(sums[var_type][var], sums2[var_type][var], counts[var_type][var], aggregate=True) scaling_params[var_type][var]['global']['mean'] = float( format(mean_, '.4g')) # round to 4 significant digits scaling_params[var_type][var]['global']['std'] = float( format(std_, '.4g')) if quantile_params: quantile_params[var_type][var]['global'][ file_name_id] = get_quantiles(var_array) elif cone_type == 'inner' or cone_type == 'outer': constituent_dR = dR(tau_eta_array - constituent_eta_array, tau_phi_array - constituent_phi_array) if cone_type == 'inner': cone_mask = constituent_dR <= dR_tau_signal_cone elif cone_type == 'outer': cone_mask = (constituent_dR > dR_tau_signal_cone) & ( constituent_dR < dR_tau_outer_cone) sums[var_type][var][cone_type][file_i] += ak.sum(var_array[cone_mask]) sums2[var_type][var][cone_type][file_i] += ak.sum( var_array[cone_mask]**2) counts[var_type][var][cone_type][file_i] += ak.count( var_array[cone_mask]) if fill_scaling_params: mean_ = compute_mean(sums[var_type][var][cone_type], counts[var_type][var][cone_type], aggregate=True) std_ = compute_std(sums[var_type][var][cone_type], sums2[var_type][var][cone_type], counts[var_type][var][cone_type], aggregate=True) scaling_params[var_type][var][cone_type]['mean'] = float( format(mean_, '.4g')) scaling_params[var_type][var][cone_type]['std'] = float( format(std_, '.4g')) if quantile_params: quantile_params[var_type][var][cone_type][ file_name_id] = get_quantiles(var_array[cone_mask]) else: raise ValueError( f'cone_type for {var_type} should be either inner, or outer')
def process(self, df): # print(df.fields) # numevents = len(df) # dataset = df.metadata["dataset"] output = pd.DataFrame({"event": df.Event.Number}) output.index.name = "entry" output["dataset"] = df.metadata["dataset"] regions = df.metadata["regions"] # channels = df.metadata['channels'] output["lumi_wgt"] = float(df.metadata["lumi_wgt"]) output["mc_wgt"] = ak.to_pandas(df.Event.Weight) # There are multiple weights per event - need to figure this out # output['lhe_wgt'] = ak.to_pandas(df.Weight.Weight) output["year"] = "snowmass" # Select muons muons = df[parameters["muon_branch"]] muon_filter = ((muons.pt > parameters["muon_pt_cut"]) & (abs(muons.eta) < parameters["muon_eta_cut"]) & (muons.IsolationVar < parameters["muon_iso_cut"])) nmuons = ak.to_pandas(ak.count(muons[muon_filter].pt, axis=1)) mu_map = {"PT": "pt", "Eta": "eta", "Phi": "phi", "Charge": "charge"} muon_columns = ["PT", "Eta", "Phi", "Charge", "IsolationVar"] # Convert one column at a time to preserve event indices in Pandas muon_feature_list = [] for col in muon_columns: muon_feature = df[parameters["muon_branch"]][col] val = ak.to_pandas(muon_feature[muon_filter]) muon_feature_list.append(val) muons = pd.concat(muon_feature_list, axis=1) muons.columns = muon_columns muons.rename(columns=mu_map, inplace=True) mu1 = muons.loc[muons.pt.groupby("entry").idxmax()] mu2 = muons.loc[muons.pt.groupby("entry").idxmin()] mu1.index = mu1.index.droplevel("subentry") mu2.index = mu2.index.droplevel("subentry") pass_leading_pt = mu1.pt > parameters["muon_leading_pt"] fill_muons(output, mu1, mu2) output.mm_charge = output.mu1_charge * output.mu2_charge # Select electrons electrons = df[parameters["electron_branch"]] electrons = electrons[ (electrons.pt > parameters["electron_pt_cut"]) & (abs(electrons.eta) < parameters["electron_eta_cut"])] nelectrons = ak.to_pandas(ak.count(electrons.pt, axis=1)) # Select jets jets = df[parameters["jet_branch"]] mu_for_clean = df[parameters["muon_branch"]] mu_for_clean = mu_for_clean[ (mu_for_clean.pt > parameters["muon_pt_cut"]) & (mu_for_clean.IsolationVar < parameters["muon_iso_cut"])] _, jet_mu_dr = jets.nearest(mu_for_clean, return_metric=True) jet_filter = ( ak.fill_none(jet_mu_dr > parameters["min_dr_mu_jet"], True) & (jets.pt > parameters["jet_pt_cut"]) & (abs(jets.eta) < parameters["jet_eta_cut"])) njets = ak.to_pandas(ak.count(jets[jet_filter].pt, axis=1)) jet_map = {"PT": "pt", "Eta": "eta", "Phi": "phi", "Mass": "mass"} jet_columns = ["PT", "Eta", "Phi", "Mass"] jet_feature_list = [] for col in jet_columns: jet_feature = df[parameters["jet_branch"]][col] val = ak.to_pandas(jet_feature[jet_filter]) jet_feature_list.append(val) jets = pd.concat(jet_feature_list, axis=1) jets.columns = jet_columns jets.rename(columns=jet_map, inplace=True) jets = jets.sort_values(["entry", "pt"], ascending=[True, False]) jets.index = pd.MultiIndex.from_arrays( [jets.index.get_level_values(0), jets.groupby(level=0).cumcount()], names=["entry", "subentry"], ) jet1 = jets.loc[pd.IndexSlice[:, 0], :] jet2 = jets.loc[pd.IndexSlice[:, 1], :] jet1.index = jet1.index.droplevel("subentry") jet2.index = jet2.index.droplevel("subentry") fill_jets(output, jet1, jet2) fill_gen_jets(df, output) # Event selection: two opposite-sign muons and no electrons output["nmuons"] = nmuons output["nelectrons"] = nelectrons output["njets"] = njets output[["nmuons", "nelectrons", "njets"]] = output[["nmuons", "nelectrons", "njets"]].fillna(0) output["event_selection"] = ((output.nmuons == 2) & (output.mm_charge == -1) & (output.nelectrons == 0) & pass_leading_pt) mass = output.dimuon_mass output["region"] = None output.loc[((mass > 76) & (mass < 106)), "region"] = "z-peak" output.loc[((mass > 110) & (mass < 115.03)) | ((mass > 135.03) & (mass < 150)), "region", ] = "h-sidebands" output.loc[((mass > 115.03) & (mass < 135.03)), "region"] = "h-peak" output = output.loc[output.event_selection, :] output = output.reindex(sorted(output.columns), axis=1) output = output[output.region.isin(regions)] """ input_evts = numevents output_evts = output.shape[0] out_yield = output.lumi_wgt.sum() out_vbf = output[ (output.jj_mass>400) & (output.jj_dEta>2.5) & (output.jet1_pt>35) & (output.njets>=2) ].lumi_wgt.sum() out_ggh = out_yield - out_vbf print(f"\n{dataset}: {input_evts} -> {output_evts}; yield = {out_ggh} (ggH) + {out_vbf} (VBF) = {out_yield}") """ to_return = None if self.apply_to_output is None: to_return = output else: self.apply_to_output(output) to_return = self.accumulator.identity() return to_return
def test_highlevel(): array = ak.Array( [[[2, 3, 5], [], [7, 11], [13]], [], [[17, 19], [23]]], check_valid=True ) assert ak.count(array) == 9 assert ak.to_list(ak.count(array, axis=-1)) == [[3, 0, 2, 1], [], [2, 1]] assert ak.to_list(ak.count(array, axis=2)) == [[3, 0, 2, 1], [], [2, 1]] assert ak.to_list(ak.count(array, axis=-1, keepdims=True)) == [ [[3], [0], [2], [1]], [], [[2], [1]], ] assert ak.to_list(ak.count(array, axis=-2)) == [[3, 2, 1], [], [2, 1]] assert ak.to_list(ak.count(array, axis=1)) == [[3, 2, 1], [], [2, 1]] assert ak.to_list(ak.count(array, axis=-2, keepdims=True)) == [ [[3, 2, 1]], [[]], [[2, 1]], ] assert ak.count_nonzero(array) == 9 assert ak.to_list(ak.count_nonzero(array, axis=-1)) == [[3, 0, 2, 1], [], [2, 1]] assert ak.to_list(ak.count_nonzero(array, axis=-2)) == [[3, 2, 1], [], [2, 1]] assert ak.sum(array) == 2 + 3 + 5 + 7 + 11 + 13 + 17 + 19 + 23 assert ak.to_list(ak.sum(array, axis=-1)) == [ [2 + 3 + 5, 0, 7 + 11, 13], [], [17 + 19, 23], ] assert ak.to_list(ak.sum(array, axis=-2)) == [ [2 + 7 + 13, 3 + 11, 5], [], [17 + 23, 19], ] assert ak.prod(array) == 2 * 3 * 5 * 7 * 11 * 13 * 17 * 19 * 23 assert ak.to_list(ak.prod(array, axis=-1)) == [ [2 * 3 * 5, 1, 7 * 11, 13], [], [17 * 19, 23], ] assert ak.to_list(ak.prod(array, axis=-2)) == [ [2 * 7 * 13, 3 * 11, 5], [], [17 * 23, 19], ] assert ak.min(array) == 2 assert ak.to_list(ak.min(array, axis=-1)) == [[2, None, 7, 13], [], [17, 23]] assert ak.to_list(ak.min(array, axis=-2)) == [[2, 3, 5], [], [17, 19]] assert ak.max(array) == 23 assert ak.to_list(ak.max(array, axis=-1)) == [[5, None, 11, 13], [], [19, 23]] assert ak.to_list(ak.max(array, axis=-2)) == [[13, 11, 5], [], [23, 19]] array = ak.Array( [ [[True, False, True], [], [False, False], [True]], [], [[False, True], [True]], ], check_valid=True, ) assert ak.any(array) == True assert ak.to_list(ak.any(array, axis=-1)) == [ [True, False, False, True], [], [True, True], ] assert ak.to_list(ak.any(array, axis=-2)) == [[True, False, True], [], [True, True]] assert ak.all(array) == False assert ak.to_list(ak.all(array, axis=-1)) == [ [False, True, False, True], [], [False, True], ] assert ak.to_list(ak.all(array, axis=-2)) == [ [False, False, True], [], [False, True], ]
def _put_tracks_into_blob(self, blob, tracks, reco_identifier, n_tracks): """ Put a certain type of "tracks" in the blob and give specific name. Parameters ---------- tracks : awkward array The tracks object to be put in the blob eventually. Can be only best tracks. identifier : string A string to name the kp table. n_tracks : int The number of tracks from before. Use to distinguish between best and all tracks. """ reco_tracks = dict( pos_x=tracks.pos_x, pos_y=tracks.pos_y, pos_z=tracks.pos_z, dir_x=tracks.dir_x, dir_y=tracks.dir_y, dir_z=tracks.dir_z, E=tracks.E, rec_type=tracks.rec_type, t=tracks.t, likelihood=tracks.lik, length=tracks.len, # do all recos have this? ) if n_tracks != 1: reco_tracks.update( id=tracks.id, idx=np.arange(n_tracks), ) n_columns = max(km3io.definitions.fitparameters.values()) + 1 fitinf_array = np.ma.filled( ak.to_numpy(ak.pad_none(tracks.fitinf, target=n_columns, axis=-1)), fill_value=np.nan, ).astype("float32") fitinf_split = np.split(fitinf_array, fitinf_array.shape[-1], axis=-1) if n_tracks == 1: for fitparam, idx in km3io.definitions.fitparameters.items(): reco_tracks[fitparam] = fitinf_split[idx][0] else: for fitparam, idx in km3io.definitions.fitparameters.items(): reco_tracks[fitparam] = fitinf_split[idx][:, 0] blob["Reco_" + reco_identifier] = kp.Table( reco_tracks, h5loc=f"/reco/" + reco_identifier, name="Reco " + reco_identifier, split_h5=self.split, ) # write out the rec stages only once with all tracks if n_tracks != 1: _rec_stage = np.array(ak.flatten(tracks.rec_stages)._layout) _counts = ak.count(tracks.rec_stages, axis=1) _idx = np.repeat(np.arange(n_tracks), _counts) blob["RecStages"] = kp.Table( dict(rec_stage=_rec_stage, idx=_idx), # Just to save space, we specify smaller dtypes. # We assume there will be never more than 32767 # reco tracks for a single reconstruction type. dtypes=[("rec_stage", np.int16), ("idx", np.uint16)], h5loc=f"/reco/rec_stages", name="Reconstruction Stages", split_h5=self.split, )
def process(self, df): # Initialize timer if self.timer: self.timer.update() # Dataset name (see definitions in config/datasets.py) dataset = df.metadata["dataset"] is_mc = "data" not in dataset numevents = len(df) # ------------------------------------------------------------# # Apply HLT, lumimask, genweights, PU weights # and L1 prefiring weights # ------------------------------------------------------------# # All variables that we want to save # will be collected into the 'output' dataframe output = pd.DataFrame({"run": df.run, "event": df.event}) output.index.name = "entry" output["npv"] = df.PV.npvs output["met"] = df.MET.pt # Separate dataframe to keep track on weights # and their systematic variations weights = Weights(output) if is_mc: # For MC: Apply gen.weights, pileup weights, lumi weights, # L1 prefiring weights mask = np.ones(numevents, dtype=bool) genweight = df.genWeight weights.add_weight("genwgt", genweight) weights.add_weight("lumi", self.lumi_weights[dataset]) pu_wgts = pu_evaluator( self.pu_lookups, self.parameters, numevents, np.array(df.Pileup.nTrueInt), self.auto_pu, ) weights.add_weight("pu_wgt", pu_wgts, how="all") if self.parameters["do_l1prefiring_wgts"]: if "L1PreFiringWeight" in df.fields: l1pfw = l1pf_weights(df) weights.add_weight("l1prefiring_wgt", l1pfw, how="all") else: weights.add_weight("l1prefiring_wgt", how="dummy_vars") else: # For Data: apply Lumi mask lumi_info = LumiMask(self.parameters["lumimask"]) mask = lumi_info(df.run, df.luminosityBlock) # Apply HLT to both Data and MC hlt_columns = [c for c in self.parameters["hlt"] if c in df.HLT.fields] hlt = ak.to_pandas(df.HLT[hlt_columns]) if len(hlt_columns) == 0: hlt = False else: hlt = hlt[hlt_columns].sum(axis=1) if self.timer: self.timer.add_checkpoint("HLT, lumimask, PU weights") # ------------------------------------------------------------# # Update muon kinematics with Rochester correction, # FSR recovery and GeoFit correction # Raw pT and eta are stored to be used in event selection # ------------------------------------------------------------# # Save raw variables before computing any corrections df["Muon", "pt_raw"] = df.Muon.pt df["Muon", "eta_raw"] = df.Muon.eta df["Muon", "phi_raw"] = df.Muon.phi df["Muon", "pfRelIso04_all_raw"] = df.Muon.pfRelIso04_all # Rochester correction if self.do_roccor: apply_roccor(df, self.roccor_lookup, is_mc) df["Muon", "pt"] = df.Muon.pt_roch # variations will be in branches pt_roch_up and pt_roch_down # muons_pts = { # 'nominal': df.Muon.pt, # 'roch_up':df.Muon.pt_roch_up, # 'roch_down':df.Muon.pt_roch_down # } # for ... if True: # indent reserved for loop over muon pT variations # According to HIG-19-006, these variations have negligible # effect on significance, but it's better to have them # implemented in the future # FSR recovery if self.do_fsr: has_fsr = fsr_recovery(df) df["Muon", "pt"] = df.Muon.pt_fsr df["Muon", "eta"] = df.Muon.eta_fsr df["Muon", "phi"] = df.Muon.phi_fsr df["Muon", "pfRelIso04_all"] = df.Muon.iso_fsr # if FSR was applied, 'pt_fsr' will be corrected pt # if FSR wasn't applied, just copy 'pt' to 'pt_fsr' df["Muon", "pt_fsr"] = df.Muon.pt # GeoFit correction if self.do_geofit and ("dxybs" in df.Muon.fields): apply_geofit(df, self.year, ~has_fsr) df["Muon", "pt"] = df.Muon.pt_fsr if self.timer: self.timer.add_checkpoint("Muon corrections") # --- conversion from awkward to pandas --- # muon_columns = [ "pt", "pt_fsr", "eta", "phi", "charge", "ptErr", "mass", "pt_raw", "eta_raw", "pfRelIso04_all", ] + [self.parameters["muon_id"]] muons = ak.to_pandas(df.Muon[muon_columns]) # --------------------------------------------------------# # Select muons that pass pT, eta, isolation cuts, # muon ID and quality flags # Select events with 2 OS muons, no electrons, # passing quality cuts and at least one good PV # --------------------------------------------------------# # Apply event quality flags flags = ak.to_pandas(df.Flag[self.parameters["event_flags"]]) flags = flags[self.parameters["event_flags"]].product(axis=1) muons["pass_flags"] = True if self.parameters["muon_flags"]: muons["pass_flags"] = muons[ self.parameters["muon_flags"]].product(axis=1) # Define baseline muon selection (applied to pandas DF!) muons["selection"] = ( (muons.pt_raw > self.parameters["muon_pt_cut"]) & (abs(muons.eta_raw) < self.parameters["muon_eta_cut"]) & (muons.pfRelIso04_all < self.parameters["muon_iso_cut"]) & muons[self.parameters["muon_id"]] & muons.pass_flags) # Count muons nmuons = (muons[muons.selection].reset_index().groupby("entry") ["subentry"].nunique()) # Find opposite-sign muons mm_charge = muons.loc[muons.selection, "charge"].groupby("entry").prod() # Veto events with good quality electrons electrons = df.Electron[ (df.Electron.pt > self.parameters["electron_pt_cut"]) & (abs(df.Electron.eta) < self.parameters["electron_eta_cut"]) & (df.Electron[self.parameters["electron_id"]] == 1)] electron_veto = ak.to_numpy(ak.count(electrons.pt, axis=1) == 0) # Find events with at least one good primary vertex good_pv = ak.to_pandas(df.PV).npvsGood > 0 # Define baseline event selection output["two_muons"] = nmuons == 2 output["event_selection"] = (mask & (hlt > 0) & (flags > 0) & (nmuons == 2) & (mm_charge == -1) & electron_veto & good_pv) # --------------------------------------------------------# # Select two leading-pT muons # --------------------------------------------------------# # Find pT-leading and subleading muons # This is slow for large chunk size. # Consider reimplementing using sort_values().groupby().nth() # or sort_values().drop_duplicates() # or using Numba # https://stackoverflow.com/questions/50381064/select-the-max-row-per-group-pandas-performance-issue muons = muons[muons.selection & (nmuons == 2)] mu1 = muons.loc[muons.pt.groupby("entry").idxmax()] mu2 = muons.loc[muons.pt.groupby("entry").idxmin()] mu1.index = mu1.index.droplevel("subentry") mu2.index = mu2.index.droplevel("subentry") # --------------------------------------------------------# # Select events with muons passing leading pT cut # and trigger matching (trig match not done in final vrsn) # --------------------------------------------------------# # Events where there is at least one muon passing # leading muon pT cut pass_leading_pt = mu1.pt_raw > self.parameters["muon_leading_pt"] # update event selection with leading muon pT cut output["pass_leading_pt"] = pass_leading_pt output[ "event_selection"] = output.event_selection & output.pass_leading_pt # --------------------------------------------------------# # Fill dimuon and muon variables # --------------------------------------------------------# fill_muons(self, output, mu1, mu2, is_mc) if self.timer: self.timer.add_checkpoint("Event & muon selection") # ------------------------------------------------------------# # Prepare jets # ------------------------------------------------------------# prepare_jets(df, is_mc) # ------------------------------------------------------------# # Apply JEC, get JEC and JER variations # ------------------------------------------------------------# jets = df.Jet self.do_jec = False # We only need to reapply JEC for 2018 data # (unless new versions of JEC are released) if ("data" in dataset) and ("2018" in self.year): self.do_jec = True jets = apply_jec( df, jets, dataset, is_mc, self.year, self.do_jec, self.do_jecunc, self.do_jerunc, self.jec_factories, self.jec_factories_data, ) # ------------------------------------------------------------# # Calculate other event weights # ------------------------------------------------------------# if is_mc: do_nnlops = self.do_nnlops and ("ggh" in dataset) if do_nnlops: nnlopsw = nnlops_weights(df, numevents, self.parameters, dataset) weights.add_weight("nnlops", nnlopsw) else: weights.add_weight("nnlops", how="dummy") # --- --- --- --- --- --- --- --- --- --- --- --- --- --- # # do_zpt = ('dy' in dataset) # # if do_zpt: # zpt_weight = np.ones(numevents, dtype=float) # zpt_weight[two_muons] =\ # self.evaluator[self.zpt_path]( # output['dimuon_pt'][two_muons] # ).flatten() # weights.add_weight('zpt_wgt', zpt_weight) # --- --- --- --- --- --- --- --- --- --- --- --- --- --- # do_musf = True if do_musf: muID, muIso, muTrig = musf_evaluator(self.musf_lookup, self.year, numevents, mu1, mu2) weights.add_weight("muID", muID, how="all") weights.add_weight("muIso", muIso, how="all") weights.add_weight("muTrig", muTrig, how="all") else: weights.add_weight("muID", how="dummy_all") weights.add_weight("muIso", how="dummy_all") weights.add_weight("muTrig", how="dummy_all") # --- --- --- --- --- --- --- --- --- --- --- --- --- --- # do_lhe = (("LHEScaleWeight" in df.fields) and ("LHEPdfWeight" in df.fields) and ("nominal" in self.pt_variations)) if do_lhe: lhe_ren, lhe_fac = lhe_weights(df, output, dataset, self.year) weights.add_weight("LHERen", lhe_ren, how="only_vars") weights.add_weight("LHEFac", lhe_fac, how="only_vars") else: weights.add_weight("LHERen", how="dummy_vars") weights.add_weight("LHEFac", how="dummy_vars") # --- --- --- --- --- --- --- --- --- --- --- --- --- --- # do_thu = (("vbf" in dataset) and ("dy" not in dataset) and ("nominal" in self.pt_variations) and ("stage1_1_fine_cat_pTjet30GeV" in df.HTXS.fields)) if do_thu: for i, name in enumerate(self.sths_names): wgt_up = stxs_uncert( i, ak.to_numpy(df.HTXS.stage1_1_fine_cat_pTjet30GeV), 1.0, self.stxs_acc_lookups, self.powheg_xsec_lookup, ) wgt_down = stxs_uncert( i, ak.to_numpy(df.HTXS.stage1_1_fine_cat_pTjet30GeV), -1.0, self.stxs_acc_lookups, self.powheg_xsec_lookup, ) thu_wgts = {"up": wgt_up, "down": wgt_down} weights.add_weight("THU_VBF_" + name, thu_wgts, how="only_vars") else: for i, name in enumerate(self.sths_names): weights.add_weight("THU_VBF_" + name, how="dummy_vars") # --- --- --- --- --- --- --- --- --- --- --- --- --- --- # do_pdf = (self.do_pdf and ("nominal" in self.pt_variations) and ("dy" in dataset or "ewk" in dataset or "ggh" in dataset or "vbf" in dataset) and ("mg" not in dataset)) if "2016" in self.year: max_replicas = 0 if "dy" in dataset: max_replicas = 100 elif "ewk" in dataset: max_replicas = 33 else: max_replicas = 100 if do_pdf: pdf_wgts = df.LHEPdfWeight[:, 0:self. parameters["n_pdf_variations"]] for i in range(100): if (i < max_replicas) and do_pdf: output[f"pdf_mcreplica{i}"] = pdf_wgts[:, i] else: output[f"pdf_mcreplica{i}"] = np.nan else: if do_pdf: pdf_wgts = df.LHEPdfWeight[:, 0:self. parameters["n_pdf_variations"]][ 0] pdf_wgts = np.array(pdf_wgts) pdf_vars = { "up": (1 + 2 * pdf_wgts.std()), "down": (1 - 2 * pdf_wgts.std()), } weights.add_weight("pdf_2rms", pdf_vars, how="only_vars") else: weights.add_weight("pdf_2rms", how="dummy_vars") # --- --- --- --- --- --- --- --- --- --- --- --- --- --- # if is_mc: output = fill_gen_jets(df, output) # ------------------------------------------------------------# # Loop over JEC variations and fill jet variables # ------------------------------------------------------------# output.columns = pd.MultiIndex.from_product( [output.columns, [""]], names=["Variable", "Variation"]) if self.timer: self.timer.add_checkpoint("Jet preparation & event weights") for v_name in self.pt_variations: output_updated = self.jet_loop( v_name, is_mc, df, dataset, mask, muons, mu1, mu2, jets, weights, numevents, output, ) if output_updated is not None: output = output_updated if self.timer: self.timer.add_checkpoint("Jet loop") # ------------------------------------------------------------# # Fill outputs # ------------------------------------------------------------# mass = output.dimuon_mass output["region"] = None output.loc[((mass > 76) & (mass < 106)), "region"] = "z-peak" output.loc[((mass > 110) & (mass < 115.03)) | ((mass > 135.03) & (mass < 150)), "region", ] = "h-sidebands" output.loc[((mass > 115.03) & (mass < 135.03)), "region"] = "h-peak" output["dataset"] = dataset output["year"] = int(self.year) for wgt in weights.df.columns: skip_saving = (("nominal" not in wgt) and ("up" not in wgt) and ("down" not in wgt)) if skip_saving: continue output[f"wgt_{wgt}"] = weights.get_weight(wgt) columns_to_save = [ c for c in output.columns if (c[0] in self.vars_to_save) or ("wgt_" in c[0]) or ( "mcreplica" in c[0]) or (c[0] in ["region", "dataset", "year"]) or ("gjet" in c[0]) or ("gjj" in c[0]) ] output = output.loc[output.event_selection, columns_to_save] output = output.reindex(sorted(output.columns), axis=1) output.columns = [ " ".join(col).strip() for col in output.columns.values ] output = output[output.region.isin(self.regions)] """ input_evts = numevents output_evts = output.shape[0] out_yield = output.wgt_nominal.sum() out_vbf = output[ (output["jj_mass nominal"]>400) & (output["jj_dEta nominal"]>2.5) & (output["jet1_pt nominal"]>35) ].wgt_nominal.sum() out_ggh = out_yield - out_vbf print(f"\n{dataset}: {input_evts} -> {output_evts}; yield = {out_ggh} (ggH) + {out_vbf} (VBF) = {out_yield}") """ to_return = None if self.apply_to_output is None: to_return = output else: self.apply_to_output(output) to_return = self.accumulator.identity() if self.timer: self.timer.add_checkpoint("Saving outputs") self.timer.summary() return to_return