def get_chunking(filelist, chunksize, treename="Events", workers=12, skip_bad_files=False): """ Return 2-tuple of - chunks: triplets of (filename,entrystart,entrystop) calculated with input `chunksize` and `filelist` - total_nevents: total event count over `filelist` """ import uproot import awkward from tqdm.auto import tqdm import concurrent.futures chunksize = int(chunksize) chunks = [] nevents = 0 if skip_bad_files: # slightly slower (serial loop), but can skip bad files for fname in tqdm(filelist): try: items = uproot.numentries(fname, treename, total=False).items() except (IndexError, ValueError) as e: print("Skipping bad file", fname) continue for fn, nentries in items: nevents += nentries for index in range(nentries // chunksize + 1): chunks.append((fn, chunksize * index, min(chunksize * (index + 1), nentries))) elif filelist[0].endswith(".awkd"): for fname in tqdm(filelist): f = awkward.load(fname, whitelist=awkward.persist.whitelist + [['blosc', 'decompress']]) nentries = len(f["run"]) nevents += nentries for index in range(nentries // chunksize + 1): chunks.append((fname, chunksize * index, min(chunksize * (index + 1), nentries))) else: executor = None if len( filelist) < 5 else concurrent.futures.ThreadPoolExecutor( min(workers, len(filelist))) for fn, nentries in uproot.numentries(filelist, treename, total=False, executor=executor).items(): nevents += nentries for index in range(nentries // chunksize + 1): if nentries <= 0: continue chunks.append((fn, chunksize * index, min(chunksize * (index + 1), nentries))) return chunks, nevents
def check_entries_uproot(files, tree_names, no_empty, confirm_tree=True, list_branches=False, ignore_inaccessible=False): no_empty = no_empty or confirm_tree if not isinstance(tree_names, (tuple, list)): tree_names = [tree_names] if ignore_inaccessible: files = [f for f in files if os.access(f, os.R_OK)] if not no_empty: n_entries = { tree: uproot.numentries(files, tree) for tree in tree_names } else: n_entries = {tree: 0 for tree in tree_names} missing_trees = defaultdict(list) for tree in tree_names: totals = uproot.numentries(files, tree, total=False) for name, entries in totals.items(): n_entries[tree] += entries if no_empty and entries <= 0: files.remove(name) if confirm_tree and entries == 0: if tree not in uproot.open(name): missing_trees[tree].append(name) if missing_trees: files = set(sum((list(v) for v in missing_trees.values()), [])) msg = "Missing at least one tree (%s) for %d file(s): %s" msg = msg % (", ".join(missing_trees), len(missing_trees), ", ".join(files)) raise RuntimeError(msg) branches = {} if list_branches: for tree in tree_names: open_files = (uproot.open(f) for f in files) all_branches = (f[tree].keys(recursive=True) for f in open_files if tree in f) branches[tree] = dict(Counter(sum(all_branches, []))) if len(n_entries) == 1: n_entries = list(n_entries.values())[0] return files, n_entries, branches
def _get_chunking(filelist, treename, chunksize): items = [] for fn in filelist: nentries = uproot.numentries(fn, treename) for index in range(nentries // chunksize + 1): items.append((fn, chunksize, index)) return items
def __len__(self): if self.cutfunc is None: return uproot.numentries(self.file_names, self.tree_name, total=True) else: return len(self.upTree)
def _get_metadata(item, skipbadfiles=False, retries=0, xrootdtimeout=None): import warnings out = set_accumulator() retry_count = 0 while retry_count <= retries: try: # add timeout option according to modified uproot numentries defaults xrootdsource = {"timeout": xrootdtimeout, "chunkbytes": 32 * 1024, "limitbytes": 1024**2, "parallel": False} nentries = uproot.numentries(item.filename, item.treename, xrootdsource=xrootdsource) out = set_accumulator([FileMeta(item.dataset, item.filename, item.treename, nentries)]) break except OSError as e: if not skipbadfiles: raise e else: w_str = 'Bad file source %s.' % item.filename if retries: w_str += ' Attempt %d of %d.' % (retry_count + 1, retries + 1) if retry_count + 1 < retries: w_str += ' Will retry.' else: w_str += ' Skipping.' else: w_str += ' Skipping.' warnings.warn(w_str) except Exception as e: if retries == retry_count: raise e w_str = 'Attempt %d of %d. Will retry.' % (retry_count + 1, retries + 1) warnings.warn(w_str) retry_count += 1 return out
def _get_chunking(filelist, treename, chunksize, workers=1): items = [] executor = None if len(filelist) < 5 else concurrent.futures.ThreadPoolExecutor(workers) for fn, nentries in uproot.numentries(filelist, treename, total=False, executor=executor).items(): for index in range(nentries // chunksize + 1): items.append((fn, chunksize, index)) return items
def main(args): mods = [] #============================================================================# #------------------------- Run PostProcessor ------------------------# #============================================================================# files = [] if len(args.inputfile) > 5 and args.inputfile[0:5] == "file:": #This is just a single test input file files.append(args.inputfile[5:]) else: #this is a file list with open(args.inputfile) as f: files = [line.strip() for line in f] nsplit = args.nSplit + 2 for file in files: nevt = uproot.numentries(file, "Events") ran = np.linspace(0, nevt+1, 5, dtype=int) print(file, nevt, ran) plist =[] for i in range(len(ran)-1): print (i, ran[i], ran[i+1]) p = Process(target=RunPost, args=(args.outputfile, file, ran, i)) p.start() plist.append(p) exit_codes = [p.join() for p in plist]
def nevents_in_file(self, path): if path in self._nevents_in_file_cache: nevents = self._nevents_in_file_cache[path] nblocks = int((nevents - 1) / self.nevents_per_block + 1) else: # Try to open root file with standard memmap with uproot. Use # localsource option if it fails nevents = uproot.numentries(path, self._treename_of_files_map[path]) self._nevents_in_file_cache[path] = nevents nblocks = int((nevents - 1) / self.nevents_per_block + 1) return nblocks
def get_chunking_dask(filelist, chunksize, client=None, treename="Events"): import uproot chunks, chunksize, nevents = [], int(chunksize), 0 info = client.gather( client.map(lambda x: (x, uproot.numentries(x, treename)), filelist)) for fn, nentries in info: nevents += nentries for index in range(nentries // chunksize + 1): chunks.append( (fn, chunksize * index, min(chunksize * (index + 1), nentries))) return chunks, nevents
def CheckMCProcess(quenue, filelist, outname, outdir): for i in range(0, quenue): inputlist_ = filelist.replace("$(Process)", str(i)) outputname_ = outname.replace("$(Process)", str(i)) inputlist = "FileList/%s" % inputlist_ outputname = "root://cmseos.fnal.gov/%s/%s" % (outdir, outputname_) if not os.path.exists(inputlist): continue ncnt_dict = GetNEventFromList(inputlist) ncnt_list = sum(ncnt_list.values()) outcnt = 0 try: outcnt = uproot.numentries(outputname, TreeName) except: print("Soemthing wrong with file") if (ncnt_list != outcnt): print("Production failed for %s : input file %s , outputname %s" % (condorfile, inputlist, outputname))
def read_file(path, sample): start = time.time() print("\tProcessing: " + sample) data_all = pd.DataFrame() mc = uproot.open(path)["mini"] numevents = uproot.numentries(path, "mini") for data in mc.iterate([ "photon_n", "photon_pt", "photon_eta", "photon_phi", "photon_etcone20", "photon_ptcone30", "photon_isTightID" ], flatten=False, entrysteps=2500000, outputtype=pd.DataFrame, entrystop=numevents * fraction): nIn = len(data.index) # Calculate reconstructed diphoton invariant mass data['myy'] = np.vectorize(calc_myy)(data.photon_pt, data.photon_eta, data.photon_phi) # Cut on number of photons fail = data[np.vectorize(HyyCuts.cut_photon_n)(data.photon_n)].index data.drop(fail, inplace=True) # Cut on pseudorapidity outside fiducial region fail = data[np.vectorize(HyyCuts.cut_photon_eta_fiducial)( data.photon_eta)].index data.drop(fail, inplace=True) # Cut on pseudorapidity inside barrel/end-cap transition region fail = data[np.vectorize(HyyCuts.cut_photon_eta_transition)( data.photon_eta)].index data.drop(fail, inplace=True) # Cut on transverse momentum of the photons fail = data[np.vectorize(HyyCuts.cut_photon_pt)(data.photon_pt)].index data.drop(fail, inplace=True) # Cut on photon reconstruction fail = data[np.vectorize(HyyCuts.cut_photon_reconstruction)( data.photon_isTightID)].index data.drop(fail, inplace=True) # Cut on energy isolation fail = data[np.vectorize(HyyCuts.cut_isolation_et)( data.photon_etcone20)].index data.drop(fail, inplace=True) # Cut on lower limit of reconstructed invariant mass fail = data[np.vectorize(HyyCuts.cut_mass_lower)(data.myy)].index data.drop(fail, inplace=True) # Cut on upper limit of reconsructed invariant mass fail = data[np.vectorize(HyyCuts.cut_mass_upper)(data.myy)].index data.drop(fail, inplace=True) # dataframe contents can be printed at any stage like this #print(data) # dataframe column can be printed at any stage like this #print(data['photon_pt']) # dataframe columns can be printed at any stage like this #print(data[['photon_pt','photon_eta']]) nOut = len(data.index) data_all = data_all.append(data) elapsed = time.time() - start print("\t\tTime taken: " + str(elapsed) + ", nIn: " + str(nIn) + ", nOut: " + str(nOut)) return data_all
def GetNEvent(file): return (file, uproot.numentries(file, TTreeName))
def PoolStopfile(file): return (file, uproot.numentries(file, TreeName))
def read_file(path, sample): start = time.time() print("\tProcessing: " + sample + " file") data_all = pd.DataFrame() mc = uproot.open(path)["mini"] numevents = uproot.numentries(path, "mini") if 'data' in sample: fraction_MC = 1 else: fraction_MC = fraction for data in mc.iterate( [ "lep_pt", "lep_eta", "lep_phi", "lep_type", "lep_charge", "mcWeight", "scaleFactor_PILEUP", "scaleFactor_ELE", "scaleFactor_MUON", # add more variables here if you make cuts on them , "scaleFactor_LepTRIGGER" ], flatten=False, entrysteps=2500000, outputtype=pd.DataFrame, entrystop=numevents * fraction_MC): nIn = len(data.index) # label for each mc type data['type'] = np.vectorize(mc_type)(sample) # label for channel (ee, mm or em) data['Channel'] = np.vectorize(channel)(data.lep_type) data['SumCharges'] = np.vectorize(sum_charge)(data.lep_charge) data['MinmOCST'] = np.vectorize(calc_min_mOCST)(data.lep_pt, data.lep_eta, data.lep_phi, data.lep_charge, data.lep_type) data['LepPt0'] = np.vectorize(lep_pt_0)(data.lep_pt) data['LepPt1'] = np.vectorize(lep_pt_1)(data.lep_pt) data['LepPt2'] = np.vectorize(lep_pt_2)(data.lep_pt) data['LepPt3'] = np.vectorize(lep_pt_3)(data.lep_pt) data['MZ1'] = np.vectorize(calc_m_Z1)(data.lep_pt, data.lep_eta, data.lep_phi, data.lep_charge, data.lep_type) data['Z1_pair'] = np.vectorize(find_Z1_pair)(data.lep_pt, data.lep_eta, data.lep_phi, data.lep_charge, data.lep_type) data['MZ2'] = np.vectorize(calc_m_Z2)(data.lep_pt, data.lep_eta, data.lep_phi, data.lep_charge, data.lep_type, data.Z1_pair) data['Mllll'] = np.vectorize(calc_mllll)(data.lep_pt, data.lep_eta, data.lep_phi) if 'data' not in sample: data['weight'] = np.vectorize(calc_weight)( data.mcWeight, data.scaleFactor_PILEUP, data.scaleFactor_ELE, data.scaleFactor_MUON, data.scaleFactor_LepTRIGGER) data['weight'] = np.vectorize(get_xsec_weight)(data.weight, sample) / fraction else: data['weight'] = 1 data.drop([ "LepPt3", "MinmOCST", "Z1_pair", "lep_pt", "lep_eta", "lep_phi", "lep_type", "lep_charge", "mcWeight", "scaleFactor_PILEUP", "scaleFactor_ELE", "scaleFactor_MUON", "scaleFactor_LepTRIGGER" ], axis=1, inplace=True) data = data[data.weight != 0] #print(data[['lep_eta']]) #print(data) nOut = len(data.index) data_all = data_all.append(data) elapsed = time.time() - start print("\t\t" + sample + " time taken: " + str(elapsed) + "s, nIn: " + str(nIn) + ", nOut: " + str(nOut)) return data_all
def read_file(path, sample): start = time.time() print("\tProcessing: " + sample) data_all = pd.DataFrame() mc = uproot.open(path)["mini"] numevents = uproot.numentries(path, "mini") for data in mc.iterate([ "scaleFactor_ELE", "scaleFactor_MUON", "scaleFactor_LepTRIGGER", "scaleFactor_PILEUP", "mcWeight", "trigE", "trigM", "lep_n", "lep_pt", "lep_eta", "lep_phi", "lep_E", "lep_z0", "lep_type", "lep_isTightID", "lep_ptcone30", "lep_etcone20", "lep_charge", "lep_trackd0pvunbiased", "lep_tracksigd0pvunbiased", "jet_n" ], flatten=False, entrysteps=2500000, outputtype=pd.DataFrame, entrystop=numevents * fraction): try: nIn = len(data.index) if 'data' not in sample: data['totalWeight'] = np.vectorize(calc_weight)( data.mcWeight, data.scaleFactor_PILEUP, data.scaleFactor_ELE, data.scaleFactor_MUON, data.scaleFactor_LepTRIGGER) data['totalWeight'] = np.vectorize(get_xsec_weight)( data.totalWeight, sample) data.drop([ "mcWeight", "scaleFactor_PILEUP", "scaleFactor_ELE", "scaleFactor_MUON", "scaleFactor_LepTRIGGER" ], axis=1, inplace=True) # Cut on number of leptons fail = data[np.vectorize(ZBosonCuts.cut_lep_n)(data.lep_n)].index data.drop(fail, inplace=True) # Preselection cut for electron/muon trigger fail = data[np.vectorize(ZBosonCuts.lepton_trigger)( data.trigE, data.trigM)].index data.drop(fail, inplace=True) # Both leptons are tight fail = data[np.vectorize(ZBosonCuts.lepton_is_tight)( data.lep_isTightID)].index data.drop(fail, inplace=True) # Both leptons are isolated and hard pT fail = data[np.vectorize(ZBosonCuts.lepton_isolated_hard_pt)( data.lep_pt, data.lep_ptcone30, data.lep_etcone20)].index data.drop(fail, inplace=True) # electron and muon selection fail = data[np.vectorize(ZBosonCuts.lepton_selection)( data.lep_type, data.lep_pt, data.lep_eta, data.lep_phi, data.lep_E, data.lep_trackd0pvunbiased, data.lep_tracksigd0pvunbiased, data.lep_z0)].index data.drop(fail, inplace=True) # Cut on oppositely charged leptons fail = data[np.vectorize(ZBosonCuts.cut_opposite_charge)( data.lep_charge)].index data.drop(fail, inplace=True) # Cut on leptons of same flavour fail = data[np.vectorize(ZBosonCuts.cut_same_flavour)( data.lep_type)].index data.drop(fail, inplace=True) # Calculate invariant mass data['mll'] = np.vectorize(calc_mll)(data.lep_pt, data.lep_eta, data.lep_phi, data.lep_E) # Cut on invariant mass fail = data[np.vectorize(ZBosonCuts.cut_invariant_mass)( data.mll)].index data.drop(fail, inplace=True) # jet cut fail = data[np.vectorize(ZBosonCuts.cut_jet_n)(data.jet_n)].index data.drop(fail, inplace=True) nOut = len(data.index) data_all = data_all.append(data) elapsed = time.time() - start print("\t\tTime taken: " + str(elapsed) + ", nIn: " + str(nIn) + ", nOut: " + str(nOut)) except ValueError: print("ValueError. Probably vectorizing on zero-length input") continue return data_all
def _get_metadata(item): nentries = uproot.numentries(item.filename, item.treename) return set_accumulator([FileMeta(item.dataset, item.filename, item.treename, nentries)])
dataset = os.path.basename(directory) if not any(fnmatch.fnmatch(dataset, pattern) for pattern in patterns): continue print(directory) flist = fnmatch.filter(xrdls(directory), "*.root") if len(flist) == 0: print(" NO FILES") continue nbytes = executor.map(lambda path: xrdfstat(path)['size'], flist) nbytes = np.array(list(nbytes)) urllist = [fnaleos + path for path in flist] if getentries: nentries = uproot.numentries(urllist, "Events", total=False, executor=executor) nentries = np.array(list(nentries.values())) print(" # Files:", len(flist)) print(" Total bytes: %d" % nbytes.sum()) print(" Avg. bytes: %.0f" % (nbytes.sum() / nbytes.size)) if dataset in xsections: xs = xsections[dataset] elif dataset not in xsections and 'Run201' not in dataset: nearest = list(xsections.keys()) nearest.sort(key=lambda s: difflib.SequenceMatcher( None, s, dataset).ratio()) print(" ", dataset, " missing xsection, taking closest name:", nearest[-1]) xs = xsections[nearest[-1]]
def _get_chunking_lazy(filelist, treename, chunksize): for fn in filelist: nentries = uproot.numentries(fn, treename) for index in range(nentries // chunksize + 1): yield (fn, chunksize, index)
import ROOT from ROOT import TGenPhaseSpace, TLorentzVector, TVector3, TGraph from ROOT import TMath, TCanvas, TH1F, TH2F, TComplex, TStyle, TColor, TChain, TTree, TLegend, TList, TLatex, THStack import uproot import uproot_methods import numpy as np import matplotlib from matplotlib import pyplot as plt from array import array events = uproot.open( "/home/camsdiaz/Documents/Tandem/Lb_Tuple_presel.root")["tree"] num_entries = uproot.numentries( "/home/camsdiaz/Documents/Tandem/Lb_Tuple_presel.root", "tree") L = 80559 D = 191256 #reconstructed momenta px_proton_reco, py_proton_reco, pz_proton_reco = events.arrays( ["hplus_PX", "hplus_PY", "hplus_PZ"], outputtype=tuple) px_pion_reco, py_pion_reco, pz_pion_reco = events.arrays( ["hminus_PX", "hminus_PY", "hminus_PZ"], outputtype=tuple) #true momenta px_proton_true, py_proton_true, pz_proton_true = events.arrays( ["hplus_TRUEP_X", "hplus_TRUEP_Y", "hplus_TRUEP_Z"], outputtype=tuple) px_pion_true, py_pion_true, pz_pion_true = events.arrays( ["hminus_TRUEP_X", "hminus_TRUEP_Y", "hminus_TRUEP_Z"], outputtype=tuple) #Track type tracktype_proton, tracktype_pion = events.arrays( ["hplus_TRACK_Type", "hminus_TRACK_Type"], outputtype=tuple)
def read_file(path, sample): start = time.time() print("\tProcessing: " + sample + " file") data_all = pd.DataFrame() mc = uproot.open(path)["mini"] numevents = uproot.numentries(path, "mini") if 'data' in sample: fraction_MC = fraction else: fraction_MC = fraction * MC_to_data_ratio entrystart = 0 entrystop = numevents * fraction_MC for data in mc.iterate( [ "lep_n", "lep_pt", "lep_eta", "lep_phi", "lep_E", "lep_charge", "lep_type", "lep_isTightID", "lep_ptcone30", "lep_etcone20", "lep_z0", "lep_trackd0pvunbiased", "lep_tracksigd0pvunbiased", "jet_n", "jet_pt", "jet_eta", "jet_jvt", "jet_MV2c10", "met_et", "met_phi", "mcWeight", "scaleFactor_PILEUP", "scaleFactor_ELE", "scaleFactor_MUON", # add more variables here if you make cuts on them , "scaleFactor_LepTRIGGER" ], flatten=False, entrysteps=2212282, outputtype=pd.DataFrame, entrystart=entrystart, entrystop=entrystop): nIn = len(data.index) data['good_lep_0_index'] = np.vectorize(find_good_lep_0_index)( data.lep_n, data.lep_type, data.lep_pt, data.lep_eta, data.lep_ptcone30, data.lep_etcone20, data.lep_isTightID, data.lep_z0, data.lep_trackd0pvunbiased, data.lep_tracksigd0pvunbiased) data['good_lep_1_index'] = np.vectorize(find_good_lep_1_index)( data.lep_n, data.lep_type, data.lep_pt, data.lep_eta, data.lep_ptcone30, data.lep_etcone20, data.lep_isTightID, data.lep_z0, data.lep_trackd0pvunbiased, data.lep_tracksigd0pvunbiased, data.good_lep_0_index) data['good_lep_2_index'] = np.vectorize(find_good_lep_2_index)( data.lep_n, data.lep_type, data.lep_pt, data.lep_eta, data.lep_ptcone30, data.lep_etcone20, data.lep_isTightID, data.lep_z0, data.lep_trackd0pvunbiased, data.lep_tracksigd0pvunbiased, data.good_lep_1_index) data['good_jet_0_index'] = np.vectorize(find_good_jet_0_index)( data.jet_n, data.jet_pt, data.jet_eta, data.jet_jvt) data['good_jet_1_index'] = np.vectorize(find_good_jet_1_index)( data.jet_n, data.jet_pt, data.jet_eta, data.jet_jvt, data.good_jet_0_index) data['good_jet_2_index'] = np.vectorize(find_good_jet_2_index)( data.jet_n, data.jet_pt, data.jet_eta, data.jet_jvt, data.good_jet_1_index) data['good_jet_3_index'] = np.vectorize(find_good_jet_3_index)( data.jet_n, data.jet_pt, data.jet_eta, data.jet_jvt, data.good_jet_2_index) data['good_jet_4_index'] = np.vectorize(find_good_jet_4_index)( data.jet_n, data.jet_pt, data.jet_eta, data.jet_jvt, data.good_jet_3_index) data['good_jet_5_index'] = np.vectorize(find_good_jet_5_index)( data.jet_n, data.jet_pt, data.jet_eta, data.jet_jvt, data.good_jet_4_index) data['good_jet_6_index'] = np.vectorize(find_good_jet_6_index)( data.jet_n, data.jet_pt, data.jet_eta, data.jet_jvt, data.good_jet_5_index) data['good_jet_7_index'] = np.vectorize(find_good_jet_7_index)( data.jet_n, data.jet_pt, data.jet_eta, data.jet_jvt, data.good_jet_6_index) data['good_jet_8_index'] = np.vectorize(find_good_jet_8_index)( data.jet_n, data.jet_pt, data.jet_eta, data.jet_jvt, data.good_jet_7_index) # throw away events where number of leptons isn't 2 fail = data[np.vectorize(cut_good_lep_n)(data.good_lep_1_index, data.good_lep_2_index)].index data.drop(fail, inplace=True) # throw away events where leptons aren't oppositely charged fail = data[np.vectorize(cut_lep_charge)(data.lep_charge, data.good_lep_0_index, data.good_lep_1_index)].index data.drop(fail, inplace=True) # cut on channel fail = data[np.vectorize(channel_cut)(data.lep_type, data.good_lep_0_index, data.good_lep_1_index)].index data.drop(fail, inplace=True) fail = data[np.vectorize(cut_met_et)(data.met_et)].index data.drop(fail, inplace=True) # label for each mc type data['type'] = np.vectorize(mc_type)(sample) # label for channel (ee, mm or em) #data['Channel'] = np.vectorize(channel)(data.lep_type,data.good_lep_0_index,data.good_lep_1_index) # number of jets data['NJets'] = np.vectorize(calc_good_jet_n)( data.jet_pt, data.good_jet_0_index, data.good_jet_1_index, data.good_jet_2_index, data.good_jet_3_index, data.good_jet_4_index, data.good_jet_5_index, data.good_jet_6_index, data.good_jet_7_index, data.good_jet_8_index) # MET data['MET'] = round(data['met_et'] / 1000, 2) # calculation of 2-lepton invariant mass data['Mll'] = np.vectorize(calc_mll)(data.lep_pt, data.lep_eta, data.lep_phi, data.lep_E, data.good_lep_0_index, data.good_lep_1_index) fail = data[np.vectorize(Mll_cut_lower)(data.Mll)].index data.drop(fail, inplace=True) fail = data[np.vectorize(Mll_cut_upper)(data.Mll)].index data.drop(fail, inplace=True) # transverse mass data['TransMass'] = np.vectorize(calc_Mt)(data.lep_pt, data.lep_eta, data.lep_E, data.met_et, data.good_lep_0_index, data.good_lep_1_index) fail = data[np.vectorize(cut_Mt_upper)(data.TransMass)].index data.drop(fail, inplace=True) # Angular separation between leptons data['LepDeltaPhi'] = np.vectorize(calc_dPhiLL)(data.lep_phi, data.good_lep_0_index, data.good_lep_1_index) # Angular separation between leptons and MET dPhi(MET,ll) data['METLLDeltaPhi'] = np.vectorize(calc_dPhiLLmet)( data.lep_pt, data.lep_phi, data.met_phi, data.good_lep_0_index, data.good_lep_1_index) # Sum of lepton pt data['SumLepPt'] = np.vectorize(calc_ptLL)(data.lep_pt, data.lep_phi, data.good_lep_0_index, data.good_lep_1_index) fail = data[np.vectorize(cut_SumLepPt)(data.SumLepPt)].index data.drop(fail, inplace=True) # whether at least 1 jet is btagged data['BTags'] = np.vectorize(bjets)( data.jet_MV2c10, data.good_jet_0_index, data.good_jet_1_index, data.good_jet_2_index, data.good_jet_3_index, data.good_jet_4_index, data.good_jet_5_index, data.good_jet_6_index, data.good_jet_7_index, data.good_jet_8_index) if 'data' not in sample: data['weight'] = np.vectorize(calc_weight)( data.mcWeight, data.scaleFactor_PILEUP, data.scaleFactor_ELE, data.scaleFactor_MUON, data.scaleFactor_LepTRIGGER) data['weight'] = np.vectorize(get_xsec_weight)(data.weight, sample) # throw away events with weight < 0.00005 fail = data[np.vectorize(cut_weight)(data.weight)].index data.drop(fail, inplace=True) else: data['weight'] = 1 data.drop([ "lep_n", "lep_pt", "lep_eta", "lep_phi", "lep_E", "lep_charge", "lep_type", "lep_isTightID", "lep_ptcone30", "lep_etcone20", "lep_z0", "lep_trackd0pvunbiased", "lep_tracksigd0pvunbiased", "jet_n", "jet_pt", "jet_eta", "jet_jvt", "jet_MV2c10", "met_et", "met_phi", "mcWeight", "scaleFactor_PILEUP", "scaleFactor_ELE", "scaleFactor_MUON", "scaleFactor_LepTRIGGER", "good_lep_0_index", "good_lep_1_index", "good_lep_2_index", "good_jet_0_index", "good_jet_1_index", "good_jet_2_index", "good_jet_3_index", "good_jet_4_index", "good_jet_5_index", "good_jet_6_index", "good_jet_7_index", "good_jet_8_index" ], axis=1, inplace=True) #print(data[['LepDeltaPhi','METLLDeltaPhi','TransMass','weight']]) #print(data['weight']) nOut = len(data.index) data_all = data_all.append(data) elapsed = time.time() - start print("\t\t" + sample + " time taken: " + str(elapsed) + "s, nIn: " + str(nIn) + ", nOut: " + str(nOut)) return data_all
def read_file(path,sample): start = time.time() # start the clock print("\tProcessing: "+sample) # print which sample is being processed data_all = pd.DataFrame() # define empty pandas DataFrame to hold all data for this sample tree = uproot.open(path)["mini"] # open the tree called mini numevents = uproot.numentries(path, "mini") # number of events for data in tree.iterate(["lep_n","lep_pt","lep_eta","lep_phi","lep_E","lep_charge","lep_type","lep_ptcone30", "lep_etcone20", # add more variables here if you make cuts on them "mcWeight","scaleFactor_PILEUP","scaleFactor_ELE","scaleFactor_MUON", "scaleFactor_LepTRIGGER"], # variables to calculate Monte Carlo weight entrysteps=2500000, # number of events in a batch to process outputtype=pd.DataFrame, # choose output type as pandas DataFrame entrystop=numevents*fraction): # process up to numevents*fraction nIn = len(data.index) # number of events in this batch print('\t initial number of events:\t\t\t',nIn) if 'data' not in sample: # only do this for Monte Carlo simulation files # multiply all Monte Carlo weights and scale factors together to give total weight data['totalWeight'] = np.vectorize(calc_weight)(data.mcWeight,data.scaleFactor_PILEUP, data.scaleFactor_ELE,data.scaleFactor_MUON, data.scaleFactor_LepTRIGGER) # incorporate the cross-section weight into the total weight data['totalWeight'] = np.vectorize(get_xsec_weight)(data.totalWeight,sample) # drop the columns we don't need anymore from the dataframe data.drop(["mcWeight","scaleFactor_PILEUP","scaleFactor_ELE","scaleFactor_MUON","scaleFactor_LepTRIGGER"], axis=1, inplace=True) # cut on number of leptons using the function cut_lep_n defined above fail = data[ np.vectorize(cut_lep_n)(data.lep_n)].index data.drop(fail, inplace=True) print('\t after requiring 4 leptons:\t\t\t',len(data.index)) # cut on lepton charge using the function cut_lep_charge defined above fail = data[ np.vectorize(cut_lep_charge)(data.lep_charge) ].index data.drop(fail, inplace=True) print('\t after requiring zero net charge:\t\t',len(data.index)) # cut on lepton type using the function cut_lep_type defined above fail = data[ np.vectorize(cut_lep_type)(data.lep_type) ].index data.drop(fail, inplace=True) print('\t after requiring lepton pairs of same type:\t',len(data.index)) #cut on the transverse momentum of the leptons using the function cut_lep_pt_012 defined above #fail =data[ np.vectorize(cut_lep_pt_012)(data.lep_pt)].index #data.drop(fail,inplace=True) #print('\t after requirements on lepton pt:\t\t',len(data.index)) # calculation of 4-lepton invariant mass using the function calc_mllll defined above data['mllll'] = np.vectorize(calc_mllll)(data.lep_pt,data.lep_eta,data.lep_phi,data.lep_E) # return the individual lepton transverse momenta in GeV #data['lep_pt_0'] = np.vectorize(lep_pt_0)(data.lep_pt) data['lep_pt_1'] = np.vectorize(lep_pt_1)(data.lep_pt) data['lep_pt_2'] = np.vectorize(lep_pt_2)(data.lep_pt) #data['lep_pt_3'] = np.vectorize(lep_pt_3)(data.lep_pt) # dataframe contents can be printed at any stage like this #print(data) # dataframe column can be printed at any stage like this #print(data['lep_pt']) # multiple dataframe columns can be printed at any stage like this #print(data[['lep_pt','lep_eta']]) nOut = len(data.index) # number of events passing cuts in this batch data_all = data_all.append(data) # append dataframe from this batch to the dataframe for the whole sample elapsed = time.time() - start # time taken to process print("\t\t nIn: "+str(nIn)+",\t nOut: \t"+str(nOut)+"\t in "+str(round(elapsed,1))+"s") # events before and after return data_all # return dataframe containing events passing all cuts
def read_file(path, sample): start = time.time() print("\tProcessing: " + sample + " file") data_all = pd.DataFrame() mc = uproot.open(path)["mini"] numevents = uproot.numentries(path, "mini") for data in mc.iterate([ "lep_n", "lep_pt", "lep_eta", "lep_phi", "lep_E", "lep_charge", "lep_type", "lep_ptcone30", "lep_etcone20", "mcWeight", "scaleFactor_PILEUP", "scaleFactor_ELE", "scaleFactor_MUON", "scaleFactor_LepTRIGGER" ], flatten=False, entrysteps=2500000, outputtype=pd.DataFrame, entrystop=numevents * fraction): nIn = len(data.index) if 'data' not in sample: data['totalWeight'] = np.vectorize(calc_weight)( data.mcWeight, data.scaleFactor_PILEUP, data.scaleFactor_ELE, data.scaleFactor_MUON, data.scaleFactor_LepTRIGGER) data['totalWeight'] = np.vectorize(get_xsec_weight)( data.totalWeight, sample) data.drop([ "mcWeight", "scaleFactor_PILEUP", "scaleFactor_ELE", "scaleFactor_MUON", "scaleFactor_LepTRIGGER" ], axis=1, inplace=True) # cut on number of leptons fail = data[np.vectorize(HZZCuts.cut_n_lep)(data.lep_n)].index data.drop(fail, inplace=True) # cut on lepton charge fail = data[np.vectorize(HZZCuts.cut_lep_charge)( data.lep_charge)].index data.drop(fail, inplace=True) #cut on the transverse momentum of the leptons fail = data[np.vectorize(HZZCuts.cut_lep_pt_012)(data.lep_pt)].index data.drop(fail, inplace=True) # cut on lepton type fail = data[np.vectorize(HZZCuts.cut_lep_type)(data.lep_type)].index data.drop(fail, inplace=True) # cut on lepton momentum isolation #fail = data[ np.vectorize(HZZCuts.cut_lep_ptcone)(data.lep_ptcone30,data.lep_pt) ].index #data.drop(fail, inplace=True) # cut on lepton energy isolation #fail = data[ np.vectorize(HZZCuts.cut_lep_etcone)(data.lep_etcone20,data.lep_pt) ].index #data.drop(fail, inplace=True) data['mllll'] = np.vectorize(calc_mllll)(data.lep_pt, data.lep_eta, data.lep_phi) nOut = len(data.index) data_all = data_all.append(data) elapsed = time.time() - start print("\t\t" + sample + " time taken: " + str(elapsed) + "s, nIn: " + str(nIn) + ", nOut: " + str(nOut)) return data_all
def do_cut( tree_name, files, supercuts, proposedBranches, output_directory, eventWeightBranch, pids, ): position = -1 if pids is not None: # handle pid registration if os.getpid() not in pids: pids[np.argmax(pids == 0)] = os.getpid() # this gives us the position of this particular process in our list of processes position = np.where(pids == os.getpid())[0][0] start = clock() try: branches = [] aliases = {} missingBranches = False for fname in files: with uproot.open(fname) as f: tree = f[tree_name] for branch in proposedBranches: if branch in tree: branches.append(branch) else: if branch in tree.aliases: aliases[branch.decode()] = formulate.from_auto( tree.aliases[branch].decode()) branches.extend( extract_branch_names(tree.aliases[branch])) else: logger.error( 'branch {} not found in {} for {}'.format( branch, tree_name, fname)) missingBranches |= True if missingBranches: sys.exit(1) for alias, alias_expr in aliases.items(): alias_expr = expand_definition(alias_expr, aliases) branches.extend(extract_branch_names(alias_expr.to_numexpr())) aliases[alias] = alias_expr branches = set(branches) eventWeightBranch = expand_selection(eventWeightBranch, aliases) supercuts = expand_supercuts(supercuts, aliases) # iterate over the cuts available cuts = defaultdict(lambda: {'raw': 0, 'weighted': 0}) events_tqdm = tqdm( total=uproot.numentries(files, tree_name), disable=(position == -1), position=2 * position + 1, leave=False, mininterval=5, maxinterval=10, unit="events", dynamic_ncols=True, ) for file, start, stop, events in uproot.iterate( files, tree_name, branches=branches, namedecode='utf-8', reportfile=True, reportentries=True, ): events_tqdm.set_description("({1:d}) Working on {0:s}".format( tree_name.decode('utf-8'), 2 * position + 1)) for cut in tqdm( get_cut(copy.deepcopy(supercuts)), desc="({1:d}) Applying cuts to {0:s}".format( file.name.decode('utf-8'), 2 * position + 2), total=get_n_cuts(supercuts), disable=(position == -1), position=2 * position + 2, leave=False, unit="cuts", miniters=10, dynamic_ncols=True, ): cut_hash = get_cut_hash(cut) rawEvents, weightedEvents = apply_cuts(events, cut, eventWeightBranch) cuts[cut_hash]['raw'] += rawEvents cuts[cut_hash]['weighted'] += weightedEvents events_tqdm.update(stop - start) with open( "{0:s}/{1:s}.json".format(output_directory, tree_name.decode('utf-8')), "w+") as f: f.write(json.dumps(cuts, sort_keys=True, indent=4)) result = True except: logger.exception("Caught an error - skipping {0:s}".format( tree_name.decode('utf-8'))) result = False end = clock() return (result, end - start)
def derive_chunks(filename, treename, chunksize): import uproot nentries = uproot.numentries(filename, treename) return [(filename, chunksize, index) for index in range(nentries // chunksize + 1)]