Beispiel #1
0
def get_chunking(filelist,
                 chunksize,
                 treename="Events",
                 workers=12,
                 skip_bad_files=False):
    """
    Return 2-tuple of
    - chunks: triplets of (filename,entrystart,entrystop) calculated with input `chunksize` and `filelist`
    - total_nevents: total event count over `filelist`
    """
    import uproot
    import awkward
    from tqdm.auto import tqdm
    import concurrent.futures
    chunksize = int(chunksize)
    chunks = []
    nevents = 0
    if skip_bad_files:
        # slightly slower (serial loop), but can skip bad files
        for fname in tqdm(filelist):
            try:
                items = uproot.numentries(fname, treename, total=False).items()
            except (IndexError, ValueError) as e:
                print("Skipping bad file", fname)
                continue
            for fn, nentries in items:
                nevents += nentries
                for index in range(nentries // chunksize + 1):
                    chunks.append((fn, chunksize * index,
                                   min(chunksize * (index + 1), nentries)))
    elif filelist[0].endswith(".awkd"):
        for fname in tqdm(filelist):
            f = awkward.load(fname,
                             whitelist=awkward.persist.whitelist +
                             [['blosc', 'decompress']])
            nentries = len(f["run"])
            nevents += nentries
            for index in range(nentries // chunksize + 1):
                chunks.append((fname, chunksize * index,
                               min(chunksize * (index + 1), nentries)))
    else:
        executor = None if len(
            filelist) < 5 else concurrent.futures.ThreadPoolExecutor(
                min(workers, len(filelist)))
        for fn, nentries in uproot.numentries(filelist,
                                              treename,
                                              total=False,
                                              executor=executor).items():
            nevents += nentries
            for index in range(nentries // chunksize + 1):
                if nentries <= 0:
                    continue
                chunks.append((fn, chunksize * index,
                               min(chunksize * (index + 1), nentries)))
    return chunks, nevents
Beispiel #2
0
def check_entries_uproot(files,
                         tree_names,
                         no_empty,
                         confirm_tree=True,
                         list_branches=False,
                         ignore_inaccessible=False):
    no_empty = no_empty or confirm_tree
    if not isinstance(tree_names, (tuple, list)):
        tree_names = [tree_names]

    if ignore_inaccessible:
        files = [f for f in files if os.access(f, os.R_OK)]

    if not no_empty:
        n_entries = {
            tree: uproot.numentries(files, tree)
            for tree in tree_names
        }
    else:
        n_entries = {tree: 0 for tree in tree_names}
        missing_trees = defaultdict(list)
        for tree in tree_names:
            totals = uproot.numentries(files, tree, total=False)
            for name, entries in totals.items():
                n_entries[tree] += entries
                if no_empty and entries <= 0:
                    files.remove(name)
                if confirm_tree and entries == 0:
                    if tree not in uproot.open(name):
                        missing_trees[tree].append(name)
        if missing_trees:
            files = set(sum((list(v) for v in missing_trees.values()), []))
            msg = "Missing at least one tree (%s) for %d file(s): %s"
            msg = msg % (", ".join(missing_trees), len(missing_trees),
                         ", ".join(files))
            raise RuntimeError(msg)

    branches = {}
    if list_branches:
        for tree in tree_names:
            open_files = (uproot.open(f) for f in files)
            all_branches = (f[tree].keys(recursive=True) for f in open_files
                            if tree in f)
            branches[tree] = dict(Counter(sum(all_branches, [])))

    if len(n_entries) == 1:
        n_entries = list(n_entries.values())[0]
    return files, n_entries, branches
def _get_chunking(filelist, treename, chunksize):
    items = []
    for fn in filelist:
        nentries = uproot.numentries(fn, treename)
        for index in range(nentries // chunksize + 1):
            items.append((fn, chunksize, index))
    return items
Beispiel #4
0
 def __len__(self):
     if self.cutfunc is None:
         return uproot.numentries(self.file_names,
                                  self.tree_name,
                                  total=True)
     else:
         return len(self.upTree)
Beispiel #5
0
def _get_metadata(item, skipbadfiles=False, retries=0, xrootdtimeout=None):
    import warnings
    out = set_accumulator()
    retry_count = 0
    while retry_count <= retries:
        try:
            # add timeout option according to modified uproot numentries defaults
            xrootdsource = {"timeout": xrootdtimeout, "chunkbytes": 32 * 1024, "limitbytes": 1024**2, "parallel": False}
            nentries = uproot.numentries(item.filename, item.treename, xrootdsource=xrootdsource)
            out = set_accumulator([FileMeta(item.dataset, item.filename, item.treename, nentries)])
            break
        except OSError as e:
            if not skipbadfiles:
                raise e
            else:
                w_str = 'Bad file source %s.' % item.filename
                if retries:
                    w_str += ' Attempt %d of %d.' % (retry_count + 1, retries + 1)
                    if retry_count + 1 < retries:
                        w_str += ' Will retry.'
                    else:
                        w_str += ' Skipping.'
                else:
                    w_str += ' Skipping.'
                warnings.warn(w_str)
        except Exception as e:
            if retries == retry_count:
                raise e
            w_str = 'Attempt %d of %d. Will retry.' % (retry_count + 1, retries + 1)
            warnings.warn(w_str)
        retry_count += 1
    return out
Beispiel #6
0
def _get_chunking(filelist, treename, chunksize, workers=1):
    items = []
    executor = None if len(filelist) < 5 else concurrent.futures.ThreadPoolExecutor(workers)
    for fn, nentries in uproot.numentries(filelist, treename, total=False, executor=executor).items():
        for index in range(nentries // chunksize + 1):
            items.append((fn, chunksize, index))
    return items
def main(args):
    mods = []

    #============================================================================#
    #-------------------------     Run PostProcessor     ------------------------#
    #============================================================================#
    files = []
    if len(args.inputfile) > 5 and args.inputfile[0:5] == "file:":
        #This is just a single test input file
        files.append(args.inputfile[5:])
    else:
        #this is a file list
        with open(args.inputfile) as f:
            files = [line.strip() for line in f]

    nsplit = args.nSplit + 2
    for file in files:
        nevt = uproot.numentries(file, "Events")
        ran = np.linspace(0, nevt+1, 5, dtype=int)
        print(file, nevt, ran)
        plist  =[]
        for i in range(len(ran)-1):
            print (i, ran[i], ran[i+1])
            p = Process(target=RunPost, args=(args.outputfile, file, ran, i))
            p.start()
            plist.append(p)
        exit_codes = [p.join() for p in plist]
Beispiel #8
0
 def nevents_in_file(self, path):
     if path in self._nevents_in_file_cache:
         nevents = self._nevents_in_file_cache[path]
         nblocks = int((nevents - 1) / self.nevents_per_block + 1)
     else:
         # Try to open root file with standard memmap with uproot. Use
         # localsource option if it fails
         nevents = uproot.numentries(path,
                                     self._treename_of_files_map[path])
         self._nevents_in_file_cache[path] = nevents
         nblocks = int((nevents - 1) / self.nevents_per_block + 1)
     return nblocks
Beispiel #9
0
def get_chunking_dask(filelist, chunksize, client=None, treename="Events"):
    import uproot
    chunks, chunksize, nevents = [], int(chunksize), 0
    info = client.gather(
        client.map(lambda x: (x, uproot.numentries(x, treename)), filelist))
    for fn, nentries in info:
        nevents += nentries
        for index in range(nentries // chunksize + 1):
            chunks.append(
                (fn, chunksize * index, min(chunksize * (index + 1),
                                            nentries)))
    return chunks, nevents
Beispiel #10
0
def CheckMCProcess(quenue, filelist, outname, outdir):
    for i in range(0, quenue):
        inputlist_ = filelist.replace("$(Process)", str(i))
        outputname_ = outname.replace("$(Process)", str(i))
        inputlist = "FileList/%s" % inputlist_
        outputname = "root://cmseos.fnal.gov/%s/%s" % (outdir, outputname_)
        if not os.path.exists(inputlist):
            continue
        ncnt_dict = GetNEventFromList(inputlist)
        ncnt_list = sum(ncnt_list.values())
        outcnt = 0
        try:
            outcnt = uproot.numentries(outputname, TreeName)
        except:
            print("Soemthing wrong with file")
        if (ncnt_list != outcnt):
            print("Production failed for %s : input file %s , outputname %s" %
                  (condorfile, inputlist, outputname))
Beispiel #11
0
def read_file(path, sample):
    start = time.time()
    print("\tProcessing: " + sample)
    data_all = pd.DataFrame()
    mc = uproot.open(path)["mini"]
    numevents = uproot.numentries(path, "mini")
    for data in mc.iterate([
            "photon_n", "photon_pt", "photon_eta", "photon_phi",
            "photon_etcone20", "photon_ptcone30", "photon_isTightID"
    ],
                           flatten=False,
                           entrysteps=2500000,
                           outputtype=pd.DataFrame,
                           entrystop=numevents * fraction):

        nIn = len(data.index)

        # Calculate reconstructed diphoton invariant mass
        data['myy'] = np.vectorize(calc_myy)(data.photon_pt, data.photon_eta,
                                             data.photon_phi)

        # Cut on number of photons
        fail = data[np.vectorize(HyyCuts.cut_photon_n)(data.photon_n)].index
        data.drop(fail, inplace=True)

        # Cut on pseudorapidity outside fiducial region
        fail = data[np.vectorize(HyyCuts.cut_photon_eta_fiducial)(
            data.photon_eta)].index
        data.drop(fail, inplace=True)

        # Cut on pseudorapidity inside barrel/end-cap transition region
        fail = data[np.vectorize(HyyCuts.cut_photon_eta_transition)(
            data.photon_eta)].index
        data.drop(fail, inplace=True)

        # Cut on transverse momentum of the photons
        fail = data[np.vectorize(HyyCuts.cut_photon_pt)(data.photon_pt)].index
        data.drop(fail, inplace=True)

        # Cut on photon reconstruction
        fail = data[np.vectorize(HyyCuts.cut_photon_reconstruction)(
            data.photon_isTightID)].index
        data.drop(fail, inplace=True)

        # Cut on energy isolation
        fail = data[np.vectorize(HyyCuts.cut_isolation_et)(
            data.photon_etcone20)].index
        data.drop(fail, inplace=True)

        # Cut on lower limit of reconstructed invariant mass
        fail = data[np.vectorize(HyyCuts.cut_mass_lower)(data.myy)].index
        data.drop(fail, inplace=True)

        # Cut on upper limit of reconsructed invariant mass
        fail = data[np.vectorize(HyyCuts.cut_mass_upper)(data.myy)].index
        data.drop(fail, inplace=True)

        # dataframe contents can be printed at any stage like this
        #print(data)

        # dataframe column can be printed at any stage like this
        #print(data['photon_pt'])

        # dataframe columns can be printed at any stage like this
        #print(data[['photon_pt','photon_eta']])

        nOut = len(data.index)
        data_all = data_all.append(data)
        elapsed = time.time() - start
        print("\t\tTime taken: " + str(elapsed) + ", nIn: " + str(nIn) +
              ", nOut: " + str(nOut))

    return data_all
Beispiel #12
0
def GetNEvent(file):
    return (file, uproot.numentries(file, TTreeName))
Beispiel #13
0
def PoolStopfile(file):
    return (file, uproot.numentries(file, TreeName))
def read_file(path, sample):
    start = time.time()
    print("\tProcessing: " + sample + " file")
    data_all = pd.DataFrame()
    mc = uproot.open(path)["mini"]
    numevents = uproot.numentries(path, "mini")
    if 'data' in sample: fraction_MC = 1
    else: fraction_MC = fraction
    for data in mc.iterate(
        [
            "lep_pt",
            "lep_eta",
            "lep_phi",
            "lep_type",
            "lep_charge",
            "mcWeight",
            "scaleFactor_PILEUP",
            "scaleFactor_ELE",
            "scaleFactor_MUON",  # add more variables here if you make cuts on them ,  
            "scaleFactor_LepTRIGGER"
        ],
            flatten=False,
            entrysteps=2500000,
            outputtype=pd.DataFrame,
            entrystop=numevents * fraction_MC):

        nIn = len(data.index)

        # label for each mc type
        data['type'] = np.vectorize(mc_type)(sample)

        # label for channel (ee, mm or em)
        data['Channel'] = np.vectorize(channel)(data.lep_type)

        data['SumCharges'] = np.vectorize(sum_charge)(data.lep_charge)
        data['MinmOCST'] = np.vectorize(calc_min_mOCST)(data.lep_pt,
                                                        data.lep_eta,
                                                        data.lep_phi,
                                                        data.lep_charge,
                                                        data.lep_type)
        data['LepPt0'] = np.vectorize(lep_pt_0)(data.lep_pt)
        data['LepPt1'] = np.vectorize(lep_pt_1)(data.lep_pt)
        data['LepPt2'] = np.vectorize(lep_pt_2)(data.lep_pt)
        data['LepPt3'] = np.vectorize(lep_pt_3)(data.lep_pt)
        data['MZ1'] = np.vectorize(calc_m_Z1)(data.lep_pt, data.lep_eta,
                                              data.lep_phi, data.lep_charge,
                                              data.lep_type)
        data['Z1_pair'] = np.vectorize(find_Z1_pair)(data.lep_pt, data.lep_eta,
                                                     data.lep_phi,
                                                     data.lep_charge,
                                                     data.lep_type)
        data['MZ2'] = np.vectorize(calc_m_Z2)(data.lep_pt, data.lep_eta,
                                              data.lep_phi, data.lep_charge,
                                              data.lep_type, data.Z1_pair)
        data['Mllll'] = np.vectorize(calc_mllll)(data.lep_pt, data.lep_eta,
                                                 data.lep_phi)

        if 'data' not in sample:
            data['weight'] = np.vectorize(calc_weight)(
                data.mcWeight, data.scaleFactor_PILEUP, data.scaleFactor_ELE,
                data.scaleFactor_MUON, data.scaleFactor_LepTRIGGER)
            data['weight'] = np.vectorize(get_xsec_weight)(data.weight,
                                                           sample) / fraction
        else:
            data['weight'] = 1

        data.drop([
            "LepPt3", "MinmOCST", "Z1_pair", "lep_pt", "lep_eta", "lep_phi",
            "lep_type", "lep_charge", "mcWeight", "scaleFactor_PILEUP",
            "scaleFactor_ELE", "scaleFactor_MUON", "scaleFactor_LepTRIGGER"
        ],
                  axis=1,
                  inplace=True)

        data = data[data.weight != 0]

        #print(data[['lep_eta']])
        #print(data)

        nOut = len(data.index)
        data_all = data_all.append(data)
        elapsed = time.time() - start
        print("\t\t" + sample + " time taken: " + str(elapsed) + "s, nIn: " +
              str(nIn) + ", nOut: " + str(nOut))

    return data_all
Beispiel #15
0
def read_file(path, sample):
    start = time.time()
    print("\tProcessing: " + sample)
    data_all = pd.DataFrame()
    mc = uproot.open(path)["mini"]
    numevents = uproot.numentries(path, "mini")

    for data in mc.iterate([
            "scaleFactor_ELE", "scaleFactor_MUON", "scaleFactor_LepTRIGGER",
            "scaleFactor_PILEUP", "mcWeight", "trigE", "trigM", "lep_n",
            "lep_pt", "lep_eta", "lep_phi", "lep_E", "lep_z0", "lep_type",
            "lep_isTightID", "lep_ptcone30", "lep_etcone20", "lep_charge",
            "lep_trackd0pvunbiased", "lep_tracksigd0pvunbiased", "jet_n"
    ],
                           flatten=False,
                           entrysteps=2500000,
                           outputtype=pd.DataFrame,
                           entrystop=numevents * fraction):

        try:
            nIn = len(data.index)

            if 'data' not in sample:
                data['totalWeight'] = np.vectorize(calc_weight)(
                    data.mcWeight, data.scaleFactor_PILEUP,
                    data.scaleFactor_ELE, data.scaleFactor_MUON,
                    data.scaleFactor_LepTRIGGER)
                data['totalWeight'] = np.vectorize(get_xsec_weight)(
                    data.totalWeight, sample)

            data.drop([
                "mcWeight", "scaleFactor_PILEUP", "scaleFactor_ELE",
                "scaleFactor_MUON", "scaleFactor_LepTRIGGER"
            ],
                      axis=1,
                      inplace=True)

            # Cut on number of leptons
            fail = data[np.vectorize(ZBosonCuts.cut_lep_n)(data.lep_n)].index
            data.drop(fail, inplace=True)

            # Preselection cut for electron/muon trigger
            fail = data[np.vectorize(ZBosonCuts.lepton_trigger)(
                data.trigE, data.trigM)].index
            data.drop(fail, inplace=True)

            # Both leptons are tight
            fail = data[np.vectorize(ZBosonCuts.lepton_is_tight)(
                data.lep_isTightID)].index
            data.drop(fail, inplace=True)

            # Both leptons are isolated and hard pT
            fail = data[np.vectorize(ZBosonCuts.lepton_isolated_hard_pt)(
                data.lep_pt, data.lep_ptcone30, data.lep_etcone20)].index
            data.drop(fail, inplace=True)

            # electron and muon selection
            fail = data[np.vectorize(ZBosonCuts.lepton_selection)(
                data.lep_type, data.lep_pt, data.lep_eta, data.lep_phi,
                data.lep_E, data.lep_trackd0pvunbiased,
                data.lep_tracksigd0pvunbiased, data.lep_z0)].index
            data.drop(fail, inplace=True)

            # Cut on oppositely charged leptons
            fail = data[np.vectorize(ZBosonCuts.cut_opposite_charge)(
                data.lep_charge)].index
            data.drop(fail, inplace=True)

            # Cut on leptons of same flavour
            fail = data[np.vectorize(ZBosonCuts.cut_same_flavour)(
                data.lep_type)].index
            data.drop(fail, inplace=True)

            # Calculate invariant mass
            data['mll'] = np.vectorize(calc_mll)(data.lep_pt, data.lep_eta,
                                                 data.lep_phi, data.lep_E)

            # Cut on invariant mass
            fail = data[np.vectorize(ZBosonCuts.cut_invariant_mass)(
                data.mll)].index
            data.drop(fail, inplace=True)

            # jet cut
            fail = data[np.vectorize(ZBosonCuts.cut_jet_n)(data.jet_n)].index
            data.drop(fail, inplace=True)

            nOut = len(data.index)
            data_all = data_all.append(data)
            elapsed = time.time() - start
            print("\t\tTime taken: " + str(elapsed) + ", nIn: " + str(nIn) +
                  ", nOut: " + str(nOut))
        except ValueError:
            print("ValueError. Probably vectorizing on zero-length input")
            continue

    return data_all
Beispiel #16
0
def _get_metadata(item):
    nentries = uproot.numentries(item.filename, item.treename)
    return set_accumulator([FileMeta(item.dataset, item.filename, item.treename, nentries)])
Beispiel #17
0
        dataset = os.path.basename(directory)
        if not any(fnmatch.fnmatch(dataset, pattern) for pattern in patterns):
            continue
        print(directory)
        flist = fnmatch.filter(xrdls(directory), "*.root")
        if len(flist) == 0:
            print("    NO FILES")
            continue

        nbytes = executor.map(lambda path: xrdfstat(path)['size'], flist)
        nbytes = np.array(list(nbytes))

        urllist = [fnaleos + path for path in flist]
        if getentries:
            nentries = uproot.numentries(urllist,
                                         "Events",
                                         total=False,
                                         executor=executor)
            nentries = np.array(list(nentries.values()))

        print("    # Files:", len(flist))
        print("    Total bytes: %d" % nbytes.sum())
        print("    Avg. bytes: %.0f" % (nbytes.sum() / nbytes.size))
        if dataset in xsections:
            xs = xsections[dataset]
        elif dataset not in xsections and 'Run201' not in dataset:
            nearest = list(xsections.keys())
            nearest.sort(key=lambda s: difflib.SequenceMatcher(
                None, s, dataset).ratio())
            print("    ", dataset, " missing xsection, taking closest name:",
                  nearest[-1])
            xs = xsections[nearest[-1]]
Beispiel #18
0
def _get_chunking_lazy(filelist, treename, chunksize):
    for fn in filelist:
        nentries = uproot.numentries(fn, treename)
        for index in range(nentries // chunksize + 1):
            yield (fn, chunksize, index)
Beispiel #19
0
import ROOT
from ROOT import TGenPhaseSpace, TLorentzVector, TVector3, TGraph
from ROOT import TMath, TCanvas, TH1F, TH2F, TComplex, TStyle, TColor, TChain, TTree, TLegend, TList, TLatex, THStack
import uproot
import uproot_methods
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
from array import array

events = uproot.open(
    "/home/camsdiaz/Documents/Tandem/Lb_Tuple_presel.root")["tree"]
num_entries = uproot.numentries(
    "/home/camsdiaz/Documents/Tandem/Lb_Tuple_presel.root", "tree")
L = 80559
D = 191256
#reconstructed momenta
px_proton_reco, py_proton_reco, pz_proton_reco = events.arrays(
    ["hplus_PX", "hplus_PY", "hplus_PZ"], outputtype=tuple)
px_pion_reco, py_pion_reco, pz_pion_reco = events.arrays(
    ["hminus_PX", "hminus_PY", "hminus_PZ"], outputtype=tuple)

#true momenta
px_proton_true, py_proton_true, pz_proton_true = events.arrays(
    ["hplus_TRUEP_X", "hplus_TRUEP_Y", "hplus_TRUEP_Z"], outputtype=tuple)
px_pion_true, py_pion_true, pz_pion_true = events.arrays(
    ["hminus_TRUEP_X", "hminus_TRUEP_Y", "hminus_TRUEP_Z"], outputtype=tuple)

#Track type
tracktype_proton, tracktype_pion = events.arrays(
    ["hplus_TRACK_Type", "hminus_TRACK_Type"], outputtype=tuple)
def read_file(path, sample):
    start = time.time()
    print("\tProcessing: " + sample + " file")
    data_all = pd.DataFrame()
    mc = uproot.open(path)["mini"]
    numevents = uproot.numentries(path, "mini")
    if 'data' in sample: fraction_MC = fraction
    else: fraction_MC = fraction * MC_to_data_ratio
    entrystart = 0
    entrystop = numevents * fraction_MC
    for data in mc.iterate(
        [
            "lep_n",
            "lep_pt",
            "lep_eta",
            "lep_phi",
            "lep_E",
            "lep_charge",
            "lep_type",
            "lep_isTightID",
            "lep_ptcone30",
            "lep_etcone20",
            "lep_z0",
            "lep_trackd0pvunbiased",
            "lep_tracksigd0pvunbiased",
            "jet_n",
            "jet_pt",
            "jet_eta",
            "jet_jvt",
            "jet_MV2c10",
            "met_et",
            "met_phi",
            "mcWeight",
            "scaleFactor_PILEUP",
            "scaleFactor_ELE",
            "scaleFactor_MUON",  # add more variables here if you make cuts on them ,              
            "scaleFactor_LepTRIGGER"
        ],
            flatten=False,
            entrysteps=2212282,
            outputtype=pd.DataFrame,
            entrystart=entrystart,
            entrystop=entrystop):

        nIn = len(data.index)

        data['good_lep_0_index'] = np.vectorize(find_good_lep_0_index)(
            data.lep_n, data.lep_type, data.lep_pt, data.lep_eta,
            data.lep_ptcone30, data.lep_etcone20, data.lep_isTightID,
            data.lep_z0, data.lep_trackd0pvunbiased,
            data.lep_tracksigd0pvunbiased)
        data['good_lep_1_index'] = np.vectorize(find_good_lep_1_index)(
            data.lep_n, data.lep_type, data.lep_pt, data.lep_eta,
            data.lep_ptcone30, data.lep_etcone20, data.lep_isTightID,
            data.lep_z0, data.lep_trackd0pvunbiased,
            data.lep_tracksigd0pvunbiased, data.good_lep_0_index)
        data['good_lep_2_index'] = np.vectorize(find_good_lep_2_index)(
            data.lep_n, data.lep_type, data.lep_pt, data.lep_eta,
            data.lep_ptcone30, data.lep_etcone20, data.lep_isTightID,
            data.lep_z0, data.lep_trackd0pvunbiased,
            data.lep_tracksigd0pvunbiased, data.good_lep_1_index)
        data['good_jet_0_index'] = np.vectorize(find_good_jet_0_index)(
            data.jet_n, data.jet_pt, data.jet_eta, data.jet_jvt)
        data['good_jet_1_index'] = np.vectorize(find_good_jet_1_index)(
            data.jet_n, data.jet_pt, data.jet_eta, data.jet_jvt,
            data.good_jet_0_index)
        data['good_jet_2_index'] = np.vectorize(find_good_jet_2_index)(
            data.jet_n, data.jet_pt, data.jet_eta, data.jet_jvt,
            data.good_jet_1_index)
        data['good_jet_3_index'] = np.vectorize(find_good_jet_3_index)(
            data.jet_n, data.jet_pt, data.jet_eta, data.jet_jvt,
            data.good_jet_2_index)
        data['good_jet_4_index'] = np.vectorize(find_good_jet_4_index)(
            data.jet_n, data.jet_pt, data.jet_eta, data.jet_jvt,
            data.good_jet_3_index)
        data['good_jet_5_index'] = np.vectorize(find_good_jet_5_index)(
            data.jet_n, data.jet_pt, data.jet_eta, data.jet_jvt,
            data.good_jet_4_index)
        data['good_jet_6_index'] = np.vectorize(find_good_jet_6_index)(
            data.jet_n, data.jet_pt, data.jet_eta, data.jet_jvt,
            data.good_jet_5_index)
        data['good_jet_7_index'] = np.vectorize(find_good_jet_7_index)(
            data.jet_n, data.jet_pt, data.jet_eta, data.jet_jvt,
            data.good_jet_6_index)
        data['good_jet_8_index'] = np.vectorize(find_good_jet_8_index)(
            data.jet_n, data.jet_pt, data.jet_eta, data.jet_jvt,
            data.good_jet_7_index)

        # throw away events where number of leptons isn't 2
        fail = data[np.vectorize(cut_good_lep_n)(data.good_lep_1_index,
                                                 data.good_lep_2_index)].index
        data.drop(fail, inplace=True)

        # throw away events where leptons aren't oppositely charged
        fail = data[np.vectorize(cut_lep_charge)(data.lep_charge,
                                                 data.good_lep_0_index,
                                                 data.good_lep_1_index)].index
        data.drop(fail, inplace=True)

        # cut on channel
        fail = data[np.vectorize(channel_cut)(data.lep_type,
                                              data.good_lep_0_index,
                                              data.good_lep_1_index)].index
        data.drop(fail, inplace=True)

        fail = data[np.vectorize(cut_met_et)(data.met_et)].index
        data.drop(fail, inplace=True)

        # label for each mc type
        data['type'] = np.vectorize(mc_type)(sample)

        # label for channel (ee, mm or em)
        #data['Channel'] = np.vectorize(channel)(data.lep_type,data.good_lep_0_index,data.good_lep_1_index)

        # number of jets
        data['NJets'] = np.vectorize(calc_good_jet_n)(
            data.jet_pt, data.good_jet_0_index, data.good_jet_1_index,
            data.good_jet_2_index, data.good_jet_3_index,
            data.good_jet_4_index, data.good_jet_5_index,
            data.good_jet_6_index, data.good_jet_7_index,
            data.good_jet_8_index)

        # MET
        data['MET'] = round(data['met_et'] / 1000, 2)

        # calculation of 2-lepton invariant mass
        data['Mll'] = np.vectorize(calc_mll)(data.lep_pt, data.lep_eta,
                                             data.lep_phi, data.lep_E,
                                             data.good_lep_0_index,
                                             data.good_lep_1_index)
        fail = data[np.vectorize(Mll_cut_lower)(data.Mll)].index
        data.drop(fail, inplace=True)
        fail = data[np.vectorize(Mll_cut_upper)(data.Mll)].index
        data.drop(fail, inplace=True)

        # transverse mass
        data['TransMass'] = np.vectorize(calc_Mt)(data.lep_pt, data.lep_eta,
                                                  data.lep_E, data.met_et,
                                                  data.good_lep_0_index,
                                                  data.good_lep_1_index)
        fail = data[np.vectorize(cut_Mt_upper)(data.TransMass)].index
        data.drop(fail, inplace=True)

        # Angular separation between leptons
        data['LepDeltaPhi'] = np.vectorize(calc_dPhiLL)(data.lep_phi,
                                                        data.good_lep_0_index,
                                                        data.good_lep_1_index)

        # Angular separation between leptons and MET dPhi(MET,ll)
        data['METLLDeltaPhi'] = np.vectorize(calc_dPhiLLmet)(
            data.lep_pt, data.lep_phi, data.met_phi, data.good_lep_0_index,
            data.good_lep_1_index)

        # Sum of lepton pt
        data['SumLepPt'] = np.vectorize(calc_ptLL)(data.lep_pt, data.lep_phi,
                                                   data.good_lep_0_index,
                                                   data.good_lep_1_index)
        fail = data[np.vectorize(cut_SumLepPt)(data.SumLepPt)].index
        data.drop(fail, inplace=True)

        # whether at least 1 jet is btagged
        data['BTags'] = np.vectorize(bjets)(
            data.jet_MV2c10, data.good_jet_0_index, data.good_jet_1_index,
            data.good_jet_2_index, data.good_jet_3_index,
            data.good_jet_4_index, data.good_jet_5_index,
            data.good_jet_6_index, data.good_jet_7_index,
            data.good_jet_8_index)

        if 'data' not in sample:
            data['weight'] = np.vectorize(calc_weight)(
                data.mcWeight, data.scaleFactor_PILEUP, data.scaleFactor_ELE,
                data.scaleFactor_MUON, data.scaleFactor_LepTRIGGER)
            data['weight'] = np.vectorize(get_xsec_weight)(data.weight, sample)
            # throw away events with weight < 0.00005
            fail = data[np.vectorize(cut_weight)(data.weight)].index
            data.drop(fail, inplace=True)
        else:
            data['weight'] = 1

        data.drop([
            "lep_n", "lep_pt", "lep_eta", "lep_phi", "lep_E", "lep_charge",
            "lep_type", "lep_isTightID", "lep_ptcone30", "lep_etcone20",
            "lep_z0", "lep_trackd0pvunbiased", "lep_tracksigd0pvunbiased",
            "jet_n", "jet_pt", "jet_eta", "jet_jvt", "jet_MV2c10", "met_et",
            "met_phi", "mcWeight", "scaleFactor_PILEUP", "scaleFactor_ELE",
            "scaleFactor_MUON", "scaleFactor_LepTRIGGER", "good_lep_0_index",
            "good_lep_1_index", "good_lep_2_index", "good_jet_0_index",
            "good_jet_1_index", "good_jet_2_index", "good_jet_3_index",
            "good_jet_4_index", "good_jet_5_index", "good_jet_6_index",
            "good_jet_7_index", "good_jet_8_index"
        ],
                  axis=1,
                  inplace=True)

        #print(data[['LepDeltaPhi','METLLDeltaPhi','TransMass','weight']])
        #print(data['weight'])

        nOut = len(data.index)
        data_all = data_all.append(data)
        elapsed = time.time() - start
        print("\t\t" + sample + " time taken: " + str(elapsed) + "s, nIn: " +
              str(nIn) + ", nOut: " + str(nOut))

    return data_all
def read_file(path,sample):
    start = time.time() # start the clock
    print("\tProcessing: "+sample) # print which sample is being processed
    data_all = pd.DataFrame() # define empty pandas DataFrame to hold all data for this sample
    tree = uproot.open(path)["mini"] # open the tree called mini
    numevents = uproot.numentries(path, "mini") # number of events
    for data in tree.iterate(["lep_n","lep_pt","lep_eta","lep_phi","lep_E","lep_charge","lep_type","lep_ptcone30",
                            "lep_etcone20", # add more variables here if you make cuts on them 
                            "mcWeight","scaleFactor_PILEUP","scaleFactor_ELE","scaleFactor_MUON",
                            "scaleFactor_LepTRIGGER"], # variables to calculate Monte Carlo weight
                           entrysteps=2500000, # number of events in a batch to process
                           outputtype=pd.DataFrame, # choose output type as pandas DataFrame
                           entrystop=numevents*fraction): # process up to numevents*fraction

        nIn = len(data.index) # number of events in this batch
        print('\t initial number of events:\t\t\t',nIn)

        if 'data' not in sample: # only do this for Monte Carlo simulation files
            # multiply all Monte Carlo weights and scale factors together to give total weight
            data['totalWeight'] = np.vectorize(calc_weight)(data.mcWeight,data.scaleFactor_PILEUP,
                                                            data.scaleFactor_ELE,data.scaleFactor_MUON,
                                                            data.scaleFactor_LepTRIGGER)
            # incorporate the cross-section weight into the total weight
            data['totalWeight'] = np.vectorize(get_xsec_weight)(data.totalWeight,sample)
            
        # drop the columns we don't need anymore from the dataframe
        data.drop(["mcWeight","scaleFactor_PILEUP","scaleFactor_ELE","scaleFactor_MUON","scaleFactor_LepTRIGGER"], 
                  axis=1, inplace=True)

        # cut on number of leptons using the function cut_lep_n defined above
        fail = data[ np.vectorize(cut_lep_n)(data.lep_n)].index
        data.drop(fail, inplace=True)
        print('\t after requiring 4 leptons:\t\t\t',len(data.index))

        # cut on lepton charge using the function cut_lep_charge defined above
        fail = data[ np.vectorize(cut_lep_charge)(data.lep_charge) ].index
        data.drop(fail, inplace=True)
        print('\t after requiring zero net charge:\t\t',len(data.index))

        # cut on lepton type using the function cut_lep_type defined above
        fail = data[ np.vectorize(cut_lep_type)(data.lep_type) ].index
        data.drop(fail, inplace=True)
        print('\t after requiring lepton pairs of same type:\t',len(data.index))

        #cut on the transverse momentum of the leptons using the function cut_lep_pt_012 defined above
        #fail =data[ np.vectorize(cut_lep_pt_012)(data.lep_pt)].index
        #data.drop(fail,inplace=True)
        #print('\t after requirements on lepton pt:\t\t',len(data.index))

        # calculation of 4-lepton invariant mass using the function calc_mllll defined above
        data['mllll'] = np.vectorize(calc_mllll)(data.lep_pt,data.lep_eta,data.lep_phi,data.lep_E)
        
        # return the individual lepton transverse momenta in GeV
        #data['lep_pt_0'] = np.vectorize(lep_pt_0)(data.lep_pt)
        data['lep_pt_1'] = np.vectorize(lep_pt_1)(data.lep_pt)
        data['lep_pt_2'] = np.vectorize(lep_pt_2)(data.lep_pt)
        #data['lep_pt_3'] = np.vectorize(lep_pt_3)(data.lep_pt)

        # dataframe contents can be printed at any stage like this
        #print(data)

        # dataframe column can be printed at any stage like this
        #print(data['lep_pt'])

        # multiple dataframe columns can be printed at any stage like this
        #print(data[['lep_pt','lep_eta']])

        nOut = len(data.index) # number of events passing cuts in this batch
        data_all = data_all.append(data) # append dataframe from this batch to the dataframe for the whole sample
        elapsed = time.time() - start # time taken to process
        print("\t\t nIn: "+str(nIn)+",\t nOut: \t"+str(nOut)+"\t in "+str(round(elapsed,1))+"s") # events before and after
    
    return data_all # return dataframe containing events passing all cuts
def read_file(path, sample):
    start = time.time()
    print("\tProcessing: " + sample + " file")
    data_all = pd.DataFrame()
    mc = uproot.open(path)["mini"]
    numevents = uproot.numentries(path, "mini")
    for data in mc.iterate([
            "lep_n", "lep_pt", "lep_eta", "lep_phi", "lep_E", "lep_charge",
            "lep_type", "lep_ptcone30", "lep_etcone20", "mcWeight",
            "scaleFactor_PILEUP", "scaleFactor_ELE", "scaleFactor_MUON",
            "scaleFactor_LepTRIGGER"
    ],
                           flatten=False,
                           entrysteps=2500000,
                           outputtype=pd.DataFrame,
                           entrystop=numevents * fraction):

        nIn = len(data.index)

        if 'data' not in sample:
            data['totalWeight'] = np.vectorize(calc_weight)(
                data.mcWeight, data.scaleFactor_PILEUP, data.scaleFactor_ELE,
                data.scaleFactor_MUON, data.scaleFactor_LepTRIGGER)
            data['totalWeight'] = np.vectorize(get_xsec_weight)(
                data.totalWeight, sample)

        data.drop([
            "mcWeight", "scaleFactor_PILEUP", "scaleFactor_ELE",
            "scaleFactor_MUON", "scaleFactor_LepTRIGGER"
        ],
                  axis=1,
                  inplace=True)

        # cut on number of leptons
        fail = data[np.vectorize(HZZCuts.cut_n_lep)(data.lep_n)].index
        data.drop(fail, inplace=True)

        # cut on lepton charge
        fail = data[np.vectorize(HZZCuts.cut_lep_charge)(
            data.lep_charge)].index
        data.drop(fail, inplace=True)

        #cut on the transverse momentum of the leptons
        fail = data[np.vectorize(HZZCuts.cut_lep_pt_012)(data.lep_pt)].index
        data.drop(fail, inplace=True)

        # cut on lepton type
        fail = data[np.vectorize(HZZCuts.cut_lep_type)(data.lep_type)].index
        data.drop(fail, inplace=True)

        # cut on lepton momentum isolation
        #fail = data[ np.vectorize(HZZCuts.cut_lep_ptcone)(data.lep_ptcone30,data.lep_pt) ].index
        #data.drop(fail, inplace=True)

        # cut on lepton energy isolation
        #fail = data[ np.vectorize(HZZCuts.cut_lep_etcone)(data.lep_etcone20,data.lep_pt) ].index
        #data.drop(fail, inplace=True)

        data['mllll'] = np.vectorize(calc_mllll)(data.lep_pt, data.lep_eta,
                                                 data.lep_phi)

        nOut = len(data.index)
        data_all = data_all.append(data)
        elapsed = time.time() - start
        print("\t\t" + sample + " time taken: " + str(elapsed) + "s, nIn: " +
              str(nIn) + ", nOut: " + str(nOut))

    return data_all
Beispiel #23
0
def do_cut(
    tree_name,
    files,
    supercuts,
    proposedBranches,
    output_directory,
    eventWeightBranch,
    pids,
):

    position = -1
    if pids is not None:
        # handle pid registration
        if os.getpid() not in pids:
            pids[np.argmax(pids == 0)] = os.getpid()
        # this gives us the position of this particular process in our list of processes
        position = np.where(pids == os.getpid())[0][0]

    start = clock()
    try:
        branches = []
        aliases = {}
        missingBranches = False
        for fname in files:
            with uproot.open(fname) as f:
                tree = f[tree_name]
                for branch in proposedBranches:
                    if branch in tree:
                        branches.append(branch)
                    else:
                        if branch in tree.aliases:
                            aliases[branch.decode()] = formulate.from_auto(
                                tree.aliases[branch].decode())
                            branches.extend(
                                extract_branch_names(tree.aliases[branch]))
                        else:
                            logger.error(
                                'branch {} not found in {} for {}'.format(
                                    branch, tree_name, fname))
                            missingBranches |= True
        if missingBranches:
            sys.exit(1)

        for alias, alias_expr in aliases.items():
            alias_expr = expand_definition(alias_expr, aliases)
            branches.extend(extract_branch_names(alias_expr.to_numexpr()))
            aliases[alias] = alias_expr

        branches = set(branches)
        eventWeightBranch = expand_selection(eventWeightBranch, aliases)
        supercuts = expand_supercuts(supercuts, aliases)

        # iterate over the cuts available
        cuts = defaultdict(lambda: {'raw': 0, 'weighted': 0})

        events_tqdm = tqdm(
            total=uproot.numentries(files, tree_name),
            disable=(position == -1),
            position=2 * position + 1,
            leave=False,
            mininterval=5,
            maxinterval=10,
            unit="events",
            dynamic_ncols=True,
        )
        for file, start, stop, events in uproot.iterate(
                files,
                tree_name,
                branches=branches,
                namedecode='utf-8',
                reportfile=True,
                reportentries=True,
        ):
            events_tqdm.set_description("({1:d}) Working on {0:s}".format(
                tree_name.decode('utf-8'), 2 * position + 1))
            for cut in tqdm(
                    get_cut(copy.deepcopy(supercuts)),
                    desc="({1:d}) Applying cuts to {0:s}".format(
                        file.name.decode('utf-8'), 2 * position + 2),
                    total=get_n_cuts(supercuts),
                    disable=(position == -1),
                    position=2 * position + 2,
                    leave=False,
                    unit="cuts",
                    miniters=10,
                    dynamic_ncols=True,
            ):
                cut_hash = get_cut_hash(cut)
                rawEvents, weightedEvents = apply_cuts(events, cut,
                                                       eventWeightBranch)
                cuts[cut_hash]['raw'] += rawEvents
                cuts[cut_hash]['weighted'] += weightedEvents

            events_tqdm.update(stop - start)

        with open(
                "{0:s}/{1:s}.json".format(output_directory,
                                          tree_name.decode('utf-8')),
                "w+") as f:
            f.write(json.dumps(cuts, sort_keys=True, indent=4))
            result = True
    except:
        logger.exception("Caught an error - skipping {0:s}".format(
            tree_name.decode('utf-8')))
        result = False
    end = clock()
    return (result, end - start)
def derive_chunks(filename, treename, chunksize):
    import uproot
    nentries = uproot.numentries(filename, treename)
    return [(filename, chunksize, index)
            for index in range(nentries // chunksize + 1)]