Ejemplo n.º 1
0
def dataframe_to_ttree(df,
                       filename,
                       treename="t",
                       chunksize=1e6,
                       compression=uproot3.LZ4(1),
                       progress=True):
    """
    Writes ROOT file containing one TTree with the input pandas DataFrame.

    filename: name of output file
    treename: name of output TTree
    chunksize: number of rows per basket
    compression: uproot compression object (LZ4, ZLIB, LZMA, or None)
    progress: show tqdm progress bar?
    """
    t = uproot3.newtree(df.dtypes)
    with uproot3.recreate(filename, compression=compression) as f:
        f[treename] = t
        chunksize = int(chunksize)
        iterable = range(0, len(df), chunksize)
        if progress:
            from tqdm.auto import tqdm
            iterable = tqdm(iterable)
        for i in iterable:
            chunk = df.iloc[i:i + chunksize]
            f[treename].extend({k: chunk[k].values for k in chunk.columns})
Ejemplo n.º 2
0
def create_file(file_name, distributions, weights, labels, extra_weights=None):
    if extra_weights is None:
        extra_weights = []
    n_events = len(weights[0])
    with uproot.recreate(file_name) as f:
        # write the predicted processes
        for i, label in enumerate(labels):
            lep_charge = create_lepton_charge(n_events)
            if label == "background":
                f[label] = uproot.newtree({
                    "jet_pt": "float64",
                    "weight": "float64",
                    "lep_charge": "int",
                    "weight_up": "float64",
                    "weight_down": "float64",
                })
                f[label].extend({
                    "jet_pt": distributions[i],
                    "weight": weights[i],
                    "lep_charge": lep_charge,
                    "weight_up": extra_weights[0],
                    "weight_down": extra_weights[1],
                })
            else:
                f[label] = uproot.newtree({
                    "jet_pt": "float64",
                    "weight": "float64",
                    "lep_charge": "int"
                })
                f[label].extend({
                    "jet_pt": distributions[i],
                    "weight": weights[i],
                    "lep_charge": lep_charge,
                })
Ejemplo n.º 3
0
 def create_ntuple(fname, treename, varname, var_array, weightname,
                   weight_array):
     with uproot.recreate(fname) as f:
         f[treename] = uproot.newtree({
             varname: "float64",
             weightname: "float64"
         })
         f[treename].extend({varname: var_array, weightname: weight_array})
Ejemplo n.º 4
0
def to_root_multi(filename, d):
    with uproot3.recreate(filename) as f:
        for treename in d.keys():
            df = d[treename]
            f[treename] = uproot3.newtree(
                {col: df[col].dtype
                 for col in df.columns})
            f[treename].extend(dict(df))
Ejemplo n.º 5
0
def save_template(templates, out_name, parameters):
    import uproot3

    out_file = uproot3.recreate(out_name)
    for tmp in templates:
        out_file[tmp._fName] = tmp
    out_file.close()
    return
Ejemplo n.º 6
0
def merge_root(rootfiles, outputfile, incrementRunId=False):
    """
    Merge root files in output files
    """
    try:
        import uproot3 as uproot
    except:
        print("uproot3 is mandatory to merge root file. Please, do:")
        print("pip install uproot3")

    out = uproot.recreate(outputfile)

    #Previous ID values to be able to increment runIn or EventId
    previousId = {}

    #create the dict reading all input root files
    trees = {}
    pbar = tqdm.tqdm(total=len(rootfiles))
    for file in rootfiles:
        root = uproot.open(file)
        root_keys = unicity(root.keys())
        for tree in root_keys:
            if hasattr(root[tree], 'keys'):
                if not tree in trees:
                    trees[tree] = {}
                    trees[tree]["rootDictType"] = {}
                    trees[tree]["rootDictValue"] = {}
                    previousId[tree] = {}
                for branch in root[tree].keys():
                    array = root[tree].array(branch)
                    if len(array) > 0:
                        if type(array[0]) is type(b'c'):
                            array = np.array([0 for xi in array])
                        if not branch in trees[tree]["rootDictType"]:
                            trees[tree]["rootDictType"][branch] = type(
                                array[0])
                            trees[tree]["rootDictValue"][branch] = np.array([])
                        if (not incrementRunId and
                                branch.decode('utf-8').startswith('eventID')
                            ) or (incrementRunId and
                                  branch.decode('utf-8').startswith('runID')):
                            if not branch in previousId[tree]:
                                previousId[tree][branch] = 0
                            array += previousId[tree][branch]
                            previousId[tree][branch] = max(array) + 1
                        trees[tree]["rootDictValue"][branch] = np.append(
                            trees[tree]["rootDictValue"][branch], array)
        pbar.update(1)
    pbar.close()

    #Set the dict in the output root file
    for tree in trees:
        if not trees[tree]["rootDictValue"] == {} or not trees[tree][
                "rootDictType"] == {}:
            out[tree] = uproot.newtree(trees[tree]["rootDictType"])
            out[tree].extend(trees[tree]["rootDictValue"])
Ejemplo n.º 7
0
def to_root(
        df,
        filename,
        treename="t",
        chunksize=20e3,
        compression=uproot3.ZLIB(1),
        compression_jagged=uproot3.ZLIB(1),
        progress=False,
):
    """
    Writes ROOT file containing one TTree with the input pandas DataFrame.

    filename: name of output file
    treename: name of output TTree
    chunksize: number of rows per basket
    compression: uproot compression object (LZ4, ZLIB, LZMA, or None)
    progress: show tqdm progress bar?
    """
    tree_dtypes = dict()
    jagged_branches = []
    for bname, dtype in df.dtypes.items():
        if "fletcher" in str(dtype):
            dtype = np.dtype(dtype.arrow_dtype.value_type.to_pandas_dtype())
            tree_dtypes[bname] = uproot3.newbranch(
                dtype, size=bname + "_varn", compression=compression_jagged)
            jagged_branches.append(bname)
        elif "object" in str(dtype):
            raise RuntimeError(
                f"Don't know how to serialize column {bname} with object dtype."
            )
        else:
            dtype = str(dtype).lstrip("u")
            tree_dtypes[bname] = dtype
    with uproot3.recreate(filename, compression=compression) as f:
        t = uproot3.newtree(tree_dtypes)
        f[treename] = t
        chunksize = int(chunksize)
        iterable = range(0, len(df), chunksize)
        if progress:
            iterable = tqdm(iterable)
        for i in iterable:
            chunk = df.iloc[i:i + chunksize]
            basket = dict()
            for column in chunk.columns:
                if column in jagged_branches:
                    arr = chunk[column].ak(version=0)
                    arr = maybe_unmask_jagged_array(arr)
                    # profiling says 30% of the time is spent checking if jagged __getitem__ is given a string
                    # this is not needed for writing out TTree branches, so free speedup.
                    arr._util_isstringslice = lambda x: False
                    basket[column] = arr
                    basket[column + "_varn"] = arr.counts.astype("int32")
                else:
                    basket[column] = chunk[column].values
            f[treename].extend(basket)
Ejemplo n.º 8
0
def _write_root(file, table, treename='Events', compression=-1, step=1048576):
    if compression == -1:
        compression = uproot3.write.compress.LZ4(4)
    with uproot3.recreate(file, compression=compression) as fout:
        fout[treename] = uproot3.newtree(
            {k: v.dtype
             for k, v in table.items()})
        start = 0
        while start < len(list(table.values())[0]) - 1:
            fout[treename].extend(
                {k: v[start:start + step]
                 for k, v in table.items()})
            start += step
Ejemplo n.º 9
0
def create_file_pseudodata(file_name, pseudodata):
    n_events = len(pseudodata)
    with uproot.recreate(file_name) as f:
        # write pseudodata
        lep_charge = create_lepton_charge(n_events)
        f["pseudodata"] = uproot.newtree({
            "jet_pt": "float64",
            "lep_charge": "int"
        })
        f["pseudodata"].extend({
            "jet_pt": pseudodata,
            "lep_charge": lep_charge
        })
Ejemplo n.º 10
0
def main():
    np.random.seed(0)

    data_hist = make_data_hist(hists)
    hists["data"] = {"counts": data_hist.tolist(), "bins": _bins}

    with open("example.json", "w") as serialization:
        json.dump(hists, serialization)

    with uproot3.recreate("example.root",
                          compression=uproot3.ZLIB(4)) as outfile:
        for key in hists.keys():
            outfile[key] = (
                np.array(hists[key]["counts"]),
                np.array(hists[key]["bins"]),
            )
Ejemplo n.º 11
0
def write_spectrum_to_root(ff, pp, filename, center=0, title=''):
    class MyTH1(uproot3_methods.classes.TH1.Methods, list):
        def __init__(self, low, high, values, title=""):
            self._fXaxis = types.SimpleNamespace()
            self._fXaxis._fNbins = len(values)
            self._fXaxis._fXmin = low
            self._fXaxis._fXmax = high
            values.insert(0, 0)
            values.append(0)
            for x in values:
                self.append(float(x))
            self._fTitle = title
            self._classname = "TH1F"

    th1f = MyTH1(center + ff[0], center + ff[-1], pp.tolist(), title=title)
    file = uproot3.recreate(filename, compression=uproot3.ZLIB(4))
    file["th1f"] = th1f
Ejemplo n.º 12
0
def pandas_to_tree(data, file_name, tree_name):
    """
    Save pandas dataframe as a ROOT TTree.

    :param pandas.DataFrame data: dataframe to be stored
    :param str file_name: path and name of the output file
    :param str tree_name: name of the result TTree
    """
    branch_dict = {
        data.columns[i]: data.dtypes[i]
        for i in range(0, len(data.columns))
    }
    with uproot3.recreate(file_name) as file_output:
        file_output[tree_name] = uproot3.newtree(branches=branch_dict,
                                                 title=tree_name)
        file_output[tree_name].extend({
            data.columns[i]: data[data.columns[i]].to_numpy()
            for i in range(0, len(data.columns))
        })
Ejemplo n.º 13
0
def writexml(spec, specdir, data_rootdir, resultprefix):
    global _ROOT_DATA_FILE

    shutil.copyfile(
        pkg_resources.resource_filename(__name__,
                                        'schemas/HistFactorySchema.dtd'),
        Path(specdir).parent.joinpath('HistFactorySchema.dtd'),
    )
    combination = ET.Element("Combination",
                             OutputFilePrefix=str(
                                 Path(specdir).joinpath(resultprefix)))

    with uproot.recreate(str(
            Path(data_rootdir).joinpath('data.root'))) as _ROOT_DATA_FILE:
        for channelspec in spec['channels']:
            channelfilename = str(
                Path(specdir).joinpath(
                    f'{resultprefix}_{channelspec["name"]}.xml'))
            with open(channelfilename, 'w') as channelfile:
                channel = build_channel(spec, channelspec,
                                        spec.get('observations'))
                indent(channel)
                channelfile.write(
                    "<!DOCTYPE Channel SYSTEM '../HistFactorySchema.dtd'>\n\n")
                channelfile.write(
                    ET.tostring(channel, encoding='utf-8').decode('utf-8'))

            inp = ET.Element("Input")
            inp.text = channelfilename
            combination.append(inp)

    # need information about modifier types to get the right prefix in measurement
    mixin = _ChannelSummaryMixin(channels=spec['channels'])

    for measurement in spec['measurements']:
        combination.append(
            build_measurement(measurement, dict(mixin.modifiers)))
    indent(combination)
    return b"<!DOCTYPE Combination  SYSTEM 'HistFactorySchema.dtd'>\n\n" + ET.tostring(
        combination, encoding='utf-8')
Ejemplo n.º 14
0
def pandas_to_tree(data, file_name, tree_name):
    """
      Parameters
      ----------
      data : pandas.DataFrame
          Data frame which should be stored as TTree
      file_name : str
          Path and name of root file
      tree_name : str
          Name of TTree
    """
    branch_dict = {
        data.columns[i]: data.dtypes[i]
        for i in range(0, len(data.columns))
    }
    with uproot3.recreate(file_name) as file_output:
        file_output[tree_name] = uproot3.newtree(branches=branch_dict,
                                                 title=tree_name)
        file_output[tree_name].extend({
            data.columns[i]: data[data.columns[i]].to_numpy()
            for i in range(0, len(data.columns))
        })
Ejemplo n.º 15
0
def save_dict_to_root(dic, file_name, tree_name=None):
    """
    This function stores data arrays in the form of a dictionary into a root file.
    It provides a convenient interface to ``uproot``.

    :param dic: Dictionary of data
    :param file_name: String
    :param tree_name: String. By default it's "tree".
    """
    if file_name[-5:] == ".root":
        file_name = file_name[:-5]
    if isinstance(dic, dict):
        dic = [dic]
    if tree_name is None:
        tree_name = "DataTree"
    Ndic = len(dic)
    if isinstance(tree_name, list):
        assert len(tree_name) == Ndic
    else:
        t = []
        for i in range(Ndic):
            t.append(tree_name + str(i))
        tree_name = t

    with uproot.recreate(file_name + ".root") as f:
        for d, t in zip(dic, tree_name):
            branch_type = {}
            branch_data = {}
            for i in d:
                j = (i.replace("(", "_").replace(")", "_").replace(
                    " ", "_").replace("*",
                                      "star").replace("+",
                                                      "p").replace("-", "m"))
                branch_data[j] = np.array(d[i])
                branch_type[j] = branch_data[j].dtype.name
            f[t] = uproot.newtree(branch_type)
            f[t].extend(branch_data)
Ejemplo n.º 16
0
def merge_root(rootfiles, outputfile):
    """
    Merge root files in output files
    """
    out = uproot.recreate(outputfile)

    #create the dict reading all input root files
    trees = {}
    pbar = tqdm.tqdm(total=len(rootfiles))
    for file in rootfiles:
        root = uproot.open(file)
        for tree in root.keys():
            if hasattr(root[tree], 'keys'):
                if not tree in trees:
                    trees[tree] = {}
                    trees[tree]["rootDictType"] = {}
                    trees[tree]["rootDictValue"] = {}
                for branch in root[tree].keys():
                    array = root[tree].array(branch)
                    if len(array) > 0:
                        if type(array[0]) is type(b'c'):
                            array = np.array([0 for xi in array])
                        if not branch in trees[tree]["rootDictType"]:
                            trees[tree]["rootDictType"][branch] = type(
                                array[0])
                            trees[tree]["rootDictValue"][branch] = np.array([])
                        trees[tree]["rootDictValue"][branch] = np.append(
                            trees[tree]["rootDictValue"][branch], array)
        pbar.update(1)
    pbar.close()

    #Set the dict in the output root file
    for tree in trees:
        if not trees[tree]["rootDictValue"] == {} or not trees[tree][
                "rootDictType"] == {}:
            out[tree] = uproot.newtree(trees[tree]["rootDictType"])
            out[tree].extend(trees[tree]["rootDictValue"])
Ejemplo n.º 17
0
                templates[f'{sname}_{pf}_{syst}'][{
                    'genflavor': s[:1:sum]
                }] for sname in samples
            ])

            matched_name = f"catp2_{pf}_{syst}"
            unmatched_name = f"catp1_{pf}_{syst}"
            if syst == 'nominal':
                data_name = f"data_obs_{pf}_{syst}"
                merged_dict[data_name] = data

            merged_dict[matched_name] = matched
            merged_dict[unmatched_name] = unmatched

    print(f'Will save templates to {template_file}')
    fout = uproot3.recreate(template_file)
    for name, h_obj in tqdm(merged_dict.items(), desc='Writing templates'):
        if np.sum(h_obj.values()) <= 0.:
            print(f'Template {name} is empty')
        if args.clip:
            h_obj = h_obj[40j:145j]
        fout[name] = export1d(h_obj)
    fout.close()

    if args.plot:
        hep.style.use("CMS")

        for clip in [True, False]:
            for pf in ["Pass", "Fail"]:
                # Make template plots
                fig, ax = plt.subplots()
Ejemplo n.º 18
0
def get_bkg_templates(tmp_rname):
    """
    Function that writes linearized mtt vs costheta distributions to root file.
    """
    ## variables that only need to be defined/evaluated once
    hdict = plt_tools.add_coffea_files(
        bkg_fnames) if len(bkg_fnames) > 1 else load(bkg_fnames[0])

    # get correct hist and rebin
    hname_to_use = "mtt_vs_tlep_ctstar_abs"
    if hname_to_use not in hdict.keys():
        raise ValueError("%s not found in file" % hname_to_use)
    xrebinning, yrebinning = linearize_binning
    histo = hdict[hname_to_use][
        Plotter.
        nonsignal_samples]  # process, sys, jmult, leptype, btag, lepcat

    xaxis_name = histo.dense_axes()[0].name
    yaxis_name = histo.dense_axes()[1].name
    ## rebin x axis
    if isinstance(xrebinning, np.ndarray):
        new_xbins = hist.Bin(xaxis_name, xaxis_name, xrebinning)
    elif isinstance(xrebinning, float) or isinstance(xrebinning, int):
        new_xbins = xrebinning
    histo = histo.rebin(xaxis_name, new_xbins)
    ## rebin y axis
    if isinstance(yrebinning, np.ndarray):
        new_ybins = hist.Bin(yaxis_name, yaxis_name, yrebinning)
    elif isinstance(yrebinning, float) or isinstance(yrebinning, int):
        new_ybins = yrebinning
    rebin_histo = histo.rebin(yaxis_name, new_ybins)

    ## scale ttJets events, split by reconstruction type, by normal ttJets lumi correction
    ttJets_permcats = [
        "*right", "*matchable", "*unmatchable", "*sl_tau", "*other"
    ]
    names = [
        dataset
        for dataset in sorted(set([key[0] for key in histo.values().keys()]))
    ]  # get dataset names in hists
    ttJets_cats = [
        name for name in names
        if any([fnmatch.fnmatch(name, cat) for cat in ttJets_permcats])
    ]  # gets ttJets(_PS)_other, ...

    ## make groups based on process
    process = hist.Cat("process", "Process", sorting="placement")
    process_cat = "dataset"

    # need to save coffea hist objects to file so they can be opened by uproot in the proper format
    upfout = uproot3.recreate(tmp_rname, compression=uproot3.ZLIB(
        4)) if os.path.isfile(tmp_rname) else uproot3.create(tmp_rname)

    if "3Jets" in njets_to_run:
        histo_dict_3j = processor.dict_accumulator({
            "Muon": {},
            "Electron": {}
        })
    if "4PJets" in njets_to_run:
        histo_dict_4pj = processor.dict_accumulator({
            "Muon": {},
            "Electron": {}
        })

    for lep in ["Muon", "Electron"]:
        orig_lepdir = "muNJETS" if lep == "Muon" else "eNJETS"

        #set_trace()
        ## make groups based on process
        process_groups = plt_tools.make_dataset_groups(lep,
                                                       args.year,
                                                       samples=names,
                                                       gdict="templates")
        #process_groups = plt_tools.make_dataset_groups(lep, args.year, samples=names, gdict="dataset")

        lumi_correction = lumi_corr_dict[args.year]["%ss" % lep]
        # scale ttJets events, split by reconstruction type, by normal ttJets lumi correction
        if len(ttJets_cats) > 0:
            for tt_cat in ttJets_cats:
                ttJets_lumi_topo = "_".join(tt_cat.split(
                    "_")[:-2]) if "sl_tau" in tt_cat else "_".join(
                        tt_cat.split("_")
                        [:-1])  # gets ttJets[SL, Had, DiLep] or ttJets_PS
                ttJets_eff_lumi = lumi_correction[ttJets_lumi_topo]
                lumi_correction.update({tt_cat: ttJets_eff_lumi})

        histo = rebin_histo.copy()
        histo.scale(lumi_correction, axis="dataset")
        histo = histo.group(process_cat, process,
                            process_groups)[:, :, :,
                                            lep, :, :].integrate("leptype")

        #set_trace()
        systs = sorted(set([key[1] for key in histo.values().keys()]))
        systs.insert(0, systs.pop(
            systs.index("nosys")))  # move "nosys" to the front

        # loop over each jet multiplicity
        for jmult in njets_to_run:
            lepdir = orig_lepdir.replace("NJETS", jmult.lower())

            # get sideband and signal region hists
            cen_sb_histo = Plotter.linearize_hist(
                histo[:, "nosys", jmult,
                      btag_reg_names_dict["Central"]["reg"]].integrate(
                          "jmult").integrate("btag").integrate("sys"))
            #up_sb_histo = histo[:, "nosys", jmult, btag_reg_names_dict["Up"]["reg"]].integrate("jmult").integrate("btag")
            #dw_sb_histo = histo[:, "nosys", jmult, btag_reg_names_dict["Down"]["reg"]].integrate("jmult").integrate("btag")
            sig_histo = Plotter.linearize_hist(
                histo[:, :, jmult,
                      btag_reg_names_dict["Signal"]["reg"]].integrate(
                          "jmult").integrate("btag"))

            # loop over each systematic
            for sys in systs:
                if sys not in systematics.template_sys_to_name[
                        args.year].keys():
                    continue

                sys_histo = sig_histo[:, sys].integrate(
                    "sys") if sys in systematics.ttJets_sys.values(
                    ) else Plotter.BKG_Est(
                        sig_reg=sig_histo[:, sys].integrate("sys"),
                        sb_reg=cen_sb_histo,
                        norm_type="SigMC",
                        sys=sys,
                        ignore_uncs=True)

                ## write nominal and systematic variations for each topology to file
                #for proc in sorted(set([key[0] for key in sig_histo.values().keys()])):
                for proc in sorted(
                        set([key[0] for key in sys_histo.values().keys()])):
                    if ("tt" not in proc) and (
                            sys in systematics.ttJets_sys.values()):
                        continue
                    #if (proc != "tt") and (sys in systematics.ttJets_sys.values()): continue
                    if (proc == "data_obs") and not (sys == "nosys"): continue
                    if not sys_histo[proc].values().keys():
                        #if not sig_histo[proc, sys].values().keys():
                        print(
                            f"Systematic {sys} for {lep} {jmult} {proc} not found, skipping"
                        )
                        continue

                    print(args.year, lep, jmult, sys, proc)
                    #set_trace()
                    outhname = "_".join(
                        list(
                            filter(None, [
                                proc, systematics.template_sys_to_name[
                                    args.year][sys][0], lepdir,
                                (args.year)[-2:]
                            ])))
                    if "LEP" in outhname:
                        outhname = outhname.replace(
                            "LEP",
                            "muon") if lep == "Muon" else outhname.replace(
                                "LEP", "electron")

                    template_histo = sys_histo[proc].integrate("process")
                    #template_histo = sig_histo[proc, sys].integrate("process").integrate("sys")

                    #set_trace()
                    ## save template histos to coffea dict
                    if jmult == "3Jets":
                        histo_dict_3j[lep][
                            f"{proc}_{sys}"] = template_histo.copy()
                    if jmult == "4PJets":
                        histo_dict_4pj[lep][
                            f"{proc}_{sys}"] = template_histo.copy()

                        ## save template histo to root file
                    upfout[outhname] = hist.export1d(template_histo)

    if "3Jets" in njets_to_run:
        coffea_out_3j = os.path.join(
            outdir,
            f"test_raw_templates_lj_3Jets_bkg_{args.year}_{jobid}.coffea")
        save(histo_dict_3j, coffea_out_3j)
        print(f"{coffea_out_3j} written")
    if "4PJets" in njets_to_run:
        coffea_out_4pj = os.path.join(
            outdir,
            f"test_raw_templates_lj_4PJets_bkg_{args.year}_{jobid}.coffea")
        save(histo_dict_4pj, coffea_out_4pj)
        print(f"{coffea_out_4pj} written")

    upfout.close()
    print(f"{tmp_rname} written")
Ejemplo n.º 19
0
def read_file(path, sample, branches=branches):
    print("=====")
    print("Processing {0} file".format(sample))
    mem = psutil.virtual_memory()
    mem_at_start = mem.available / (1024 ** 2)
    print(f'Available Memory: {mem_at_start:.0f} MB')
    count = 0
    hists = {}
    start = time.time()
    batch_num = 0
    with uproot.open(path) as file:
        tree = file['mini']
        numevents = tree.num_entries
        print(f'Total number of events in file: {numevents}')

        for batch in tree.iterate(branches, step_size='30 MB', library='np'):
            print('==============')
            df = pandas.DataFrame.from_dict(batch)
            del batch
            num_before_cuts = len(df.index)
            print("Events before cuts: {0}".format(num_before_cuts))
            count += num_before_cuts
            if 'Data' not in sample:
                df['totalWeight'] = np.vectorize(calc_weight)(df.mcWeight, df.scaleFactor_ELE, df.scaleFactor_MUON,
                                                              df.scaleFactor_PILEUP, df.scaleFactor_TRIGGER,
                                                              df.scaleFactor_ZVERTEX)
                df["totalWeight"] = np.vectorize(get_xsec_weight)(df.totalWeight, sample)
            else:
                df['totalWeight'] = [1 for item in range(len(df.index))]

            df.drop(["mcWeight", "scaleFactor_ELE", "scaleFactor_MUON",
                     "scaleFactor_PILEUP", "scaleFactor_TRIGGER", 'scaleFactor_ZVERTEX'],
                    axis=1,
                    inplace=True)

            # Standard selection cuts
            df = df.query("trigE or trigM")
            df = df.query('passGRL')
            df = df.query('hasGoodVertex')

            df.drop(["trigE", "trigM", "passGRL", "hasGoodVertex"],
                    axis=1,
                    inplace=True)

            # Lepton requirements
            df['good_lepton'] = np.vectorize(WCuts.cut_GoodLepton)(df.lep_flag, df.lep_pt,
                                                                   df.lep_ptcone30, df.lep_etcone20,
                                                                   df.lep_n, df.lep_type)
            df = df.query('good_lepton > -1')
            for column in df.columns:
                if 'lep' in column and column not in ['lep_n', 'good_lepton']:
                    df[column] = np.vectorize(extract_good_lepton)(df[column], df['good_lepton'])

            # W transverse mass
            df['mtw'] = np.vectorize(calc_mtw)(df.lep_pt, df.met_et, df.lep_phi, df.met_phi)
            df = df.query('mtw > 30000.')
            df = df.query('met_et > 30000.')

            # Convert MeV to GeV
            df['lep_pt'] = df['lep_pt'] / 1000
            df['lep_E'] = df['lep_E'] / 1000
            df['met_et'] = df['met_et'] / 1000
            df['mtw'] = df['mtw'] / 1000

            df['mtw_enu'] = df.query('lep_type == 11')['mtw']
            df['mtw_munu'] = df.query('lep_type == 13')['mtw']

            df['WT_phi'] = np.vectorize(calc_W_phi)(df.lep_pt, df.met_et, df.lep_phi, df.met_phi)

            df['jet_n'] = df['alljet_n']
            df.drop(['alljet_n'], axis=1, inplace=True)

            # Asymmetry related histograms
            df['pos_ele_eta'] = df.query('lep_type == 11 and lep_charge == 1')['lep_eta']
            df['pos_ele_eta'] = np.vectorize(abs_value)(df.pos_ele_eta)

            df['neg_ele_eta'] = df.query('lep_type == 11 and lep_charge == -1')['lep_eta']
            df['neg_ele_eta'] = np.vectorize(abs_value)(df.neg_ele_eta)

            df['pos_mu_eta'] = df.query('lep_type == 13 and lep_charge == 1')['lep_eta']
            df['pos_mu_eta'] = np.vectorize(abs_value)(df.pos_mu_eta)

            df['neg_mu_eta'] = df.query('lep_type == 13 and lep_charge == -1')['lep_eta']
            df['neg_mu_eta'] = np.vectorize(abs_value)(df.neg_mu_eta)

            df['lep_pt_j0'] = df.query('jet_n == 0')['lep_pt']
            df['lep_pt_j1'] = df.query('jet_n == 1')['lep_pt']
            df['lep_pt_j2'] = df.query('jet_n > 1')['lep_pt']

            df['mtw_j0'] = df.query('jet_n == 0')['mtw']
            df['mtw_j1'] = df.query('jet_n == 1')['mtw']
            df['mtw_j2'] = df.query('jet_n > 1')['mtw']

            df['met_et_j0'] = df.query('jet_n == 0')['met_et']
            df['met_et_j1'] = df.query('jet_n == 1')['met_et']
            df['met_et_j2'] = df.query('jet_n > 1')['met_et']

            df['lep_eta_j0'] = df.query('jet_n == 0')['lep_eta']
            df['lep_eta_j1'] = df.query('jet_n == 1')['lep_eta']
            df['lep_eta_j2'] = df.query('jet_n > 1')['lep_eta']

            if len(df.loc[df['jet_n'] > 0].index) > 0:
                temp_df = pandas.DataFrame()
                temp_df['eventNumber'] = df.loc[df['jet_n'] > 0]['eventNumber']
                for column in df.columns:
                    if 'jet' in column and column != 'jet_n':
                        temp_df[f'lead_{column}'] = np.vectorize(find_lead_jet)(df.loc[df['jet_n'] > 0]['jet_pt'],
                                                                                df.loc[df['jet_n'] > 0][column])
                temp_df['lead_jet_pt'] = temp_df['lead_jet_pt'] / 1000.

                temp_df['lj_phi_diff'] = np.vectorize(calc_delta_phi)(df.loc[df['jet_n'] > 0]['lep_phi'],
                                                                      temp_df['lead_jet_phi'])
                temp_df['abs_lj_phi_diff'] = np.vectorize(abs_value)(temp_df.lj_phi_diff)

                temp_df['Wj_phi_diff'] = np.vectorize(calc_delta_phi)(df.loc[df['jet_n'] > 0]['WT_phi'],
                                                                      temp_df['lead_jet_phi'])
                temp_df['abs_Wj_phi_diff'] = np.vectorize(abs_value)(temp_df.Wj_phi_diff)
                df = pandas.merge(left=df, right=temp_df, left_on='eventNumber', right_on='eventNumber', how='left')

            num_after_cuts = len(df.index)
            print("Number of events after cuts: {0}".format(num_after_cuts))
            print(f'Currently at {(count * 100 / numevents):.0f}% of events ({count}/{numevents})')

            for key, hist in hist_dicts.items():
                h_bin_width = hist["bin_width"]
                h_num_bins = hist["numbins"]
                h_xmin = hist["xmin"]

                x_var = hist["xvariable"]

                bins = [h_xmin + x * h_bin_width for x in range(h_num_bins + 1)]
                data_x, binning = np.histogram(df[x_var].values, bins=bins, weights=df.totalWeight.values)
                data_x = data_x.astype('float64')
                histo = uproot3_methods.classes.TH1.from_numpy((data_x, binning))
                if key not in hists.keys():
                    hists[key] = histo
                else:
                    for i in range(len(hists[key])):
                        hists[key][i] += histo[i]
            if not os.path.exists(f'../DataForFit_8TeV/{sample}/'):
                os.mkdir(f'../DataForFit_8TeV/{sample}')
            f = uproot3.recreate(f'../DataForFit_8TeV/{sample}/{sample}_{batch_num}.root')

            f['FitTree'] = uproot3.newtree({'mtw': uproot3.newbranch(np.float64, 'mtw'),
                                            'jet_n': uproot3.newbranch(np.int32, 'jet_n'),
                                            'totalWeight': uproot3.newbranch(np.float64, 'totalWeight')})

            f['FitTree'].extend({'mtw': df['mtw'].to_numpy(dtype=np.float64),
                                 'jet_n': df['jet_n'].to_numpy(dtype=np.int32),
                                 'totalWeight': df['totalWeight'].to_numpy(dtype=np.float64)})
            f.close()
            batch_num += 1
            del df
            gc.collect()
            # diagnostics
            mem = psutil.virtual_memory()
            actual_mem = mem.available / (1024 ** 2)
            print(f'Current available memory {actual_mem:.0f} MB '
                  f'({100 * actual_mem / mem_at_start:.0f}% of what we started with)')

    file = uproot3.recreate(f'../Output_8TeV/{sample}.root', uproot3.ZLIB(4))

    for key, hist in hists.items():
        file[key] = hist
        print(f'{key} histogram')
        file[key].show()

    file.close()

    mem = psutil.virtual_memory()
    actual_mem = mem.available / (1024 ** 2)
    print(f'Current available memory {actual_mem:.0f} MB '
          f'({100 * actual_mem / mem_at_start:.0f}% of what we started with)')
    print('Finished!')
    print(f'Time elapsed: {time.time() - start} seconds')
    return None
Ejemplo n.º 20
0
                SUEP_NTuple(
                    options.isMC,
                    str(options.era),
                    do_syst=1,
                    syst_var=sys + var,
                    weight_syst=True,
                    sample=options.dataset  #,
                    #                   haddFileName=f"tree_{options.jobNum}_{sys}{var}.root",
                ))

for i in modules_era:
    print("modules : ", i)

print("Selection : ", pre_selection)
tstart = time.time()
f = uproot.recreate("tree_%s_WS.root" % str(options.jobNum))
for instance in modules_era:
    output = run_uproot_job({instance.sample: [options.infile]},
                            treename='Events',
                            processor_instance=instance,
                            executor=futures_executor,
                            executor_args={'workers': 10},
                            chunksize=500000)
    for h, hist in output.items():
        f[h] = export1d(hist)
        #print(f'wrote {h} to tree_{options.jobNum}_WS.root')

modules_gensum = []

if options.isMC:
    modules_gensum.append(
Ejemplo n.º 21
0
def AddFFcorr(infname,
              intreename,
              outfname,
              outtreename,
              Lcstate,
              leptname,
              q2True_branchname,
              costhlTrue_branchname,
              nentries_to_read=1000000000,
              chunksize=10000):

    TH1.AddDirectory(kFALSE)

    perfname = None
    q2factor = None
    if Lcstate == 'Lc':
        perfname = './CorrectionTables/LcFFratios.root'
        q2factor = 1.
    elif (Lcstate == 'Lc2595' or Lcstate == 'Lc2625'):
        perfname = './CorrectionTables/LcstFFratios.root'
        q2factor = 1e-6
    else:
        raise Exception('Lc state not recognised', Lcstate)

    if leptname != 'mu' and leptname != 'tau':
        raise Exception('Lepton name not recognised', leptname)

    print('Using the histname', Lcstate + leptname + "_ratio")

    #variables to get from file
    varsdf = ['runNumber', 'eventNumber']
    varsdf += ['Lb_TRUEP_X', 'Lb_TRUEP_Y', 'Lb_TRUEP_Z', 'Lb_TRUEP_E']
    varsdf += ['Lc_TRUEP_X', 'Lc_TRUEP_Y', 'Lc_TRUEP_Z', 'Lc_TRUEP_E']
    varsdf += [
        'Lb_True' + leptname.capitalize() + '_PX',
        'Lb_True' + leptname.capitalize() + '_PY',
        'Lb_True' + leptname.capitalize() + '_PZ',
        'Lb_True' + leptname.capitalize() + '_PE'
    ]
    varsdf += [
        'Lb_TrueNeutrino_PX', 'Lb_TrueNeutrino_PY', 'Lb_TrueNeutrino_PZ',
        'Lb_TrueNeutrino_PE'
    ]

    File = TFile.Open(perfname, "read")
    Histg = File.Get(Lcstate + leptname + "_ratio")
    perfHist = Histg.Clone(Lcstate + leptname + "_rationew")
    File.Close()
    Xmin = perfHist.GetXaxis().GetXmin()
    Xmax = perfHist.GetXaxis().GetXmax()
    Ymin = perfHist.GetYaxis().GetXmin()
    Ymax = perfHist.GetYaxis().GetXmax()
    Limits = (Xmin, Xmax, Ymin, Ymax)
    print(Limits, perfHist.Integral())

    #variables to store in the new ttree
    varstoStore = {
        'runNumber': np.int,
        'eventNumber': np.int,
        'Event_FFcorr': np.float64,
        costhlTrue_branchname: np.float64,
        q2True_branchname: np.float64
    }

    aliases = {}
    #create a new rootfile
    with uproot3.recreate(outfname) as f:
        f[outtreename] = uproot3.newtree(varstoStore)

        #loop over the old rootfile chunkwise
        events_read = 0
        if chunksize >= nentries_to_read: chunksize = nentries_to_read
        for df_data in uproot4.iterate(infname + ':' + intreename,
                                       varsdf,
                                       aliases=aliases,
                                       cut=None,
                                       library="pd",
                                       step_size=chunksize):
            if events_read >= nentries_to_read: break

            #Compute q2 and cosThetaL
            pxl = df_data['Lb_True' + leptname.capitalize() + '_PX']
            pxnu = df_data['Lb_TrueNeutrino_PX']
            pyl = df_data['Lb_True' + leptname.capitalize() + '_PY']
            pynu = df_data['Lb_TrueNeutrino_PY']
            pzl = df_data['Lb_True' + leptname.capitalize() + '_PZ']
            pznu = df_data['Lb_TrueNeutrino_PZ']
            pel = df_data['Lb_True' + leptname.capitalize() + '_PE']
            penu = df_data['Lb_TrueNeutrino_PE']
            if (Lcstate == 'Lc2595' or Lcstate == 'Lc2625'):
                #this should be Lcstar momentum
                pxlc = df_data['Lb_TRUEP_X'] - pxl - pxnu
                pylc = df_data['Lb_TRUEP_X'] - pyl - pynu
                pzlc = df_data['Lb_TRUEP_X'] - pzl - pznu
                pelc = df_data['Lb_TRUEP_X'] - pel - penu
            elif Lcstate == 'Lc':
                pxlc = df_data['Lc_TRUEP_X']
                pylc = df_data['Lc_TRUEP_Y']
                pzlc = df_data['Lc_TRUEP_Z']
                pelc = df_data['Lc_TRUEP_E']

            PLc_lab = LorentzVector(
                Vector(pxlc, pylc,
                       pzlc), pelc)  #Format of LorentzVector(Vector(X,Y,Z), E)
            Pl_lab = LorentzVector(Vector(pxl, pyl, pzl), pel)
            PNu_lab = LorentzVector(Vector(pxnu, pynu, pznu), penu)
            PLb_lab = PLc_lab + Pl_lab + PNu_lab
            qsq, cthl = return_phasespace(PLb_lab, PLc_lab, Pl_lab)
            #print(qsq,cthl)
            df_data[q2True_branchname] = qsq
            df_data[costhlTrue_branchname] = cthl

            #get the corrections
            applyvars = [q2True_branchname, costhlTrue_branchname
                         ]  #has to be in correct order like in histogram
            df_data['Event_FFcorr'] = df_data[applyvars].apply(
                storeeff2D, args=[perfHist, Limits, q2factor], axis=1)

            #get only the things that need to be stored and write them to the file
            branch_dict = {
                vartostore: df_data[vartostore].to_numpy()
                for vartostore in list(varstoStore.keys())
            }
            f[outtreename].extend(branch_dict)
            events_read += df_data.shape[0]
            print('Events read', events_read)
Ejemplo n.º 22
0
#Create Directories to be cut and pushed into files
SD = dict((k, A[k]) for k in ['MET',"METPhi","j1PT","mjj","mjj_13","mjj_23","mjjoptimized","j1Eta","j2Eta","j3Eta","j1Phi","j2Phi","j3Phi","j2PT","j3PT","weight"]
                                       if k in A)
SDEvents = pd.DataFrame.from_dict(SD)
EWKBD = dict((k, B[k]) for k in ['MET',"METPhi","j1PT","mjj","mjj_13","mjj_23","mjjoptimized","j1Eta","j2Eta","j3Eta","j1Phi","j2Phi","j3Phi","j2PT","j3PT","weight"]
                                       if k in B)
EWKBDEvents = pd.DataFrame.from_dict(EWKBD)
QCDBD = dict((k, C[k]) for k in ['MET',"METPhi","j1PT","mjj","mjj_13","mjj_23","mjjoptimized","j1Eta","j2Eta","j3Eta","j1Phi","j2Phi","j3Phi","j2PT","j3PT","weight"]
                                       if k in C)
QCDBDEvents = pd.DataFrame.from_dict(QCDBD)



# In[6]:
#Create file of events with no cuts applied
file1 = uproot.recreate("Combined Signal Ntuples.root")
file1["Signal"] = SDEvents 
file2 = uproot.recreate("Combined EWKBackground Ntuples.root")
file2["EWKBackground"] = EWKBDEvents
file3 = uproot.recreate("Combined QCDBackground Ntuples.root")
file3["QCDBackground"] = QCDBDEvents


# In[7]:
#Create file of events with mjj>1000, MET>200, and 3 jets.
file1 = uproot.recreate("Combined Signal Ntuples, mjj>1000, MET>200, 3 jets.root")
#The j3Eta cut ensures no j3Eta values in the -1000 range, indicative of an error, most are around 1.
file1["Signal"] = SDEvents.loc[(SDEvents['mjjoptimized'] > 1000) & (SDEvents['MET'] > 200) & (SDEvents['j3Eta'] > -500)]
file2 = uproot.recreate("Combined EWKBackground Ntuples, mjj>1000, MET>200, 3 jets.root")
file2["EWKBackground"] = EWKBDEvents.loc[(EWKBDEvents['mjjoptimized'] > 1000) & (EWKBDEvents['MET'] > 200) & (EWKBDEvents['j3Eta'] > -500)]
file3 = uproot.recreate("Combined QCDBackground Ntuples, mjj>1000, MET>200, 3 jets.root")
Ejemplo n.º 23
0
def get_bkg_templates(tmp_rname):
    """
    Function that writes linearized mtt vs costheta distributions to root file.
    """
    # define variables to get histogram for background
    bkg_fnmatch = "%s.coffea" % base_template_name.replace(
        "NJETS", njets_regex).replace("SIG", "bkg")
    bkg_fnames = fnmatch.filter(os.listdir(inputdir), bkg_fnmatch)

    if "3Jets" in njets_to_run:
        histo_dict_3j = processor.dict_accumulator({
            "Muon": {},
            "Electron": {}
        })
    if "4PJets" in njets_to_run:
        histo_dict_4pj = processor.dict_accumulator({
            "Muon": {},
            "Electron": {}
        })

        # need to save coffea hist objects to file so they can be opened by uproot3 in the proper format
    upfout = uproot3.recreate(tmp_rname, compression=uproot3.ZLIB(
        4)) if os.path.isfile(tmp_rname) else uproot3.create(tmp_rname)

    for bkg_file in bkg_fnames:
        hdict = load(os.path.join(inputdir, bkg_file))
        jmult = "3Jets" if "3Jets" in bkg_file else "4PJets"
        for lep in hdict.keys():
            lepdir = "mujets" if lep == "Muon" else "ejets"
            for tname in hdict[lep].keys():
                #set_trace()
                template_histo = hdict[lep][tname]
                proc = tname.split(
                    "_")[0] if not "data_obs" in tname else "data_obs"
                sys = "_".join(tname.split("_")
                               [1:]) if not "data_obs" in tname else "nosys"
                if not sys in sys_to_use.keys():
                    continue
                #if "RENORM" in sys: set_trace()
                sysname, onlyTT = sys_to_use[sys]

                name = proc + lepdir if proc == "QCD" else proc
                print(lep, jmult, sys, name)
                outhname = "_".join([jmult, lepdir, name
                                     ]) if sys == "nosys" else "_".join(
                                         [jmult, lepdir, name, sysname])

                if (sys != "nosys") and (args.smooth) and (
                        templates_to_smooth[proc]):
                    template_histo = smoothing(
                        nominal=histo_dict_3j[lep][proc]
                        if jmult == "3Jets" else histo_dict_4pj[lep][proc],
                        template=template_histo,
                        nbinsx=len(linearize_binning[0]) - 1,
                        nbinsy=len(linearize_binning[1]) - 1)
                    #set_trace()

                    ## save template histos to coffea dict
                if jmult == "3Jets":
                    histo_dict_3j[lep][proc if sys == "nosys" else "%s_%s" %
                                       (proc, sysname)] = template_histo
                if jmult == "4PJets":
                    histo_dict_4pj[lep][proc if sys == "nosys" else "%s_%s" %
                                        (proc, sysname)] = template_histo

                    ## save template histo to root file
                upfout[outhname] = hist.export1d(template_histo)

    #set_trace()
    if "3Jets" in njets_to_run:
        coffea_out_3j = os.path.join(
            outdir,
            f"templates_lj_3Jets_bkg_smoothed_{jobid}_{args.year}.coffea"
            if args.smooth else
            f"templates_lj_3Jets_bkg_{jobid}_{args.year}.coffea")
        save(histo_dict_3j, coffea_out_3j)
        print(f"{coffea_out_3j} written")
    if "4PJets" in njets_to_run:
        coffea_out_4pj = os.path.join(
            outdir,
            f"templates_lj_4PJets_bkg_smoothed_{jobid}_{args.year}.coffea"
            if args.smooth else
            f"templates_lj_4PJets_bkg_{jobid}_{args.year}.coffea")
        save(histo_dict_4pj, coffea_out_4pj)
        print(f"{coffea_out_4pj} written")

    upfout.close()
    print(f"{tmp_rname} written")
Ejemplo n.º 24
0
def dump_generated_events(arr: ak.Array):

    fn = f"wd/{conf.tag}/output.root"
    with uproot.recreate(fn) as file:
        file["tree1"] = uproot.newtree(dict(arr.type.type))
        file["tree1"].extend({branch: arr[branch] for branch in arr.fields})
Ejemplo n.º 25
0
def to_root(df, filename, treename):
    with uproot3.recreate(filename) as f:
        f[treename] = uproot3.newtree(
            {col: df[col].dtype
             for col in df.columns})
        f[treename].extend(dict(df))
                    default="converted_trees")
args = parser.parse_args()

base_dir = args.output
os.makedirs(base_dir, exist_ok=True)

infile = args.infile
name = infile.split("/")[-1].split(".")[0]

print("Processing Datafile: {}".format(name))
if args.offline is True:
    online_tree = u3.open(infile)["deepntuplizer"]["tree"]
else:
    online_tree = u3.open(infile)["btagana"]["ttree"]

out_file = u3.recreate(os.path.join(base_dir, "{}.root".format(name)),
                       compression=u3.ZLIB(4))

kt = "PuppiJet.TagVar_trackPtRel"
track_key_indicator = "PuppiJet.TagVar_"

N_jets = -1
print("running on {} events!".format(N_jets))

branch_dict = {}
branch_registration = {}

tracking_index_low = online_tree['PuppiJet.Jet_nFirstTrkTagVar'].array()
tracking_index_high = online_tree['PuppiJet.Jet_nLastTrkTagVar'].array()

track_eta_index_low = online_tree[
    'PuppiJet.Jet_nFirstTrkEtaRelTagVarCSV'].array()
Ejemplo n.º 27
0
                    branch_dicts[i_split]["branch_dict"]["{}_counts".format(
                        online_key)] = counts
                # branch_dict["{}_counts".format(online_key)] = counts
            else:
                arr = online_tree[online_key].array()[on_mask].flatten(
                )[:N_jets]
                dtype = np.dtype("f4")

            for i_split, (n_start, n_end) in enumerate(zip(n_starts, n_ends)):
                branch_dicts[i_split]["branch_registration"][
                    online_key] = dtype
                branch_dicts[i_split]["branch_dict"][online_key] = arr[
                    n_start:n_end]
            # branch_registration[online_key] = dtype
            # branch_dict[online_key] = arr

for i_split in range(SPLITS):
    out_path = os.path.join(base_dir,
                            "{}_{}_{}.root".format(name, branch_key, i_split))
    print("Creating new tree: {}".format(out_path))
    with u3.recreate(out_path, compression=u3.ZLIB(6)) as out_file:
        print("Creating new tree for {}".format(branch_key))
        out_file["ttree"] = u3.newtree(
            branch_dicts[i_split]["branch_registration"])
        print("Creating branches")
        # s_time = time.time()
        out_file["ttree"].extend(branch_dicts[i_split]["branch_dict"])
        # from IPython import embed;embed()
        # e_time = time.time()
        # print("Total time needed for {0} events:\n{1:1.1f}".format(N_jets, e_time - s_time))
Ejemplo n.º 28
0
def main(args):

    processed_nano = "tmva_xgboost_reproducers/lead_processed_nano_uncorr.root"
    df = pd.read_parquet(args.input_dataframe)
    print("Read input dataframe:\n{}".format(df))

    inputs = {
        "lead_energyRaw": "SCRawE", 
        "lead_r9": "r9", 
        "lead_sieie":"sigmaIetaIeta", 
        "lead_etaWidth": "etaWidth", 
        "lead_phiWidth": "phiWidth", 
        "lead_sieip": "covIEtaIPhi", 
        "lead_s4": "s4", 
        "lead_pfPhoIso03": "phoIso03", 
        "lead_pfChargedIsoPFPV": "chgIsoWrtChosenVtx", 
        "lead_pfChargedIsoWorstVtx": "chgIsoWrtWorstVtx",
        "lead_eta": "scEta", 
        "lead_fixedGridRhoAll": "rho"
        }

    # This is needed just to not hardcore the branch type later
    arr_dict = {}
    for name in inputs.keys():
        name_orig = name
        # Since I don't remember which ones have the suffix _nano
        if name_orig not in list(df.columns):
            name += "_nano"
        arr_dict[name_orig] = df[name]
    ak_arr = ak.Array(arr_dict)
    print(ak_arr.type)

    # Explicitely recompute also XGBoost one, just because
    print("Recomputing MVA with XGBoost")
    mva = xgboost.Booster()
    mva.load_model(args.xgboost_model)
    var_order = list(arr_dict.keys())
    bdt_inputs = np.column_stack([ak.to_numpy(ak_arr[name]) for name in var_order])
    tempmatrix = xgboost.DMatrix(bdt_inputs, feature_names=var_order)
    lead_idmva_xgboost = mva.predict(tempmatrix)
    # Thomas workflow
    lead_idmva_xgboost = -np.log(1./lead_idmva_xgboost - 1.)
    lead_idmva_xgboost = 2. / (1. + np.exp(-2.*lead_idmva_xgboost)) - 1.

    # Dump nanoaod inputs to a TTree
    with uproot3.recreate(processed_nano) as f:
        branchdict = {}
        arraydict = {}
    
        for nano_name, model_name in inputs.items():
            #branchdict[model_name] = str(ak_arr[nano_name].type.type).replace('?', '')
            branchdict[model_name] = "float32"
            arraydict[model_name] = ak_arr[nano_name]
    
        f["Events"] = uproot3.newtree(branchdict)
        f["Events"].extend(arraydict)

    # TMVA with RDataFrame
    ROOT.gInterpreter.ProcessLine('''
    TMVA::Experimental::RReader model("{}");
    computeModel = TMVA::Experimental::Compute<{}, float>(model);
    '''.format(args.tmva_model, len(ak_arr.fields)))

    rdf = ROOT.RDataFrame("Events", processed_nano)
    rdf = rdf.Define("lead_idmva_tmva", ROOT.computeModel, ROOT.model.GetVariableNames())
    print("Running RDF event loop")
    dct = rdf.AsNumpy(columns=["lead_idmva_tmva"])
    lead_idmva_tmva = np.array([v[0] for v in dct["lead_idmva_tmva"]])

    # Plot
    print("Plotting to {}".format(args.output_dir))
    bins = 100
    rng = (-1, 1)

    fig, (up, down) = plt.subplots(
        nrows=2,
        ncols=1,
        gridspec_kw={"height_ratios": (1, 1)}
        )

    up.hist(lead_idmva_xgboost, bins=bins, range=rng, histtype="step", label="XGBoost", linewidth=2)
    up.hist(lead_idmva_tmva, bins=bins, range=rng, histtype="step", label="TMVA", linewidth=2)

    up.set_xlabel("lead PhoIDMVA after corrections")
    up.legend(fontsize=18, loc="upper left")

    down.hist(100 * (lead_idmva_xgboost - lead_idmva_tmva) / lead_idmva_tmva, 
              bins=500,
              range=(-100, 100),
              histtype="step",
              density=True,
              color="black",
              linewidth=2
             )
    down.set_xlabel("$(XGB - TMVA)/TMVA$ [%]")
    down.set_yscale("log")

    fig.tight_layout()

    fig.savefig("{}/lead_xgb_tmva.png".format(args.output_dir), bbox_inches='tight')
    fig.savefig("{}/lead_xgb_tmva.pdf".format(args.output_dir), bbox_inches='tight')

    fig, ax = plt.subplots()
    ax.scatter(lead_idmva_xgboost, lead_idmva_tmva)
    ax.set_xlabel("XGBoost")
    ax.set_ylabel("TMVA")

    fig.savefig("{}/xgb_tmva_scatter.png".format(args.output_dir), bbox_inches='tight')
    fig.savefig("{}/xgb_tmva_scatter.pdf".format(args.output_dir), bbox_inches='tight')
Ejemplo n.º 29
0
def makeCardFromHist(out_cache,
                     hist_name,
                     nonprompt_scale=1,
                     signal_scale=1,
                     bkg_scale=1,
                     overflow='all',
                     ext='',
                     systematics=True):
    print("Writing cards using histogram:", hist_name)
    card_dir = os.path.expandvars('$TWHOME/data/cards/')
    if not os.path.isdir(card_dir):
        os.makedirs(card_dir)

    data_card = card_dir + hist_name + ext + '_card.txt'
    shape_file = card_dir + hist_name + ext + '_shapes.root'

    histogram = out_cache[hist_name].copy()
    #histogram = histogram.rebin('mass', bins[hist_name]['bins'])

    # scale some processes
    scales = {
        'ttbar': nonprompt_scale,
        'topW_v2': signal_scale,
        'TTW': bkg_scale,  # only scale the most important backgrounds
        'TTZ': bkg_scale,
        'TTH': bkg_scale,
    }
    histogram.scale(scales, axis='dataset')

    ## making a histogram for pseudo observation. this hurts, but rn it seems to be the best option
    data_counts = np.asarray(
        np.round(
            histogram[notdata].integrate('dataset').values(
                overflow=overflow)[()], 0), int)
    data_hist = histogram['topW_v2']
    data_hist.clear()
    data_hist_bins = data_hist.axes()[1]
    for i, edge in enumerate(data_hist_bins.edges(overflow=overflow)):
        if i >= len(data_counts): break
        for y in range(data_counts[i]):
            data_hist.fill(**{
                'dataset': 'data',
                data_hist_bins.name: edge + 0.0001
            })

    other_sel = re.compile('(TTTT|diboson|DY|rare)')
    ##observation = hist.export1d(histogram['pseudodata'].integrate('dataset'), overflow=overflow)
    #observation = hist.export1d(data_hist['data'].integrate('dataset'), overflow=overflow)
    observation = hist.export1d(histogram[notdata].integrate('dataset'),
                                overflow=overflow)
    tw = hist.export1d(histogram['topW_v2'].integrate('dataset'),
                       overflow=overflow)
    ttw = hist.export1d(histogram['TTW'].integrate('dataset'),
                        overflow=overflow)
    ttz = hist.export1d(histogram['TTZ'].integrate('dataset'),
                        overflow=overflow)
    tth = hist.export1d(histogram['TTH'].integrate('dataset'),
                        overflow=overflow)
    rare = hist.export1d(histogram[other_sel].integrate('dataset'),
                         overflow=overflow)
    nonprompt = hist.export1d(histogram['ttbar'].integrate('dataset'),
                              overflow=overflow)

    fout = uproot3.recreate(shape_file)

    fout["signal"] = tw
    fout["nonprompt"] = nonprompt
    fout["ttw"] = ttw
    fout["ttz"] = ttz
    fout["tth"] = tth
    fout["rare"] = rare
    fout["data_obs"] = observation
    fout.close()

    # Get the total yields to write into a data card
    totals = {}

    totals['signal'] = histogram['topW_v2'].integrate('dataset').values(
        overflow=overflow)[()].sum()
    totals['ttw'] = histogram['TTW'].integrate('dataset').values(
        overflow=overflow)[()].sum()
    totals['ttz'] = histogram['TTZ'].integrate('dataset').values(
        overflow=overflow)[()].sum()
    totals['tth'] = histogram['TTH'].integrate('dataset').values(
        overflow=overflow)[()].sum()
    totals['rare'] = histogram['rare'].integrate('dataset').values(
        overflow=overflow)[()].sum()
    totals['nonprompt'] = histogram['ttbar'].integrate('dataset').values(
        overflow=overflow)[()].sum()
    ##totals['observation'] = histogram['pseudodata'].integrate('dataset').values(overflow=overflow)[()].sum()
    #totals['observation'] = int(sum(data_hist['data'].sum('dataset').values(overflow=overflow)[()]))
    totals['observation'] = histogram[notdata].integrate('dataset').values(
        overflow=overflow)[()].sum()

    print("{:30}{:.2f}".format("Signal expectation:", totals['signal']))
    print("{:30}{:.2f}".format("Non-prompt background:", totals['nonprompt']))
    print("{:30}{:.2f}".format(
        "t(t)X(X)/rare background:",
        totals['ttw'] + totals['ttz'] + totals['tth'] + totals['rare']))
    print("{:30}{:.2f}".format("Observation:", totals['observation']))

    # set up the card
    card = dataCard()
    card.reset()
    card.setPrecision(3)

    # add the uncertainties (just flat ones for now)
    card.addUncertainty('lumi', 'lnN')
    card.addUncertainty('ttw_norm', 'lnN')
    card.addUncertainty('ttz_norm', 'lnN')
    card.addUncertainty('tth_norm', 'lnN')
    card.addUncertainty('rare_norm', 'lnN')
    card.addUncertainty('fake', 'lnN')

    # add the single bin
    card.addBin('Bin0', ['ttw', 'ttz', 'tth', 'rare', 'nonprompt'], 'Bin0')
    card.specifyExpectation('Bin0', 'signal', totals['signal'])
    card.specifyExpectation('Bin0', 'ttw', totals['ttw'])
    card.specifyExpectation('Bin0', 'ttz', totals['ttz'])
    card.specifyExpectation('Bin0', 'tth', totals['tth'])
    card.specifyExpectation('Bin0', 'rare', totals['rare'])
    card.specifyExpectation('Bin0', 'nonprompt', totals['nonprompt'])

    # set uncertainties
    if systematics:
        card.specifyUncertainty('ttw_norm', 'Bin0', 'ttw', 1.15)
        card.specifyUncertainty('ttz_norm', 'Bin0', 'ttz', 1.10)
        card.specifyUncertainty('tth_norm', 'Bin0', 'tth', 1.20)
        card.specifyUncertainty('rare_norm', 'Bin0', 'rare', 1.20)
        card.specifyUncertainty('fake', 'Bin0', 'nonprompt', 1.25)
        card.specifyFlatUncertainty('lumi', 1.03)

    ## observation
    #card.specifyObservation('Bin0', int(round(totals['observation'],0)))
    card.specifyObservation('Bin0', totals['observation'])

    print("Done.\n")

    return card.writeToFile(data_card, shapeFile=shape_file)
Ejemplo n.º 30
0
            fdf = btdf
            region = "totalr"
        else:
            fdf = sbdf
            region = "sideband"
    else:
        fdf = sbdf

    print("    number of passing events ",len(fdf))
    #print("number of btag passing events ",len(btdf))

    #lets make some histograms.
    rootfilename  = go.makeOutFile(samp,'upout_'+region+'_'+btaggr,'.root',str(zptcut),str(hptcut),str(metcut),str(btagwp))#need to update for btagger
    npfilename    = go.makeOutFile(samp,'totalevents_'+region+'_'+btaggr,'.npy',str(zptcut),str(hptcut),str(metcut),str(btagwp))
    pklfilename   = go.makeOutFile(samp,'selected_errors_'+region+'_'+btaggr,'.pkl',str(zptcut),str(hptcut),str(metcut),str(btagwp))
    rootOutFile   = up3.recreate(rootfilename,compression = None)
    npOutFile     = open(npfilename,'wb')

    rootOutFile["h_z_pt"]    = np.histogram(fdf['ZCandidate_pt'],bins=80,range=(0,800),weights=fdf['event_weight'])
    #rootOutFile["h_z_phi"]   = np.histogram(fdf['ZCandidate_phi'],bins=100,range=(0,3.14159),weights=fdf['event_weight'])#needs to fit range
    rootOutFile["h_z_eta"]   = np.histogram(fdf['ZCandidate_eta'],bins=100,range=(-5,5),weights=fdf['event_weight'])
    rootOutFile["h_z_m"]     = np.histogram(fdf['ZCandidate_m'],bins=100,range=(40,140),weights=fdf['event_weight'])
    rootOutFile["h_h_pt"]    = np.histogram(fdf['hCandidate_pt'],bins=40,range=(200,1200),weights=fdf['event_weight'])
    #rootOutFile["h_h_phi"]   = np.histogram(fdf['hCandidate_phi'],bins=100,range=(0,3.14159))#needs to fit range
    rootOutFile["h_h_eta"]   = np.histogram(fdf['hCandidate_eta'],bins=100,range=(-5,5),weights=fdf['event_weight'])
    rootOutFile["h_h_m"]     = np.histogram(fdf['hCandidate_m'],bins=80,range=(0,400),weights=fdf['event_weight'])
    rootOutFile["h_h_sd"]    = np.histogram(fdf['hCandidate_sd'],bins=80,range=(0,400),weights=fdf['event_weight'])
    rootOutFile["h_met"]     = np.histogram(fdf['METclean'],bins=78,range=(50,2000),weights=fdf['event_weight'])
    #rootOutFile["h_met_phi"] = np.histogram(fdf['METPhiclean'],bins=100,range=(0,3.14159))#needs to fit range
    rootOutFile["h_zp_jigm"] = np.histogram(fdf['ZPrime_mass_est'],bins=100,range=(500,5000),weights=fdf['event_weight'])
    rootOutFile["h_nd_jigm"] = np.histogram(fdf['ND_mass_est'],bins=70,range=(100,800),weights=fdf['event_weight'])