Ejemplo n.º 1
0
def create_file(file_name, distributions, weights, labels, extra_weights=None):
    if extra_weights is None:
        extra_weights = []
    n_events = len(weights[0])
    with uproot.recreate(file_name) as f:
        # write the predicted processes
        for i, label in enumerate(labels):
            lep_charge = create_lepton_charge(n_events)
            if label == "background":
                f[label] = uproot.newtree({
                    "jet_pt": "float64",
                    "weight": "float64",
                    "lep_charge": "int",
                    "weight_up": "float64",
                    "weight_down": "float64",
                })
                f[label].extend({
                    "jet_pt": distributions[i],
                    "weight": weights[i],
                    "lep_charge": lep_charge,
                    "weight_up": extra_weights[0],
                    "weight_down": extra_weights[1],
                })
            else:
                f[label] = uproot.newtree({
                    "jet_pt": "float64",
                    "weight": "float64",
                    "lep_charge": "int"
                })
                f[label].extend({
                    "jet_pt": distributions[i],
                    "weight": weights[i],
                    "lep_charge": lep_charge,
                })
Ejemplo n.º 2
0
def dataframe_to_ttree(df,
                       filename,
                       treename="t",
                       chunksize=1e6,
                       compression=uproot3.LZ4(1),
                       progress=True):
    """
    Writes ROOT file containing one TTree with the input pandas DataFrame.

    filename: name of output file
    treename: name of output TTree
    chunksize: number of rows per basket
    compression: uproot compression object (LZ4, ZLIB, LZMA, or None)
    progress: show tqdm progress bar?
    """
    t = uproot3.newtree(df.dtypes)
    with uproot3.recreate(filename, compression=compression) as f:
        f[treename] = t
        chunksize = int(chunksize)
        iterable = range(0, len(df), chunksize)
        if progress:
            from tqdm.auto import tqdm
            iterable = tqdm(iterable)
        for i in iterable:
            chunk = df.iloc[i:i + chunksize]
            f[treename].extend({k: chunk[k].values for k in chunk.columns})
Ejemplo n.º 3
0
 def create_ntuple(fname, treename, varname, var_array, weightname,
                   weight_array):
     with uproot.recreate(fname) as f:
         f[treename] = uproot.newtree({
             varname: "float64",
             weightname: "float64"
         })
         f[treename].extend({varname: var_array, weightname: weight_array})
Ejemplo n.º 4
0
def to_root_multi(filename, d):
    with uproot3.recreate(filename) as f:
        for treename in d.keys():
            df = d[treename]
            f[treename] = uproot3.newtree(
                {col: df[col].dtype
                 for col in df.columns})
            f[treename].extend(dict(df))
Ejemplo n.º 5
0
def merge_root(rootfiles, outputfile, incrementRunId=False):
    """
    Merge root files in output files
    """
    try:
        import uproot3 as uproot
    except:
        print("uproot3 is mandatory to merge root file. Please, do:")
        print("pip install uproot3")

    out = uproot.recreate(outputfile)

    #Previous ID values to be able to increment runIn or EventId
    previousId = {}

    #create the dict reading all input root files
    trees = {}
    pbar = tqdm.tqdm(total=len(rootfiles))
    for file in rootfiles:
        root = uproot.open(file)
        root_keys = unicity(root.keys())
        for tree in root_keys:
            if hasattr(root[tree], 'keys'):
                if not tree in trees:
                    trees[tree] = {}
                    trees[tree]["rootDictType"] = {}
                    trees[tree]["rootDictValue"] = {}
                    previousId[tree] = {}
                for branch in root[tree].keys():
                    array = root[tree].array(branch)
                    if len(array) > 0:
                        if type(array[0]) is type(b'c'):
                            array = np.array([0 for xi in array])
                        if not branch in trees[tree]["rootDictType"]:
                            trees[tree]["rootDictType"][branch] = type(
                                array[0])
                            trees[tree]["rootDictValue"][branch] = np.array([])
                        if (not incrementRunId and
                                branch.decode('utf-8').startswith('eventID')
                            ) or (incrementRunId and
                                  branch.decode('utf-8').startswith('runID')):
                            if not branch in previousId[tree]:
                                previousId[tree][branch] = 0
                            array += previousId[tree][branch]
                            previousId[tree][branch] = max(array) + 1
                        trees[tree]["rootDictValue"][branch] = np.append(
                            trees[tree]["rootDictValue"][branch], array)
        pbar.update(1)
    pbar.close()

    #Set the dict in the output root file
    for tree in trees:
        if not trees[tree]["rootDictValue"] == {} or not trees[tree][
                "rootDictType"] == {}:
            out[tree] = uproot.newtree(trees[tree]["rootDictType"])
            out[tree].extend(trees[tree]["rootDictValue"])
Ejemplo n.º 6
0
def to_root(
        df,
        filename,
        treename="t",
        chunksize=20e3,
        compression=uproot3.ZLIB(1),
        compression_jagged=uproot3.ZLIB(1),
        progress=False,
):
    """
    Writes ROOT file containing one TTree with the input pandas DataFrame.

    filename: name of output file
    treename: name of output TTree
    chunksize: number of rows per basket
    compression: uproot compression object (LZ4, ZLIB, LZMA, or None)
    progress: show tqdm progress bar?
    """
    tree_dtypes = dict()
    jagged_branches = []
    for bname, dtype in df.dtypes.items():
        if "fletcher" in str(dtype):
            dtype = np.dtype(dtype.arrow_dtype.value_type.to_pandas_dtype())
            tree_dtypes[bname] = uproot3.newbranch(
                dtype, size=bname + "_varn", compression=compression_jagged)
            jagged_branches.append(bname)
        elif "object" in str(dtype):
            raise RuntimeError(
                f"Don't know how to serialize column {bname} with object dtype."
            )
        else:
            dtype = str(dtype).lstrip("u")
            tree_dtypes[bname] = dtype
    with uproot3.recreate(filename, compression=compression) as f:
        t = uproot3.newtree(tree_dtypes)
        f[treename] = t
        chunksize = int(chunksize)
        iterable = range(0, len(df), chunksize)
        if progress:
            iterable = tqdm(iterable)
        for i in iterable:
            chunk = df.iloc[i:i + chunksize]
            basket = dict()
            for column in chunk.columns:
                if column in jagged_branches:
                    arr = chunk[column].ak(version=0)
                    arr = maybe_unmask_jagged_array(arr)
                    # profiling says 30% of the time is spent checking if jagged __getitem__ is given a string
                    # this is not needed for writing out TTree branches, so free speedup.
                    arr._util_isstringslice = lambda x: False
                    basket[column] = arr
                    basket[column + "_varn"] = arr.counts.astype("int32")
                else:
                    basket[column] = chunk[column].values
            f[treename].extend(basket)
Ejemplo n.º 7
0
def _write_root(file, table, treename='Events', compression=-1, step=1048576):
    if compression == -1:
        compression = uproot3.write.compress.LZ4(4)
    with uproot3.recreate(file, compression=compression) as fout:
        fout[treename] = uproot3.newtree(
            {k: v.dtype
             for k, v in table.items()})
        start = 0
        while start < len(list(table.values())[0]) - 1:
            fout[treename].extend(
                {k: v[start:start + step]
                 for k, v in table.items()})
            start += step
Ejemplo n.º 8
0
def create_file_pseudodata(file_name, pseudodata):
    n_events = len(pseudodata)
    with uproot.recreate(file_name) as f:
        # write pseudodata
        lep_charge = create_lepton_charge(n_events)
        f["pseudodata"] = uproot.newtree({
            "jet_pt": "float64",
            "lep_charge": "int"
        })
        f["pseudodata"].extend({
            "jet_pt": pseudodata,
            "lep_charge": lep_charge
        })
Ejemplo n.º 9
0
def pandas_to_tree(data, file_name, tree_name):
    """
    Save pandas dataframe as a ROOT TTree.

    :param pandas.DataFrame data: dataframe to be stored
    :param str file_name: path and name of the output file
    :param str tree_name: name of the result TTree
    """
    branch_dict = {
        data.columns[i]: data.dtypes[i]
        for i in range(0, len(data.columns))
    }
    with uproot3.recreate(file_name) as file_output:
        file_output[tree_name] = uproot3.newtree(branches=branch_dict,
                                                 title=tree_name)
        file_output[tree_name].extend({
            data.columns[i]: data[data.columns[i]].to_numpy()
            for i in range(0, len(data.columns))
        })
Ejemplo n.º 10
0
def pandas_to_tree(data, file_name, tree_name):
    """
      Parameters
      ----------
      data : pandas.DataFrame
          Data frame which should be stored as TTree
      file_name : str
          Path and name of root file
      tree_name : str
          Name of TTree
    """
    branch_dict = {
        data.columns[i]: data.dtypes[i]
        for i in range(0, len(data.columns))
    }
    with uproot3.recreate(file_name) as file_output:
        file_output[tree_name] = uproot3.newtree(branches=branch_dict,
                                                 title=tree_name)
        file_output[tree_name].extend({
            data.columns[i]: data[data.columns[i]].to_numpy()
            for i in range(0, len(data.columns))
        })
Ejemplo n.º 11
0
def merge_root(rootfiles, outputfile):
    """
    Merge root files in output files
    """
    out = uproot.recreate(outputfile)

    #create the dict reading all input root files
    trees = {}
    pbar = tqdm.tqdm(total=len(rootfiles))
    for file in rootfiles:
        root = uproot.open(file)
        for tree in root.keys():
            if hasattr(root[tree], 'keys'):
                if not tree in trees:
                    trees[tree] = {}
                    trees[tree]["rootDictType"] = {}
                    trees[tree]["rootDictValue"] = {}
                for branch in root[tree].keys():
                    array = root[tree].array(branch)
                    if len(array) > 0:
                        if type(array[0]) is type(b'c'):
                            array = np.array([0 for xi in array])
                        if not branch in trees[tree]["rootDictType"]:
                            trees[tree]["rootDictType"][branch] = type(
                                array[0])
                            trees[tree]["rootDictValue"][branch] = np.array([])
                        trees[tree]["rootDictValue"][branch] = np.append(
                            trees[tree]["rootDictValue"][branch], array)
        pbar.update(1)
    pbar.close()

    #Set the dict in the output root file
    for tree in trees:
        if not trees[tree]["rootDictValue"] == {} or not trees[tree][
                "rootDictType"] == {}:
            out[tree] = uproot.newtree(trees[tree]["rootDictType"])
            out[tree].extend(trees[tree]["rootDictValue"])
Ejemplo n.º 12
0
def save_dict_to_root(dic, file_name, tree_name=None):
    """
    This function stores data arrays in the form of a dictionary into a root file.
    It provides a convenient interface to ``uproot``.

    :param dic: Dictionary of data
    :param file_name: String
    :param tree_name: String. By default it's "tree".
    """
    if file_name[-5:] == ".root":
        file_name = file_name[:-5]
    if isinstance(dic, dict):
        dic = [dic]
    if tree_name is None:
        tree_name = "DataTree"
    Ndic = len(dic)
    if isinstance(tree_name, list):
        assert len(tree_name) == Ndic
    else:
        t = []
        for i in range(Ndic):
            t.append(tree_name + str(i))
        tree_name = t

    with uproot.recreate(file_name + ".root") as f:
        for d, t in zip(dic, tree_name):
            branch_type = {}
            branch_data = {}
            for i in d:
                j = (i.replace("(", "_").replace(")", "_").replace(
                    " ", "_").replace("*",
                                      "star").replace("+",
                                                      "p").replace("-", "m"))
                branch_data[j] = np.array(d[i])
                branch_type[j] = branch_data[j].dtype.name
            f[t] = uproot.newtree(branch_type)
            f[t].extend(branch_data)
        # counts = arr.counts
        # branch_dict["{}-counts".format(key)] = counts

        # branch_reg[key] = dtype
        # branch_dict[key] = arr


    # out_file["ttree"] = ur.newtree(branch_reg)
    # out_file["ttree"].extend(branch_dict)
    arr = ak.to_awkward0( ak.Array([np.zeros( (20), dtype=np.dtype("f8"))[:np.random.randint(4, 20)] for i in range(100)] ) )
    # arr = ak.to_awkward0( ak.Array( [np.zeros( (20)) for i in range(100)] ) )
    # arr = ak.to_awkward0(  [np.zeros( (20), np.dtype("f4")) for i in range(100)] )
    # arr = ak.to_awkward0(  [np.zeros( (20), np.dtype("f4")) for i in range(100)] )
    # from IPython import embed;embed()

    out_file["ttree"] = ur.newtree( {"key-a": ur.newbranch(np.dtype("f8"), size="n")})
    out_file["ttree"].extend({"key-a": arr, "n": arr.counts})

    # out_file["ttree"] = ur.newtree( {"key-a": np.float32})
    # out_file["ttree"].extend( {"key-a": np.zeros( (200, 20), dtype=np.float32)} )


print("Trying to load the file with root_numpy")

nparr = rn.root2array( [f_path], treename="ttree", stop=None, branches=keys)
from IPython import embed;embed()
# for ar in nparr:
    # print(ar)
# print(nparr)
Ejemplo n.º 14
0
def to_root(df, filename, treename):
    with uproot3.recreate(filename) as f:
        f[treename] = uproot3.newtree(
            {col: df[col].dtype
             for col in df.columns})
        f[treename].extend(dict(df))
Ejemplo n.º 15
0
def AddFFcorr(infname,
              intreename,
              outfname,
              outtreename,
              Lcstate,
              leptname,
              q2True_branchname,
              costhlTrue_branchname,
              nentries_to_read=1000000000,
              chunksize=10000):

    TH1.AddDirectory(kFALSE)

    perfname = None
    q2factor = None
    if Lcstate == 'Lc':
        perfname = './CorrectionTables/LcFFratios.root'
        q2factor = 1.
    elif (Lcstate == 'Lc2595' or Lcstate == 'Lc2625'):
        perfname = './CorrectionTables/LcstFFratios.root'
        q2factor = 1e-6
    else:
        raise Exception('Lc state not recognised', Lcstate)

    if leptname != 'mu' and leptname != 'tau':
        raise Exception('Lepton name not recognised', leptname)

    print('Using the histname', Lcstate + leptname + "_ratio")

    #variables to get from file
    varsdf = ['runNumber', 'eventNumber']
    varsdf += ['Lb_TRUEP_X', 'Lb_TRUEP_Y', 'Lb_TRUEP_Z', 'Lb_TRUEP_E']
    varsdf += ['Lc_TRUEP_X', 'Lc_TRUEP_Y', 'Lc_TRUEP_Z', 'Lc_TRUEP_E']
    varsdf += [
        'Lb_True' + leptname.capitalize() + '_PX',
        'Lb_True' + leptname.capitalize() + '_PY',
        'Lb_True' + leptname.capitalize() + '_PZ',
        'Lb_True' + leptname.capitalize() + '_PE'
    ]
    varsdf += [
        'Lb_TrueNeutrino_PX', 'Lb_TrueNeutrino_PY', 'Lb_TrueNeutrino_PZ',
        'Lb_TrueNeutrino_PE'
    ]

    File = TFile.Open(perfname, "read")
    Histg = File.Get(Lcstate + leptname + "_ratio")
    perfHist = Histg.Clone(Lcstate + leptname + "_rationew")
    File.Close()
    Xmin = perfHist.GetXaxis().GetXmin()
    Xmax = perfHist.GetXaxis().GetXmax()
    Ymin = perfHist.GetYaxis().GetXmin()
    Ymax = perfHist.GetYaxis().GetXmax()
    Limits = (Xmin, Xmax, Ymin, Ymax)
    print(Limits, perfHist.Integral())

    #variables to store in the new ttree
    varstoStore = {
        'runNumber': np.int,
        'eventNumber': np.int,
        'Event_FFcorr': np.float64,
        costhlTrue_branchname: np.float64,
        q2True_branchname: np.float64
    }

    aliases = {}
    #create a new rootfile
    with uproot3.recreate(outfname) as f:
        f[outtreename] = uproot3.newtree(varstoStore)

        #loop over the old rootfile chunkwise
        events_read = 0
        if chunksize >= nentries_to_read: chunksize = nentries_to_read
        for df_data in uproot4.iterate(infname + ':' + intreename,
                                       varsdf,
                                       aliases=aliases,
                                       cut=None,
                                       library="pd",
                                       step_size=chunksize):
            if events_read >= nentries_to_read: break

            #Compute q2 and cosThetaL
            pxl = df_data['Lb_True' + leptname.capitalize() + '_PX']
            pxnu = df_data['Lb_TrueNeutrino_PX']
            pyl = df_data['Lb_True' + leptname.capitalize() + '_PY']
            pynu = df_data['Lb_TrueNeutrino_PY']
            pzl = df_data['Lb_True' + leptname.capitalize() + '_PZ']
            pznu = df_data['Lb_TrueNeutrino_PZ']
            pel = df_data['Lb_True' + leptname.capitalize() + '_PE']
            penu = df_data['Lb_TrueNeutrino_PE']
            if (Lcstate == 'Lc2595' or Lcstate == 'Lc2625'):
                #this should be Lcstar momentum
                pxlc = df_data['Lb_TRUEP_X'] - pxl - pxnu
                pylc = df_data['Lb_TRUEP_X'] - pyl - pynu
                pzlc = df_data['Lb_TRUEP_X'] - pzl - pznu
                pelc = df_data['Lb_TRUEP_X'] - pel - penu
            elif Lcstate == 'Lc':
                pxlc = df_data['Lc_TRUEP_X']
                pylc = df_data['Lc_TRUEP_Y']
                pzlc = df_data['Lc_TRUEP_Z']
                pelc = df_data['Lc_TRUEP_E']

            PLc_lab = LorentzVector(
                Vector(pxlc, pylc,
                       pzlc), pelc)  #Format of LorentzVector(Vector(X,Y,Z), E)
            Pl_lab = LorentzVector(Vector(pxl, pyl, pzl), pel)
            PNu_lab = LorentzVector(Vector(pxnu, pynu, pznu), penu)
            PLb_lab = PLc_lab + Pl_lab + PNu_lab
            qsq, cthl = return_phasespace(PLb_lab, PLc_lab, Pl_lab)
            #print(qsq,cthl)
            df_data[q2True_branchname] = qsq
            df_data[costhlTrue_branchname] = cthl

            #get the corrections
            applyvars = [q2True_branchname, costhlTrue_branchname
                         ]  #has to be in correct order like in histogram
            df_data['Event_FFcorr'] = df_data[applyvars].apply(
                storeeff2D, args=[perfHist, Limits, q2factor], axis=1)

            #get only the things that need to be stored and write them to the file
            branch_dict = {
                vartostore: df_data[vartostore].to_numpy()
                for vartostore in list(varstoStore.keys())
            }
            f[outtreename].extend(branch_dict)
            events_read += df_data.shape[0]
            print('Events read', events_read)
Ejemplo n.º 16
0
def main(args):

    processed_nano = "tmva_xgboost_reproducers/lead_processed_nano_uncorr.root"
    df = pd.read_parquet(args.input_dataframe)
    print("Read input dataframe:\n{}".format(df))

    inputs = {
        "lead_energyRaw": "SCRawE", 
        "lead_r9": "r9", 
        "lead_sieie":"sigmaIetaIeta", 
        "lead_etaWidth": "etaWidth", 
        "lead_phiWidth": "phiWidth", 
        "lead_sieip": "covIEtaIPhi", 
        "lead_s4": "s4", 
        "lead_pfPhoIso03": "phoIso03", 
        "lead_pfChargedIsoPFPV": "chgIsoWrtChosenVtx", 
        "lead_pfChargedIsoWorstVtx": "chgIsoWrtWorstVtx",
        "lead_eta": "scEta", 
        "lead_fixedGridRhoAll": "rho"
        }

    # This is needed just to not hardcore the branch type later
    arr_dict = {}
    for name in inputs.keys():
        name_orig = name
        # Since I don't remember which ones have the suffix _nano
        if name_orig not in list(df.columns):
            name += "_nano"
        arr_dict[name_orig] = df[name]
    ak_arr = ak.Array(arr_dict)
    print(ak_arr.type)

    # Explicitely recompute also XGBoost one, just because
    print("Recomputing MVA with XGBoost")
    mva = xgboost.Booster()
    mva.load_model(args.xgboost_model)
    var_order = list(arr_dict.keys())
    bdt_inputs = np.column_stack([ak.to_numpy(ak_arr[name]) for name in var_order])
    tempmatrix = xgboost.DMatrix(bdt_inputs, feature_names=var_order)
    lead_idmva_xgboost = mva.predict(tempmatrix)
    # Thomas workflow
    lead_idmva_xgboost = -np.log(1./lead_idmva_xgboost - 1.)
    lead_idmva_xgboost = 2. / (1. + np.exp(-2.*lead_idmva_xgboost)) - 1.

    # Dump nanoaod inputs to a TTree
    with uproot3.recreate(processed_nano) as f:
        branchdict = {}
        arraydict = {}
    
        for nano_name, model_name in inputs.items():
            #branchdict[model_name] = str(ak_arr[nano_name].type.type).replace('?', '')
            branchdict[model_name] = "float32"
            arraydict[model_name] = ak_arr[nano_name]
    
        f["Events"] = uproot3.newtree(branchdict)
        f["Events"].extend(arraydict)

    # TMVA with RDataFrame
    ROOT.gInterpreter.ProcessLine('''
    TMVA::Experimental::RReader model("{}");
    computeModel = TMVA::Experimental::Compute<{}, float>(model);
    '''.format(args.tmva_model, len(ak_arr.fields)))

    rdf = ROOT.RDataFrame("Events", processed_nano)
    rdf = rdf.Define("lead_idmva_tmva", ROOT.computeModel, ROOT.model.GetVariableNames())
    print("Running RDF event loop")
    dct = rdf.AsNumpy(columns=["lead_idmva_tmva"])
    lead_idmva_tmva = np.array([v[0] for v in dct["lead_idmva_tmva"]])

    # Plot
    print("Plotting to {}".format(args.output_dir))
    bins = 100
    rng = (-1, 1)

    fig, (up, down) = plt.subplots(
        nrows=2,
        ncols=1,
        gridspec_kw={"height_ratios": (1, 1)}
        )

    up.hist(lead_idmva_xgboost, bins=bins, range=rng, histtype="step", label="XGBoost", linewidth=2)
    up.hist(lead_idmva_tmva, bins=bins, range=rng, histtype="step", label="TMVA", linewidth=2)

    up.set_xlabel("lead PhoIDMVA after corrections")
    up.legend(fontsize=18, loc="upper left")

    down.hist(100 * (lead_idmva_xgboost - lead_idmva_tmva) / lead_idmva_tmva, 
              bins=500,
              range=(-100, 100),
              histtype="step",
              density=True,
              color="black",
              linewidth=2
             )
    down.set_xlabel("$(XGB - TMVA)/TMVA$ [%]")
    down.set_yscale("log")

    fig.tight_layout()

    fig.savefig("{}/lead_xgb_tmva.png".format(args.output_dir), bbox_inches='tight')
    fig.savefig("{}/lead_xgb_tmva.pdf".format(args.output_dir), bbox_inches='tight')

    fig, ax = plt.subplots()
    ax.scatter(lead_idmva_xgboost, lead_idmva_tmva)
    ax.set_xlabel("XGBoost")
    ax.set_ylabel("TMVA")

    fig.savefig("{}/xgb_tmva_scatter.png".format(args.output_dir), bbox_inches='tight')
    fig.savefig("{}/xgb_tmva_scatter.pdf".format(args.output_dir), bbox_inches='tight')
Ejemplo n.º 17
0
def dump_generated_events(arr: ak.Array):

    fn = f"wd/{conf.tag}/output.root"
    with uproot.recreate(fn) as file:
        file["tree1"] = uproot.newtree(dict(arr.type.type))
        file["tree1"].extend({branch: arr[branch] for branch in arr.fields})
Ejemplo n.º 18
0
def read_file(path, sample, branches=branches):
    print("=====")
    print("Processing {0} file".format(sample))
    mem = psutil.virtual_memory()
    mem_at_start = mem.available / (1024 ** 2)
    print(f'Available Memory: {mem_at_start:.0f} MB')
    count = 0
    hists = {}
    start = time.time()
    batch_num = 0
    with uproot.open(path) as file:
        tree = file['mini']
        numevents = tree.num_entries
        print(f'Total number of events in file: {numevents}')

        for batch in tree.iterate(branches, step_size='30 MB', library='np'):
            print('==============')
            df = pandas.DataFrame.from_dict(batch)
            del batch
            num_before_cuts = len(df.index)
            print("Events before cuts: {0}".format(num_before_cuts))
            count += num_before_cuts
            if 'Data' not in sample:
                df['totalWeight'] = np.vectorize(calc_weight)(df.mcWeight, df.scaleFactor_ELE, df.scaleFactor_MUON,
                                                              df.scaleFactor_PILEUP, df.scaleFactor_TRIGGER,
                                                              df.scaleFactor_ZVERTEX)
                df["totalWeight"] = np.vectorize(get_xsec_weight)(df.totalWeight, sample)
            else:
                df['totalWeight'] = [1 for item in range(len(df.index))]

            df.drop(["mcWeight", "scaleFactor_ELE", "scaleFactor_MUON",
                     "scaleFactor_PILEUP", "scaleFactor_TRIGGER", 'scaleFactor_ZVERTEX'],
                    axis=1,
                    inplace=True)

            # Standard selection cuts
            df = df.query("trigE or trigM")
            df = df.query('passGRL')
            df = df.query('hasGoodVertex')

            df.drop(["trigE", "trigM", "passGRL", "hasGoodVertex"],
                    axis=1,
                    inplace=True)

            # Lepton requirements
            df['good_lepton'] = np.vectorize(WCuts.cut_GoodLepton)(df.lep_flag, df.lep_pt,
                                                                   df.lep_ptcone30, df.lep_etcone20,
                                                                   df.lep_n, df.lep_type)
            df = df.query('good_lepton > -1')
            for column in df.columns:
                if 'lep' in column and column not in ['lep_n', 'good_lepton']:
                    df[column] = np.vectorize(extract_good_lepton)(df[column], df['good_lepton'])

            # W transverse mass
            df['mtw'] = np.vectorize(calc_mtw)(df.lep_pt, df.met_et, df.lep_phi, df.met_phi)
            df = df.query('mtw > 30000.')
            df = df.query('met_et > 30000.')

            # Convert MeV to GeV
            df['lep_pt'] = df['lep_pt'] / 1000
            df['lep_E'] = df['lep_E'] / 1000
            df['met_et'] = df['met_et'] / 1000
            df['mtw'] = df['mtw'] / 1000

            df['mtw_enu'] = df.query('lep_type == 11')['mtw']
            df['mtw_munu'] = df.query('lep_type == 13')['mtw']

            df['WT_phi'] = np.vectorize(calc_W_phi)(df.lep_pt, df.met_et, df.lep_phi, df.met_phi)

            df['jet_n'] = df['alljet_n']
            df.drop(['alljet_n'], axis=1, inplace=True)

            # Asymmetry related histograms
            df['pos_ele_eta'] = df.query('lep_type == 11 and lep_charge == 1')['lep_eta']
            df['pos_ele_eta'] = np.vectorize(abs_value)(df.pos_ele_eta)

            df['neg_ele_eta'] = df.query('lep_type == 11 and lep_charge == -1')['lep_eta']
            df['neg_ele_eta'] = np.vectorize(abs_value)(df.neg_ele_eta)

            df['pos_mu_eta'] = df.query('lep_type == 13 and lep_charge == 1')['lep_eta']
            df['pos_mu_eta'] = np.vectorize(abs_value)(df.pos_mu_eta)

            df['neg_mu_eta'] = df.query('lep_type == 13 and lep_charge == -1')['lep_eta']
            df['neg_mu_eta'] = np.vectorize(abs_value)(df.neg_mu_eta)

            df['lep_pt_j0'] = df.query('jet_n == 0')['lep_pt']
            df['lep_pt_j1'] = df.query('jet_n == 1')['lep_pt']
            df['lep_pt_j2'] = df.query('jet_n > 1')['lep_pt']

            df['mtw_j0'] = df.query('jet_n == 0')['mtw']
            df['mtw_j1'] = df.query('jet_n == 1')['mtw']
            df['mtw_j2'] = df.query('jet_n > 1')['mtw']

            df['met_et_j0'] = df.query('jet_n == 0')['met_et']
            df['met_et_j1'] = df.query('jet_n == 1')['met_et']
            df['met_et_j2'] = df.query('jet_n > 1')['met_et']

            df['lep_eta_j0'] = df.query('jet_n == 0')['lep_eta']
            df['lep_eta_j1'] = df.query('jet_n == 1')['lep_eta']
            df['lep_eta_j2'] = df.query('jet_n > 1')['lep_eta']

            if len(df.loc[df['jet_n'] > 0].index) > 0:
                temp_df = pandas.DataFrame()
                temp_df['eventNumber'] = df.loc[df['jet_n'] > 0]['eventNumber']
                for column in df.columns:
                    if 'jet' in column and column != 'jet_n':
                        temp_df[f'lead_{column}'] = np.vectorize(find_lead_jet)(df.loc[df['jet_n'] > 0]['jet_pt'],
                                                                                df.loc[df['jet_n'] > 0][column])
                temp_df['lead_jet_pt'] = temp_df['lead_jet_pt'] / 1000.

                temp_df['lj_phi_diff'] = np.vectorize(calc_delta_phi)(df.loc[df['jet_n'] > 0]['lep_phi'],
                                                                      temp_df['lead_jet_phi'])
                temp_df['abs_lj_phi_diff'] = np.vectorize(abs_value)(temp_df.lj_phi_diff)

                temp_df['Wj_phi_diff'] = np.vectorize(calc_delta_phi)(df.loc[df['jet_n'] > 0]['WT_phi'],
                                                                      temp_df['lead_jet_phi'])
                temp_df['abs_Wj_phi_diff'] = np.vectorize(abs_value)(temp_df.Wj_phi_diff)
                df = pandas.merge(left=df, right=temp_df, left_on='eventNumber', right_on='eventNumber', how='left')

            num_after_cuts = len(df.index)
            print("Number of events after cuts: {0}".format(num_after_cuts))
            print(f'Currently at {(count * 100 / numevents):.0f}% of events ({count}/{numevents})')

            for key, hist in hist_dicts.items():
                h_bin_width = hist["bin_width"]
                h_num_bins = hist["numbins"]
                h_xmin = hist["xmin"]

                x_var = hist["xvariable"]

                bins = [h_xmin + x * h_bin_width for x in range(h_num_bins + 1)]
                data_x, binning = np.histogram(df[x_var].values, bins=bins, weights=df.totalWeight.values)
                data_x = data_x.astype('float64')
                histo = uproot3_methods.classes.TH1.from_numpy((data_x, binning))
                if key not in hists.keys():
                    hists[key] = histo
                else:
                    for i in range(len(hists[key])):
                        hists[key][i] += histo[i]
            if not os.path.exists(f'../DataForFit_8TeV/{sample}/'):
                os.mkdir(f'../DataForFit_8TeV/{sample}')
            f = uproot3.recreate(f'../DataForFit_8TeV/{sample}/{sample}_{batch_num}.root')

            f['FitTree'] = uproot3.newtree({'mtw': uproot3.newbranch(np.float64, 'mtw'),
                                            'jet_n': uproot3.newbranch(np.int32, 'jet_n'),
                                            'totalWeight': uproot3.newbranch(np.float64, 'totalWeight')})

            f['FitTree'].extend({'mtw': df['mtw'].to_numpy(dtype=np.float64),
                                 'jet_n': df['jet_n'].to_numpy(dtype=np.int32),
                                 'totalWeight': df['totalWeight'].to_numpy(dtype=np.float64)})
            f.close()
            batch_num += 1
            del df
            gc.collect()
            # diagnostics
            mem = psutil.virtual_memory()
            actual_mem = mem.available / (1024 ** 2)
            print(f'Current available memory {actual_mem:.0f} MB '
                  f'({100 * actual_mem / mem_at_start:.0f}% of what we started with)')

    file = uproot3.recreate(f'../Output_8TeV/{sample}.root', uproot3.ZLIB(4))

    for key, hist in hists.items():
        file[key] = hist
        print(f'{key} histogram')
        file[key].show()

    file.close()

    mem = psutil.virtual_memory()
    actual_mem = mem.available / (1024 ** 2)
    print(f'Current available memory {actual_mem:.0f} MB '
          f'({100 * actual_mem / mem_at_start:.0f}% of what we started with)')
    print('Finished!')
    print(f'Time elapsed: {time.time() - start} seconds')
    return None
Ejemplo n.º 19
0
                    branch_dicts[i_split]["branch_dict"]["{}_counts".format(
                        online_key)] = counts
                # branch_dict["{}_counts".format(online_key)] = counts
            else:
                arr = online_tree[online_key].array()[on_mask].flatten(
                )[:N_jets]
                dtype = np.dtype("f4")

            for i_split, (n_start, n_end) in enumerate(zip(n_starts, n_ends)):
                branch_dicts[i_split]["branch_registration"][
                    online_key] = dtype
                branch_dicts[i_split]["branch_dict"][online_key] = arr[
                    n_start:n_end]
            # branch_registration[online_key] = dtype
            # branch_dict[online_key] = arr

for i_split in range(SPLITS):
    out_path = os.path.join(base_dir,
                            "{}_{}_{}.root".format(name, branch_key, i_split))
    print("Creating new tree: {}".format(out_path))
    with u3.recreate(out_path, compression=u3.ZLIB(6)) as out_file:
        print("Creating new tree for {}".format(branch_key))
        out_file["ttree"] = u3.newtree(
            branch_dicts[i_split]["branch_registration"])
        print("Creating branches")
        # s_time = time.time()
        out_file["ttree"].extend(branch_dicts[i_split]["branch_dict"])
        # from IPython import embed;embed()
        # e_time = time.time()
        # print("Total time needed for {0} events:\n{1:1.1f}".format(N_jets, e_time - s_time))
            np.dtype("f8"),
            size="{}_counts".format(online_key),
        )
        counts = arr.counts
        branch_dict["{}_counts".format(online_key)] = counts
    elif track_key_indicator in online_key:
        arr = track_var_to_flat(online_tree[online_key].array(),
                                tracking_index_low,
                                tracking_index_high)[:N_jets]
        dtype = u3.newbranch(
            np.dtype("f8"),
            size="{}_counts".format(online_key),
        )
        counts = arr.counts
        branch_dict["{}_counts".format(online_key)] = counts
    else:
        arr = online_tree[online_key].array().flatten()[:N_jets]
        dtype = np.float32

    branch_registration[online_key] = dtype
    branch_dict[online_key] = arr

print("Creating new tree")
out_file["ttree"] = u3.newtree(branch_registration)
print("Creating branches")
s_time = time.time()
out_file["ttree"].extend(branch_dict)
e_time = time.time()
print("Total time needed for {0} events:\n{1:1.1f}".format(
    N_jets, e_time - s_time))