def create_file(file_name, distributions, weights, labels, extra_weights=None): if extra_weights is None: extra_weights = [] n_events = len(weights[0]) with uproot.recreate(file_name) as f: # write the predicted processes for i, label in enumerate(labels): lep_charge = create_lepton_charge(n_events) if label == "background": f[label] = uproot.newtree({ "jet_pt": "float64", "weight": "float64", "lep_charge": "int", "weight_up": "float64", "weight_down": "float64", }) f[label].extend({ "jet_pt": distributions[i], "weight": weights[i], "lep_charge": lep_charge, "weight_up": extra_weights[0], "weight_down": extra_weights[1], }) else: f[label] = uproot.newtree({ "jet_pt": "float64", "weight": "float64", "lep_charge": "int" }) f[label].extend({ "jet_pt": distributions[i], "weight": weights[i], "lep_charge": lep_charge, })
def dataframe_to_ttree(df, filename, treename="t", chunksize=1e6, compression=uproot3.LZ4(1), progress=True): """ Writes ROOT file containing one TTree with the input pandas DataFrame. filename: name of output file treename: name of output TTree chunksize: number of rows per basket compression: uproot compression object (LZ4, ZLIB, LZMA, or None) progress: show tqdm progress bar? """ t = uproot3.newtree(df.dtypes) with uproot3.recreate(filename, compression=compression) as f: f[treename] = t chunksize = int(chunksize) iterable = range(0, len(df), chunksize) if progress: from tqdm.auto import tqdm iterable = tqdm(iterable) for i in iterable: chunk = df.iloc[i:i + chunksize] f[treename].extend({k: chunk[k].values for k in chunk.columns})
def create_ntuple(fname, treename, varname, var_array, weightname, weight_array): with uproot.recreate(fname) as f: f[treename] = uproot.newtree({ varname: "float64", weightname: "float64" }) f[treename].extend({varname: var_array, weightname: weight_array})
def to_root_multi(filename, d): with uproot3.recreate(filename) as f: for treename in d.keys(): df = d[treename] f[treename] = uproot3.newtree( {col: df[col].dtype for col in df.columns}) f[treename].extend(dict(df))
def merge_root(rootfiles, outputfile, incrementRunId=False): """ Merge root files in output files """ try: import uproot3 as uproot except: print("uproot3 is mandatory to merge root file. Please, do:") print("pip install uproot3") out = uproot.recreate(outputfile) #Previous ID values to be able to increment runIn or EventId previousId = {} #create the dict reading all input root files trees = {} pbar = tqdm.tqdm(total=len(rootfiles)) for file in rootfiles: root = uproot.open(file) root_keys = unicity(root.keys()) for tree in root_keys: if hasattr(root[tree], 'keys'): if not tree in trees: trees[tree] = {} trees[tree]["rootDictType"] = {} trees[tree]["rootDictValue"] = {} previousId[tree] = {} for branch in root[tree].keys(): array = root[tree].array(branch) if len(array) > 0: if type(array[0]) is type(b'c'): array = np.array([0 for xi in array]) if not branch in trees[tree]["rootDictType"]: trees[tree]["rootDictType"][branch] = type( array[0]) trees[tree]["rootDictValue"][branch] = np.array([]) if (not incrementRunId and branch.decode('utf-8').startswith('eventID') ) or (incrementRunId and branch.decode('utf-8').startswith('runID')): if not branch in previousId[tree]: previousId[tree][branch] = 0 array += previousId[tree][branch] previousId[tree][branch] = max(array) + 1 trees[tree]["rootDictValue"][branch] = np.append( trees[tree]["rootDictValue"][branch], array) pbar.update(1) pbar.close() #Set the dict in the output root file for tree in trees: if not trees[tree]["rootDictValue"] == {} or not trees[tree][ "rootDictType"] == {}: out[tree] = uproot.newtree(trees[tree]["rootDictType"]) out[tree].extend(trees[tree]["rootDictValue"])
def to_root( df, filename, treename="t", chunksize=20e3, compression=uproot3.ZLIB(1), compression_jagged=uproot3.ZLIB(1), progress=False, ): """ Writes ROOT file containing one TTree with the input pandas DataFrame. filename: name of output file treename: name of output TTree chunksize: number of rows per basket compression: uproot compression object (LZ4, ZLIB, LZMA, or None) progress: show tqdm progress bar? """ tree_dtypes = dict() jagged_branches = [] for bname, dtype in df.dtypes.items(): if "fletcher" in str(dtype): dtype = np.dtype(dtype.arrow_dtype.value_type.to_pandas_dtype()) tree_dtypes[bname] = uproot3.newbranch( dtype, size=bname + "_varn", compression=compression_jagged) jagged_branches.append(bname) elif "object" in str(dtype): raise RuntimeError( f"Don't know how to serialize column {bname} with object dtype." ) else: dtype = str(dtype).lstrip("u") tree_dtypes[bname] = dtype with uproot3.recreate(filename, compression=compression) as f: t = uproot3.newtree(tree_dtypes) f[treename] = t chunksize = int(chunksize) iterable = range(0, len(df), chunksize) if progress: iterable = tqdm(iterable) for i in iterable: chunk = df.iloc[i:i + chunksize] basket = dict() for column in chunk.columns: if column in jagged_branches: arr = chunk[column].ak(version=0) arr = maybe_unmask_jagged_array(arr) # profiling says 30% of the time is spent checking if jagged __getitem__ is given a string # this is not needed for writing out TTree branches, so free speedup. arr._util_isstringslice = lambda x: False basket[column] = arr basket[column + "_varn"] = arr.counts.astype("int32") else: basket[column] = chunk[column].values f[treename].extend(basket)
def _write_root(file, table, treename='Events', compression=-1, step=1048576): if compression == -1: compression = uproot3.write.compress.LZ4(4) with uproot3.recreate(file, compression=compression) as fout: fout[treename] = uproot3.newtree( {k: v.dtype for k, v in table.items()}) start = 0 while start < len(list(table.values())[0]) - 1: fout[treename].extend( {k: v[start:start + step] for k, v in table.items()}) start += step
def create_file_pseudodata(file_name, pseudodata): n_events = len(pseudodata) with uproot.recreate(file_name) as f: # write pseudodata lep_charge = create_lepton_charge(n_events) f["pseudodata"] = uproot.newtree({ "jet_pt": "float64", "lep_charge": "int" }) f["pseudodata"].extend({ "jet_pt": pseudodata, "lep_charge": lep_charge })
def pandas_to_tree(data, file_name, tree_name): """ Save pandas dataframe as a ROOT TTree. :param pandas.DataFrame data: dataframe to be stored :param str file_name: path and name of the output file :param str tree_name: name of the result TTree """ branch_dict = { data.columns[i]: data.dtypes[i] for i in range(0, len(data.columns)) } with uproot3.recreate(file_name) as file_output: file_output[tree_name] = uproot3.newtree(branches=branch_dict, title=tree_name) file_output[tree_name].extend({ data.columns[i]: data[data.columns[i]].to_numpy() for i in range(0, len(data.columns)) })
def pandas_to_tree(data, file_name, tree_name): """ Parameters ---------- data : pandas.DataFrame Data frame which should be stored as TTree file_name : str Path and name of root file tree_name : str Name of TTree """ branch_dict = { data.columns[i]: data.dtypes[i] for i in range(0, len(data.columns)) } with uproot3.recreate(file_name) as file_output: file_output[tree_name] = uproot3.newtree(branches=branch_dict, title=tree_name) file_output[tree_name].extend({ data.columns[i]: data[data.columns[i]].to_numpy() for i in range(0, len(data.columns)) })
def merge_root(rootfiles, outputfile): """ Merge root files in output files """ out = uproot.recreate(outputfile) #create the dict reading all input root files trees = {} pbar = tqdm.tqdm(total=len(rootfiles)) for file in rootfiles: root = uproot.open(file) for tree in root.keys(): if hasattr(root[tree], 'keys'): if not tree in trees: trees[tree] = {} trees[tree]["rootDictType"] = {} trees[tree]["rootDictValue"] = {} for branch in root[tree].keys(): array = root[tree].array(branch) if len(array) > 0: if type(array[0]) is type(b'c'): array = np.array([0 for xi in array]) if not branch in trees[tree]["rootDictType"]: trees[tree]["rootDictType"][branch] = type( array[0]) trees[tree]["rootDictValue"][branch] = np.array([]) trees[tree]["rootDictValue"][branch] = np.append( trees[tree]["rootDictValue"][branch], array) pbar.update(1) pbar.close() #Set the dict in the output root file for tree in trees: if not trees[tree]["rootDictValue"] == {} or not trees[tree][ "rootDictType"] == {}: out[tree] = uproot.newtree(trees[tree]["rootDictType"]) out[tree].extend(trees[tree]["rootDictValue"])
def save_dict_to_root(dic, file_name, tree_name=None): """ This function stores data arrays in the form of a dictionary into a root file. It provides a convenient interface to ``uproot``. :param dic: Dictionary of data :param file_name: String :param tree_name: String. By default it's "tree". """ if file_name[-5:] == ".root": file_name = file_name[:-5] if isinstance(dic, dict): dic = [dic] if tree_name is None: tree_name = "DataTree" Ndic = len(dic) if isinstance(tree_name, list): assert len(tree_name) == Ndic else: t = [] for i in range(Ndic): t.append(tree_name + str(i)) tree_name = t with uproot.recreate(file_name + ".root") as f: for d, t in zip(dic, tree_name): branch_type = {} branch_data = {} for i in d: j = (i.replace("(", "_").replace(")", "_").replace( " ", "_").replace("*", "star").replace("+", "p").replace("-", "m")) branch_data[j] = np.array(d[i]) branch_type[j] = branch_data[j].dtype.name f[t] = uproot.newtree(branch_type) f[t].extend(branch_data)
# counts = arr.counts # branch_dict["{}-counts".format(key)] = counts # branch_reg[key] = dtype # branch_dict[key] = arr # out_file["ttree"] = ur.newtree(branch_reg) # out_file["ttree"].extend(branch_dict) arr = ak.to_awkward0( ak.Array([np.zeros( (20), dtype=np.dtype("f8"))[:np.random.randint(4, 20)] for i in range(100)] ) ) # arr = ak.to_awkward0( ak.Array( [np.zeros( (20)) for i in range(100)] ) ) # arr = ak.to_awkward0( [np.zeros( (20), np.dtype("f4")) for i in range(100)] ) # arr = ak.to_awkward0( [np.zeros( (20), np.dtype("f4")) for i in range(100)] ) # from IPython import embed;embed() out_file["ttree"] = ur.newtree( {"key-a": ur.newbranch(np.dtype("f8"), size="n")}) out_file["ttree"].extend({"key-a": arr, "n": arr.counts}) # out_file["ttree"] = ur.newtree( {"key-a": np.float32}) # out_file["ttree"].extend( {"key-a": np.zeros( (200, 20), dtype=np.float32)} ) print("Trying to load the file with root_numpy") nparr = rn.root2array( [f_path], treename="ttree", stop=None, branches=keys) from IPython import embed;embed() # for ar in nparr: # print(ar) # print(nparr)
def to_root(df, filename, treename): with uproot3.recreate(filename) as f: f[treename] = uproot3.newtree( {col: df[col].dtype for col in df.columns}) f[treename].extend(dict(df))
def AddFFcorr(infname, intreename, outfname, outtreename, Lcstate, leptname, q2True_branchname, costhlTrue_branchname, nentries_to_read=1000000000, chunksize=10000): TH1.AddDirectory(kFALSE) perfname = None q2factor = None if Lcstate == 'Lc': perfname = './CorrectionTables/LcFFratios.root' q2factor = 1. elif (Lcstate == 'Lc2595' or Lcstate == 'Lc2625'): perfname = './CorrectionTables/LcstFFratios.root' q2factor = 1e-6 else: raise Exception('Lc state not recognised', Lcstate) if leptname != 'mu' and leptname != 'tau': raise Exception('Lepton name not recognised', leptname) print('Using the histname', Lcstate + leptname + "_ratio") #variables to get from file varsdf = ['runNumber', 'eventNumber'] varsdf += ['Lb_TRUEP_X', 'Lb_TRUEP_Y', 'Lb_TRUEP_Z', 'Lb_TRUEP_E'] varsdf += ['Lc_TRUEP_X', 'Lc_TRUEP_Y', 'Lc_TRUEP_Z', 'Lc_TRUEP_E'] varsdf += [ 'Lb_True' + leptname.capitalize() + '_PX', 'Lb_True' + leptname.capitalize() + '_PY', 'Lb_True' + leptname.capitalize() + '_PZ', 'Lb_True' + leptname.capitalize() + '_PE' ] varsdf += [ 'Lb_TrueNeutrino_PX', 'Lb_TrueNeutrino_PY', 'Lb_TrueNeutrino_PZ', 'Lb_TrueNeutrino_PE' ] File = TFile.Open(perfname, "read") Histg = File.Get(Lcstate + leptname + "_ratio") perfHist = Histg.Clone(Lcstate + leptname + "_rationew") File.Close() Xmin = perfHist.GetXaxis().GetXmin() Xmax = perfHist.GetXaxis().GetXmax() Ymin = perfHist.GetYaxis().GetXmin() Ymax = perfHist.GetYaxis().GetXmax() Limits = (Xmin, Xmax, Ymin, Ymax) print(Limits, perfHist.Integral()) #variables to store in the new ttree varstoStore = { 'runNumber': np.int, 'eventNumber': np.int, 'Event_FFcorr': np.float64, costhlTrue_branchname: np.float64, q2True_branchname: np.float64 } aliases = {} #create a new rootfile with uproot3.recreate(outfname) as f: f[outtreename] = uproot3.newtree(varstoStore) #loop over the old rootfile chunkwise events_read = 0 if chunksize >= nentries_to_read: chunksize = nentries_to_read for df_data in uproot4.iterate(infname + ':' + intreename, varsdf, aliases=aliases, cut=None, library="pd", step_size=chunksize): if events_read >= nentries_to_read: break #Compute q2 and cosThetaL pxl = df_data['Lb_True' + leptname.capitalize() + '_PX'] pxnu = df_data['Lb_TrueNeutrino_PX'] pyl = df_data['Lb_True' + leptname.capitalize() + '_PY'] pynu = df_data['Lb_TrueNeutrino_PY'] pzl = df_data['Lb_True' + leptname.capitalize() + '_PZ'] pznu = df_data['Lb_TrueNeutrino_PZ'] pel = df_data['Lb_True' + leptname.capitalize() + '_PE'] penu = df_data['Lb_TrueNeutrino_PE'] if (Lcstate == 'Lc2595' or Lcstate == 'Lc2625'): #this should be Lcstar momentum pxlc = df_data['Lb_TRUEP_X'] - pxl - pxnu pylc = df_data['Lb_TRUEP_X'] - pyl - pynu pzlc = df_data['Lb_TRUEP_X'] - pzl - pznu pelc = df_data['Lb_TRUEP_X'] - pel - penu elif Lcstate == 'Lc': pxlc = df_data['Lc_TRUEP_X'] pylc = df_data['Lc_TRUEP_Y'] pzlc = df_data['Lc_TRUEP_Z'] pelc = df_data['Lc_TRUEP_E'] PLc_lab = LorentzVector( Vector(pxlc, pylc, pzlc), pelc) #Format of LorentzVector(Vector(X,Y,Z), E) Pl_lab = LorentzVector(Vector(pxl, pyl, pzl), pel) PNu_lab = LorentzVector(Vector(pxnu, pynu, pznu), penu) PLb_lab = PLc_lab + Pl_lab + PNu_lab qsq, cthl = return_phasespace(PLb_lab, PLc_lab, Pl_lab) #print(qsq,cthl) df_data[q2True_branchname] = qsq df_data[costhlTrue_branchname] = cthl #get the corrections applyvars = [q2True_branchname, costhlTrue_branchname ] #has to be in correct order like in histogram df_data['Event_FFcorr'] = df_data[applyvars].apply( storeeff2D, args=[perfHist, Limits, q2factor], axis=1) #get only the things that need to be stored and write them to the file branch_dict = { vartostore: df_data[vartostore].to_numpy() for vartostore in list(varstoStore.keys()) } f[outtreename].extend(branch_dict) events_read += df_data.shape[0] print('Events read', events_read)
def main(args): processed_nano = "tmva_xgboost_reproducers/lead_processed_nano_uncorr.root" df = pd.read_parquet(args.input_dataframe) print("Read input dataframe:\n{}".format(df)) inputs = { "lead_energyRaw": "SCRawE", "lead_r9": "r9", "lead_sieie":"sigmaIetaIeta", "lead_etaWidth": "etaWidth", "lead_phiWidth": "phiWidth", "lead_sieip": "covIEtaIPhi", "lead_s4": "s4", "lead_pfPhoIso03": "phoIso03", "lead_pfChargedIsoPFPV": "chgIsoWrtChosenVtx", "lead_pfChargedIsoWorstVtx": "chgIsoWrtWorstVtx", "lead_eta": "scEta", "lead_fixedGridRhoAll": "rho" } # This is needed just to not hardcore the branch type later arr_dict = {} for name in inputs.keys(): name_orig = name # Since I don't remember which ones have the suffix _nano if name_orig not in list(df.columns): name += "_nano" arr_dict[name_orig] = df[name] ak_arr = ak.Array(arr_dict) print(ak_arr.type) # Explicitely recompute also XGBoost one, just because print("Recomputing MVA with XGBoost") mva = xgboost.Booster() mva.load_model(args.xgboost_model) var_order = list(arr_dict.keys()) bdt_inputs = np.column_stack([ak.to_numpy(ak_arr[name]) for name in var_order]) tempmatrix = xgboost.DMatrix(bdt_inputs, feature_names=var_order) lead_idmva_xgboost = mva.predict(tempmatrix) # Thomas workflow lead_idmva_xgboost = -np.log(1./lead_idmva_xgboost - 1.) lead_idmva_xgboost = 2. / (1. + np.exp(-2.*lead_idmva_xgboost)) - 1. # Dump nanoaod inputs to a TTree with uproot3.recreate(processed_nano) as f: branchdict = {} arraydict = {} for nano_name, model_name in inputs.items(): #branchdict[model_name] = str(ak_arr[nano_name].type.type).replace('?', '') branchdict[model_name] = "float32" arraydict[model_name] = ak_arr[nano_name] f["Events"] = uproot3.newtree(branchdict) f["Events"].extend(arraydict) # TMVA with RDataFrame ROOT.gInterpreter.ProcessLine(''' TMVA::Experimental::RReader model("{}"); computeModel = TMVA::Experimental::Compute<{}, float>(model); '''.format(args.tmva_model, len(ak_arr.fields))) rdf = ROOT.RDataFrame("Events", processed_nano) rdf = rdf.Define("lead_idmva_tmva", ROOT.computeModel, ROOT.model.GetVariableNames()) print("Running RDF event loop") dct = rdf.AsNumpy(columns=["lead_idmva_tmva"]) lead_idmva_tmva = np.array([v[0] for v in dct["lead_idmva_tmva"]]) # Plot print("Plotting to {}".format(args.output_dir)) bins = 100 rng = (-1, 1) fig, (up, down) = plt.subplots( nrows=2, ncols=1, gridspec_kw={"height_ratios": (1, 1)} ) up.hist(lead_idmva_xgboost, bins=bins, range=rng, histtype="step", label="XGBoost", linewidth=2) up.hist(lead_idmva_tmva, bins=bins, range=rng, histtype="step", label="TMVA", linewidth=2) up.set_xlabel("lead PhoIDMVA after corrections") up.legend(fontsize=18, loc="upper left") down.hist(100 * (lead_idmva_xgboost - lead_idmva_tmva) / lead_idmva_tmva, bins=500, range=(-100, 100), histtype="step", density=True, color="black", linewidth=2 ) down.set_xlabel("$(XGB - TMVA)/TMVA$ [%]") down.set_yscale("log") fig.tight_layout() fig.savefig("{}/lead_xgb_tmva.png".format(args.output_dir), bbox_inches='tight') fig.savefig("{}/lead_xgb_tmva.pdf".format(args.output_dir), bbox_inches='tight') fig, ax = plt.subplots() ax.scatter(lead_idmva_xgboost, lead_idmva_tmva) ax.set_xlabel("XGBoost") ax.set_ylabel("TMVA") fig.savefig("{}/xgb_tmva_scatter.png".format(args.output_dir), bbox_inches='tight') fig.savefig("{}/xgb_tmva_scatter.pdf".format(args.output_dir), bbox_inches='tight')
def dump_generated_events(arr: ak.Array): fn = f"wd/{conf.tag}/output.root" with uproot.recreate(fn) as file: file["tree1"] = uproot.newtree(dict(arr.type.type)) file["tree1"].extend({branch: arr[branch] for branch in arr.fields})
def read_file(path, sample, branches=branches): print("=====") print("Processing {0} file".format(sample)) mem = psutil.virtual_memory() mem_at_start = mem.available / (1024 ** 2) print(f'Available Memory: {mem_at_start:.0f} MB') count = 0 hists = {} start = time.time() batch_num = 0 with uproot.open(path) as file: tree = file['mini'] numevents = tree.num_entries print(f'Total number of events in file: {numevents}') for batch in tree.iterate(branches, step_size='30 MB', library='np'): print('==============') df = pandas.DataFrame.from_dict(batch) del batch num_before_cuts = len(df.index) print("Events before cuts: {0}".format(num_before_cuts)) count += num_before_cuts if 'Data' not in sample: df['totalWeight'] = np.vectorize(calc_weight)(df.mcWeight, df.scaleFactor_ELE, df.scaleFactor_MUON, df.scaleFactor_PILEUP, df.scaleFactor_TRIGGER, df.scaleFactor_ZVERTEX) df["totalWeight"] = np.vectorize(get_xsec_weight)(df.totalWeight, sample) else: df['totalWeight'] = [1 for item in range(len(df.index))] df.drop(["mcWeight", "scaleFactor_ELE", "scaleFactor_MUON", "scaleFactor_PILEUP", "scaleFactor_TRIGGER", 'scaleFactor_ZVERTEX'], axis=1, inplace=True) # Standard selection cuts df = df.query("trigE or trigM") df = df.query('passGRL') df = df.query('hasGoodVertex') df.drop(["trigE", "trigM", "passGRL", "hasGoodVertex"], axis=1, inplace=True) # Lepton requirements df['good_lepton'] = np.vectorize(WCuts.cut_GoodLepton)(df.lep_flag, df.lep_pt, df.lep_ptcone30, df.lep_etcone20, df.lep_n, df.lep_type) df = df.query('good_lepton > -1') for column in df.columns: if 'lep' in column and column not in ['lep_n', 'good_lepton']: df[column] = np.vectorize(extract_good_lepton)(df[column], df['good_lepton']) # W transverse mass df['mtw'] = np.vectorize(calc_mtw)(df.lep_pt, df.met_et, df.lep_phi, df.met_phi) df = df.query('mtw > 30000.') df = df.query('met_et > 30000.') # Convert MeV to GeV df['lep_pt'] = df['lep_pt'] / 1000 df['lep_E'] = df['lep_E'] / 1000 df['met_et'] = df['met_et'] / 1000 df['mtw'] = df['mtw'] / 1000 df['mtw_enu'] = df.query('lep_type == 11')['mtw'] df['mtw_munu'] = df.query('lep_type == 13')['mtw'] df['WT_phi'] = np.vectorize(calc_W_phi)(df.lep_pt, df.met_et, df.lep_phi, df.met_phi) df['jet_n'] = df['alljet_n'] df.drop(['alljet_n'], axis=1, inplace=True) # Asymmetry related histograms df['pos_ele_eta'] = df.query('lep_type == 11 and lep_charge == 1')['lep_eta'] df['pos_ele_eta'] = np.vectorize(abs_value)(df.pos_ele_eta) df['neg_ele_eta'] = df.query('lep_type == 11 and lep_charge == -1')['lep_eta'] df['neg_ele_eta'] = np.vectorize(abs_value)(df.neg_ele_eta) df['pos_mu_eta'] = df.query('lep_type == 13 and lep_charge == 1')['lep_eta'] df['pos_mu_eta'] = np.vectorize(abs_value)(df.pos_mu_eta) df['neg_mu_eta'] = df.query('lep_type == 13 and lep_charge == -1')['lep_eta'] df['neg_mu_eta'] = np.vectorize(abs_value)(df.neg_mu_eta) df['lep_pt_j0'] = df.query('jet_n == 0')['lep_pt'] df['lep_pt_j1'] = df.query('jet_n == 1')['lep_pt'] df['lep_pt_j2'] = df.query('jet_n > 1')['lep_pt'] df['mtw_j0'] = df.query('jet_n == 0')['mtw'] df['mtw_j1'] = df.query('jet_n == 1')['mtw'] df['mtw_j2'] = df.query('jet_n > 1')['mtw'] df['met_et_j0'] = df.query('jet_n == 0')['met_et'] df['met_et_j1'] = df.query('jet_n == 1')['met_et'] df['met_et_j2'] = df.query('jet_n > 1')['met_et'] df['lep_eta_j0'] = df.query('jet_n == 0')['lep_eta'] df['lep_eta_j1'] = df.query('jet_n == 1')['lep_eta'] df['lep_eta_j2'] = df.query('jet_n > 1')['lep_eta'] if len(df.loc[df['jet_n'] > 0].index) > 0: temp_df = pandas.DataFrame() temp_df['eventNumber'] = df.loc[df['jet_n'] > 0]['eventNumber'] for column in df.columns: if 'jet' in column and column != 'jet_n': temp_df[f'lead_{column}'] = np.vectorize(find_lead_jet)(df.loc[df['jet_n'] > 0]['jet_pt'], df.loc[df['jet_n'] > 0][column]) temp_df['lead_jet_pt'] = temp_df['lead_jet_pt'] / 1000. temp_df['lj_phi_diff'] = np.vectorize(calc_delta_phi)(df.loc[df['jet_n'] > 0]['lep_phi'], temp_df['lead_jet_phi']) temp_df['abs_lj_phi_diff'] = np.vectorize(abs_value)(temp_df.lj_phi_diff) temp_df['Wj_phi_diff'] = np.vectorize(calc_delta_phi)(df.loc[df['jet_n'] > 0]['WT_phi'], temp_df['lead_jet_phi']) temp_df['abs_Wj_phi_diff'] = np.vectorize(abs_value)(temp_df.Wj_phi_diff) df = pandas.merge(left=df, right=temp_df, left_on='eventNumber', right_on='eventNumber', how='left') num_after_cuts = len(df.index) print("Number of events after cuts: {0}".format(num_after_cuts)) print(f'Currently at {(count * 100 / numevents):.0f}% of events ({count}/{numevents})') for key, hist in hist_dicts.items(): h_bin_width = hist["bin_width"] h_num_bins = hist["numbins"] h_xmin = hist["xmin"] x_var = hist["xvariable"] bins = [h_xmin + x * h_bin_width for x in range(h_num_bins + 1)] data_x, binning = np.histogram(df[x_var].values, bins=bins, weights=df.totalWeight.values) data_x = data_x.astype('float64') histo = uproot3_methods.classes.TH1.from_numpy((data_x, binning)) if key not in hists.keys(): hists[key] = histo else: for i in range(len(hists[key])): hists[key][i] += histo[i] if not os.path.exists(f'../DataForFit_8TeV/{sample}/'): os.mkdir(f'../DataForFit_8TeV/{sample}') f = uproot3.recreate(f'../DataForFit_8TeV/{sample}/{sample}_{batch_num}.root') f['FitTree'] = uproot3.newtree({'mtw': uproot3.newbranch(np.float64, 'mtw'), 'jet_n': uproot3.newbranch(np.int32, 'jet_n'), 'totalWeight': uproot3.newbranch(np.float64, 'totalWeight')}) f['FitTree'].extend({'mtw': df['mtw'].to_numpy(dtype=np.float64), 'jet_n': df['jet_n'].to_numpy(dtype=np.int32), 'totalWeight': df['totalWeight'].to_numpy(dtype=np.float64)}) f.close() batch_num += 1 del df gc.collect() # diagnostics mem = psutil.virtual_memory() actual_mem = mem.available / (1024 ** 2) print(f'Current available memory {actual_mem:.0f} MB ' f'({100 * actual_mem / mem_at_start:.0f}% of what we started with)') file = uproot3.recreate(f'../Output_8TeV/{sample}.root', uproot3.ZLIB(4)) for key, hist in hists.items(): file[key] = hist print(f'{key} histogram') file[key].show() file.close() mem = psutil.virtual_memory() actual_mem = mem.available / (1024 ** 2) print(f'Current available memory {actual_mem:.0f} MB ' f'({100 * actual_mem / mem_at_start:.0f}% of what we started with)') print('Finished!') print(f'Time elapsed: {time.time() - start} seconds') return None
branch_dicts[i_split]["branch_dict"]["{}_counts".format( online_key)] = counts # branch_dict["{}_counts".format(online_key)] = counts else: arr = online_tree[online_key].array()[on_mask].flatten( )[:N_jets] dtype = np.dtype("f4") for i_split, (n_start, n_end) in enumerate(zip(n_starts, n_ends)): branch_dicts[i_split]["branch_registration"][ online_key] = dtype branch_dicts[i_split]["branch_dict"][online_key] = arr[ n_start:n_end] # branch_registration[online_key] = dtype # branch_dict[online_key] = arr for i_split in range(SPLITS): out_path = os.path.join(base_dir, "{}_{}_{}.root".format(name, branch_key, i_split)) print("Creating new tree: {}".format(out_path)) with u3.recreate(out_path, compression=u3.ZLIB(6)) as out_file: print("Creating new tree for {}".format(branch_key)) out_file["ttree"] = u3.newtree( branch_dicts[i_split]["branch_registration"]) print("Creating branches") # s_time = time.time() out_file["ttree"].extend(branch_dicts[i_split]["branch_dict"]) # from IPython import embed;embed() # e_time = time.time() # print("Total time needed for {0} events:\n{1:1.1f}".format(N_jets, e_time - s_time))
np.dtype("f8"), size="{}_counts".format(online_key), ) counts = arr.counts branch_dict["{}_counts".format(online_key)] = counts elif track_key_indicator in online_key: arr = track_var_to_flat(online_tree[online_key].array(), tracking_index_low, tracking_index_high)[:N_jets] dtype = u3.newbranch( np.dtype("f8"), size="{}_counts".format(online_key), ) counts = arr.counts branch_dict["{}_counts".format(online_key)] = counts else: arr = online_tree[online_key].array().flatten()[:N_jets] dtype = np.float32 branch_registration[online_key] = dtype branch_dict[online_key] = arr print("Creating new tree") out_file["ttree"] = u3.newtree(branch_registration) print("Creating branches") s_time = time.time() out_file["ttree"].extend(branch_dict) e_time = time.time() print("Total time needed for {0} events:\n{1:1.1f}".format( N_jets, e_time - s_time))