def dataframe_to_ttree(df, filename, treename="t", chunksize=1e6, compression=uproot3.LZ4(1), progress=True): """ Writes ROOT file containing one TTree with the input pandas DataFrame. filename: name of output file treename: name of output TTree chunksize: number of rows per basket compression: uproot compression object (LZ4, ZLIB, LZMA, or None) progress: show tqdm progress bar? """ t = uproot3.newtree(df.dtypes) with uproot3.recreate(filename, compression=compression) as f: f[treename] = t chunksize = int(chunksize) iterable = range(0, len(df), chunksize) if progress: from tqdm.auto import tqdm iterable = tqdm(iterable) for i in iterable: chunk = df.iloc[i:i + chunksize] f[treename].extend({k: chunk[k].values for k in chunk.columns})
def create_file(file_name, distributions, weights, labels, extra_weights=None): if extra_weights is None: extra_weights = [] n_events = len(weights[0]) with uproot.recreate(file_name) as f: # write the predicted processes for i, label in enumerate(labels): lep_charge = create_lepton_charge(n_events) if label == "background": f[label] = uproot.newtree({ "jet_pt": "float64", "weight": "float64", "lep_charge": "int", "weight_up": "float64", "weight_down": "float64", }) f[label].extend({ "jet_pt": distributions[i], "weight": weights[i], "lep_charge": lep_charge, "weight_up": extra_weights[0], "weight_down": extra_weights[1], }) else: f[label] = uproot.newtree({ "jet_pt": "float64", "weight": "float64", "lep_charge": "int" }) f[label].extend({ "jet_pt": distributions[i], "weight": weights[i], "lep_charge": lep_charge, })
def create_ntuple(fname, treename, varname, var_array, weightname, weight_array): with uproot.recreate(fname) as f: f[treename] = uproot.newtree({ varname: "float64", weightname: "float64" }) f[treename].extend({varname: var_array, weightname: weight_array})
def to_root_multi(filename, d): with uproot3.recreate(filename) as f: for treename in d.keys(): df = d[treename] f[treename] = uproot3.newtree( {col: df[col].dtype for col in df.columns}) f[treename].extend(dict(df))
def save_template(templates, out_name, parameters): import uproot3 out_file = uproot3.recreate(out_name) for tmp in templates: out_file[tmp._fName] = tmp out_file.close() return
def merge_root(rootfiles, outputfile, incrementRunId=False): """ Merge root files in output files """ try: import uproot3 as uproot except: print("uproot3 is mandatory to merge root file. Please, do:") print("pip install uproot3") out = uproot.recreate(outputfile) #Previous ID values to be able to increment runIn or EventId previousId = {} #create the dict reading all input root files trees = {} pbar = tqdm.tqdm(total=len(rootfiles)) for file in rootfiles: root = uproot.open(file) root_keys = unicity(root.keys()) for tree in root_keys: if hasattr(root[tree], 'keys'): if not tree in trees: trees[tree] = {} trees[tree]["rootDictType"] = {} trees[tree]["rootDictValue"] = {} previousId[tree] = {} for branch in root[tree].keys(): array = root[tree].array(branch) if len(array) > 0: if type(array[0]) is type(b'c'): array = np.array([0 for xi in array]) if not branch in trees[tree]["rootDictType"]: trees[tree]["rootDictType"][branch] = type( array[0]) trees[tree]["rootDictValue"][branch] = np.array([]) if (not incrementRunId and branch.decode('utf-8').startswith('eventID') ) or (incrementRunId and branch.decode('utf-8').startswith('runID')): if not branch in previousId[tree]: previousId[tree][branch] = 0 array += previousId[tree][branch] previousId[tree][branch] = max(array) + 1 trees[tree]["rootDictValue"][branch] = np.append( trees[tree]["rootDictValue"][branch], array) pbar.update(1) pbar.close() #Set the dict in the output root file for tree in trees: if not trees[tree]["rootDictValue"] == {} or not trees[tree][ "rootDictType"] == {}: out[tree] = uproot.newtree(trees[tree]["rootDictType"]) out[tree].extend(trees[tree]["rootDictValue"])
def to_root( df, filename, treename="t", chunksize=20e3, compression=uproot3.ZLIB(1), compression_jagged=uproot3.ZLIB(1), progress=False, ): """ Writes ROOT file containing one TTree with the input pandas DataFrame. filename: name of output file treename: name of output TTree chunksize: number of rows per basket compression: uproot compression object (LZ4, ZLIB, LZMA, or None) progress: show tqdm progress bar? """ tree_dtypes = dict() jagged_branches = [] for bname, dtype in df.dtypes.items(): if "fletcher" in str(dtype): dtype = np.dtype(dtype.arrow_dtype.value_type.to_pandas_dtype()) tree_dtypes[bname] = uproot3.newbranch( dtype, size=bname + "_varn", compression=compression_jagged) jagged_branches.append(bname) elif "object" in str(dtype): raise RuntimeError( f"Don't know how to serialize column {bname} with object dtype." ) else: dtype = str(dtype).lstrip("u") tree_dtypes[bname] = dtype with uproot3.recreate(filename, compression=compression) as f: t = uproot3.newtree(tree_dtypes) f[treename] = t chunksize = int(chunksize) iterable = range(0, len(df), chunksize) if progress: iterable = tqdm(iterable) for i in iterable: chunk = df.iloc[i:i + chunksize] basket = dict() for column in chunk.columns: if column in jagged_branches: arr = chunk[column].ak(version=0) arr = maybe_unmask_jagged_array(arr) # profiling says 30% of the time is spent checking if jagged __getitem__ is given a string # this is not needed for writing out TTree branches, so free speedup. arr._util_isstringslice = lambda x: False basket[column] = arr basket[column + "_varn"] = arr.counts.astype("int32") else: basket[column] = chunk[column].values f[treename].extend(basket)
def _write_root(file, table, treename='Events', compression=-1, step=1048576): if compression == -1: compression = uproot3.write.compress.LZ4(4) with uproot3.recreate(file, compression=compression) as fout: fout[treename] = uproot3.newtree( {k: v.dtype for k, v in table.items()}) start = 0 while start < len(list(table.values())[0]) - 1: fout[treename].extend( {k: v[start:start + step] for k, v in table.items()}) start += step
def create_file_pseudodata(file_name, pseudodata): n_events = len(pseudodata) with uproot.recreate(file_name) as f: # write pseudodata lep_charge = create_lepton_charge(n_events) f["pseudodata"] = uproot.newtree({ "jet_pt": "float64", "lep_charge": "int" }) f["pseudodata"].extend({ "jet_pt": pseudodata, "lep_charge": lep_charge })
def main(): np.random.seed(0) data_hist = make_data_hist(hists) hists["data"] = {"counts": data_hist.tolist(), "bins": _bins} with open("example.json", "w") as serialization: json.dump(hists, serialization) with uproot3.recreate("example.root", compression=uproot3.ZLIB(4)) as outfile: for key in hists.keys(): outfile[key] = ( np.array(hists[key]["counts"]), np.array(hists[key]["bins"]), )
def write_spectrum_to_root(ff, pp, filename, center=0, title=''): class MyTH1(uproot3_methods.classes.TH1.Methods, list): def __init__(self, low, high, values, title=""): self._fXaxis = types.SimpleNamespace() self._fXaxis._fNbins = len(values) self._fXaxis._fXmin = low self._fXaxis._fXmax = high values.insert(0, 0) values.append(0) for x in values: self.append(float(x)) self._fTitle = title self._classname = "TH1F" th1f = MyTH1(center + ff[0], center + ff[-1], pp.tolist(), title=title) file = uproot3.recreate(filename, compression=uproot3.ZLIB(4)) file["th1f"] = th1f
def pandas_to_tree(data, file_name, tree_name): """ Save pandas dataframe as a ROOT TTree. :param pandas.DataFrame data: dataframe to be stored :param str file_name: path and name of the output file :param str tree_name: name of the result TTree """ branch_dict = { data.columns[i]: data.dtypes[i] for i in range(0, len(data.columns)) } with uproot3.recreate(file_name) as file_output: file_output[tree_name] = uproot3.newtree(branches=branch_dict, title=tree_name) file_output[tree_name].extend({ data.columns[i]: data[data.columns[i]].to_numpy() for i in range(0, len(data.columns)) })
def writexml(spec, specdir, data_rootdir, resultprefix): global _ROOT_DATA_FILE shutil.copyfile( pkg_resources.resource_filename(__name__, 'schemas/HistFactorySchema.dtd'), Path(specdir).parent.joinpath('HistFactorySchema.dtd'), ) combination = ET.Element("Combination", OutputFilePrefix=str( Path(specdir).joinpath(resultprefix))) with uproot.recreate(str( Path(data_rootdir).joinpath('data.root'))) as _ROOT_DATA_FILE: for channelspec in spec['channels']: channelfilename = str( Path(specdir).joinpath( f'{resultprefix}_{channelspec["name"]}.xml')) with open(channelfilename, 'w') as channelfile: channel = build_channel(spec, channelspec, spec.get('observations')) indent(channel) channelfile.write( "<!DOCTYPE Channel SYSTEM '../HistFactorySchema.dtd'>\n\n") channelfile.write( ET.tostring(channel, encoding='utf-8').decode('utf-8')) inp = ET.Element("Input") inp.text = channelfilename combination.append(inp) # need information about modifier types to get the right prefix in measurement mixin = _ChannelSummaryMixin(channels=spec['channels']) for measurement in spec['measurements']: combination.append( build_measurement(measurement, dict(mixin.modifiers))) indent(combination) return b"<!DOCTYPE Combination SYSTEM 'HistFactorySchema.dtd'>\n\n" + ET.tostring( combination, encoding='utf-8')
def pandas_to_tree(data, file_name, tree_name): """ Parameters ---------- data : pandas.DataFrame Data frame which should be stored as TTree file_name : str Path and name of root file tree_name : str Name of TTree """ branch_dict = { data.columns[i]: data.dtypes[i] for i in range(0, len(data.columns)) } with uproot3.recreate(file_name) as file_output: file_output[tree_name] = uproot3.newtree(branches=branch_dict, title=tree_name) file_output[tree_name].extend({ data.columns[i]: data[data.columns[i]].to_numpy() for i in range(0, len(data.columns)) })
def save_dict_to_root(dic, file_name, tree_name=None): """ This function stores data arrays in the form of a dictionary into a root file. It provides a convenient interface to ``uproot``. :param dic: Dictionary of data :param file_name: String :param tree_name: String. By default it's "tree". """ if file_name[-5:] == ".root": file_name = file_name[:-5] if isinstance(dic, dict): dic = [dic] if tree_name is None: tree_name = "DataTree" Ndic = len(dic) if isinstance(tree_name, list): assert len(tree_name) == Ndic else: t = [] for i in range(Ndic): t.append(tree_name + str(i)) tree_name = t with uproot.recreate(file_name + ".root") as f: for d, t in zip(dic, tree_name): branch_type = {} branch_data = {} for i in d: j = (i.replace("(", "_").replace(")", "_").replace( " ", "_").replace("*", "star").replace("+", "p").replace("-", "m")) branch_data[j] = np.array(d[i]) branch_type[j] = branch_data[j].dtype.name f[t] = uproot.newtree(branch_type) f[t].extend(branch_data)
def merge_root(rootfiles, outputfile): """ Merge root files in output files """ out = uproot.recreate(outputfile) #create the dict reading all input root files trees = {} pbar = tqdm.tqdm(total=len(rootfiles)) for file in rootfiles: root = uproot.open(file) for tree in root.keys(): if hasattr(root[tree], 'keys'): if not tree in trees: trees[tree] = {} trees[tree]["rootDictType"] = {} trees[tree]["rootDictValue"] = {} for branch in root[tree].keys(): array = root[tree].array(branch) if len(array) > 0: if type(array[0]) is type(b'c'): array = np.array([0 for xi in array]) if not branch in trees[tree]["rootDictType"]: trees[tree]["rootDictType"][branch] = type( array[0]) trees[tree]["rootDictValue"][branch] = np.array([]) trees[tree]["rootDictValue"][branch] = np.append( trees[tree]["rootDictValue"][branch], array) pbar.update(1) pbar.close() #Set the dict in the output root file for tree in trees: if not trees[tree]["rootDictValue"] == {} or not trees[tree][ "rootDictType"] == {}: out[tree] = uproot.newtree(trees[tree]["rootDictType"]) out[tree].extend(trees[tree]["rootDictValue"])
templates[f'{sname}_{pf}_{syst}'][{ 'genflavor': s[:1:sum] }] for sname in samples ]) matched_name = f"catp2_{pf}_{syst}" unmatched_name = f"catp1_{pf}_{syst}" if syst == 'nominal': data_name = f"data_obs_{pf}_{syst}" merged_dict[data_name] = data merged_dict[matched_name] = matched merged_dict[unmatched_name] = unmatched print(f'Will save templates to {template_file}') fout = uproot3.recreate(template_file) for name, h_obj in tqdm(merged_dict.items(), desc='Writing templates'): if np.sum(h_obj.values()) <= 0.: print(f'Template {name} is empty') if args.clip: h_obj = h_obj[40j:145j] fout[name] = export1d(h_obj) fout.close() if args.plot: hep.style.use("CMS") for clip in [True, False]: for pf in ["Pass", "Fail"]: # Make template plots fig, ax = plt.subplots()
def get_bkg_templates(tmp_rname): """ Function that writes linearized mtt vs costheta distributions to root file. """ ## variables that only need to be defined/evaluated once hdict = plt_tools.add_coffea_files( bkg_fnames) if len(bkg_fnames) > 1 else load(bkg_fnames[0]) # get correct hist and rebin hname_to_use = "mtt_vs_tlep_ctstar_abs" if hname_to_use not in hdict.keys(): raise ValueError("%s not found in file" % hname_to_use) xrebinning, yrebinning = linearize_binning histo = hdict[hname_to_use][ Plotter. nonsignal_samples] # process, sys, jmult, leptype, btag, lepcat xaxis_name = histo.dense_axes()[0].name yaxis_name = histo.dense_axes()[1].name ## rebin x axis if isinstance(xrebinning, np.ndarray): new_xbins = hist.Bin(xaxis_name, xaxis_name, xrebinning) elif isinstance(xrebinning, float) or isinstance(xrebinning, int): new_xbins = xrebinning histo = histo.rebin(xaxis_name, new_xbins) ## rebin y axis if isinstance(yrebinning, np.ndarray): new_ybins = hist.Bin(yaxis_name, yaxis_name, yrebinning) elif isinstance(yrebinning, float) or isinstance(yrebinning, int): new_ybins = yrebinning rebin_histo = histo.rebin(yaxis_name, new_ybins) ## scale ttJets events, split by reconstruction type, by normal ttJets lumi correction ttJets_permcats = [ "*right", "*matchable", "*unmatchable", "*sl_tau", "*other" ] names = [ dataset for dataset in sorted(set([key[0] for key in histo.values().keys()])) ] # get dataset names in hists ttJets_cats = [ name for name in names if any([fnmatch.fnmatch(name, cat) for cat in ttJets_permcats]) ] # gets ttJets(_PS)_other, ... ## make groups based on process process = hist.Cat("process", "Process", sorting="placement") process_cat = "dataset" # need to save coffea hist objects to file so they can be opened by uproot in the proper format upfout = uproot3.recreate(tmp_rname, compression=uproot3.ZLIB( 4)) if os.path.isfile(tmp_rname) else uproot3.create(tmp_rname) if "3Jets" in njets_to_run: histo_dict_3j = processor.dict_accumulator({ "Muon": {}, "Electron": {} }) if "4PJets" in njets_to_run: histo_dict_4pj = processor.dict_accumulator({ "Muon": {}, "Electron": {} }) for lep in ["Muon", "Electron"]: orig_lepdir = "muNJETS" if lep == "Muon" else "eNJETS" #set_trace() ## make groups based on process process_groups = plt_tools.make_dataset_groups(lep, args.year, samples=names, gdict="templates") #process_groups = plt_tools.make_dataset_groups(lep, args.year, samples=names, gdict="dataset") lumi_correction = lumi_corr_dict[args.year]["%ss" % lep] # scale ttJets events, split by reconstruction type, by normal ttJets lumi correction if len(ttJets_cats) > 0: for tt_cat in ttJets_cats: ttJets_lumi_topo = "_".join(tt_cat.split( "_")[:-2]) if "sl_tau" in tt_cat else "_".join( tt_cat.split("_") [:-1]) # gets ttJets[SL, Had, DiLep] or ttJets_PS ttJets_eff_lumi = lumi_correction[ttJets_lumi_topo] lumi_correction.update({tt_cat: ttJets_eff_lumi}) histo = rebin_histo.copy() histo.scale(lumi_correction, axis="dataset") histo = histo.group(process_cat, process, process_groups)[:, :, :, lep, :, :].integrate("leptype") #set_trace() systs = sorted(set([key[1] for key in histo.values().keys()])) systs.insert(0, systs.pop( systs.index("nosys"))) # move "nosys" to the front # loop over each jet multiplicity for jmult in njets_to_run: lepdir = orig_lepdir.replace("NJETS", jmult.lower()) # get sideband and signal region hists cen_sb_histo = Plotter.linearize_hist( histo[:, "nosys", jmult, btag_reg_names_dict["Central"]["reg"]].integrate( "jmult").integrate("btag").integrate("sys")) #up_sb_histo = histo[:, "nosys", jmult, btag_reg_names_dict["Up"]["reg"]].integrate("jmult").integrate("btag") #dw_sb_histo = histo[:, "nosys", jmult, btag_reg_names_dict["Down"]["reg"]].integrate("jmult").integrate("btag") sig_histo = Plotter.linearize_hist( histo[:, :, jmult, btag_reg_names_dict["Signal"]["reg"]].integrate( "jmult").integrate("btag")) # loop over each systematic for sys in systs: if sys not in systematics.template_sys_to_name[ args.year].keys(): continue sys_histo = sig_histo[:, sys].integrate( "sys") if sys in systematics.ttJets_sys.values( ) else Plotter.BKG_Est( sig_reg=sig_histo[:, sys].integrate("sys"), sb_reg=cen_sb_histo, norm_type="SigMC", sys=sys, ignore_uncs=True) ## write nominal and systematic variations for each topology to file #for proc in sorted(set([key[0] for key in sig_histo.values().keys()])): for proc in sorted( set([key[0] for key in sys_histo.values().keys()])): if ("tt" not in proc) and ( sys in systematics.ttJets_sys.values()): continue #if (proc != "tt") and (sys in systematics.ttJets_sys.values()): continue if (proc == "data_obs") and not (sys == "nosys"): continue if not sys_histo[proc].values().keys(): #if not sig_histo[proc, sys].values().keys(): print( f"Systematic {sys} for {lep} {jmult} {proc} not found, skipping" ) continue print(args.year, lep, jmult, sys, proc) #set_trace() outhname = "_".join( list( filter(None, [ proc, systematics.template_sys_to_name[ args.year][sys][0], lepdir, (args.year)[-2:] ]))) if "LEP" in outhname: outhname = outhname.replace( "LEP", "muon") if lep == "Muon" else outhname.replace( "LEP", "electron") template_histo = sys_histo[proc].integrate("process") #template_histo = sig_histo[proc, sys].integrate("process").integrate("sys") #set_trace() ## save template histos to coffea dict if jmult == "3Jets": histo_dict_3j[lep][ f"{proc}_{sys}"] = template_histo.copy() if jmult == "4PJets": histo_dict_4pj[lep][ f"{proc}_{sys}"] = template_histo.copy() ## save template histo to root file upfout[outhname] = hist.export1d(template_histo) if "3Jets" in njets_to_run: coffea_out_3j = os.path.join( outdir, f"test_raw_templates_lj_3Jets_bkg_{args.year}_{jobid}.coffea") save(histo_dict_3j, coffea_out_3j) print(f"{coffea_out_3j} written") if "4PJets" in njets_to_run: coffea_out_4pj = os.path.join( outdir, f"test_raw_templates_lj_4PJets_bkg_{args.year}_{jobid}.coffea") save(histo_dict_4pj, coffea_out_4pj) print(f"{coffea_out_4pj} written") upfout.close() print(f"{tmp_rname} written")
def read_file(path, sample, branches=branches): print("=====") print("Processing {0} file".format(sample)) mem = psutil.virtual_memory() mem_at_start = mem.available / (1024 ** 2) print(f'Available Memory: {mem_at_start:.0f} MB') count = 0 hists = {} start = time.time() batch_num = 0 with uproot.open(path) as file: tree = file['mini'] numevents = tree.num_entries print(f'Total number of events in file: {numevents}') for batch in tree.iterate(branches, step_size='30 MB', library='np'): print('==============') df = pandas.DataFrame.from_dict(batch) del batch num_before_cuts = len(df.index) print("Events before cuts: {0}".format(num_before_cuts)) count += num_before_cuts if 'Data' not in sample: df['totalWeight'] = np.vectorize(calc_weight)(df.mcWeight, df.scaleFactor_ELE, df.scaleFactor_MUON, df.scaleFactor_PILEUP, df.scaleFactor_TRIGGER, df.scaleFactor_ZVERTEX) df["totalWeight"] = np.vectorize(get_xsec_weight)(df.totalWeight, sample) else: df['totalWeight'] = [1 for item in range(len(df.index))] df.drop(["mcWeight", "scaleFactor_ELE", "scaleFactor_MUON", "scaleFactor_PILEUP", "scaleFactor_TRIGGER", 'scaleFactor_ZVERTEX'], axis=1, inplace=True) # Standard selection cuts df = df.query("trigE or trigM") df = df.query('passGRL') df = df.query('hasGoodVertex') df.drop(["trigE", "trigM", "passGRL", "hasGoodVertex"], axis=1, inplace=True) # Lepton requirements df['good_lepton'] = np.vectorize(WCuts.cut_GoodLepton)(df.lep_flag, df.lep_pt, df.lep_ptcone30, df.lep_etcone20, df.lep_n, df.lep_type) df = df.query('good_lepton > -1') for column in df.columns: if 'lep' in column and column not in ['lep_n', 'good_lepton']: df[column] = np.vectorize(extract_good_lepton)(df[column], df['good_lepton']) # W transverse mass df['mtw'] = np.vectorize(calc_mtw)(df.lep_pt, df.met_et, df.lep_phi, df.met_phi) df = df.query('mtw > 30000.') df = df.query('met_et > 30000.') # Convert MeV to GeV df['lep_pt'] = df['lep_pt'] / 1000 df['lep_E'] = df['lep_E'] / 1000 df['met_et'] = df['met_et'] / 1000 df['mtw'] = df['mtw'] / 1000 df['mtw_enu'] = df.query('lep_type == 11')['mtw'] df['mtw_munu'] = df.query('lep_type == 13')['mtw'] df['WT_phi'] = np.vectorize(calc_W_phi)(df.lep_pt, df.met_et, df.lep_phi, df.met_phi) df['jet_n'] = df['alljet_n'] df.drop(['alljet_n'], axis=1, inplace=True) # Asymmetry related histograms df['pos_ele_eta'] = df.query('lep_type == 11 and lep_charge == 1')['lep_eta'] df['pos_ele_eta'] = np.vectorize(abs_value)(df.pos_ele_eta) df['neg_ele_eta'] = df.query('lep_type == 11 and lep_charge == -1')['lep_eta'] df['neg_ele_eta'] = np.vectorize(abs_value)(df.neg_ele_eta) df['pos_mu_eta'] = df.query('lep_type == 13 and lep_charge == 1')['lep_eta'] df['pos_mu_eta'] = np.vectorize(abs_value)(df.pos_mu_eta) df['neg_mu_eta'] = df.query('lep_type == 13 and lep_charge == -1')['lep_eta'] df['neg_mu_eta'] = np.vectorize(abs_value)(df.neg_mu_eta) df['lep_pt_j0'] = df.query('jet_n == 0')['lep_pt'] df['lep_pt_j1'] = df.query('jet_n == 1')['lep_pt'] df['lep_pt_j2'] = df.query('jet_n > 1')['lep_pt'] df['mtw_j0'] = df.query('jet_n == 0')['mtw'] df['mtw_j1'] = df.query('jet_n == 1')['mtw'] df['mtw_j2'] = df.query('jet_n > 1')['mtw'] df['met_et_j0'] = df.query('jet_n == 0')['met_et'] df['met_et_j1'] = df.query('jet_n == 1')['met_et'] df['met_et_j2'] = df.query('jet_n > 1')['met_et'] df['lep_eta_j0'] = df.query('jet_n == 0')['lep_eta'] df['lep_eta_j1'] = df.query('jet_n == 1')['lep_eta'] df['lep_eta_j2'] = df.query('jet_n > 1')['lep_eta'] if len(df.loc[df['jet_n'] > 0].index) > 0: temp_df = pandas.DataFrame() temp_df['eventNumber'] = df.loc[df['jet_n'] > 0]['eventNumber'] for column in df.columns: if 'jet' in column and column != 'jet_n': temp_df[f'lead_{column}'] = np.vectorize(find_lead_jet)(df.loc[df['jet_n'] > 0]['jet_pt'], df.loc[df['jet_n'] > 0][column]) temp_df['lead_jet_pt'] = temp_df['lead_jet_pt'] / 1000. temp_df['lj_phi_diff'] = np.vectorize(calc_delta_phi)(df.loc[df['jet_n'] > 0]['lep_phi'], temp_df['lead_jet_phi']) temp_df['abs_lj_phi_diff'] = np.vectorize(abs_value)(temp_df.lj_phi_diff) temp_df['Wj_phi_diff'] = np.vectorize(calc_delta_phi)(df.loc[df['jet_n'] > 0]['WT_phi'], temp_df['lead_jet_phi']) temp_df['abs_Wj_phi_diff'] = np.vectorize(abs_value)(temp_df.Wj_phi_diff) df = pandas.merge(left=df, right=temp_df, left_on='eventNumber', right_on='eventNumber', how='left') num_after_cuts = len(df.index) print("Number of events after cuts: {0}".format(num_after_cuts)) print(f'Currently at {(count * 100 / numevents):.0f}% of events ({count}/{numevents})') for key, hist in hist_dicts.items(): h_bin_width = hist["bin_width"] h_num_bins = hist["numbins"] h_xmin = hist["xmin"] x_var = hist["xvariable"] bins = [h_xmin + x * h_bin_width for x in range(h_num_bins + 1)] data_x, binning = np.histogram(df[x_var].values, bins=bins, weights=df.totalWeight.values) data_x = data_x.astype('float64') histo = uproot3_methods.classes.TH1.from_numpy((data_x, binning)) if key not in hists.keys(): hists[key] = histo else: for i in range(len(hists[key])): hists[key][i] += histo[i] if not os.path.exists(f'../DataForFit_8TeV/{sample}/'): os.mkdir(f'../DataForFit_8TeV/{sample}') f = uproot3.recreate(f'../DataForFit_8TeV/{sample}/{sample}_{batch_num}.root') f['FitTree'] = uproot3.newtree({'mtw': uproot3.newbranch(np.float64, 'mtw'), 'jet_n': uproot3.newbranch(np.int32, 'jet_n'), 'totalWeight': uproot3.newbranch(np.float64, 'totalWeight')}) f['FitTree'].extend({'mtw': df['mtw'].to_numpy(dtype=np.float64), 'jet_n': df['jet_n'].to_numpy(dtype=np.int32), 'totalWeight': df['totalWeight'].to_numpy(dtype=np.float64)}) f.close() batch_num += 1 del df gc.collect() # diagnostics mem = psutil.virtual_memory() actual_mem = mem.available / (1024 ** 2) print(f'Current available memory {actual_mem:.0f} MB ' f'({100 * actual_mem / mem_at_start:.0f}% of what we started with)') file = uproot3.recreate(f'../Output_8TeV/{sample}.root', uproot3.ZLIB(4)) for key, hist in hists.items(): file[key] = hist print(f'{key} histogram') file[key].show() file.close() mem = psutil.virtual_memory() actual_mem = mem.available / (1024 ** 2) print(f'Current available memory {actual_mem:.0f} MB ' f'({100 * actual_mem / mem_at_start:.0f}% of what we started with)') print('Finished!') print(f'Time elapsed: {time.time() - start} seconds') return None
SUEP_NTuple( options.isMC, str(options.era), do_syst=1, syst_var=sys + var, weight_syst=True, sample=options.dataset #, # haddFileName=f"tree_{options.jobNum}_{sys}{var}.root", )) for i in modules_era: print("modules : ", i) print("Selection : ", pre_selection) tstart = time.time() f = uproot.recreate("tree_%s_WS.root" % str(options.jobNum)) for instance in modules_era: output = run_uproot_job({instance.sample: [options.infile]}, treename='Events', processor_instance=instance, executor=futures_executor, executor_args={'workers': 10}, chunksize=500000) for h, hist in output.items(): f[h] = export1d(hist) #print(f'wrote {h} to tree_{options.jobNum}_WS.root') modules_gensum = [] if options.isMC: modules_gensum.append(
def AddFFcorr(infname, intreename, outfname, outtreename, Lcstate, leptname, q2True_branchname, costhlTrue_branchname, nentries_to_read=1000000000, chunksize=10000): TH1.AddDirectory(kFALSE) perfname = None q2factor = None if Lcstate == 'Lc': perfname = './CorrectionTables/LcFFratios.root' q2factor = 1. elif (Lcstate == 'Lc2595' or Lcstate == 'Lc2625'): perfname = './CorrectionTables/LcstFFratios.root' q2factor = 1e-6 else: raise Exception('Lc state not recognised', Lcstate) if leptname != 'mu' and leptname != 'tau': raise Exception('Lepton name not recognised', leptname) print('Using the histname', Lcstate + leptname + "_ratio") #variables to get from file varsdf = ['runNumber', 'eventNumber'] varsdf += ['Lb_TRUEP_X', 'Lb_TRUEP_Y', 'Lb_TRUEP_Z', 'Lb_TRUEP_E'] varsdf += ['Lc_TRUEP_X', 'Lc_TRUEP_Y', 'Lc_TRUEP_Z', 'Lc_TRUEP_E'] varsdf += [ 'Lb_True' + leptname.capitalize() + '_PX', 'Lb_True' + leptname.capitalize() + '_PY', 'Lb_True' + leptname.capitalize() + '_PZ', 'Lb_True' + leptname.capitalize() + '_PE' ] varsdf += [ 'Lb_TrueNeutrino_PX', 'Lb_TrueNeutrino_PY', 'Lb_TrueNeutrino_PZ', 'Lb_TrueNeutrino_PE' ] File = TFile.Open(perfname, "read") Histg = File.Get(Lcstate + leptname + "_ratio") perfHist = Histg.Clone(Lcstate + leptname + "_rationew") File.Close() Xmin = perfHist.GetXaxis().GetXmin() Xmax = perfHist.GetXaxis().GetXmax() Ymin = perfHist.GetYaxis().GetXmin() Ymax = perfHist.GetYaxis().GetXmax() Limits = (Xmin, Xmax, Ymin, Ymax) print(Limits, perfHist.Integral()) #variables to store in the new ttree varstoStore = { 'runNumber': np.int, 'eventNumber': np.int, 'Event_FFcorr': np.float64, costhlTrue_branchname: np.float64, q2True_branchname: np.float64 } aliases = {} #create a new rootfile with uproot3.recreate(outfname) as f: f[outtreename] = uproot3.newtree(varstoStore) #loop over the old rootfile chunkwise events_read = 0 if chunksize >= nentries_to_read: chunksize = nentries_to_read for df_data in uproot4.iterate(infname + ':' + intreename, varsdf, aliases=aliases, cut=None, library="pd", step_size=chunksize): if events_read >= nentries_to_read: break #Compute q2 and cosThetaL pxl = df_data['Lb_True' + leptname.capitalize() + '_PX'] pxnu = df_data['Lb_TrueNeutrino_PX'] pyl = df_data['Lb_True' + leptname.capitalize() + '_PY'] pynu = df_data['Lb_TrueNeutrino_PY'] pzl = df_data['Lb_True' + leptname.capitalize() + '_PZ'] pznu = df_data['Lb_TrueNeutrino_PZ'] pel = df_data['Lb_True' + leptname.capitalize() + '_PE'] penu = df_data['Lb_TrueNeutrino_PE'] if (Lcstate == 'Lc2595' or Lcstate == 'Lc2625'): #this should be Lcstar momentum pxlc = df_data['Lb_TRUEP_X'] - pxl - pxnu pylc = df_data['Lb_TRUEP_X'] - pyl - pynu pzlc = df_data['Lb_TRUEP_X'] - pzl - pznu pelc = df_data['Lb_TRUEP_X'] - pel - penu elif Lcstate == 'Lc': pxlc = df_data['Lc_TRUEP_X'] pylc = df_data['Lc_TRUEP_Y'] pzlc = df_data['Lc_TRUEP_Z'] pelc = df_data['Lc_TRUEP_E'] PLc_lab = LorentzVector( Vector(pxlc, pylc, pzlc), pelc) #Format of LorentzVector(Vector(X,Y,Z), E) Pl_lab = LorentzVector(Vector(pxl, pyl, pzl), pel) PNu_lab = LorentzVector(Vector(pxnu, pynu, pznu), penu) PLb_lab = PLc_lab + Pl_lab + PNu_lab qsq, cthl = return_phasespace(PLb_lab, PLc_lab, Pl_lab) #print(qsq,cthl) df_data[q2True_branchname] = qsq df_data[costhlTrue_branchname] = cthl #get the corrections applyvars = [q2True_branchname, costhlTrue_branchname ] #has to be in correct order like in histogram df_data['Event_FFcorr'] = df_data[applyvars].apply( storeeff2D, args=[perfHist, Limits, q2factor], axis=1) #get only the things that need to be stored and write them to the file branch_dict = { vartostore: df_data[vartostore].to_numpy() for vartostore in list(varstoStore.keys()) } f[outtreename].extend(branch_dict) events_read += df_data.shape[0] print('Events read', events_read)
#Create Directories to be cut and pushed into files SD = dict((k, A[k]) for k in ['MET',"METPhi","j1PT","mjj","mjj_13","mjj_23","mjjoptimized","j1Eta","j2Eta","j3Eta","j1Phi","j2Phi","j3Phi","j2PT","j3PT","weight"] if k in A) SDEvents = pd.DataFrame.from_dict(SD) EWKBD = dict((k, B[k]) for k in ['MET',"METPhi","j1PT","mjj","mjj_13","mjj_23","mjjoptimized","j1Eta","j2Eta","j3Eta","j1Phi","j2Phi","j3Phi","j2PT","j3PT","weight"] if k in B) EWKBDEvents = pd.DataFrame.from_dict(EWKBD) QCDBD = dict((k, C[k]) for k in ['MET',"METPhi","j1PT","mjj","mjj_13","mjj_23","mjjoptimized","j1Eta","j2Eta","j3Eta","j1Phi","j2Phi","j3Phi","j2PT","j3PT","weight"] if k in C) QCDBDEvents = pd.DataFrame.from_dict(QCDBD) # In[6]: #Create file of events with no cuts applied file1 = uproot.recreate("Combined Signal Ntuples.root") file1["Signal"] = SDEvents file2 = uproot.recreate("Combined EWKBackground Ntuples.root") file2["EWKBackground"] = EWKBDEvents file3 = uproot.recreate("Combined QCDBackground Ntuples.root") file3["QCDBackground"] = QCDBDEvents # In[7]: #Create file of events with mjj>1000, MET>200, and 3 jets. file1 = uproot.recreate("Combined Signal Ntuples, mjj>1000, MET>200, 3 jets.root") #The j3Eta cut ensures no j3Eta values in the -1000 range, indicative of an error, most are around 1. file1["Signal"] = SDEvents.loc[(SDEvents['mjjoptimized'] > 1000) & (SDEvents['MET'] > 200) & (SDEvents['j3Eta'] > -500)] file2 = uproot.recreate("Combined EWKBackground Ntuples, mjj>1000, MET>200, 3 jets.root") file2["EWKBackground"] = EWKBDEvents.loc[(EWKBDEvents['mjjoptimized'] > 1000) & (EWKBDEvents['MET'] > 200) & (EWKBDEvents['j3Eta'] > -500)] file3 = uproot.recreate("Combined QCDBackground Ntuples, mjj>1000, MET>200, 3 jets.root")
def get_bkg_templates(tmp_rname): """ Function that writes linearized mtt vs costheta distributions to root file. """ # define variables to get histogram for background bkg_fnmatch = "%s.coffea" % base_template_name.replace( "NJETS", njets_regex).replace("SIG", "bkg") bkg_fnames = fnmatch.filter(os.listdir(inputdir), bkg_fnmatch) if "3Jets" in njets_to_run: histo_dict_3j = processor.dict_accumulator({ "Muon": {}, "Electron": {} }) if "4PJets" in njets_to_run: histo_dict_4pj = processor.dict_accumulator({ "Muon": {}, "Electron": {} }) # need to save coffea hist objects to file so they can be opened by uproot3 in the proper format upfout = uproot3.recreate(tmp_rname, compression=uproot3.ZLIB( 4)) if os.path.isfile(tmp_rname) else uproot3.create(tmp_rname) for bkg_file in bkg_fnames: hdict = load(os.path.join(inputdir, bkg_file)) jmult = "3Jets" if "3Jets" in bkg_file else "4PJets" for lep in hdict.keys(): lepdir = "mujets" if lep == "Muon" else "ejets" for tname in hdict[lep].keys(): #set_trace() template_histo = hdict[lep][tname] proc = tname.split( "_")[0] if not "data_obs" in tname else "data_obs" sys = "_".join(tname.split("_") [1:]) if not "data_obs" in tname else "nosys" if not sys in sys_to_use.keys(): continue #if "RENORM" in sys: set_trace() sysname, onlyTT = sys_to_use[sys] name = proc + lepdir if proc == "QCD" else proc print(lep, jmult, sys, name) outhname = "_".join([jmult, lepdir, name ]) if sys == "nosys" else "_".join( [jmult, lepdir, name, sysname]) if (sys != "nosys") and (args.smooth) and ( templates_to_smooth[proc]): template_histo = smoothing( nominal=histo_dict_3j[lep][proc] if jmult == "3Jets" else histo_dict_4pj[lep][proc], template=template_histo, nbinsx=len(linearize_binning[0]) - 1, nbinsy=len(linearize_binning[1]) - 1) #set_trace() ## save template histos to coffea dict if jmult == "3Jets": histo_dict_3j[lep][proc if sys == "nosys" else "%s_%s" % (proc, sysname)] = template_histo if jmult == "4PJets": histo_dict_4pj[lep][proc if sys == "nosys" else "%s_%s" % (proc, sysname)] = template_histo ## save template histo to root file upfout[outhname] = hist.export1d(template_histo) #set_trace() if "3Jets" in njets_to_run: coffea_out_3j = os.path.join( outdir, f"templates_lj_3Jets_bkg_smoothed_{jobid}_{args.year}.coffea" if args.smooth else f"templates_lj_3Jets_bkg_{jobid}_{args.year}.coffea") save(histo_dict_3j, coffea_out_3j) print(f"{coffea_out_3j} written") if "4PJets" in njets_to_run: coffea_out_4pj = os.path.join( outdir, f"templates_lj_4PJets_bkg_smoothed_{jobid}_{args.year}.coffea" if args.smooth else f"templates_lj_4PJets_bkg_{jobid}_{args.year}.coffea") save(histo_dict_4pj, coffea_out_4pj) print(f"{coffea_out_4pj} written") upfout.close() print(f"{tmp_rname} written")
def dump_generated_events(arr: ak.Array): fn = f"wd/{conf.tag}/output.root" with uproot.recreate(fn) as file: file["tree1"] = uproot.newtree(dict(arr.type.type)) file["tree1"].extend({branch: arr[branch] for branch in arr.fields})
def to_root(df, filename, treename): with uproot3.recreate(filename) as f: f[treename] = uproot3.newtree( {col: df[col].dtype for col in df.columns}) f[treename].extend(dict(df))
default="converted_trees") args = parser.parse_args() base_dir = args.output os.makedirs(base_dir, exist_ok=True) infile = args.infile name = infile.split("/")[-1].split(".")[0] print("Processing Datafile: {}".format(name)) if args.offline is True: online_tree = u3.open(infile)["deepntuplizer"]["tree"] else: online_tree = u3.open(infile)["btagana"]["ttree"] out_file = u3.recreate(os.path.join(base_dir, "{}.root".format(name)), compression=u3.ZLIB(4)) kt = "PuppiJet.TagVar_trackPtRel" track_key_indicator = "PuppiJet.TagVar_" N_jets = -1 print("running on {} events!".format(N_jets)) branch_dict = {} branch_registration = {} tracking_index_low = online_tree['PuppiJet.Jet_nFirstTrkTagVar'].array() tracking_index_high = online_tree['PuppiJet.Jet_nLastTrkTagVar'].array() track_eta_index_low = online_tree[ 'PuppiJet.Jet_nFirstTrkEtaRelTagVarCSV'].array()
branch_dicts[i_split]["branch_dict"]["{}_counts".format( online_key)] = counts # branch_dict["{}_counts".format(online_key)] = counts else: arr = online_tree[online_key].array()[on_mask].flatten( )[:N_jets] dtype = np.dtype("f4") for i_split, (n_start, n_end) in enumerate(zip(n_starts, n_ends)): branch_dicts[i_split]["branch_registration"][ online_key] = dtype branch_dicts[i_split]["branch_dict"][online_key] = arr[ n_start:n_end] # branch_registration[online_key] = dtype # branch_dict[online_key] = arr for i_split in range(SPLITS): out_path = os.path.join(base_dir, "{}_{}_{}.root".format(name, branch_key, i_split)) print("Creating new tree: {}".format(out_path)) with u3.recreate(out_path, compression=u3.ZLIB(6)) as out_file: print("Creating new tree for {}".format(branch_key)) out_file["ttree"] = u3.newtree( branch_dicts[i_split]["branch_registration"]) print("Creating branches") # s_time = time.time() out_file["ttree"].extend(branch_dicts[i_split]["branch_dict"]) # from IPython import embed;embed() # e_time = time.time() # print("Total time needed for {0} events:\n{1:1.1f}".format(N_jets, e_time - s_time))
def main(args): processed_nano = "tmva_xgboost_reproducers/lead_processed_nano_uncorr.root" df = pd.read_parquet(args.input_dataframe) print("Read input dataframe:\n{}".format(df)) inputs = { "lead_energyRaw": "SCRawE", "lead_r9": "r9", "lead_sieie":"sigmaIetaIeta", "lead_etaWidth": "etaWidth", "lead_phiWidth": "phiWidth", "lead_sieip": "covIEtaIPhi", "lead_s4": "s4", "lead_pfPhoIso03": "phoIso03", "lead_pfChargedIsoPFPV": "chgIsoWrtChosenVtx", "lead_pfChargedIsoWorstVtx": "chgIsoWrtWorstVtx", "lead_eta": "scEta", "lead_fixedGridRhoAll": "rho" } # This is needed just to not hardcore the branch type later arr_dict = {} for name in inputs.keys(): name_orig = name # Since I don't remember which ones have the suffix _nano if name_orig not in list(df.columns): name += "_nano" arr_dict[name_orig] = df[name] ak_arr = ak.Array(arr_dict) print(ak_arr.type) # Explicitely recompute also XGBoost one, just because print("Recomputing MVA with XGBoost") mva = xgboost.Booster() mva.load_model(args.xgboost_model) var_order = list(arr_dict.keys()) bdt_inputs = np.column_stack([ak.to_numpy(ak_arr[name]) for name in var_order]) tempmatrix = xgboost.DMatrix(bdt_inputs, feature_names=var_order) lead_idmva_xgboost = mva.predict(tempmatrix) # Thomas workflow lead_idmva_xgboost = -np.log(1./lead_idmva_xgboost - 1.) lead_idmva_xgboost = 2. / (1. + np.exp(-2.*lead_idmva_xgboost)) - 1. # Dump nanoaod inputs to a TTree with uproot3.recreate(processed_nano) as f: branchdict = {} arraydict = {} for nano_name, model_name in inputs.items(): #branchdict[model_name] = str(ak_arr[nano_name].type.type).replace('?', '') branchdict[model_name] = "float32" arraydict[model_name] = ak_arr[nano_name] f["Events"] = uproot3.newtree(branchdict) f["Events"].extend(arraydict) # TMVA with RDataFrame ROOT.gInterpreter.ProcessLine(''' TMVA::Experimental::RReader model("{}"); computeModel = TMVA::Experimental::Compute<{}, float>(model); '''.format(args.tmva_model, len(ak_arr.fields))) rdf = ROOT.RDataFrame("Events", processed_nano) rdf = rdf.Define("lead_idmva_tmva", ROOT.computeModel, ROOT.model.GetVariableNames()) print("Running RDF event loop") dct = rdf.AsNumpy(columns=["lead_idmva_tmva"]) lead_idmva_tmva = np.array([v[0] for v in dct["lead_idmva_tmva"]]) # Plot print("Plotting to {}".format(args.output_dir)) bins = 100 rng = (-1, 1) fig, (up, down) = plt.subplots( nrows=2, ncols=1, gridspec_kw={"height_ratios": (1, 1)} ) up.hist(lead_idmva_xgboost, bins=bins, range=rng, histtype="step", label="XGBoost", linewidth=2) up.hist(lead_idmva_tmva, bins=bins, range=rng, histtype="step", label="TMVA", linewidth=2) up.set_xlabel("lead PhoIDMVA after corrections") up.legend(fontsize=18, loc="upper left") down.hist(100 * (lead_idmva_xgboost - lead_idmva_tmva) / lead_idmva_tmva, bins=500, range=(-100, 100), histtype="step", density=True, color="black", linewidth=2 ) down.set_xlabel("$(XGB - TMVA)/TMVA$ [%]") down.set_yscale("log") fig.tight_layout() fig.savefig("{}/lead_xgb_tmva.png".format(args.output_dir), bbox_inches='tight') fig.savefig("{}/lead_xgb_tmva.pdf".format(args.output_dir), bbox_inches='tight') fig, ax = plt.subplots() ax.scatter(lead_idmva_xgboost, lead_idmva_tmva) ax.set_xlabel("XGBoost") ax.set_ylabel("TMVA") fig.savefig("{}/xgb_tmva_scatter.png".format(args.output_dir), bbox_inches='tight') fig.savefig("{}/xgb_tmva_scatter.pdf".format(args.output_dir), bbox_inches='tight')
def makeCardFromHist(out_cache, hist_name, nonprompt_scale=1, signal_scale=1, bkg_scale=1, overflow='all', ext='', systematics=True): print("Writing cards using histogram:", hist_name) card_dir = os.path.expandvars('$TWHOME/data/cards/') if not os.path.isdir(card_dir): os.makedirs(card_dir) data_card = card_dir + hist_name + ext + '_card.txt' shape_file = card_dir + hist_name + ext + '_shapes.root' histogram = out_cache[hist_name].copy() #histogram = histogram.rebin('mass', bins[hist_name]['bins']) # scale some processes scales = { 'ttbar': nonprompt_scale, 'topW_v2': signal_scale, 'TTW': bkg_scale, # only scale the most important backgrounds 'TTZ': bkg_scale, 'TTH': bkg_scale, } histogram.scale(scales, axis='dataset') ## making a histogram for pseudo observation. this hurts, but rn it seems to be the best option data_counts = np.asarray( np.round( histogram[notdata].integrate('dataset').values( overflow=overflow)[()], 0), int) data_hist = histogram['topW_v2'] data_hist.clear() data_hist_bins = data_hist.axes()[1] for i, edge in enumerate(data_hist_bins.edges(overflow=overflow)): if i >= len(data_counts): break for y in range(data_counts[i]): data_hist.fill(**{ 'dataset': 'data', data_hist_bins.name: edge + 0.0001 }) other_sel = re.compile('(TTTT|diboson|DY|rare)') ##observation = hist.export1d(histogram['pseudodata'].integrate('dataset'), overflow=overflow) #observation = hist.export1d(data_hist['data'].integrate('dataset'), overflow=overflow) observation = hist.export1d(histogram[notdata].integrate('dataset'), overflow=overflow) tw = hist.export1d(histogram['topW_v2'].integrate('dataset'), overflow=overflow) ttw = hist.export1d(histogram['TTW'].integrate('dataset'), overflow=overflow) ttz = hist.export1d(histogram['TTZ'].integrate('dataset'), overflow=overflow) tth = hist.export1d(histogram['TTH'].integrate('dataset'), overflow=overflow) rare = hist.export1d(histogram[other_sel].integrate('dataset'), overflow=overflow) nonprompt = hist.export1d(histogram['ttbar'].integrate('dataset'), overflow=overflow) fout = uproot3.recreate(shape_file) fout["signal"] = tw fout["nonprompt"] = nonprompt fout["ttw"] = ttw fout["ttz"] = ttz fout["tth"] = tth fout["rare"] = rare fout["data_obs"] = observation fout.close() # Get the total yields to write into a data card totals = {} totals['signal'] = histogram['topW_v2'].integrate('dataset').values( overflow=overflow)[()].sum() totals['ttw'] = histogram['TTW'].integrate('dataset').values( overflow=overflow)[()].sum() totals['ttz'] = histogram['TTZ'].integrate('dataset').values( overflow=overflow)[()].sum() totals['tth'] = histogram['TTH'].integrate('dataset').values( overflow=overflow)[()].sum() totals['rare'] = histogram['rare'].integrate('dataset').values( overflow=overflow)[()].sum() totals['nonprompt'] = histogram['ttbar'].integrate('dataset').values( overflow=overflow)[()].sum() ##totals['observation'] = histogram['pseudodata'].integrate('dataset').values(overflow=overflow)[()].sum() #totals['observation'] = int(sum(data_hist['data'].sum('dataset').values(overflow=overflow)[()])) totals['observation'] = histogram[notdata].integrate('dataset').values( overflow=overflow)[()].sum() print("{:30}{:.2f}".format("Signal expectation:", totals['signal'])) print("{:30}{:.2f}".format("Non-prompt background:", totals['nonprompt'])) print("{:30}{:.2f}".format( "t(t)X(X)/rare background:", totals['ttw'] + totals['ttz'] + totals['tth'] + totals['rare'])) print("{:30}{:.2f}".format("Observation:", totals['observation'])) # set up the card card = dataCard() card.reset() card.setPrecision(3) # add the uncertainties (just flat ones for now) card.addUncertainty('lumi', 'lnN') card.addUncertainty('ttw_norm', 'lnN') card.addUncertainty('ttz_norm', 'lnN') card.addUncertainty('tth_norm', 'lnN') card.addUncertainty('rare_norm', 'lnN') card.addUncertainty('fake', 'lnN') # add the single bin card.addBin('Bin0', ['ttw', 'ttz', 'tth', 'rare', 'nonprompt'], 'Bin0') card.specifyExpectation('Bin0', 'signal', totals['signal']) card.specifyExpectation('Bin0', 'ttw', totals['ttw']) card.specifyExpectation('Bin0', 'ttz', totals['ttz']) card.specifyExpectation('Bin0', 'tth', totals['tth']) card.specifyExpectation('Bin0', 'rare', totals['rare']) card.specifyExpectation('Bin0', 'nonprompt', totals['nonprompt']) # set uncertainties if systematics: card.specifyUncertainty('ttw_norm', 'Bin0', 'ttw', 1.15) card.specifyUncertainty('ttz_norm', 'Bin0', 'ttz', 1.10) card.specifyUncertainty('tth_norm', 'Bin0', 'tth', 1.20) card.specifyUncertainty('rare_norm', 'Bin0', 'rare', 1.20) card.specifyUncertainty('fake', 'Bin0', 'nonprompt', 1.25) card.specifyFlatUncertainty('lumi', 1.03) ## observation #card.specifyObservation('Bin0', int(round(totals['observation'],0))) card.specifyObservation('Bin0', totals['observation']) print("Done.\n") return card.writeToFile(data_card, shapeFile=shape_file)
fdf = btdf region = "totalr" else: fdf = sbdf region = "sideband" else: fdf = sbdf print(" number of passing events ",len(fdf)) #print("number of btag passing events ",len(btdf)) #lets make some histograms. rootfilename = go.makeOutFile(samp,'upout_'+region+'_'+btaggr,'.root',str(zptcut),str(hptcut),str(metcut),str(btagwp))#need to update for btagger npfilename = go.makeOutFile(samp,'totalevents_'+region+'_'+btaggr,'.npy',str(zptcut),str(hptcut),str(metcut),str(btagwp)) pklfilename = go.makeOutFile(samp,'selected_errors_'+region+'_'+btaggr,'.pkl',str(zptcut),str(hptcut),str(metcut),str(btagwp)) rootOutFile = up3.recreate(rootfilename,compression = None) npOutFile = open(npfilename,'wb') rootOutFile["h_z_pt"] = np.histogram(fdf['ZCandidate_pt'],bins=80,range=(0,800),weights=fdf['event_weight']) #rootOutFile["h_z_phi"] = np.histogram(fdf['ZCandidate_phi'],bins=100,range=(0,3.14159),weights=fdf['event_weight'])#needs to fit range rootOutFile["h_z_eta"] = np.histogram(fdf['ZCandidate_eta'],bins=100,range=(-5,5),weights=fdf['event_weight']) rootOutFile["h_z_m"] = np.histogram(fdf['ZCandidate_m'],bins=100,range=(40,140),weights=fdf['event_weight']) rootOutFile["h_h_pt"] = np.histogram(fdf['hCandidate_pt'],bins=40,range=(200,1200),weights=fdf['event_weight']) #rootOutFile["h_h_phi"] = np.histogram(fdf['hCandidate_phi'],bins=100,range=(0,3.14159))#needs to fit range rootOutFile["h_h_eta"] = np.histogram(fdf['hCandidate_eta'],bins=100,range=(-5,5),weights=fdf['event_weight']) rootOutFile["h_h_m"] = np.histogram(fdf['hCandidate_m'],bins=80,range=(0,400),weights=fdf['event_weight']) rootOutFile["h_h_sd"] = np.histogram(fdf['hCandidate_sd'],bins=80,range=(0,400),weights=fdf['event_weight']) rootOutFile["h_met"] = np.histogram(fdf['METclean'],bins=78,range=(50,2000),weights=fdf['event_weight']) #rootOutFile["h_met_phi"] = np.histogram(fdf['METPhiclean'],bins=100,range=(0,3.14159))#needs to fit range rootOutFile["h_zp_jigm"] = np.histogram(fdf['ZPrime_mass_est'],bins=100,range=(500,5000),weights=fdf['event_weight']) rootOutFile["h_nd_jigm"] = np.histogram(fdf['ND_mass_est'],bins=70,range=(100,800),weights=fdf['event_weight'])