def misc_soverb(datadir, selections, use_tptrw): """Get signal over background using data in DATADIR and a SELECTIONS file. the format of the JSON entries should be "region": "numexpr selection". """ from tdub.frames import raw_dataframe, apply_weight_tptrw, satisfying_selection from tdub.data import quick_files from tdub.data import selection_branches with open(selections) as f: selections = json.load(f) necessary_branches = set() for selection, query in selections.items(): necessary_branches |= selection_branches(query) necessary_branches = list(necessary_branches) + ["weight_tptrw_tool"] qf = quick_files(datadir) bkg = qf["ttbar"] + qf["Diboson"] + qf["Zjets"] + qf["MCNP"] sig = qf["tW_DR"] sig_df = raw_dataframe(sig, branches=necessary_branches) bkg_df = raw_dataframe(bkg, branches=necessary_branches, entrysteps="1GB") apply_weight_tptrw(bkg_df) for sel, query in selections.items(): s_df, b_df = satisfying_selection(sig_df, bkg_df, selection=query) print(sel, s_df["weight_nominal"].sum() / b_df["weight_nominal"].sum())
def check(datadir: str): pairs = [] allbranches = set() for reg, entries in META["regions"].items(): for entry in entries: pairs.append((reg, entry["var"])) allbranches.add(entry["var"]) allbranches.add("reg1j1b") allbranches.add("reg2j1b") allbranches.add("reg2j2b") allbranches.add("OS") allbranches.add("elmu") qf = quick_files(datadir) df = raw_dataframe(qf["Data"], branches=sorted(allbranches, key=str.lower)) df1j1b = df.query(get_selection("1j1b")) df2j1b = df.query(get_selection("2j1b")) df2j2b = df.query(get_selection("2j2b")) for r, v in pairs: if r == "r1j1b": x = df1j1b[v] w = df1j1b["weight_nominal"] if r == "r2j1b": x = df2j1b[v] w = df2j1b["weight_nominal"] if r == "r2j2b": x = df2j2b[v] w = df2j2b["weight_nominal"] n, bins = np.histogram(x, bins=35) print(r, v, bins[0], bins[-1])
def region_frames_from_qf( qf_result: Dict[str, List[str]], apply_tptrw: bool = False ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: """get dataframes for our desired samples Parameters ---------- qf_result : dict(str, list(str)) the dictionary from calling quick_files apply_tptrw : bool if True apply tptrw to the ttbar frames Returns ------- frames1j1b : dict(str, pd.DataFrame) the 1j1b dataframes frames2j1b : dict(str, pd.DataFrame) the 1j1b dataframes frames2j2b : dict(str, pd.DataFrame) the 2j2b dataframes """ branches = get_branches(qf_result["Data"][0]) masks1j1b, masks2j1b, masks2j2b = {}, {}, {} frames1j1b, frames2j1b, frames2j2b = {}, {}, {} log.info("reading data from disk") frames = { name: raw_dataframe(qf_result[name], branches=branches, drop_weight_sys=True) for name in ALL_SAMPLES } log.info("determing selections") for samp in ALL_SAMPLES: if samp != "Data": frames[samp]["weight_nominal"] *= LUMI if apply_tptrw and samp == "ttbar": log.info("applying top pt reweighting") frames[samp].apply_weight_tptrw() masks1j1b[samp] = frames[samp].eval(get_selection("1j1b")) masks2j1b[samp] = frames[samp].eval(get_selection("2j1b")) masks2j2b[samp] = frames[samp].eval(get_selection("2j2b")) frames1j1b[samp] = frames[samp][masks1j1b[samp]] frames2j1b[samp] = frames[samp][masks2j1b[samp]] frames2j2b[samp] = frames[samp][masks2j2b[samp]] return frames1j1b, frames2j1b, frames2j2b
def apply_single(infile, arrname, outdir, fold_results=None, single_results=None): """Generate BDT response array for INFILE and save to .npy file. We generate the .npy files using either single training results (-s flag) or folded training results (-f flag). """ if len(single_results) > 0 and len(fold_results) > 0: raise ValueError("Cannot use -f and -s together with apply-single") from tdub.ml_apply import build_array, FoldedTrainSummary, SingleTrainSummary from tdub.data import SampleInfo from tdub.data import selection_branches from tdub.frames import raw_dataframe import numpy as np outdir = PosixPath(outdir).resolve() outdir.mkdir(parents=True, exist_ok=True) trs = None if len(fold_results) > 0: trs = [FoldedTrainSummary(p) for p in fold_results] elif len(single_results) > 0: trs = [SingleTrainSummary(p) for p in single_results] else: raise ValueError("-f or -s required") necessary_branches = ["OS", "elmu", "reg2j1b", "reg2j2b", "reg1j1b"] for res in trs: necessary_branches += res.features necessary_branches += selection_branches(res.selection_used) necessary_branches = sorted(set(necessary_branches), key=str.lower) log.info("Loading necessary branches:") for nb in necessary_branches: log.info(f" - {nb}") stem = PosixPath(infile).stem sampinfo = SampleInfo(stem) tree = f"WtLoop_{sampinfo.tree}" log.info(f"Using tree {tree}") df = raw_dataframe(infile, tree=tree, branches=necessary_branches) npyfilename = outdir / f"{stem}.{arrname}.npy" result_arr = build_array(trs, df) np.save(npyfilename, result_arr)
def bdt_cut_plots( source: PosixPath, branch: str = "bdtres03", lumi: float = 139.0, lo_1j1b: float = 0.35, hi_2j1b: float = 0.70, lo_2j2b: float = 0.45, hi_2j2b: float = 0.775, bins_1j1b: tuple[int, float, float] = (18, 0.2, 0.75), bins_2j1b: tuple[int, float, float] = (18, 0.2, 0.85), bins_2j2b: tuple[int, float, float] = (18, 0.2, 0.90), thesis: bool = False, ) -> None: """Geneate plots showing BDT cuts.""" setup_tdub_style() source = PosixPath(source) qf = quick_files(source) def drds_histograms( dr_df, ds_df, region, branch="bdtres03", weight_branch="weight_nominal", nbins=12, xmin=0.2, xmax=0.9, ): dr_hist, err = fix1d( dr_df[branch].to_numpy(), bins=nbins, range=(xmin, xmax), weights=dr_df[weight_branch].to_numpy() * lumi, flow=True, ) ds_hist, err = fix1d( ds_df[branch].to_numpy(), bins=nbins, range=(xmin, xmax), weights=ds_df[weight_branch].to_numpy() * lumi, flow=True, ) return dr_hist, ds_hist branches = [ branch, "weight_nominal", "reg1j1b", "reg2j1b", "reg2j2b", "OS" ] dr_df = raw_dataframe(qf["tW_DR"], branches=branches) ds_df = raw_dataframe(qf["tW_DS"], branches=branches) ################## dr, ds = drds_histograms( dr_df.query(tdub.config.SELECTION_1j1b), ds_df.query(tdub.config.SELECTION_1j1b), "1j1b", branch, nbins=bins_1j1b[0], xmin=bins_1j1b[1], xmax=bins_1j1b[2], ) fig, ax, axr = one_sided_comparison_plot( dr, ds, np.linspace(bins_1j1b[1], bins_1j1b[2], bins_1j1b[0] + 1), thesis=thesis, ) ymid = ax.get_ylim()[1] * 0.69 xmid = (lo_1j1b - ax.get_xlim()[0]) * 0.5 + ax.get_xlim()[0] ax.text(xmid, ymid, "Excluded", ha="center", va="center", color="gray", size=9) ax.fill_betweenx([-1, 1.0e5], -1.0, lo_1j1b, color="gray", alpha=0.55) axr.fill_betweenx([-200, 200], -1.0, lo_1j1b, color="gray", alpha=0.55) fig.savefig("drds_1j1b.pdf") plt.close(fig) ################## dr, ds = drds_histograms( dr_df.query(tdub.config.SELECTION_2j1b), ds_df.query(tdub.config.SELECTION_2j1b), "2j1b", branch, nbins=bins_2j1b[0], xmin=bins_2j1b[1], xmax=bins_2j1b[2], ) fig, ax, axr = one_sided_comparison_plot( dr, ds, np.linspace(bins_2j1b[1], bins_2j1b[2], bins_2j1b[0] + 1), thesis=thesis, ) ax.fill_betweenx([-1, 1.0e5], hi_2j1b, 1.0, color="gray", alpha=0.55) axr.fill_betweenx([-200, 200], hi_2j1b, 1.0, color="gray", alpha=0.55) ymid = ax.get_ylim()[1] * 0.69 xmid = (ax.get_xlim()[1] - hi_2j1b) * 0.5 + hi_2j1b ax.text(xmid, ymid, "Excluded", ha="center", va="center", color="gray", size=9) fig.savefig("drds_2j1b.pdf") plt.close(fig) ################## dr, ds = drds_histograms( dr_df.query(tdub.config.SELECTION_2j2b), ds_df.query(tdub.config.SELECTION_2j2b), "2j2b", branch, nbins=bins_2j2b[0], xmin=bins_2j2b[1], xmax=bins_2j2b[2], ) fig, ax, axr = one_sided_comparison_plot( dr, ds, np.linspace(bins_2j2b[1], bins_2j2b[2], bins_2j2b[0] + 1), thesis=thesis, ) ax.fill_betweenx([-1, 1.0e5], -1.0, lo_2j2b, color="gray", alpha=0.55) axr.fill_betweenx([-200, 200], -1.0, lo_2j2b, color="gray", alpha=0.55) ax.fill_betweenx([-1, 1.0e5], hi_2j2b, 1.0, color="gray", alpha=0.55) axr.fill_betweenx([-200, 200], hi_2j2b, 1.0, color="gray", alpha=0.55) ymid = ax.get_ylim()[1] * 0.69 xmid = (lo_2j2b - ax.get_xlim()[0]) * 0.5 + ax.get_xlim()[0] ax.text(xmid, ymid, "Excluded", ha="center", va="center", color="gray", size=9) xmid = (ax.get_xlim()[1] - hi_2j2b) * 0.5 + hi_2j2b ax.text(xmid, ymid, "Excluded", ha="center", va="center", color="gray", size=9) fig.savefig("drds_2j2b.pdf") plt.close(fig)
"MCNP", #"tW_DR_AFII", #"tW_DS", #"ttbar_AFII", #"ttbar_PS", #"ttbar_hdamp", ] all_histograms = {} for samp in samples: print(f"working on {samp}") files = qf[samp] df = raw_dataframe(files, branches=[ "met", "weight_nominal", "reg1j1b", "reg2j1b", "reg2j2b", "elmu", "OS" ]) for region in ("reg1j1b", "reg2j1b", "reg2j2b"): sel = get_selection(region) dfc, dfe = generate_from_df(df.query(sel), "met", bins=15, range=(0.0, 200.0), systematic_weights=False) hists = df2th1(dfc, dfe, weight_col="weight_nominal") for hname, hobj in hists.items(): if hname == "weight_nominal": finalkey = f"{region}_met_{samp}" else: finalkey = f"{region}_met_{samp}_{hname}"