Esempio n. 1
0
def misc_soverb(datadir, selections, use_tptrw):
    """Get signal over background using data in DATADIR and a SELECTIONS file.

    the format of the JSON entries should be "region": "numexpr selection".

    """
    from tdub.frames import raw_dataframe, apply_weight_tptrw, satisfying_selection
    from tdub.data import quick_files
    from tdub.data import selection_branches

    with open(selections) as f:
        selections = json.load(f)

    necessary_branches = set()
    for selection, query in selections.items():
        necessary_branches |= selection_branches(query)
    necessary_branches = list(necessary_branches) + ["weight_tptrw_tool"]

    qf = quick_files(datadir)
    bkg = qf["ttbar"] + qf["Diboson"] + qf["Zjets"] + qf["MCNP"]
    sig = qf["tW_DR"]

    sig_df = raw_dataframe(sig, branches=necessary_branches)
    bkg_df = raw_dataframe(bkg, branches=necessary_branches, entrysteps="1GB")
    apply_weight_tptrw(bkg_df)

    for sel, query in selections.items():
        s_df, b_df = satisfying_selection(sig_df, bkg_df, selection=query)
        print(sel, s_df["weight_nominal"].sum() / b_df["weight_nominal"].sum())
Esempio n. 2
0
def check(datadir: str):
    pairs = []
    allbranches = set()
    for reg, entries in META["regions"].items():
        for entry in entries:
            pairs.append((reg, entry["var"]))
            allbranches.add(entry["var"])
    allbranches.add("reg1j1b")
    allbranches.add("reg2j1b")
    allbranches.add("reg2j2b")
    allbranches.add("OS")
    allbranches.add("elmu")

    qf = quick_files(datadir)
    df = raw_dataframe(qf["Data"], branches=sorted(allbranches, key=str.lower))
    df1j1b = df.query(get_selection("1j1b"))
    df2j1b = df.query(get_selection("2j1b"))
    df2j2b = df.query(get_selection("2j2b"))

    for r, v in pairs:
        if r == "r1j1b":
            x = df1j1b[v]
            w = df1j1b["weight_nominal"]
        if r == "r2j1b":
            x = df2j1b[v]
            w = df2j1b["weight_nominal"]
        if r == "r2j2b":
            x = df2j2b[v]
            w = df2j2b["weight_nominal"]
        n, bins = np.histogram(x, bins=35)
        print(r, v, bins[0], bins[-1])
Esempio n. 3
0
def region_frames_from_qf(
    qf_result: Dict[str, List[str]],
    apply_tptrw: bool = False
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """get dataframes for our desired samples

    Parameters
    ----------
    qf_result : dict(str, list(str))
        the dictionary from calling quick_files
    apply_tptrw : bool
        if True apply tptrw to the ttbar frames

    Returns
    -------
    frames1j1b : dict(str, pd.DataFrame)
        the 1j1b dataframes
    frames2j1b : dict(str, pd.DataFrame)
        the 1j1b dataframes
    frames2j2b : dict(str, pd.DataFrame)
        the 2j2b dataframes
    """
    branches = get_branches(qf_result["Data"][0])
    masks1j1b, masks2j1b, masks2j2b = {}, {}, {}
    frames1j1b, frames2j1b, frames2j2b = {}, {}, {}
    log.info("reading data from disk")
    frames = {
        name: raw_dataframe(qf_result[name],
                            branches=branches,
                            drop_weight_sys=True)
        for name in ALL_SAMPLES
    }
    log.info("determing selections")
    for samp in ALL_SAMPLES:
        if samp != "Data":
            frames[samp]["weight_nominal"] *= LUMI
        if apply_tptrw and samp == "ttbar":
            log.info("applying top pt reweighting")
            frames[samp].apply_weight_tptrw()
        masks1j1b[samp] = frames[samp].eval(get_selection("1j1b"))
        masks2j1b[samp] = frames[samp].eval(get_selection("2j1b"))
        masks2j2b[samp] = frames[samp].eval(get_selection("2j2b"))
        frames1j1b[samp] = frames[samp][masks1j1b[samp]]
        frames2j1b[samp] = frames[samp][masks2j1b[samp]]
        frames2j2b[samp] = frames[samp][masks2j2b[samp]]

    return frames1j1b, frames2j1b, frames2j2b
Esempio n. 4
0
def apply_single(infile, arrname, outdir, fold_results=None, single_results=None):
    """Generate BDT response array for INFILE and save to .npy file.

    We generate the .npy files using either single training results
    (-s flag) or folded training results (-f flag).

    """
    if len(single_results) > 0 and len(fold_results) > 0:
        raise ValueError("Cannot use -f and -s together with apply-single")

    from tdub.ml_apply import build_array, FoldedTrainSummary, SingleTrainSummary
    from tdub.data import SampleInfo
    from tdub.data import selection_branches
    from tdub.frames import raw_dataframe
    import numpy as np

    outdir = PosixPath(outdir).resolve()
    outdir.mkdir(parents=True, exist_ok=True)

    trs = None
    if len(fold_results) > 0:
        trs = [FoldedTrainSummary(p) for p in fold_results]
    elif len(single_results) > 0:
        trs = [SingleTrainSummary(p) for p in single_results]
    else:
        raise ValueError("-f or -s required")

    necessary_branches = ["OS", "elmu", "reg2j1b", "reg2j2b", "reg1j1b"]
    for res in trs:
        necessary_branches += res.features
        necessary_branches += selection_branches(res.selection_used)
    necessary_branches = sorted(set(necessary_branches), key=str.lower)

    log.info("Loading necessary branches:")
    for nb in necessary_branches:
        log.info(f" - {nb}")

    stem = PosixPath(infile).stem
    sampinfo = SampleInfo(stem)
    tree = f"WtLoop_{sampinfo.tree}"
    log.info(f"Using tree {tree}")
    df = raw_dataframe(infile, tree=tree, branches=necessary_branches)
    npyfilename = outdir / f"{stem}.{arrname}.npy"
    result_arr = build_array(trs, df)
    np.save(npyfilename, result_arr)
Esempio n. 5
0
def bdt_cut_plots(
    source: PosixPath,
    branch: str = "bdtres03",
    lumi: float = 139.0,
    lo_1j1b: float = 0.35,
    hi_2j1b: float = 0.70,
    lo_2j2b: float = 0.45,
    hi_2j2b: float = 0.775,
    bins_1j1b: tuple[int, float, float] = (18, 0.2, 0.75),
    bins_2j1b: tuple[int, float, float] = (18, 0.2, 0.85),
    bins_2j2b: tuple[int, float, float] = (18, 0.2, 0.90),
    thesis: bool = False,
) -> None:
    """Geneate plots showing BDT cuts."""
    setup_tdub_style()
    source = PosixPath(source)
    qf = quick_files(source)

    def drds_histograms(
        dr_df,
        ds_df,
        region,
        branch="bdtres03",
        weight_branch="weight_nominal",
        nbins=12,
        xmin=0.2,
        xmax=0.9,
    ):
        dr_hist, err = fix1d(
            dr_df[branch].to_numpy(),
            bins=nbins,
            range=(xmin, xmax),
            weights=dr_df[weight_branch].to_numpy() * lumi,
            flow=True,
        )
        ds_hist, err = fix1d(
            ds_df[branch].to_numpy(),
            bins=nbins,
            range=(xmin, xmax),
            weights=ds_df[weight_branch].to_numpy() * lumi,
            flow=True,
        )
        return dr_hist, ds_hist

    branches = [
        branch, "weight_nominal", "reg1j1b", "reg2j1b", "reg2j2b", "OS"
    ]
    dr_df = raw_dataframe(qf["tW_DR"], branches=branches)
    ds_df = raw_dataframe(qf["tW_DS"], branches=branches)

    ##################

    dr, ds = drds_histograms(
        dr_df.query(tdub.config.SELECTION_1j1b),
        ds_df.query(tdub.config.SELECTION_1j1b),
        "1j1b",
        branch,
        nbins=bins_1j1b[0],
        xmin=bins_1j1b[1],
        xmax=bins_1j1b[2],
    )
    fig, ax, axr = one_sided_comparison_plot(
        dr,
        ds,
        np.linspace(bins_1j1b[1], bins_1j1b[2], bins_1j1b[0] + 1),
        thesis=thesis,
    )
    ymid = ax.get_ylim()[1] * 0.69
    xmid = (lo_1j1b - ax.get_xlim()[0]) * 0.5 + ax.get_xlim()[0]
    ax.text(xmid,
            ymid,
            "Excluded",
            ha="center",
            va="center",
            color="gray",
            size=9)
    ax.fill_betweenx([-1, 1.0e5], -1.0, lo_1j1b, color="gray", alpha=0.55)
    axr.fill_betweenx([-200, 200], -1.0, lo_1j1b, color="gray", alpha=0.55)
    fig.savefig("drds_1j1b.pdf")
    plt.close(fig)

    ##################

    dr, ds = drds_histograms(
        dr_df.query(tdub.config.SELECTION_2j1b),
        ds_df.query(tdub.config.SELECTION_2j1b),
        "2j1b",
        branch,
        nbins=bins_2j1b[0],
        xmin=bins_2j1b[1],
        xmax=bins_2j1b[2],
    )
    fig, ax, axr = one_sided_comparison_plot(
        dr,
        ds,
        np.linspace(bins_2j1b[1], bins_2j1b[2], bins_2j1b[0] + 1),
        thesis=thesis,
    )
    ax.fill_betweenx([-1, 1.0e5], hi_2j1b, 1.0, color="gray", alpha=0.55)
    axr.fill_betweenx([-200, 200], hi_2j1b, 1.0, color="gray", alpha=0.55)
    ymid = ax.get_ylim()[1] * 0.69
    xmid = (ax.get_xlim()[1] - hi_2j1b) * 0.5 + hi_2j1b
    ax.text(xmid,
            ymid,
            "Excluded",
            ha="center",
            va="center",
            color="gray",
            size=9)
    fig.savefig("drds_2j1b.pdf")
    plt.close(fig)

    ##################

    dr, ds = drds_histograms(
        dr_df.query(tdub.config.SELECTION_2j2b),
        ds_df.query(tdub.config.SELECTION_2j2b),
        "2j2b",
        branch,
        nbins=bins_2j2b[0],
        xmin=bins_2j2b[1],
        xmax=bins_2j2b[2],
    )
    fig, ax, axr = one_sided_comparison_plot(
        dr,
        ds,
        np.linspace(bins_2j2b[1], bins_2j2b[2], bins_2j2b[0] + 1),
        thesis=thesis,
    )
    ax.fill_betweenx([-1, 1.0e5], -1.0, lo_2j2b, color="gray", alpha=0.55)
    axr.fill_betweenx([-200, 200], -1.0, lo_2j2b, color="gray", alpha=0.55)
    ax.fill_betweenx([-1, 1.0e5], hi_2j2b, 1.0, color="gray", alpha=0.55)
    axr.fill_betweenx([-200, 200], hi_2j2b, 1.0, color="gray", alpha=0.55)
    ymid = ax.get_ylim()[1] * 0.69
    xmid = (lo_2j2b - ax.get_xlim()[0]) * 0.5 + ax.get_xlim()[0]
    ax.text(xmid,
            ymid,
            "Excluded",
            ha="center",
            va="center",
            color="gray",
            size=9)
    xmid = (ax.get_xlim()[1] - hi_2j2b) * 0.5 + hi_2j2b
    ax.text(xmid,
            ymid,
            "Excluded",
            ha="center",
            va="center",
            color="gray",
            size=9)
    fig.savefig("drds_2j2b.pdf")
    plt.close(fig)
Esempio n. 6
0
    "MCNP",
    #"tW_DR_AFII",
    #"tW_DS",
    #"ttbar_AFII",
    #"ttbar_PS",
    #"ttbar_hdamp",
]

all_histograms = {}

for samp in samples:
    print(f"working on {samp}")
    files = qf[samp]
    df = raw_dataframe(files,
                       branches=[
                           "met", "weight_nominal", "reg1j1b", "reg2j1b",
                           "reg2j2b", "elmu", "OS"
                       ])
    for region in ("reg1j1b", "reg2j1b", "reg2j2b"):
        sel = get_selection(region)
        dfc, dfe = generate_from_df(df.query(sel),
                                    "met",
                                    bins=15,
                                    range=(0.0, 200.0),
                                    systematic_weights=False)
        hists = df2th1(dfc, dfe, weight_col="weight_nominal")
        for hname, hobj in hists.items():
            if hname == "weight_nominal":
                finalkey = f"{region}_met_{samp}"
            else:
                finalkey = f"{region}_met_{samp}_{hname}"