Esempio n. 1
0
def check(datadir: str):
    pairs = []
    allbranches = set()
    for reg, entries in META["regions"].items():
        for entry in entries:
            pairs.append((reg, entry["var"]))
            allbranches.add(entry["var"])
    allbranches.add("reg1j1b")
    allbranches.add("reg2j1b")
    allbranches.add("reg2j2b")
    allbranches.add("OS")
    allbranches.add("elmu")

    qf = quick_files(datadir)
    df = raw_dataframe(qf["Data"], branches=sorted(allbranches, key=str.lower))
    df1j1b = df.query(get_selection("1j1b"))
    df2j1b = df.query(get_selection("2j1b"))
    df2j2b = df.query(get_selection("2j2b"))

    for r, v in pairs:
        if r == "r1j1b":
            x = df1j1b[v]
            w = df1j1b["weight_nominal"]
        if r == "r2j1b":
            x = df2j1b[v]
            w = df2j1b["weight_nominal"]
        if r == "r2j2b":
            x = df2j2b[v]
            w = df2j2b["weight_nominal"]
        n, bins = np.histogram(x, bins=35)
        print(r, v, bins[0], bins[-1])
Esempio n. 2
0
def single(args):
    qf = quick_files(args.data_dir)
    extra_sel = args.extra_selection
    if extra_sel:
        extra_sel = PosixPath(extra_sel).read_text().strip()
    df, y, w = prepare_from_root(
        qf[f"tW_{args.nlo_method}"],
        qf["ttbar"],
        args.region,
        weight_mean=1.0,
    )
    drop_cols(df, *get_avoids(args.region))
    params = dict(
        learning_rate=args.learning_rate,
        max_depth=args.max_depth,
        n_estimators=args.n_estimators,
    )
    extra_sum = {"region": args.region, "nlo_method": args.nlo_method}
    sr = single_training(
        df,
        y,
        w,
        params,
        args.out_dir,
        early_stopping_rounds=args.early_stopping_rounds,
        extra_summary_entries=extra_sum,
        use_catboost=True,
    )
    return sr
Esempio n. 3
0
from tdub.utils import get_selection, get_features, quick_files

import lightgbm as lgbm
from sklearn.model_selection import train_test_split

from dask_jobqueue import HTCondorCluster
from dask.distributed import Client
from dask_ml.model_selection import GridSearchCV

cluster = HTCondorCluster(cores=2, disk="4GB", memory="8GB")
client = Client(cluster)
cluster.adapt(maximum_jobs=200)

setup_logging()

qf = quick_files("/atlasgpfs01/usatlas/data/ddavis/wtloop/v29_20191111")

df, y, w = prepare_from_root(qf["tW_DR"], qf["ttbar"], "1j1b")

X_train, X_test, y_train, y_test, w_train, w_test = train_test_split(
    df, y, w, train_size=0.8, random_state=414, shuffle=True)

n_sig = y_train[y_train == 1].shape[0]
n_bkg = y_train[y_train == 0].shape[0]
spw = n_bkg / n_sig

n_sig = y[y == 1].shape[0]
n_bkg = y[y == 0].shape[0]
spw = n_bkg / n_sig
print(spw)
Esempio n. 4
0
def plot(data_dir, output_dir, apply_tptrw, from_parquet, prep_parquet,
         regions, skip_absent_features):
    curdir = pathlib.PosixPath(__file__).parent.resolve()
    datadir = curdir / "data"
    datadir.mkdir(exist_ok=True)

    if from_parquet:
        log.info("reading parquet files")
        dfs_1j1b, dfs_2j1b, dfs_2j2b = {}, {}, {}
        for samp in ALL_SAMPLES:
            dfs_1j1b[samp] = pd.read_parquet(datadir / f"{samp}_1j1b.parquet")
            dfs_2j1b[samp] = pd.read_parquet(datadir / f"{samp}_2j1b.parquet")
            dfs_2j2b[samp] = pd.read_parquet(datadir / f"{samp}_2j2b.parquet")
        log.info("done reading parquet files")

    else:
        qf = quick_files(data_dir)
        dfs_1j1b, dfs_2j1b, dfs_2j2b = region_frames_from_qf(qf)
        if prep_parquet:
            log.info("preping parquet files")
            for k, v in dfs_1j1b.items():
                name = datadir / f"{k}_1j1b.parquet"
                v.to_parquet(name)
            for k, v in dfs_2j1b.items():
                name = datadir / f"{k}_2j1b.parquet"
                v.to_parquet(name)
            for k, v in dfs_2j2b.items():
                name = datadir / f"{k}_2j2b.parquet"
                v.to_parquet(name)
            log.info("dont prepping parquet")
            exit(0)

    if apply_tptrw:
        log.info("applying top pt reweighting")
        apply_weight_tptrw(dfs_1j1b["ttbar"])
        apply_weight_tptrw(dfs_2j1b["ttbar"])
        apply_weight_tptrw(dfs_2j2b["ttbar"])

    plotdir = pathlib.PosixPath(output_dir)
    plotdir.mkdir(exist_ok=True)
    os.chdir(plotdir)

    if "1j1b" in regions:
        for entry in META["regions"]["r1j1b"]:
            if skip_absent_features:
                if entry["var"] not in FEATURESET_1j1b:
                    continue
            binning = (entry["nbins"], entry["xmin"], entry["xmax"])
            fig, ax, axr = plot_from_region_frames(dfs_1j1b, entry["var"],
                                                   binning, "1j1b",
                                                   entry["log"])
            if fig is not None:
                save_and_close(fig, "r{}_{}.pdf".format("1j1b", entry["var"]))
    if "2j1b" in regions:
        for entry in META["regions"]["r2j1b"]:
            if skip_absent_features:
                if entry["var"] not in FEATURESET_2j1b:
                    continue
            binning = (entry["nbins"], entry["xmin"], entry["xmax"])
            fig, ax, axr = plot_from_region_frames(dfs_2j1b, entry["var"],
                                                   binning, "2j1b",
                                                   entry["log"])
            if fig is not None:
                save_and_close(fig, "r{}_{}.pdf".format("2j1b", entry["var"]))
    if "2j2b" in regions:
        for entry in META["regions"]["r2j2b"]:
            if skip_absent_features:
                if entry["var"] not in FEATURESET_2j2b:
                    continue
            binning = (entry["nbins"], entry["xmin"], entry["xmax"])
            fig, ax, axr = plot_from_region_frames(dfs_2j2b, entry["var"],
                                                   binning, "2j2b",
                                                   entry["log"])
            if fig is not None:
                save_and_close(fig, "r{}_{}.pdf".format("2j2b", entry["var"]))

    os.chdir(curdir)
Esempio n. 5
0
from tdub.frames import raw_dataframe
from tdub.utils import quick_files, files_for_tree, get_selection
from tdub.hist import generate_from_df, df2th1
import uproot

qf = quick_files("/Users/ddavis/ATLAS/data/wtloop/v29_20200201")

samples = [
    "Data",
    "ttbar",
    "tW_DR",
    "Zjets",
    "Diboson",
    "MCNP",
    #"tW_DR_AFII",
    #"tW_DS",
    #"ttbar_AFII",
    #"ttbar_PS",
    #"ttbar_hdamp",
]

all_histograms = {}

for samp in samples:
    print(f"working on {samp}")
    files = qf[samp]
    df = raw_dataframe(files,
                       branches=[
                           "met", "weight_nominal", "reg1j1b", "reg2j1b",
                           "reg2j2b", "elmu", "OS"
                       ])