def check(datadir: str): pairs = [] allbranches = set() for reg, entries in META["regions"].items(): for entry in entries: pairs.append((reg, entry["var"])) allbranches.add(entry["var"]) allbranches.add("reg1j1b") allbranches.add("reg2j1b") allbranches.add("reg2j2b") allbranches.add("OS") allbranches.add("elmu") qf = quick_files(datadir) df = raw_dataframe(qf["Data"], branches=sorted(allbranches, key=str.lower)) df1j1b = df.query(get_selection("1j1b")) df2j1b = df.query(get_selection("2j1b")) df2j2b = df.query(get_selection("2j2b")) for r, v in pairs: if r == "r1j1b": x = df1j1b[v] w = df1j1b["weight_nominal"] if r == "r2j1b": x = df2j1b[v] w = df2j1b["weight_nominal"] if r == "r2j2b": x = df2j2b[v] w = df2j2b["weight_nominal"] n, bins = np.histogram(x, bins=35) print(r, v, bins[0], bins[-1])
def single(args): qf = quick_files(args.data_dir) extra_sel = args.extra_selection if extra_sel: extra_sel = PosixPath(extra_sel).read_text().strip() df, y, w = prepare_from_root( qf[f"tW_{args.nlo_method}"], qf["ttbar"], args.region, weight_mean=1.0, ) drop_cols(df, *get_avoids(args.region)) params = dict( learning_rate=args.learning_rate, max_depth=args.max_depth, n_estimators=args.n_estimators, ) extra_sum = {"region": args.region, "nlo_method": args.nlo_method} sr = single_training( df, y, w, params, args.out_dir, early_stopping_rounds=args.early_stopping_rounds, extra_summary_entries=extra_sum, use_catboost=True, ) return sr
from tdub.utils import get_selection, get_features, quick_files import lightgbm as lgbm from sklearn.model_selection import train_test_split from dask_jobqueue import HTCondorCluster from dask.distributed import Client from dask_ml.model_selection import GridSearchCV cluster = HTCondorCluster(cores=2, disk="4GB", memory="8GB") client = Client(cluster) cluster.adapt(maximum_jobs=200) setup_logging() qf = quick_files("/atlasgpfs01/usatlas/data/ddavis/wtloop/v29_20191111") df, y, w = prepare_from_root(qf["tW_DR"], qf["ttbar"], "1j1b") X_train, X_test, y_train, y_test, w_train, w_test = train_test_split( df, y, w, train_size=0.8, random_state=414, shuffle=True) n_sig = y_train[y_train == 1].shape[0] n_bkg = y_train[y_train == 0].shape[0] spw = n_bkg / n_sig n_sig = y[y == 1].shape[0] n_bkg = y[y == 0].shape[0] spw = n_bkg / n_sig print(spw)
def plot(data_dir, output_dir, apply_tptrw, from_parquet, prep_parquet, regions, skip_absent_features): curdir = pathlib.PosixPath(__file__).parent.resolve() datadir = curdir / "data" datadir.mkdir(exist_ok=True) if from_parquet: log.info("reading parquet files") dfs_1j1b, dfs_2j1b, dfs_2j2b = {}, {}, {} for samp in ALL_SAMPLES: dfs_1j1b[samp] = pd.read_parquet(datadir / f"{samp}_1j1b.parquet") dfs_2j1b[samp] = pd.read_parquet(datadir / f"{samp}_2j1b.parquet") dfs_2j2b[samp] = pd.read_parquet(datadir / f"{samp}_2j2b.parquet") log.info("done reading parquet files") else: qf = quick_files(data_dir) dfs_1j1b, dfs_2j1b, dfs_2j2b = region_frames_from_qf(qf) if prep_parquet: log.info("preping parquet files") for k, v in dfs_1j1b.items(): name = datadir / f"{k}_1j1b.parquet" v.to_parquet(name) for k, v in dfs_2j1b.items(): name = datadir / f"{k}_2j1b.parquet" v.to_parquet(name) for k, v in dfs_2j2b.items(): name = datadir / f"{k}_2j2b.parquet" v.to_parquet(name) log.info("dont prepping parquet") exit(0) if apply_tptrw: log.info("applying top pt reweighting") apply_weight_tptrw(dfs_1j1b["ttbar"]) apply_weight_tptrw(dfs_2j1b["ttbar"]) apply_weight_tptrw(dfs_2j2b["ttbar"]) plotdir = pathlib.PosixPath(output_dir) plotdir.mkdir(exist_ok=True) os.chdir(plotdir) if "1j1b" in regions: for entry in META["regions"]["r1j1b"]: if skip_absent_features: if entry["var"] not in FEATURESET_1j1b: continue binning = (entry["nbins"], entry["xmin"], entry["xmax"]) fig, ax, axr = plot_from_region_frames(dfs_1j1b, entry["var"], binning, "1j1b", entry["log"]) if fig is not None: save_and_close(fig, "r{}_{}.pdf".format("1j1b", entry["var"])) if "2j1b" in regions: for entry in META["regions"]["r2j1b"]: if skip_absent_features: if entry["var"] not in FEATURESET_2j1b: continue binning = (entry["nbins"], entry["xmin"], entry["xmax"]) fig, ax, axr = plot_from_region_frames(dfs_2j1b, entry["var"], binning, "2j1b", entry["log"]) if fig is not None: save_and_close(fig, "r{}_{}.pdf".format("2j1b", entry["var"])) if "2j2b" in regions: for entry in META["regions"]["r2j2b"]: if skip_absent_features: if entry["var"] not in FEATURESET_2j2b: continue binning = (entry["nbins"], entry["xmin"], entry["xmax"]) fig, ax, axr = plot_from_region_frames(dfs_2j2b, entry["var"], binning, "2j2b", entry["log"]) if fig is not None: save_and_close(fig, "r{}_{}.pdf".format("2j2b", entry["var"])) os.chdir(curdir)
from tdub.frames import raw_dataframe from tdub.utils import quick_files, files_for_tree, get_selection from tdub.hist import generate_from_df, df2th1 import uproot qf = quick_files("/Users/ddavis/ATLAS/data/wtloop/v29_20200201") samples = [ "Data", "ttbar", "tW_DR", "Zjets", "Diboson", "MCNP", #"tW_DR_AFII", #"tW_DS", #"ttbar_AFII", #"ttbar_PS", #"ttbar_hdamp", ] all_histograms = {} for samp in samples: print(f"working on {samp}") files = qf[samp] df = raw_dataframe(files, branches=[ "met", "weight_nominal", "reg1j1b", "reg2j1b", "reg2j2b", "elmu", "OS" ])