def fromTable(cls, table, is_signal=False): from DatasetExtractor import TrainNuisAuxSplit import numpy as np cur_data, cur_nuis, cur_weights = TrainNuisAuxSplit(table) if is_signal: cur_labels = np.ones(len(cur_data)) else: cur_labels = np.zeros(len(cur_data)) return cls(cur_data, cur_nuis, cur_weights, cur_labels)
def ShowEventContent(infile_path): with pd.HDFStore(infile_path) as hdf: keys = hdf.keys() available_tables = [os.path.basename(key) for key in keys] for name in available_tables: data = pd.read_hdf(infile_path, key=name) testdata, nuisdata, weights = TrainNuisAuxSplit(data) total_events = np.sum(weights) unweighted_events = len(weights) print("{}: total events = {} ({} unweighted)".format( name, total_events, unweighted_events))
def OptimizeCBASensitivity(infile_path, outdir, do_plots = True): data_slice = TrainingConfig.training_slice slice_size = data_slice[1] - data_slice[0] # read the test dataset, which will be used to get the expected sensitivity of the analysis sig_samples = TrainingConfig.sig_samples bkg_samples = TrainingConfig.bkg_samples print("loading data ...") sig_data = [pd.read_hdf(infile_path, key = sig_sample) for sig_sample in sig_samples] bkg_data = [pd.read_hdf(infile_path, key = bkg_sample) for bkg_sample in bkg_samples] sig_data_train = [] sig_mBB_train = [] sig_weights_train = [] sig_aux_data_train = [] for sample in sig_data: cur_length = len(sample) sample = sample.sample(frac = 1, random_state = 12345).reset_index(drop = True) # shuffle the sample cur_train = sample[int(data_slice[0] * cur_length) : int(data_slice[1] * cur_length)] cur_traindata, cur_nuisdata, cur_weights = TrainNuisAuxSplit(cur_train) # load the standard classifier input, nuisances and weights cur_aux_data = cur_train[TrainingConfig.other_branches].values sig_data_train.append(cur_traindata) sig_mBB_train.append(cur_nuisdata) sig_weights_train.append(cur_weights / slice_size) sig_aux_data_train.append(cur_aux_data) bkg_data_train = [] bkg_mBB_train = [] bkg_weights_train = [] bkg_aux_data_train = [] for sample in bkg_data: cur_length = len(sample) sample = sample.sample(frac = 1, random_state = 12345).reset_index(drop = True) # shuffle the sample cur_train = sample[int(data_slice[0] * cur_length) : int(data_slice[1] * cur_length)] cur_traindata, cur_nuisdata, cur_weights = TrainNuisAuxSplit(cur_train) # load the standard classifier input, nuisances and weights cur_aux_data = cur_train[TrainingConfig.other_branches].values bkg_data_train.append(cur_traindata) bkg_mBB_train.append(cur_nuisdata) bkg_weights_train.append(cur_weights / slice_size) bkg_aux_data_train.append(cur_aux_data) # also prepare the total, concatenated versions data_train = sig_data_train + bkg_data_train aux_train = sig_aux_data_train + bkg_aux_data_train weights_train = sig_weights_train + bkg_weights_train samples = sig_samples + bkg_samples # define the SR binning for mBB SR_low = 30 SR_up = 210 SR_binwidth = 10 SR_mBB_binning = np.linspace(SR_low, SR_up, num = 1 + int((SR_up - SR_low) / SR_binwidth), endpoint = True) print("mBB binning: {}".format(SR_mBB_binning)) original_cuts = {"MET_cut": 200, "dRBB_highMET_cut": 1.2, "dRBB_lowMET_cut": 1.8} # the objective function that needs to be minimized costfunc = lambda cuts: -EvaluateAsimovSignificance(process_events = data_train, process_aux_events = aux_train, process_weights = weights_train, process_names = samples, signal_process_names = sig_samples, background_process_names = bkg_samples, binning = SR_mBB_binning, cuts = cuts, fit_dir = outdir)["combined"] costfunc_bayes = lambda MET_cut, dRBB_highMET_cut, dRBB_lowMET_cut: -costfunc({"MET_cut": MET_cut, "dRBB_highMET_cut": dRBB_highMET_cut, "dRBB_lowMET_cut": dRBB_lowMET_cut}) # then, try a global search strategy ranges_bayes = {"MET_cut": (150, 250), "dRBB_highMET_cut": (0.5, 5.0), "dRBB_lowMET_cut": (0.5, 5.0)} gp_params = {'kernel': 1.0 * Matern(length_scale = 0.05, length_scale_bounds = (1e-1, 1e2), nu = 1.5)} optimizer = BayesianOptimization( f = costfunc_bayes, pbounds = ranges_bayes, random_state = None ) optimizer.maximize(init_points = 20, n_iter = 1, acq = 'poi', kappa = 3, **gp_params) xi_scheduler = lambda iteration: 0.01 + 0.19 * np.exp(-0.004 * iteration) for it in range(400): cur_xi = xi_scheduler(it) print("using xi = {}".format(cur_xi)) optimizer.maximize(init_points = 0, n_iter = 1, acq = 'poi', kappa = 3, xi = cur_xi, **gp_params) # print the results print("==============================================") print("initial cuts:") print("==============================================") print("MET_cut = {}".format(original_cuts["MET_cut"])) print("dRBB_highMET_cut = {}".format(original_cuts["dRBB_highMET_cut"])) print("dRBB_lowMET_cut = {}".format(original_cuts["dRBB_lowMET_cut"])) print("significance = {} sigma".format(costfunc_bayes(**original_cuts))) print("==============================================") print("==============================================") print("optimized cuts (global optimization):") print("==============================================") print("MET_cut = {}".format(optimizer.max["params"]["MET_cut"])) print("dRBB_highMET_cut = {}".format(optimizer.max["params"]["dRBB_highMET_cut"])) print("dRBB_lowMET_cut = {}".format(optimizer.max["params"]["dRBB_lowMET_cut"])) print("significance = {} sigma".format(optimizer.max["target"])) print("==============================================") # save the results: with open(os.path.join(outdir, "opt_results.pkl"), "wb") as opt_outfile: pickle.dump(optimizer.max, opt_outfile)
def main(): parser = ArgumentParser(description="train adversarial networks") parser.add_argument("--data", action="store", dest="infile_path") parser.add_argument("--outdir", action="store", dest="outdir") parser.add_argument("--statistics", action="store_const", const=True, default=False, dest="verbose_statistics") args = vars(parser.parse_args()) infile_path = args["infile_path"] outdir = args["outdir"] print("using infile_path = " + infile_path) print("using outdir = " + outdir) tconf = TrainingConfig.from_file(outdir) data_branches = tconf.training_branches print("using data_branches = " + ", ".join(data_branches)) # read the training data sig_samples = TrainingConfig.sig_samples bkg_samples = TrainingConfig.bkg_samples training_slice = TrainingConfig.training_slice print("loading data ...") sig_data = [ pd.read_hdf(infile_path, key=sig_sample) for sig_sample in sig_samples ] bkg_data = [ pd.read_hdf(infile_path, key=bkg_sample) for bkg_sample in bkg_samples ] auxdat_sig = [] auxdat_bkg = [] # extract the training dataset sig_data_train = [] for sample, sample_name in zip(sig_data, sig_samples): cur_length = len(sample) sample = sample.sample(frac=1, random_state=12345).reset_index( drop=True) # shuffle the sample cur_train = sample[int(training_slice[0] * cur_length):int(training_slice[1] * cur_length)] auxdat_sig.append(cur_train[TrainingConfig.auxiliary_branches].values) sig_data_train.append(cur_train) bkg_data_train = [] for sample, sample_name in zip(bkg_data, bkg_samples): cur_length = len(sample) sample = sample.sample(frac=1, random_state=12345).reset_index( drop=True) # shuffle the sample cur_train = sample[int(training_slice[0] * cur_length):int(training_slice[1] * cur_length)] auxdat_bkg.append(cur_train[TrainingConfig.auxiliary_branches].values) bkg_data_train.append(cur_train) print("got " + str(len(sig_data)) + " signal datasets") print("got " + str(len(bkg_data)) + " background datasets") # split the dataset into training branches, nuisances and event weights traindat_sig = [] nuisdat_sig = [] weightdat_sig = [] traindat_bkg = [] nuisdat_bkg = [] weightdat_bkg = [] for cur_sig_data_train, sample_name in zip(sig_data_train, sig_samples): cur_traindat_sig, cur_nuisdat_sig, cur_weightdat_sig = TrainNuisAuxSplit( cur_sig_data_train) traindat_sig.append(cur_traindat_sig) nuisdat_sig.append(cur_nuisdat_sig) weightdat_sig.append(cur_weightdat_sig * TrainingConfig.sample_reweighting[sample_name]) print("'{}' with {} entries representing {} events".format( sample_name, len(cur_weightdat_sig), np.sum(cur_weightdat_sig))) for cur_bkg_data_train, sample_name in zip(bkg_data_train, bkg_samples): cur_traindat_bkg, cur_nuisdat_bkg, cur_weightdat_bkg = TrainNuisAuxSplit( cur_bkg_data_train) traindat_bkg.append(cur_traindat_bkg) nuisdat_bkg.append(cur_nuisdat_bkg) weightdat_bkg.append(cur_weightdat_bkg * TrainingConfig.sample_reweighting[sample_name]) print("'{}' with {} entries representing {} events".format( sample_name, len(cur_weightdat_bkg), np.sum(cur_weightdat_bkg))) print("starting up") mce = AdversarialEnvironment.from_file(outdir) training_pars = tconf.training_pars print("using the following training parameters:") for key, val in training_pars.items(): print(key + " = " + str(val)) # set up the training train = AdversarialTrainer(training_pars=training_pars, verbose_statistics=args["verbose_statistics"]) # give the full list of signal / background components to the trainer train.train(mce, number_batches=training_pars["training_batches"], traindat_sig=traindat_sig, traindat_bkg=traindat_bkg, nuisances_sig=nuisdat_sig, nuisances_bkg=nuisdat_bkg, weights_sig=weightdat_sig, weights_bkg=weightdat_bkg, auxdat_sig=auxdat_sig, auxdat_bkg=auxdat_bkg, sig_sampling_pars={ "sampling_lengths": TrainingConfig.sig_sampling_lengths }, bkg_sampling_pars={ "sampling_lengths": TrainingConfig.bkg_sampling_lengths }) # save all the necessary information if not os.path.exists(outdir): os.makedirs(outdir) mce.save(os.path.join(outdir, )) train.save_training_statistics( os.path.join(outdir, "training_evolution.pkl"))
def GetCBASignalEfficiencies(outdir): if not os.path.exists(outdir): os.makedirs(outdir) sig_samples = TrainingConfig.sig_samples bkg_samples = TrainingConfig.bkg_samples infile_path = TrainingConfig.data_path data_slice = TrainingConfig.validation_slice slice_size = data_slice[1] - data_slice[0] data_sig = [pd.read_hdf(infile_path, key=sample) for sample in sig_samples] data_bkg = [pd.read_hdf(infile_path, key=sample) for sample in bkg_samples] # load all signal processes sig_data_test = [ ] # this holds all the branches used as inputs to the classifier sig_weights_test = [] sig_aux_data_test = [ ] # this holds some other branches that may be important for sample, sample_name in zip(data_sig, sig_samples): cur_length = len(sample) sample = sample.sample(frac=1, random_state=12345).reset_index( drop=True) # shuffle the sample cur_test = sample[int(data_slice[0] * cur_length):int(data_slice[1] * cur_length)] cur_testdata, cur_nuisdata, cur_weights = TrainNuisAuxSplit( cur_test ) # load the standard classifier input, nuisances and weights cur_aux_data = cur_test[TrainingConfig.auxiliary_branches].values sig_data_test.append(cur_testdata) sig_weights_test.append(cur_weights / slice_size * TrainingConfig.sample_reweighting[sample_name]) sig_aux_data_test.append(cur_aux_data) # load all background processes bkg_data_test = [ ] # this holds all the branches used as inputs to the classifier bkg_weights_test = [] bkg_aux_data_test = [ ] # this holds some other branches that may be important for sample, sample_name in zip(data_bkg, bkg_samples): cur_length = len(sample) sample = sample.sample(frac=1, random_state=12345).reset_index( drop=True) # shuffle the sample cur_test = sample[int(data_slice[0] * cur_length):int(data_slice[1] * cur_length)] cur_testdata, cur_nuisdata, cur_weights = TrainNuisAuxSplit( cur_test ) # load the standard classifier input, nuisances and weights cur_aux_data = cur_test[TrainingConfig.auxiliary_branches].values bkg_data_test.append(cur_testdata) bkg_weights_test.append(cur_weights / slice_size * TrainingConfig.sample_reweighting[sample_name]) bkg_aux_data_test.append(cur_aux_data) # also prepare the total, concatenated versions data_test = sig_data_test + bkg_data_test aux_test = sig_aux_data_test + bkg_aux_data_test weights_test = sig_weights_test + bkg_weights_test samples = sig_samples + bkg_samples # prepare the common mBB binning for all signal regions SR_low = 30 SR_up = 210 SR_binwidth = 10 SR_binning = np.linspace(SR_low, SR_up, num=1 + int((SR_up - SR_low) / SR_binwidth), endpoint=True) effdict = {} # also fill inclusive 2- and 3-jet categories to get a baseline for the shapes inclusive_2J = CutBasedCategoryFiller.create_nJ_category( process_events=data_test, process_aux_events=aux_test, process_weights=weights_test, process_names=samples, nJ=2) inclusive_3J = CutBasedCategoryFiller.create_nJ_category( process_events=data_test, process_aux_events=aux_test, process_weights=weights_test, process_names=samples, nJ=3) # compute the total number of available signal events sig_events_total_2j = inclusive_2J.get_number_events("Hbb") sig_events_total_3j = inclusive_3J.get_number_events("Hbb") print("total 2J signal events: {}".format(sig_events_total_2j)) print("total 3J signal events: {}".format(sig_events_total_3j)) # fill the cut-based categories for cur_nJ, sig_events_total in zip( [2, 3], [sig_events_total_2j, sig_events_total_3j]): # first, export the categories of the cut-based analysis: high / low MET low_MET_cat = CutBasedCategoryFiller.create_low_MET_category( process_events=data_test, process_aux_events=aux_test, process_weights=weights_test, process_names=samples, nJ=cur_nJ) low_MET_cat.export_ROOT_histogram( binning=SR_binning, processes=sig_samples + bkg_samples, var_names="mBB", outfile_path=os.path.join(outdir, "{}jet_low_MET.root".format(cur_nJ)), clipping=True, density=False) CategoryPlotter.plot_category_composition( low_MET_cat, binning=SR_binning, outpath=os.path.join(outdir, "{}jet_low_MET.pdf".format(cur_nJ)), var="mBB", xlabel=r'$m_{bb}$ [GeV]', plotlabel=[ "MC16d", r'150 GeV < MET < 200 GeV', "dRBB < 1.8", "nJ = {}".format(cur_nJ) ], args={}) # get the signal efficiency for this category sigeff = low_MET_cat.get_number_events("Hbb") / sig_events_total effdict["low_MET_{}J".format(cur_nJ)] = sigeff high_MET_cat = CutBasedCategoryFiller.create_high_MET_category( process_events=data_test, process_aux_events=aux_test, process_weights=weights_test, process_names=samples, nJ=cur_nJ) high_MET_cat.export_ROOT_histogram( binning=SR_binning, processes=sig_samples + bkg_samples, var_names="mBB", outfile_path=os.path.join(outdir, "{}jet_high_MET.root".format(cur_nJ)), clipping=True, density=False) CategoryPlotter.plot_category_composition( high_MET_cat, binning=SR_binning, outpath=os.path.join(outdir, "{}jet_high_MET.pdf".format(cur_nJ)), var="mBB", xlabel=r'$m_{bb}$ [GeV]', plotlabel=[ "MC16d", "MET > 200 GeV", "dRBB < 1.2", "nJ = {}".format(cur_nJ) ], args={}) # get the signal efficiency for this category sigeff = high_MET_cat.get_number_events("Hbb") / sig_events_total effdict["high_MET_{}J".format(cur_nJ)] = sigeff return effdict
def main(): parser = ArgumentParser(description = "populate analysis signal regions and export them to be used with HistFitter") parser.add_argument("--data", action = "store", dest = "infile_path") parser.add_argument("--model_dir", action = "store", dest = "model_dir") parser.add_argument("--out_dir", action = "store", dest = "out_dir") parser.add_argument("--use_test", action = "store_const", const = True, default = False) args = vars(parser.parse_args()) adv_model = _load_metadata(os.path.join(args["model_dir"], "meta.conf"), "AdversarialEnvironment")["adversary_model"] adversary_label_library = {"MINEAdversary": "MIND", "DisCoAdversary": "DisCo", "GMMAdversary": "EMAX", "PtEstAdversary": "REG"} adversary_label = adversary_label_library[adv_model] # extract the validation or test dataset if args["use_test"]: print("using test dataset") data_slice = TrainingConfig.test_slice else: print("using validation dataset") data_slice = TrainingConfig.validation_slice slice_size = data_slice[1] - data_slice[0] infile_path = args["infile_path"] model_dir = args["model_dir"] outdir = args["out_dir"] # make plots showing the progress of the training training_dir = os.path.dirname(model_dir) training_plotter = TrainingStatisticsPlotter(model_dir) training_plotter.plot(model_dir) sig_samples = TrainingConfig.sig_samples bkg_samples = TrainingConfig.bkg_samples data_sig = [pd.read_hdf(infile_path, key = sample) for sample in sig_samples] data_bkg = [pd.read_hdf(infile_path, key = sample) for sample in bkg_samples] # load all signal processes sig_data_test = [] # this holds all the branches used as inputs to the classifier sig_weights_test = [] sig_aux_data_test = [] # this holds some other branches that may be important for sample, sample_name in zip(data_sig, sig_samples): cur_length = len(sample) sample = sample.sample(frac = 1, random_state = 12345).reset_index(drop = True) # shuffle the sample cur_test = sample[int(data_slice[0] * cur_length) : int(data_slice[1] * cur_length)] cur_testdata, cur_nuisdata, cur_weights = TrainNuisAuxSplit(cur_test) # load the standard classifier input, nuisances and weights cur_aux_data = cur_test[TrainingConfig.auxiliary_branches].values sig_data_test.append(cur_testdata) sig_weights_test.append(cur_weights / slice_size) sig_aux_data_test.append(cur_aux_data) # also need to keep separate all signal events with 2 jets / 3 jets sig_data_test_2j = [] sig_weights_test_2j = [] sig_aux_data_test_2j = [] sig_data_test_3j = [] sig_weights_test_3j = [] sig_aux_data_test_3j = [] for sample, sample_name in zip(data_sig, sig_samples): cur_length = len(sample) sample = sample.sample(frac = 1, random_state = 12345).reset_index(drop = True) # shuffle the sample cur_test = sample[int(data_slice[0] * cur_length) : int(data_slice[1] * cur_length)] cur_test = cur_test[cur_test["nJ"] == 2] cur_testdata, cur_nuisdata, cur_weights = TrainNuisAuxSplit(cur_test) # load the standard classifier input, nuisances and weights cur_aux_data = cur_test[TrainingConfig.auxiliary_branches].values sig_data_test_2j.append(cur_testdata) sig_weights_test_2j.append(cur_weights / slice_size) sig_aux_data_test_2j.append(cur_aux_data) for sample, sample_name in zip(data_sig, sig_samples): cur_length = len(sample) sample = sample.sample(frac = 1, random_state = 12345).reset_index(drop = True) # shuffle the sample cur_test = sample[int(data_slice[0] * cur_length) : int(data_slice[1] * cur_length)] cur_test = cur_test[cur_test["nJ"] == 3] cur_testdata, cur_nuisdata, cur_weights = TrainNuisAuxSplit(cur_test) # load the standard classifier input, nuisances and weights cur_aux_data = cur_test[TrainingConfig.auxiliary_branches].values sig_data_test_3j.append(cur_testdata) sig_weights_test_3j.append(cur_weights / slice_size) sig_aux_data_test_3j.append(cur_aux_data) # load all background processes bkg_data_test = [] # this holds all the branches used as inputs to the classifier bkg_weights_test = [] bkg_aux_data_test = [] # this holds some other branches that may be important for sample, sample_name in zip(data_bkg, bkg_samples): cur_length = len(sample) sample = sample.sample(frac = 1, random_state = 12345).reset_index(drop = True) # shuffle the sample cur_test = sample[int(data_slice[0] * cur_length) : int(data_slice[1] * cur_length)] cur_testdata, cur_nuisdata, cur_weights = TrainNuisAuxSplit(cur_test) # load the standard classifier input, nuisances and weights cur_aux_data = cur_test[TrainingConfig.auxiliary_branches].values bkg_data_test.append(cur_testdata) bkg_weights_test.append(cur_weights / slice_size) bkg_aux_data_test.append(cur_aux_data) # also prepare the total, concatenated versions data_test = sig_data_test + bkg_data_test aux_test = sig_aux_data_test + bkg_aux_data_test weights_test = sig_weights_test + bkg_weights_test samples = sig_samples + bkg_samples # load the AdversarialEnvironment env = AdversarialEnvironment.from_file(model_dir) # prepare the common mBB binning for all signal regions SR_low = 30 SR_up = 210 SR_binwidth = 10 SR_binning = np.linspace(SR_low, SR_up, num = 1 + int((SR_up - SR_low) / SR_binwidth), endpoint = True) # also prepare the binning along the MVA dimension sigeff_binning = [0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.70, 0.75, 0.80, 0.85, 0.9, 0.92, 0.94, 0.96, 0.98, 0.99, 1.0] print("signal efficiency binning: {}".format(sigeff_binning)) print("mBB binning: {}".format(SR_binning)) # for MadGraph ATLAS MC (with optimized CBA) cuts = {2: [0.0, 0.3936688696975736, 0.9162186612913272], 3: [0.0, 0.35975037002858584, 0.861855992060236]} cut_labels = ["tight", "loose"] CBA_original = {"MET_cut": 200, "dRBB_highMET_cut": 1.2, "dRBB_lowMET_cut": 1.8} CBA_optimized = {"MET_cut": 191, "dRBB_highMET_cut": 1.2, "dRBB_lowMET_cut": 5.0} print("using the following cuts:") print(cuts) # fill the inclusive categories with 2j / 3j events inclusive_2J = CutBasedCategoryFiller.create_nJ_category(process_events = data_test, process_aux_events = aux_test, process_weights = weights_test, process_names = samples, nJ = 2) for cur_process in samples: inclusive_2J.export_histogram(binning = SR_binning, processes = [cur_process], var_name = "mBB", outfile = os.path.join(outdir, "dist_mBB_{}_2jet.pkl".format(cur_process)), density = True) inclusive_2J.export_histogram(binning = SR_binning, processes = bkg_samples, var_name = "mBB", outfile = os.path.join(outdir, "dist_mBB_bkg_2jet.pkl"), density = True) inclusive_3J = CutBasedCategoryFiller.create_nJ_category(process_events = data_test, process_aux_events = aux_test, process_weights = weights_test, process_names = samples, nJ = 3) for cur_process in samples: inclusive_3J.export_histogram(binning = SR_binning, processes = [cur_process], var_name = "mBB", outfile = os.path.join(outdir, "dist_mBB_{}_3jet.pkl".format(cur_process)), density = True) inclusive_3J.export_histogram(binning = SR_binning, processes = bkg_samples, var_name = "mBB", outfile = os.path.join(outdir, "dist_mBB_bkg_3jet.pkl"), density = True) total_events = inclusive_2J.get_total_events() + inclusive_3J.get_total_events() CBA_used_events = 0 PCA_used_events = 0 anadict = {} for cur_nJ, cur_inclusive_cat, cur_signal_events, cur_signal_weights, cur_signal_aux_events in zip([2, 3], [inclusive_2J, inclusive_3J], [sig_data_test_2j, sig_data_test_3j], [sig_weights_test_2j, sig_weights_test_3j], [sig_aux_data_test_2j, sig_aux_data_test_3j]): for cur_cuts, prefix in zip([CBA_original, CBA_optimized], ["original_", "optimized_"]): # first, export the categories of the cut-based analysis: high / low MET, using the optimized cuts print("filling {} jet low_MET category with cut prefix = {}".format(cur_nJ, prefix)) low_MET_cat = CutBasedCategoryFiller.create_low_MET_category(process_events = data_test, process_aux_events = aux_test, process_weights = weights_test, process_names = samples, nJ = cur_nJ, cuts = cur_cuts) print("filled {} signal events".format(low_MET_cat.get_number_events("Hbb"))) low_MET_cat.export_ROOT_histogram(binning = SR_binning, processes = sig_samples + bkg_samples, var_names = "mBB", outfile_path = os.path.join(outdir, prefix + "{}jet_low_MET.root".format(cur_nJ)), clipping = True, density = False) anadict[prefix + "low_MET_{}jet_sig_eff".format(cur_nJ)] = ModelEvaluator.get_efficiency(low_MET_cat, cur_inclusive_cat, sig_samples) anadict[prefix + "low_MET_{}jet_bkg_eff".format(cur_nJ)] = ModelEvaluator.get_efficiency(low_MET_cat, cur_inclusive_cat, bkg_samples) anadict[prefix + "low_MET_{}jet_inv_JS_bkg".format(cur_nJ)] = 1.0 / ModelEvaluator.get_JS_categories(low_MET_cat, cur_inclusive_cat, binning = SR_binning, var = "mBB", processes = bkg_samples) anadict[prefix + "low_MET_{}jet_binned_sig".format(cur_nJ)] = low_MET_cat.get_binned_significance(binning = SR_binning, signal_processes = sig_samples, background_processes = bkg_samples, var_name = "mBB") CBA_used_events += low_MET_cat.get_total_events() for cur_process in samples: low_MET_cat.export_histogram(binning = SR_binning, processes = [cur_process], var_name = "mBB", outfile = os.path.join(outdir, prefix + "dist_mBB_{}_{}jet_low_MET.pkl".format(cur_process, cur_nJ)), density = True) low_MET_cat.export_histogram(binning = SR_binning, processes = bkg_samples, var_name = "mBB", outfile = os.path.join(outdir, prefix + "dist_mBB_bkg_{}jet_low_MET.pkl".format(cur_nJ)), density = True) CategoryPlotter.plot_category_composition(low_MET_cat, binning = SR_binning, outpath = os.path.join(outdir, prefix + "{}jet_low_MET.pdf".format(cur_nJ)), var = "mBB", xlabel = r'$m_{bb}$ [GeV]', plotlabel = ["MadGraph + Pythia8", r'$\sqrt{s} = 13$ TeV, 140 fb$^{-1}$', r'150 GeV < $E_{\mathrm{T}}^{\mathrm{miss}}$' + '< {MET_cut} GeV'.format(**cur_cuts), r'$\Delta R_{{bb}} < {dRBB_lowMET_cut}$'.format(**cur_cuts), r'{} jet'.format(cur_nJ)], args = {}) CategoryPlotter.plot_category_composition(low_MET_cat, binning = SR_binning, outpath = os.path.join(outdir, prefix + "{}jet_low_MET_nostack.pdf".format(cur_nJ)), var = "mBB", xlabel = r'$m_{bb}$ [GeV]', ylabel = "a.u.", plotlabel = ["MadGraph + Pythia8", r'$\sqrt{s} = 13$ TeV, 140 fb$^{-1}$', r'150 GeV < $E_{\mathrm{T}}^{\mathrm{miss}}$' + '< {MET_cut} GeV'.format(**cur_cuts), r'$\Delta R_{{bb}} < {dRBB_lowMET_cut}$'.format(**cur_cuts), r'{} jet'.format(cur_nJ)], args = {}, stacked = False, histtype = 'step', density = True) print("filling {} jet high_MET category".format(cur_nJ)) high_MET_cat = CutBasedCategoryFiller.create_high_MET_category(process_events = data_test, process_aux_events = aux_test, process_weights = weights_test, process_names = samples, nJ = cur_nJ, cuts = cur_cuts) print("filled {} signal events".format(high_MET_cat.get_number_events("Hbb"))) high_MET_cat.export_ROOT_histogram(binning = SR_binning, processes = sig_samples + bkg_samples, var_names = "mBB", outfile_path = os.path.join(outdir, prefix + "{}jet_high_MET.root".format(cur_nJ)), clipping = True, density = False) anadict[prefix + "high_MET_{}jet_sig_eff".format(cur_nJ)] = ModelEvaluator.get_efficiency(high_MET_cat, cur_inclusive_cat, sig_samples) anadict[prefix + "high_MET_{}jet_bkg_eff".format(cur_nJ)] = ModelEvaluator.get_efficiency(high_MET_cat, cur_inclusive_cat, bkg_samples) anadict[prefix + "high_MET_{}jet_inv_JS_bkg".format(cur_nJ)] = 1.0 / ModelEvaluator.get_JS_categories(high_MET_cat, cur_inclusive_cat, binning = SR_binning, var = "mBB", processes = bkg_samples) anadict[prefix + "high_MET_{}jet_binned_sig".format(cur_nJ)] = high_MET_cat.get_binned_significance(binning = SR_binning, signal_processes = sig_samples, background_processes = bkg_samples, var_name = "mBB") # compute JSD between the high-MET and low-MET categories anadict[prefix + "{}jet_high_low_MET_inv_JS_bkg".format(cur_nJ)] = 1.0 / ModelEvaluator.get_JS_categories(high_MET_cat, low_MET_cat, binning = SR_binning, var = "mBB", processes = bkg_samples) anadict[prefix + "{}jet_binned_sig_CBA".format(cur_nJ)] = (anadict[prefix + "low_MET_{}jet_binned_sig".format(cur_nJ)]**2 + anadict[prefix + "high_MET_{}jet_binned_sig".format(cur_nJ)]**2)**0.5 CBA_used_events += high_MET_cat.get_total_events() for cur_process in samples: high_MET_cat.export_histogram(binning = SR_binning, processes = [cur_process], var_name = "mBB", outfile = os.path.join(outdir, prefix + "dist_mBB_{}_{}jet_high_MET.pkl".format(cur_process, cur_nJ)), density = True) high_MET_cat.export_histogram(binning = SR_binning, processes = bkg_samples, var_name = "mBB", outfile = os.path.join(outdir, prefix + "dist_mBB_bkg_{}jet_high_MET.pkl".format(cur_nJ)), density = True) CategoryPlotter.plot_category_composition(high_MET_cat, binning = SR_binning, outpath = os.path.join(outdir, prefix + "{}jet_high_MET.pdf".format(cur_nJ)), var = "mBB", xlabel = r'$m_{bb}$ [GeV]', plotlabel = ["MadGraph + Pythia8", r'$\sqrt{s} = 13$ TeV, 140 fb$^{-1}$', r'$E_{\mathrm{T}}^{\mathrm{miss}}$ >' + ' {MET_cut} GeV'.format(**cur_cuts), r'$\Delta R_{{bb}} < {dRBB_highMET_cut}$'.format(**cur_cuts), r'{} jet'.format(cur_nJ)], args = {}) CategoryPlotter.plot_category_composition(high_MET_cat, binning = SR_binning, outpath = os.path.join(outdir, prefix + "{}jet_high_MET_nostack.pdf".format(cur_nJ)), var = "mBB", xlabel = r'$m_{bb}$ [GeV]', ylabel = "a.u.", plotlabel = ["MadGraph + Pythia8", r'$\sqrt{s} = 13$ TeV, 140 fb$^{-1}$', r'$E_{\mathrm{T}}^{\mathrm{miss}}$ >' + ' {MET_cut} GeV'.format(**cur_cuts), r'$\Delta R_{{bb}} < {dRBB_highMET_cut}$'.format(**cur_cuts), r'{} jet'.format(cur_nJ)], args = {}, stacked = False, histtype = 'step', density = True) # keep track of the tight and loose categories for later classifier_categories = {} # prepare N categories along the classifier output dimension for cut_end, cut_start, cut_label in zip(cuts[cur_nJ][0:-1], cuts[cur_nJ][1:], cut_labels): print("exporting {}J region with sigeff range {} - {}".format(cur_nJ, cut_start, cut_end)) cur_cat = ClassifierBasedCategoryFiller.create_classifier_category(env, process_events = data_test, process_aux_events = aux_test, process_weights = weights_test, process_names = samples, signal_events = cur_signal_events, signal_weights = cur_signal_weights, signal_aux_events = cur_signal_aux_events, classifier_sigeff_range = (cut_start, cut_end), nJ = cur_nJ) cur_cat.export_ROOT_histogram(binning = SR_binning, processes = sig_samples + bkg_samples, var_names = "mBB", outfile_path = os.path.join(outdir, "region_{}jet_{}_{}.root".format(cur_nJ, cut_start, cut_end)), clipping = True, density = False) PCA_used_events += cur_cat.get_total_events() anadict["{}_{}jet_sig_eff".format(cut_label, cur_nJ)] = ModelEvaluator.get_efficiency(cur_cat, cur_inclusive_cat, sig_samples) anadict["{}_{}jet_bkg_eff".format(cut_label, cur_nJ)] = ModelEvaluator.get_efficiency(cur_cat, cur_inclusive_cat, bkg_samples) anadict["{}_{}jet_inv_JS_bkg".format(cut_label, cur_nJ)] = 1.0 / ModelEvaluator.get_JS_categories(cur_cat, cur_inclusive_cat, binning = SR_binning, var = "mBB", processes = bkg_samples) anadict["{}_{}jet_binned_sig".format(cut_label, cur_nJ)] = cur_cat.get_binned_significance(binning = SR_binning, signal_processes = sig_samples, background_processes = bkg_samples, var_name = "mBB") classifier_categories[cut_label] = cur_cat for cur_process in samples: cur_cat.export_histogram(binning = SR_binning, processes = [cur_process], var_name = "mBB", outfile = os.path.join(outdir, "dist_mBB_{}_{}jet_{}.pkl".format(cur_process, cur_nJ, cut_label)), density = True) cur_cat.export_histogram(binning = SR_binning, processes = bkg_samples, var_name = "mBB", outfile = os.path.join(outdir, "dist_mBB_bkg_{}jet_{}.pkl".format(cur_nJ, cut_label)), density = True) CategoryPlotter.plot_category_composition(cur_cat, binning = SR_binning, outpath = os.path.join(outdir, "dist_mBB_region_{}jet_{}_{}.pdf".format(cur_nJ, cut_start, cut_end)), var = "mBB", xlabel = r'$m_{bb}$ [GeV]', plotlabel = ["MadGraph + Pythia8", r'$\sqrt{s} = 13$ TeV, 140 fb$^{-1}$', cut_label + r', {} jet'.format(cur_nJ), adversary_label]) CategoryPlotter.plot_category_composition(cur_cat, binning = SR_binning, outpath = os.path.join(outdir, "dist_mBB_region_{}jet_{}_{}_nostack.pdf".format(cur_nJ, cut_start, cut_end)), var = "mBB", xlabel = r'$m_{bb}$ [GeV]', ylabel = "a.u.", plotlabel = ["MadGraph + Pythia8", r'$\sqrt{s} = 13$ TeV, 140 fb$^{-1}$', cut_label + r', {} jet'.format(cur_nJ), adversary_label], stacked = False, histtype = 'step', density = True) print("filled {} signal events".format(cur_cat.get_number_events("Hbb"))) # compute JSD between the tight and loose categories anadict["{}jet_tight_loose_inv_JS_bkg".format(cur_nJ)] = 1.0 / ModelEvaluator.get_JS_categories(classifier_categories["tight"], classifier_categories["loose"], binning = SR_binning, var = "mBB", processes = bkg_samples) anadict["{}jet_binned_sig_PCA".format(cur_nJ)] = (anadict["tight_{}jet_binned_sig".format(cur_nJ)]**2 + anadict["loose_{}jet_binned_sig".format(cur_nJ)]**2)**0.5 print("event statistics:") print("have a total of {} events, CBA used {} events, ({}%)".format(total_events, CBA_used_events, CBA_used_events / total_events)) print("have a total of {} events, PCA used {} events, ({}%)".format(total_events, PCA_used_events, PCA_used_events / total_events)) anadict.update(env.create_paramdict()) print("got the following anadict: {}".format(anadict)) with open(os.path.join(outdir, "anadict.pkl"), "wb") as outfile: pickle.dump(anadict, outfile)
def MakeDistributionControlPlots(infile, outdir, test_size=0.999): # sig_samples = ["Hbb"] # bkg_samples = ["ttbar", "Zjets", "Wjets", "diboson", "singletop"] # for MadGraph sig_samples = ["Hbb"] bkg_samples = ["ttbar", "Zjets", "Wjets", "diboson"] samples = sig_samples + bkg_samples # set up proper binnings for different variables binnings = {} binnings["mBB"] = get_binning(30, 600, 10) binnings["dRBB"] = get_binning(0.0, 3.0, 0.1) binnings["pTB1"] = get_binning(0, 300, 10) binnings["pTB2"] = get_binning(0, 300, 10) binnings["MET"] = get_binning(0, 300, 10) binnings["dEtaBB"] = get_binning(0, 5, 0.1) binnings["dPhiMETdijet"] = get_binning(0, np.pi, 0.1) binnings["SumPtJet"] = get_binning(0, 500, 10) print("loading data ...") data = [pd.read_hdf(infile_path, key=sample) for sample in samples] for cur_df, sample in zip(data, samples): print("have {} events available for '{}'".format(len(cur_df), sample)) data_test = [] mBB_test = [] weights_test = [] aux_data_test = [] for sample in data: _, cur_test = train_test_split(sample, test_size=test_size, shuffle=True, random_state=12345) cur_testdata, cur_nuisdata, cur_weights = TrainNuisAuxSplit( cur_test ) # load the standard classifier input, nuisances and weights cur_aux_data = cur_test[TrainingConfig.other_branches].values data_test.append(cur_testdata) mBB_test.append(cur_nuisdata) weights_test.append(cur_weights / test_size) aux_data_test.append(cur_aux_data) # first, plot the total event content (i.e. corresponding to an "inclusive" event category) inclusive = Category("inclusive") for events, weights, process in zip(data_test, weights_test, samples): inclusive.add_events(events=events, weights=weights, process=process, event_variables=TrainingConfig.training_branches, aux_content=cur_aux_data, aux_variables=TrainingConfig.other_branches) # print total event numbers for all processes print("============================") print(" inclusive expected event yield ") print("============================") for process in samples: print("{}: {} events".format(process, inclusive.get_number_events(process))) print("============================") # also fill inclusive 2- and 3-jet categories to get a baseline for the shapes inclusive_2J = CutBasedCategoryFiller.create_nJ_category( process_events=data_test, process_aux_events=aux_data_test, process_weights=weights_test, process_names=samples, nJ=2) print("============================") print(" inclusive 2j expected event yield ") print("============================") for process in samples: print("{}: {} events".format(process, inclusive_2J.get_number_events(process))) print("============================") inclusive_3J = CutBasedCategoryFiller.create_nJ_category( process_events=data_test, process_aux_events=aux_data_test, process_weights=weights_test, process_names=samples, nJ=3) print("============================") print(" inclusive 3j expected event yield ") print("============================") for process in samples: print("{}: {} events".format(process, inclusive_3J.get_number_events(process))) print("============================") # now, create separate histograms for each process and each event variable for cur_var in TrainingConfig.training_branches: if cur_var == "nJ": # no plots for number of jets continue for cur_process in samples: CategoryPlotter.plot_category_composition( inclusive, binning=binnings[cur_var], outpath=os.path.join( outdir, "dist_{}_{}_inclusive.pdf".format(cur_var, cur_process)), var=cur_var, process_order=[cur_process], xlabel=cur_var, plotlabel=["inclusive"], args={}) inclusive.export_histogram(binning=binnings[cur_var], processes=[cur_process], var_name=cur_var, outfile=os.path.join( outdir, "dist_{}_{}_inclusive.pkl".format( cur_var, cur_process)), clipping=True, density=True) CategoryPlotter.plot_category_composition( inclusive_2J, binning=binnings[cur_var], outpath=os.path.join( outdir, "dist_{}_{}_inclusive_2J.pdf".format(cur_var, cur_process)), var=cur_var, process_order=[cur_process], xlabel=cur_var, plotlabel=["inclusive, nJ = 2"], args={}) inclusive_2J.export_histogram( binning=binnings[cur_var], processes=[cur_process], var_name=cur_var, outfile=os.path.join( outdir, "dist_{}_{}_inclusive_2J.pkl".format(cur_var, cur_process)), clipping=True, density=True) CategoryPlotter.plot_category_composition( inclusive_3J, binning=binnings[cur_var], outpath=os.path.join( outdir, "dist_{}_{}_inclusive_3J.pdf".format(cur_var, cur_process)), var=cur_var, process_order=[cur_process], xlabel=cur_var, plotlabel=["inclusive, nJ = 3"], args={}) inclusive_3J.export_histogram( binning=binnings[cur_var], processes=[cur_process], var_name=cur_var, outfile=os.path.join( outdir, "dist_{}_{}_inclusive_3J.pkl".format(cur_var, cur_process)), clipping=True, density=True)