Exemple #1
0
    def fromTable(cls, table, is_signal=False):
        from DatasetExtractor import TrainNuisAuxSplit
        import numpy as np
        cur_data, cur_nuis, cur_weights = TrainNuisAuxSplit(table)

        if is_signal:
            cur_labels = np.ones(len(cur_data))
        else:
            cur_labels = np.zeros(len(cur_data))

        return cls(cur_data, cur_nuis, cur_weights, cur_labels)
def ShowEventContent(infile_path):
    with pd.HDFStore(infile_path) as hdf:
        keys = hdf.keys()
        available_tables = [os.path.basename(key) for key in keys]

    for name in available_tables:
        data = pd.read_hdf(infile_path, key=name)

        testdata, nuisdata, weights = TrainNuisAuxSplit(data)

        total_events = np.sum(weights)
        unweighted_events = len(weights)
        print("{}: total events = {} ({} unweighted)".format(
            name, total_events, unweighted_events))
Exemple #3
0
def OptimizeCBASensitivity(infile_path, outdir, do_plots = True):
    data_slice = TrainingConfig.training_slice
    slice_size = data_slice[1] - data_slice[0]

    # read the test dataset, which will be used to get the expected sensitivity of the analysis
    sig_samples = TrainingConfig.sig_samples
    bkg_samples = TrainingConfig.bkg_samples

    print("loading data ...")
    sig_data = [pd.read_hdf(infile_path, key = sig_sample) for sig_sample in sig_samples]
    bkg_data = [pd.read_hdf(infile_path, key = bkg_sample) for bkg_sample in bkg_samples]

    sig_data_train = []
    sig_mBB_train = []
    sig_weights_train = []
    sig_aux_data_train = []
    for sample in sig_data:
        cur_length = len(sample)
        sample = sample.sample(frac = 1, random_state = 12345).reset_index(drop = True) # shuffle the sample
        cur_train = sample[int(data_slice[0] * cur_length) : int(data_slice[1] * cur_length)]
        cur_traindata, cur_nuisdata, cur_weights = TrainNuisAuxSplit(cur_train) # load the standard classifier input, nuisances and weights
        cur_aux_data = cur_train[TrainingConfig.other_branches].values
        sig_data_train.append(cur_traindata)
        sig_mBB_train.append(cur_nuisdata)
        sig_weights_train.append(cur_weights / slice_size)
        sig_aux_data_train.append(cur_aux_data)

    bkg_data_train = []
    bkg_mBB_train = []
    bkg_weights_train = []
    bkg_aux_data_train = []
    for sample in bkg_data:
        cur_length = len(sample)
        sample = sample.sample(frac = 1, random_state = 12345).reset_index(drop = True) # shuffle the sample
        cur_train = sample[int(data_slice[0] * cur_length) : int(data_slice[1] * cur_length)]
        cur_traindata, cur_nuisdata, cur_weights = TrainNuisAuxSplit(cur_train) # load the standard classifier input, nuisances and weights
        cur_aux_data = cur_train[TrainingConfig.other_branches].values
        bkg_data_train.append(cur_traindata)
        bkg_mBB_train.append(cur_nuisdata)
        bkg_weights_train.append(cur_weights / slice_size)
        bkg_aux_data_train.append(cur_aux_data)

    # also prepare the total, concatenated versions
    data_train = sig_data_train + bkg_data_train
    aux_train = sig_aux_data_train + bkg_aux_data_train
    weights_train = sig_weights_train + bkg_weights_train
    samples = sig_samples + bkg_samples

    # define the SR binning for mBB
    SR_low = 30
    SR_up = 210
    SR_binwidth = 10
    SR_mBB_binning = np.linspace(SR_low, SR_up, num = 1 + int((SR_up - SR_low) / SR_binwidth), endpoint = True)

    print("mBB binning: {}".format(SR_mBB_binning))

    original_cuts = {"MET_cut": 200, "dRBB_highMET_cut": 1.2, "dRBB_lowMET_cut": 1.8}

    # the objective function that needs to be minimized
    costfunc = lambda cuts: -EvaluateAsimovSignificance(process_events = data_train, process_aux_events = aux_train, 
                                                        process_weights = weights_train, process_names = samples, 
                                                        signal_process_names = sig_samples, background_process_names = bkg_samples, 
                                                        binning = SR_mBB_binning, cuts = cuts, fit_dir = outdir)["combined"]
    
    costfunc_bayes = lambda MET_cut, dRBB_highMET_cut, dRBB_lowMET_cut: -costfunc({"MET_cut": MET_cut, "dRBB_highMET_cut": dRBB_highMET_cut, "dRBB_lowMET_cut": dRBB_lowMET_cut})
    
    # then, try a global search strategy
    ranges_bayes = {"MET_cut": (150, 250), "dRBB_highMET_cut": (0.5, 5.0), "dRBB_lowMET_cut": (0.5, 5.0)}
    gp_params = {'kernel': 1.0 * Matern(length_scale = 0.05, length_scale_bounds = (1e-1, 1e2), nu = 1.5)}
    optimizer = BayesianOptimization(
        f = costfunc_bayes,
        pbounds = ranges_bayes,
        random_state = None
    )
    optimizer.maximize(init_points = 20, n_iter = 1, acq = 'poi', kappa = 3, **gp_params)

    xi_scheduler = lambda iteration: 0.01 + 0.19 * np.exp(-0.004 * iteration)
    for it in range(400):
        cur_xi = xi_scheduler(it)
        print("using xi = {}".format(cur_xi))
        optimizer.maximize(init_points = 0, n_iter = 1, acq = 'poi', kappa = 3, xi = cur_xi, **gp_params)
    
    # print the results
    print("==============================================")
    print("initial cuts:")
    print("==============================================")
    print("MET_cut = {}".format(original_cuts["MET_cut"]))
    print("dRBB_highMET_cut = {}".format(original_cuts["dRBB_highMET_cut"]))
    print("dRBB_lowMET_cut = {}".format(original_cuts["dRBB_lowMET_cut"]))
    print("significance = {} sigma".format(costfunc_bayes(**original_cuts)))
    print("==============================================")

    print("==============================================")
    print("optimized cuts (global optimization):")
    print("==============================================")
    print("MET_cut = {}".format(optimizer.max["params"]["MET_cut"]))
    print("dRBB_highMET_cut = {}".format(optimizer.max["params"]["dRBB_highMET_cut"]))
    print("dRBB_lowMET_cut = {}".format(optimizer.max["params"]["dRBB_lowMET_cut"]))
    print("significance = {} sigma".format(optimizer.max["target"]))
    print("==============================================")

    # save the results:
    with open(os.path.join(outdir, "opt_results.pkl"), "wb") as opt_outfile:
        pickle.dump(optimizer.max, opt_outfile)
def main():
    parser = ArgumentParser(description="train adversarial networks")
    parser.add_argument("--data", action="store", dest="infile_path")
    parser.add_argument("--outdir", action="store", dest="outdir")
    parser.add_argument("--statistics",
                        action="store_const",
                        const=True,
                        default=False,
                        dest="verbose_statistics")
    args = vars(parser.parse_args())

    infile_path = args["infile_path"]
    outdir = args["outdir"]

    print("using infile_path = " + infile_path)
    print("using outdir = " + outdir)

    tconf = TrainingConfig.from_file(outdir)
    data_branches = tconf.training_branches
    print("using data_branches = " + ", ".join(data_branches))

    # read the training data
    sig_samples = TrainingConfig.sig_samples
    bkg_samples = TrainingConfig.bkg_samples
    training_slice = TrainingConfig.training_slice

    print("loading data ...")
    sig_data = [
        pd.read_hdf(infile_path, key=sig_sample) for sig_sample in sig_samples
    ]
    bkg_data = [
        pd.read_hdf(infile_path, key=bkg_sample) for bkg_sample in bkg_samples
    ]

    auxdat_sig = []
    auxdat_bkg = []

    # extract the training dataset
    sig_data_train = []
    for sample, sample_name in zip(sig_data, sig_samples):
        cur_length = len(sample)
        sample = sample.sample(frac=1, random_state=12345).reset_index(
            drop=True)  # shuffle the sample
        cur_train = sample[int(training_slice[0] *
                               cur_length):int(training_slice[1] * cur_length)]
        auxdat_sig.append(cur_train[TrainingConfig.auxiliary_branches].values)
        sig_data_train.append(cur_train)

    bkg_data_train = []
    for sample, sample_name in zip(bkg_data, bkg_samples):
        cur_length = len(sample)
        sample = sample.sample(frac=1, random_state=12345).reset_index(
            drop=True)  # shuffle the sample
        cur_train = sample[int(training_slice[0] *
                               cur_length):int(training_slice[1] * cur_length)]
        auxdat_bkg.append(cur_train[TrainingConfig.auxiliary_branches].values)
        bkg_data_train.append(cur_train)

    print("got " + str(len(sig_data)) + " signal datasets")
    print("got " + str(len(bkg_data)) + " background datasets")

    # split the dataset into training branches, nuisances and event weights
    traindat_sig = []
    nuisdat_sig = []
    weightdat_sig = []

    traindat_bkg = []
    nuisdat_bkg = []
    weightdat_bkg = []

    for cur_sig_data_train, sample_name in zip(sig_data_train, sig_samples):
        cur_traindat_sig, cur_nuisdat_sig, cur_weightdat_sig = TrainNuisAuxSplit(
            cur_sig_data_train)
        traindat_sig.append(cur_traindat_sig)
        nuisdat_sig.append(cur_nuisdat_sig)
        weightdat_sig.append(cur_weightdat_sig *
                             TrainingConfig.sample_reweighting[sample_name])
        print("'{}' with {} entries representing {} events".format(
            sample_name, len(cur_weightdat_sig), np.sum(cur_weightdat_sig)))

    for cur_bkg_data_train, sample_name in zip(bkg_data_train, bkg_samples):
        cur_traindat_bkg, cur_nuisdat_bkg, cur_weightdat_bkg = TrainNuisAuxSplit(
            cur_bkg_data_train)
        traindat_bkg.append(cur_traindat_bkg)
        nuisdat_bkg.append(cur_nuisdat_bkg)
        weightdat_bkg.append(cur_weightdat_bkg *
                             TrainingConfig.sample_reweighting[sample_name])
        print("'{}' with {} entries representing {} events".format(
            sample_name, len(cur_weightdat_bkg), np.sum(cur_weightdat_bkg)))

    print("starting up")
    mce = AdversarialEnvironment.from_file(outdir)

    training_pars = tconf.training_pars
    print("using the following training parameters:")
    for key, val in training_pars.items():
        print(key + " = " + str(val))

    # set up the training
    train = AdversarialTrainer(training_pars=training_pars,
                               verbose_statistics=args["verbose_statistics"])

    # give the full list of signal / background components to the trainer
    train.train(mce,
                number_batches=training_pars["training_batches"],
                traindat_sig=traindat_sig,
                traindat_bkg=traindat_bkg,
                nuisances_sig=nuisdat_sig,
                nuisances_bkg=nuisdat_bkg,
                weights_sig=weightdat_sig,
                weights_bkg=weightdat_bkg,
                auxdat_sig=auxdat_sig,
                auxdat_bkg=auxdat_bkg,
                sig_sampling_pars={
                    "sampling_lengths": TrainingConfig.sig_sampling_lengths
                },
                bkg_sampling_pars={
                    "sampling_lengths": TrainingConfig.bkg_sampling_lengths
                })

    # save all the necessary information
    if not os.path.exists(outdir):
        os.makedirs(outdir)
    mce.save(os.path.join(outdir, ))
    train.save_training_statistics(
        os.path.join(outdir, "training_evolution.pkl"))
def GetCBASignalEfficiencies(outdir):
    if not os.path.exists(outdir):
        os.makedirs(outdir)

    sig_samples = TrainingConfig.sig_samples
    bkg_samples = TrainingConfig.bkg_samples
    infile_path = TrainingConfig.data_path

    data_slice = TrainingConfig.validation_slice
    slice_size = data_slice[1] - data_slice[0]

    data_sig = [pd.read_hdf(infile_path, key=sample) for sample in sig_samples]
    data_bkg = [pd.read_hdf(infile_path, key=sample) for sample in bkg_samples]

    # load all signal processes
    sig_data_test = [
    ]  # this holds all the branches used as inputs to the classifier
    sig_weights_test = []
    sig_aux_data_test = [
    ]  # this holds some other branches that may be important
    for sample, sample_name in zip(data_sig, sig_samples):
        cur_length = len(sample)
        sample = sample.sample(frac=1, random_state=12345).reset_index(
            drop=True)  # shuffle the sample
        cur_test = sample[int(data_slice[0] * cur_length):int(data_slice[1] *
                                                              cur_length)]
        cur_testdata, cur_nuisdata, cur_weights = TrainNuisAuxSplit(
            cur_test
        )  # load the standard classifier input, nuisances and weights

        cur_aux_data = cur_test[TrainingConfig.auxiliary_branches].values
        sig_data_test.append(cur_testdata)
        sig_weights_test.append(cur_weights / slice_size *
                                TrainingConfig.sample_reweighting[sample_name])
        sig_aux_data_test.append(cur_aux_data)

    # load all background processes
    bkg_data_test = [
    ]  # this holds all the branches used as inputs to the classifier
    bkg_weights_test = []
    bkg_aux_data_test = [
    ]  # this holds some other branches that may be important
    for sample, sample_name in zip(data_bkg, bkg_samples):
        cur_length = len(sample)
        sample = sample.sample(frac=1, random_state=12345).reset_index(
            drop=True)  # shuffle the sample
        cur_test = sample[int(data_slice[0] * cur_length):int(data_slice[1] *
                                                              cur_length)]
        cur_testdata, cur_nuisdata, cur_weights = TrainNuisAuxSplit(
            cur_test
        )  # load the standard classifier input, nuisances and weights

        cur_aux_data = cur_test[TrainingConfig.auxiliary_branches].values
        bkg_data_test.append(cur_testdata)
        bkg_weights_test.append(cur_weights / slice_size *
                                TrainingConfig.sample_reweighting[sample_name])
        bkg_aux_data_test.append(cur_aux_data)

    # also prepare the total, concatenated versions
    data_test = sig_data_test + bkg_data_test
    aux_test = sig_aux_data_test + bkg_aux_data_test
    weights_test = sig_weights_test + bkg_weights_test
    samples = sig_samples + bkg_samples

    # prepare the common mBB binning for all signal regions
    SR_low = 30
    SR_up = 210
    SR_binwidth = 10
    SR_binning = np.linspace(SR_low,
                             SR_up,
                             num=1 + int((SR_up - SR_low) / SR_binwidth),
                             endpoint=True)

    effdict = {}

    # also fill inclusive 2- and 3-jet categories to get a baseline for the shapes
    inclusive_2J = CutBasedCategoryFiller.create_nJ_category(
        process_events=data_test,
        process_aux_events=aux_test,
        process_weights=weights_test,
        process_names=samples,
        nJ=2)

    inclusive_3J = CutBasedCategoryFiller.create_nJ_category(
        process_events=data_test,
        process_aux_events=aux_test,
        process_weights=weights_test,
        process_names=samples,
        nJ=3)

    # compute the total number of available signal events
    sig_events_total_2j = inclusive_2J.get_number_events("Hbb")
    sig_events_total_3j = inclusive_3J.get_number_events("Hbb")

    print("total 2J signal events: {}".format(sig_events_total_2j))
    print("total 3J signal events: {}".format(sig_events_total_3j))

    # fill the cut-based categories
    for cur_nJ, sig_events_total in zip(
        [2, 3], [sig_events_total_2j, sig_events_total_3j]):
        # first, export the categories of the cut-based analysis: high / low MET
        low_MET_cat = CutBasedCategoryFiller.create_low_MET_category(
            process_events=data_test,
            process_aux_events=aux_test,
            process_weights=weights_test,
            process_names=samples,
            nJ=cur_nJ)

        low_MET_cat.export_ROOT_histogram(
            binning=SR_binning,
            processes=sig_samples + bkg_samples,
            var_names="mBB",
            outfile_path=os.path.join(outdir,
                                      "{}jet_low_MET.root".format(cur_nJ)),
            clipping=True,
            density=False)

        CategoryPlotter.plot_category_composition(
            low_MET_cat,
            binning=SR_binning,
            outpath=os.path.join(outdir, "{}jet_low_MET.pdf".format(cur_nJ)),
            var="mBB",
            xlabel=r'$m_{bb}$ [GeV]',
            plotlabel=[
                "MC16d", r'150 GeV < MET < 200 GeV', "dRBB < 1.8",
                "nJ = {}".format(cur_nJ)
            ],
            args={})

        # get the signal efficiency for this category
        sigeff = low_MET_cat.get_number_events("Hbb") / sig_events_total
        effdict["low_MET_{}J".format(cur_nJ)] = sigeff

        high_MET_cat = CutBasedCategoryFiller.create_high_MET_category(
            process_events=data_test,
            process_aux_events=aux_test,
            process_weights=weights_test,
            process_names=samples,
            nJ=cur_nJ)

        high_MET_cat.export_ROOT_histogram(
            binning=SR_binning,
            processes=sig_samples + bkg_samples,
            var_names="mBB",
            outfile_path=os.path.join(outdir,
                                      "{}jet_high_MET.root".format(cur_nJ)),
            clipping=True,
            density=False)

        CategoryPlotter.plot_category_composition(
            high_MET_cat,
            binning=SR_binning,
            outpath=os.path.join(outdir, "{}jet_high_MET.pdf".format(cur_nJ)),
            var="mBB",
            xlabel=r'$m_{bb}$ [GeV]',
            plotlabel=[
                "MC16d", "MET > 200 GeV", "dRBB < 1.2",
                "nJ = {}".format(cur_nJ)
            ],
            args={})

        # get the signal efficiency for this category
        sigeff = high_MET_cat.get_number_events("Hbb") / sig_events_total
        effdict["high_MET_{}J".format(cur_nJ)] = sigeff

    return effdict
def main():
    parser = ArgumentParser(description = "populate analysis signal regions and export them to be used with HistFitter")
    parser.add_argument("--data", action = "store", dest = "infile_path")
    parser.add_argument("--model_dir", action = "store", dest = "model_dir")
    parser.add_argument("--out_dir", action = "store", dest = "out_dir")
    parser.add_argument("--use_test", action = "store_const", const = True, default = False)
    args = vars(parser.parse_args())

    adv_model = _load_metadata(os.path.join(args["model_dir"], "meta.conf"), "AdversarialEnvironment")["adversary_model"]
    adversary_label_library = {"MINEAdversary": "MIND", "DisCoAdversary": "DisCo", "GMMAdversary": "EMAX", "PtEstAdversary": "REG"}
    adversary_label = adversary_label_library[adv_model]

    # extract the validation or test dataset
    if args["use_test"]:
        print("using test dataset")
        data_slice = TrainingConfig.test_slice
    else:
        print("using validation dataset")
        data_slice = TrainingConfig.validation_slice

    slice_size = data_slice[1] - data_slice[0]

    infile_path = args["infile_path"]
    model_dir = args["model_dir"]
    outdir = args["out_dir"]

    # make plots showing the progress of the training
    training_dir = os.path.dirname(model_dir)
    training_plotter = TrainingStatisticsPlotter(model_dir)
    training_plotter.plot(model_dir)

    sig_samples = TrainingConfig.sig_samples
    bkg_samples = TrainingConfig.bkg_samples

    data_sig = [pd.read_hdf(infile_path, key = sample) for sample in sig_samples]
    data_bkg = [pd.read_hdf(infile_path, key = sample) for sample in bkg_samples]

    # load all signal processes
    sig_data_test = [] # this holds all the branches used as inputs to the classifier
    sig_weights_test = []
    sig_aux_data_test = [] # this holds some other branches that may be important

    for sample, sample_name in zip(data_sig, sig_samples):
        cur_length = len(sample)
        sample = sample.sample(frac = 1, random_state = 12345).reset_index(drop = True) # shuffle the sample
        cur_test = sample[int(data_slice[0] * cur_length) : int(data_slice[1] * cur_length)]
        cur_testdata, cur_nuisdata, cur_weights = TrainNuisAuxSplit(cur_test) # load the standard classifier input, nuisances and weights

        cur_aux_data = cur_test[TrainingConfig.auxiliary_branches].values
        sig_data_test.append(cur_testdata)
        sig_weights_test.append(cur_weights / slice_size)
        sig_aux_data_test.append(cur_aux_data)

    # also need to keep separate all signal events with 2 jets / 3 jets
    sig_data_test_2j = []
    sig_weights_test_2j = []
    sig_aux_data_test_2j = []

    sig_data_test_3j = []
    sig_weights_test_3j = []
    sig_aux_data_test_3j = []

    for sample, sample_name in zip(data_sig, sig_samples):
        cur_length = len(sample)
        sample = sample.sample(frac = 1, random_state = 12345).reset_index(drop = True) # shuffle the sample
        cur_test = sample[int(data_slice[0] * cur_length) : int(data_slice[1] * cur_length)]
        cur_test = cur_test[cur_test["nJ"] == 2]
        cur_testdata, cur_nuisdata, cur_weights = TrainNuisAuxSplit(cur_test) # load the standard classifier input, nuisances and weights

        cur_aux_data = cur_test[TrainingConfig.auxiliary_branches].values
        sig_data_test_2j.append(cur_testdata)
        sig_weights_test_2j.append(cur_weights / slice_size)
        sig_aux_data_test_2j.append(cur_aux_data)

    for sample, sample_name in zip(data_sig, sig_samples):
        cur_length = len(sample)
        sample = sample.sample(frac = 1, random_state = 12345).reset_index(drop = True) # shuffle the sample
        cur_test = sample[int(data_slice[0] * cur_length) : int(data_slice[1] * cur_length)]
        cur_test = cur_test[cur_test["nJ"] == 3]
        cur_testdata, cur_nuisdata, cur_weights = TrainNuisAuxSplit(cur_test) # load the standard classifier input, nuisances and weights

        cur_aux_data = cur_test[TrainingConfig.auxiliary_branches].values
        sig_data_test_3j.append(cur_testdata)
        sig_weights_test_3j.append(cur_weights / slice_size)
        sig_aux_data_test_3j.append(cur_aux_data)

    # load all background processes
    bkg_data_test = [] # this holds all the branches used as inputs to the classifier
    bkg_weights_test = []
    bkg_aux_data_test = [] # this holds some other branches that may be important
    for sample, sample_name in zip(data_bkg, bkg_samples):
        cur_length = len(sample)
        sample = sample.sample(frac = 1, random_state = 12345).reset_index(drop = True) # shuffle the sample
        cur_test = sample[int(data_slice[0] * cur_length) : int(data_slice[1] * cur_length)]
        cur_testdata, cur_nuisdata, cur_weights = TrainNuisAuxSplit(cur_test) # load the standard classifier input, nuisances and weights

        cur_aux_data = cur_test[TrainingConfig.auxiliary_branches].values
        bkg_data_test.append(cur_testdata)
        bkg_weights_test.append(cur_weights / slice_size)
        bkg_aux_data_test.append(cur_aux_data)

    # also prepare the total, concatenated versions
    data_test = sig_data_test + bkg_data_test
    aux_test = sig_aux_data_test + bkg_aux_data_test
    weights_test = sig_weights_test + bkg_weights_test
    samples = sig_samples + bkg_samples

    # load the AdversarialEnvironment
    env = AdversarialEnvironment.from_file(model_dir)

    # prepare the common mBB binning for all signal regions
    SR_low = 30
    SR_up = 210
    SR_binwidth = 10
    SR_binning = np.linspace(SR_low, SR_up, num = 1 + int((SR_up - SR_low) / SR_binwidth), endpoint = True)

    # also prepare the binning along the MVA dimension
    sigeff_binning = [0.0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.70, 0.75, 0.80, 0.85, 0.9, 0.92, 0.94, 0.96, 0.98, 0.99, 1.0]

    print("signal efficiency binning: {}".format(sigeff_binning))
    print("mBB binning: {}".format(SR_binning))

    # for MadGraph ATLAS MC (with optimized CBA)
    cuts = {2: [0.0, 0.3936688696975736, 0.9162186612913272],
            3: [0.0, 0.35975037002858584, 0.861855992060236]}

    cut_labels = ["tight", "loose"]

    CBA_original = {"MET_cut": 200, "dRBB_highMET_cut": 1.2, "dRBB_lowMET_cut": 1.8}
    CBA_optimized = {"MET_cut": 191, "dRBB_highMET_cut": 1.2, "dRBB_lowMET_cut": 5.0}
    
    print("using the following cuts:")
    print(cuts)

    # fill the inclusive categories with 2j / 3j events
    inclusive_2J = CutBasedCategoryFiller.create_nJ_category(process_events = data_test,
                                                             process_aux_events = aux_test,
                                                             process_weights = weights_test,
                                                             process_names = samples,
                                                             nJ = 2)
    for cur_process in samples:
        inclusive_2J.export_histogram(binning = SR_binning, processes = [cur_process], var_name = "mBB", outfile = os.path.join(outdir, "dist_mBB_{}_2jet.pkl".format(cur_process)), density = True)

    inclusive_2J.export_histogram(binning = SR_binning, processes = bkg_samples, var_name = "mBB", outfile = os.path.join(outdir, "dist_mBB_bkg_2jet.pkl"), density = True)

    inclusive_3J = CutBasedCategoryFiller.create_nJ_category(process_events = data_test,
                                                             process_aux_events = aux_test,
                                                             process_weights = weights_test,
                                                             process_names = samples,
                                                             nJ = 3)
    for cur_process in samples:
        inclusive_3J.export_histogram(binning = SR_binning, processes = [cur_process], var_name = "mBB", outfile = os.path.join(outdir, "dist_mBB_{}_3jet.pkl".format(cur_process)), density = True)

    inclusive_3J.export_histogram(binning = SR_binning, processes = bkg_samples, var_name = "mBB", outfile = os.path.join(outdir, "dist_mBB_bkg_3jet.pkl"), density = True)

    total_events = inclusive_2J.get_total_events() + inclusive_3J.get_total_events()
    CBA_used_events = 0
    PCA_used_events = 0

    anadict = {}
    
    for cur_nJ, cur_inclusive_cat, cur_signal_events, cur_signal_weights, cur_signal_aux_events in zip([2, 3], [inclusive_2J, inclusive_3J], [sig_data_test_2j, sig_data_test_3j], [sig_weights_test_2j, sig_weights_test_3j], [sig_aux_data_test_2j, sig_aux_data_test_3j]):
        for cur_cuts, prefix in zip([CBA_original, CBA_optimized], ["original_", "optimized_"]):
            # first, export the categories of the cut-based analysis: high / low MET, using the optimized cuts
            print("filling {} jet low_MET category with cut prefix = {}".format(cur_nJ, prefix))
            low_MET_cat = CutBasedCategoryFiller.create_low_MET_category(process_events = data_test,
                                                                         process_aux_events = aux_test,
                                                                         process_weights = weights_test,
                                                                         process_names = samples,
                                                                         nJ = cur_nJ,
                                                                         cuts = cur_cuts)
            print("filled {} signal events".format(low_MET_cat.get_number_events("Hbb")))
            
            low_MET_cat.export_ROOT_histogram(binning = SR_binning, processes = sig_samples + bkg_samples, var_names = "mBB",
                                              outfile_path = os.path.join(outdir, prefix + "{}jet_low_MET.root".format(cur_nJ)), clipping = True, density = False)
            
            anadict[prefix + "low_MET_{}jet_sig_eff".format(cur_nJ)] = ModelEvaluator.get_efficiency(low_MET_cat, cur_inclusive_cat, sig_samples)
            anadict[prefix + "low_MET_{}jet_bkg_eff".format(cur_nJ)] = ModelEvaluator.get_efficiency(low_MET_cat, cur_inclusive_cat, bkg_samples)
            
            anadict[prefix + "low_MET_{}jet_inv_JS_bkg".format(cur_nJ)] = 1.0 / ModelEvaluator.get_JS_categories(low_MET_cat, cur_inclusive_cat, binning = SR_binning, var = "mBB", processes = bkg_samples)
            anadict[prefix + "low_MET_{}jet_binned_sig".format(cur_nJ)] = low_MET_cat.get_binned_significance(binning = SR_binning, signal_processes = sig_samples, background_processes = bkg_samples, var_name = "mBB")
            
            CBA_used_events += low_MET_cat.get_total_events()
            
            for cur_process in samples:
                low_MET_cat.export_histogram(binning = SR_binning, processes = [cur_process], var_name = "mBB", outfile = os.path.join(outdir, prefix + "dist_mBB_{}_{}jet_low_MET.pkl".format(cur_process, cur_nJ)), density = True)

            low_MET_cat.export_histogram(binning = SR_binning, processes = bkg_samples, var_name = "mBB", outfile = os.path.join(outdir, prefix + "dist_mBB_bkg_{}jet_low_MET.pkl".format(cur_nJ)), density = True)

            CategoryPlotter.plot_category_composition(low_MET_cat, binning = SR_binning, outpath = os.path.join(outdir, prefix + "{}jet_low_MET.pdf".format(cur_nJ)), var = "mBB", xlabel = r'$m_{bb}$ [GeV]', 
                                                      plotlabel = ["MadGraph + Pythia8", r'$\sqrt{s} = 13$ TeV, 140 fb$^{-1}$', r'150 GeV < $E_{\mathrm{T}}^{\mathrm{miss}}$' + '< {MET_cut} GeV'.format(**cur_cuts), r'$\Delta R_{{bb}} < {dRBB_lowMET_cut}$'.format(**cur_cuts), r'{} jet'.format(cur_nJ)], args = {})
            
            CategoryPlotter.plot_category_composition(low_MET_cat, binning = SR_binning, outpath = os.path.join(outdir, prefix + "{}jet_low_MET_nostack.pdf".format(cur_nJ)), var = "mBB", xlabel = r'$m_{bb}$ [GeV]', ylabel = "a.u.",
                                                      plotlabel = ["MadGraph + Pythia8", r'$\sqrt{s} = 13$ TeV, 140 fb$^{-1}$', r'150 GeV < $E_{\mathrm{T}}^{\mathrm{miss}}$' +  '< {MET_cut} GeV'.format(**cur_cuts), r'$\Delta R_{{bb}} < {dRBB_lowMET_cut}$'.format(**cur_cuts), r'{} jet'.format(cur_nJ)], args = {}, stacked = False, histtype = 'step', density = True)
            
            print("filling {} jet high_MET category".format(cur_nJ))
            high_MET_cat = CutBasedCategoryFiller.create_high_MET_category(process_events = data_test,
                                                                           process_aux_events = aux_test,
                                                                           process_weights = weights_test,
                                                                           process_names = samples,
                                                                           nJ = cur_nJ,
                                                                           cuts = cur_cuts)
            print("filled {} signal events".format(high_MET_cat.get_number_events("Hbb")))
            
            high_MET_cat.export_ROOT_histogram(binning = SR_binning, processes = sig_samples + bkg_samples, var_names = "mBB",
                                               outfile_path = os.path.join(outdir, prefix + "{}jet_high_MET.root".format(cur_nJ)), clipping = True, density = False)
            
            anadict[prefix + "high_MET_{}jet_sig_eff".format(cur_nJ)] = ModelEvaluator.get_efficiency(high_MET_cat, cur_inclusive_cat, sig_samples)
            anadict[prefix + "high_MET_{}jet_bkg_eff".format(cur_nJ)] = ModelEvaluator.get_efficiency(high_MET_cat, cur_inclusive_cat, bkg_samples)
            
            anadict[prefix + "high_MET_{}jet_inv_JS_bkg".format(cur_nJ)] = 1.0 / ModelEvaluator.get_JS_categories(high_MET_cat, cur_inclusive_cat, binning = SR_binning, var = "mBB", processes = bkg_samples)
            anadict[prefix + "high_MET_{}jet_binned_sig".format(cur_nJ)] = high_MET_cat.get_binned_significance(binning = SR_binning, signal_processes = sig_samples, background_processes = bkg_samples, var_name = "mBB")
            
            # compute JSD between the high-MET and low-MET categories
            anadict[prefix + "{}jet_high_low_MET_inv_JS_bkg".format(cur_nJ)] = 1.0 / ModelEvaluator.get_JS_categories(high_MET_cat, low_MET_cat, binning = SR_binning, var = "mBB", processes = bkg_samples)
            anadict[prefix + "{}jet_binned_sig_CBA".format(cur_nJ)] = (anadict[prefix + "low_MET_{}jet_binned_sig".format(cur_nJ)]**2 + anadict[prefix + "high_MET_{}jet_binned_sig".format(cur_nJ)]**2)**0.5
            
            CBA_used_events += high_MET_cat.get_total_events()
            
            for cur_process in samples:
                high_MET_cat.export_histogram(binning = SR_binning, processes = [cur_process], var_name = "mBB", outfile = os.path.join(outdir, prefix + "dist_mBB_{}_{}jet_high_MET.pkl".format(cur_process, cur_nJ)), density = True)

            high_MET_cat.export_histogram(binning = SR_binning, processes = bkg_samples, var_name = "mBB", outfile = os.path.join(outdir, prefix + "dist_mBB_bkg_{}jet_high_MET.pkl".format(cur_nJ)), density = True)

            CategoryPlotter.plot_category_composition(high_MET_cat, binning = SR_binning, outpath = os.path.join(outdir, prefix + "{}jet_high_MET.pdf".format(cur_nJ)), var = "mBB", xlabel = r'$m_{bb}$ [GeV]', 
                                                      plotlabel = ["MadGraph + Pythia8", r'$\sqrt{s} = 13$ TeV, 140 fb$^{-1}$', r'$E_{\mathrm{T}}^{\mathrm{miss}}$ >' + ' {MET_cut} GeV'.format(**cur_cuts), r'$\Delta R_{{bb}} < {dRBB_highMET_cut}$'.format(**cur_cuts), r'{} jet'.format(cur_nJ)], args = {})
            
            CategoryPlotter.plot_category_composition(high_MET_cat, binning = SR_binning, outpath = os.path.join(outdir, prefix + "{}jet_high_MET_nostack.pdf".format(cur_nJ)), var = "mBB", xlabel = r'$m_{bb}$ [GeV]', ylabel = "a.u.",
                                                      plotlabel = ["MadGraph + Pythia8", r'$\sqrt{s} = 13$ TeV, 140 fb$^{-1}$', r'$E_{\mathrm{T}}^{\mathrm{miss}}$ >' + ' {MET_cut} GeV'.format(**cur_cuts), r'$\Delta R_{{bb}} < {dRBB_highMET_cut}$'.format(**cur_cuts), r'{} jet'.format(cur_nJ)], args = {}, stacked = False, histtype = 'step', density = True)
            
        # keep track of the tight and loose categories for later
        classifier_categories = {}
            
        # prepare N categories along the classifier output dimension
        for cut_end, cut_start, cut_label in zip(cuts[cur_nJ][0:-1], cuts[cur_nJ][1:], cut_labels):
            print("exporting {}J region with sigeff range {} - {}".format(cur_nJ, cut_start, cut_end))

            cur_cat = ClassifierBasedCategoryFiller.create_classifier_category(env,
                                                                               process_events = data_test,
                                                                               process_aux_events = aux_test,
                                                                               process_weights = weights_test,
                                                                               process_names = samples,
                                                                               signal_events = cur_signal_events,
                                                                               signal_weights = cur_signal_weights,
                                                                               signal_aux_events = cur_signal_aux_events,
                                                                               classifier_sigeff_range = (cut_start, cut_end),
                                                                               nJ = cur_nJ)
            cur_cat.export_ROOT_histogram(binning = SR_binning, processes = sig_samples + bkg_samples, var_names = "mBB",
                                           outfile_path = os.path.join(outdir, "region_{}jet_{}_{}.root".format(cur_nJ, cut_start, cut_end)), clipping = True, density = False)

            PCA_used_events += cur_cat.get_total_events()

            anadict["{}_{}jet_sig_eff".format(cut_label, cur_nJ)] = ModelEvaluator.get_efficiency(cur_cat, cur_inclusive_cat, sig_samples)
            anadict["{}_{}jet_bkg_eff".format(cut_label, cur_nJ)] = ModelEvaluator.get_efficiency(cur_cat, cur_inclusive_cat, bkg_samples)

            anadict["{}_{}jet_inv_JS_bkg".format(cut_label, cur_nJ)] = 1.0 / ModelEvaluator.get_JS_categories(cur_cat, cur_inclusive_cat, binning = SR_binning, var = "mBB", processes = bkg_samples)
            anadict["{}_{}jet_binned_sig".format(cut_label, cur_nJ)] = cur_cat.get_binned_significance(binning = SR_binning, signal_processes = sig_samples, background_processes = bkg_samples, var_name = "mBB")

            classifier_categories[cut_label] = cur_cat

            for cur_process in samples:
                cur_cat.export_histogram(binning = SR_binning, processes = [cur_process], var_name = "mBB", outfile = os.path.join(outdir, "dist_mBB_{}_{}jet_{}.pkl".format(cur_process, cur_nJ, cut_label)), density = True)

            cur_cat.export_histogram(binning = SR_binning, processes = bkg_samples, var_name = "mBB", outfile = os.path.join(outdir, "dist_mBB_bkg_{}jet_{}.pkl".format(cur_nJ, cut_label)), density = True)

            CategoryPlotter.plot_category_composition(cur_cat, binning = SR_binning, outpath = os.path.join(outdir, "dist_mBB_region_{}jet_{}_{}.pdf".format(cur_nJ, cut_start, cut_end)), 
                                                      var = "mBB", xlabel = r'$m_{bb}$ [GeV]', plotlabel = ["MadGraph + Pythia8", r'$\sqrt{s} = 13$ TeV, 140 fb$^{-1}$', cut_label + r', {} jet'.format(cur_nJ), adversary_label])

            CategoryPlotter.plot_category_composition(cur_cat, binning = SR_binning, outpath = os.path.join(outdir, "dist_mBB_region_{}jet_{}_{}_nostack.pdf".format(cur_nJ, cut_start, cut_end)), 
                                                      var = "mBB", xlabel = r'$m_{bb}$ [GeV]', ylabel = "a.u.", plotlabel = ["MadGraph + Pythia8", r'$\sqrt{s} = 13$ TeV, 140 fb$^{-1}$', cut_label + r', {} jet'.format(cur_nJ), adversary_label], stacked = False, histtype = 'step', density = True)

            print("filled {} signal events".format(cur_cat.get_number_events("Hbb")))

        # compute JSD between the tight and loose categories
        anadict["{}jet_tight_loose_inv_JS_bkg".format(cur_nJ)] = 1.0 / ModelEvaluator.get_JS_categories(classifier_categories["tight"], classifier_categories["loose"], binning = SR_binning, var = "mBB", processes = bkg_samples)
        anadict["{}jet_binned_sig_PCA".format(cur_nJ)] = (anadict["tight_{}jet_binned_sig".format(cur_nJ)]**2 + anadict["loose_{}jet_binned_sig".format(cur_nJ)]**2)**0.5

    print("event statistics:")
    print("have a total of {} events, CBA used {} events, ({}%)".format(total_events, CBA_used_events, CBA_used_events / total_events))
    print("have a total of {} events, PCA used {} events, ({}%)".format(total_events, PCA_used_events, PCA_used_events / total_events))

    anadict.update(env.create_paramdict())
    print("got the following anadict: {}".format(anadict))
    with open(os.path.join(outdir, "anadict.pkl"), "wb") as outfile:
        pickle.dump(anadict, outfile)
Exemple #7
0
def MakeDistributionControlPlots(infile, outdir, test_size=0.999):
    # sig_samples = ["Hbb"]
    # bkg_samples = ["ttbar", "Zjets", "Wjets", "diboson", "singletop"]

    # for MadGraph
    sig_samples = ["Hbb"]
    bkg_samples = ["ttbar", "Zjets", "Wjets", "diboson"]

    samples = sig_samples + bkg_samples

    # set up proper binnings for different variables
    binnings = {}
    binnings["mBB"] = get_binning(30, 600, 10)
    binnings["dRBB"] = get_binning(0.0, 3.0, 0.1)
    binnings["pTB1"] = get_binning(0, 300, 10)
    binnings["pTB2"] = get_binning(0, 300, 10)
    binnings["MET"] = get_binning(0, 300, 10)
    binnings["dEtaBB"] = get_binning(0, 5, 0.1)
    binnings["dPhiMETdijet"] = get_binning(0, np.pi, 0.1)
    binnings["SumPtJet"] = get_binning(0, 500, 10)

    print("loading data ...")
    data = [pd.read_hdf(infile_path, key=sample) for sample in samples]

    for cur_df, sample in zip(data, samples):
        print("have {} events available for '{}'".format(len(cur_df), sample))

    data_test = []
    mBB_test = []
    weights_test = []
    aux_data_test = []
    for sample in data:
        _, cur_test = train_test_split(sample,
                                       test_size=test_size,
                                       shuffle=True,
                                       random_state=12345)
        cur_testdata, cur_nuisdata, cur_weights = TrainNuisAuxSplit(
            cur_test
        )  # load the standard classifier input, nuisances and weights
        cur_aux_data = cur_test[TrainingConfig.other_branches].values
        data_test.append(cur_testdata)
        mBB_test.append(cur_nuisdata)
        weights_test.append(cur_weights / test_size)
        aux_data_test.append(cur_aux_data)

    # first, plot the total event content (i.e. corresponding to an "inclusive" event category)
    inclusive = Category("inclusive")
    for events, weights, process in zip(data_test, weights_test, samples):
        inclusive.add_events(events=events,
                             weights=weights,
                             process=process,
                             event_variables=TrainingConfig.training_branches,
                             aux_content=cur_aux_data,
                             aux_variables=TrainingConfig.other_branches)

    # print total event numbers for all processes
    print("============================")
    print(" inclusive expected event yield ")
    print("============================")
    for process in samples:
        print("{}: {} events".format(process,
                                     inclusive.get_number_events(process)))
    print("============================")

    # also fill inclusive 2- and 3-jet categories to get a baseline for the shapes
    inclusive_2J = CutBasedCategoryFiller.create_nJ_category(
        process_events=data_test,
        process_aux_events=aux_data_test,
        process_weights=weights_test,
        process_names=samples,
        nJ=2)

    print("============================")
    print(" inclusive 2j expected event yield ")
    print("============================")
    for process in samples:
        print("{}: {} events".format(process,
                                     inclusive_2J.get_number_events(process)))
    print("============================")

    inclusive_3J = CutBasedCategoryFiller.create_nJ_category(
        process_events=data_test,
        process_aux_events=aux_data_test,
        process_weights=weights_test,
        process_names=samples,
        nJ=3)

    print("============================")
    print(" inclusive 3j expected event yield ")
    print("============================")
    for process in samples:
        print("{}: {} events".format(process,
                                     inclusive_3J.get_number_events(process)))
    print("============================")

    # now, create separate histograms for each process and each event variable
    for cur_var in TrainingConfig.training_branches:
        if cur_var == "nJ":  # no plots for number of jets
            continue

        for cur_process in samples:
            CategoryPlotter.plot_category_composition(
                inclusive,
                binning=binnings[cur_var],
                outpath=os.path.join(
                    outdir,
                    "dist_{}_{}_inclusive.pdf".format(cur_var, cur_process)),
                var=cur_var,
                process_order=[cur_process],
                xlabel=cur_var,
                plotlabel=["inclusive"],
                args={})
            inclusive.export_histogram(binning=binnings[cur_var],
                                       processes=[cur_process],
                                       var_name=cur_var,
                                       outfile=os.path.join(
                                           outdir,
                                           "dist_{}_{}_inclusive.pkl".format(
                                               cur_var, cur_process)),
                                       clipping=True,
                                       density=True)

            CategoryPlotter.plot_category_composition(
                inclusive_2J,
                binning=binnings[cur_var],
                outpath=os.path.join(
                    outdir,
                    "dist_{}_{}_inclusive_2J.pdf".format(cur_var,
                                                         cur_process)),
                var=cur_var,
                process_order=[cur_process],
                xlabel=cur_var,
                plotlabel=["inclusive, nJ = 2"],
                args={})
            inclusive_2J.export_histogram(
                binning=binnings[cur_var],
                processes=[cur_process],
                var_name=cur_var,
                outfile=os.path.join(
                    outdir,
                    "dist_{}_{}_inclusive_2J.pkl".format(cur_var,
                                                         cur_process)),
                clipping=True,
                density=True)

            CategoryPlotter.plot_category_composition(
                inclusive_3J,
                binning=binnings[cur_var],
                outpath=os.path.join(
                    outdir,
                    "dist_{}_{}_inclusive_3J.pdf".format(cur_var,
                                                         cur_process)),
                var=cur_var,
                process_order=[cur_process],
                xlabel=cur_var,
                plotlabel=["inclusive, nJ = 3"],
                args={})
            inclusive_3J.export_histogram(
                binning=binnings[cur_var],
                processes=[cur_process],
                var_name=cur_var,
                outfile=os.path.join(
                    outdir,
                    "dist_{}_{}_inclusive_3J.pkl".format(cur_var,
                                                         cur_process)),
                clipping=True,
                density=True)