예제 #1
0
    def create_nJ_category(process_events,
                           process_aux_events,
                           process_weights,
                           process_names,
                           nJ=2):
        retcat = Category("inclusive_{}J".format(nJ))

        for cur_events, cur_aux_events, cur_weights, process_name in zip(
                process_events, process_aux_events, process_weights,
                process_names):
            # extract the branches that are needed for the cut
            cur_nJ = cur_aux_events[:,
                                    TrainingConfig.auxiliary_branches.
                                    index("nJ")]
            cut = (cur_nJ == nJ)

            passed_events = cur_events[cut]
            passed_weights = cur_weights[cut]
            passed_aux = cur_aux_events[cut]

            print("XXXXXX")
            print("adding aux with shape: {}".format(np.shape(passed_aux)))
            print("adding weights with shape: {}".format(
                np.shape(passed_weights)))

            retcat.add_events(events=passed_events,
                              weights=passed_weights,
                              process=process_name,
                              event_variables=TrainingConfig.training_branches,
                              aux_content=passed_aux,
                              aux_variables=TrainingConfig.auxiliary_branches)

        return retcat
    def create_nJ_category(process_data, process_names, nJ = 2):
        
        retcat = Category("inclusive_{}J".format(nJ))
        formatter = only_nJ(nJ = nJ)

        for cur_process_data, cur_process_name in zip(process_data, process_names):
            
            passed = formatter.format_as_TrainingSample(cur_process_data)
            retcat.add_events(events = passed.data, weights = passed.weights, process = cur_process_name, event_variables = TrainingConfig.training_branches)

        return retcat
    def create_high_MET_category(process_data, process_names, nJ = 2, cuts = {"MET_cut": 191, "dRBB_highMET_cut": 1.2, "dRBB_lowMET_cut": 5.0}):

        retcat = Category("high_MET")

        for cur_process_data, cur_process_name in zip(process_data, process_names):
            
            # apply the cuts
            passed = cur_process_data.loc[(cur_process_data["MET"] > cuts["MET_cut"]) & (cur_process_data["dRBB"] < cuts["dRBB_highMET_cut"]) & (cur_process_data["nJ"] == nJ)]
            passed = TrainingSample.fromTable(passed)

            # fill the category
            retcat.add_events(events = passed.data, weights = passed.weights, process = cur_process_name, event_variables = TrainingConfig.training_branches)

            print("filled {} events from process '{}'".format(sum(passed.weights), cur_process_name))

        return retcat
예제 #4
0
    def create_low_MET_category(process_events,
                                process_aux_events,
                                process_weights,
                                process_names,
                                nJ=2,
                                cuts={
                                    "MET_cut": 191,
                                    "dRBB_highMET_cut": 1.2,
                                    "dRBB_lowMET_cut": 5.0
                                }):
        retcat = Category("low_MET")

        for cur_events, cur_aux_events, cur_weights, process_name in zip(
                process_events, process_aux_events, process_weights,
                process_names):
            # extract the branches that are needed for the cut
            cur_MET = cur_events[:,
                                 TrainingConfig.training_branches.index("MET")]
            cur_dRBB = cur_aux_events[:,
                                      TrainingConfig.auxiliary_branches.
                                      index("dRBB")]

            cur_nJ = cur_aux_events[:,
                                    TrainingConfig.auxiliary_branches.
                                    index("nJ")]

            cut = np.logical_and.reduce(
                (cur_MET > 150, cur_MET < cuts["MET_cut"],
                 cur_dRBB < cuts["dRBB_lowMET_cut"], cur_nJ == nJ))

            passed_events = cur_events[cut]
            passed_weights = cur_weights[cut]
            passed_aux = cur_aux_events[cut]

            retcat.add_events(events=passed_events,
                              weights=passed_weights,
                              process=process_name,
                              event_variables=TrainingConfig.training_branches,
                              aux_content=passed_aux,
                              aux_variables=TrainingConfig.auxiliary_branches)

            print("filled {} events from sample '{}'".format(
                len(passed_events), process_name))

        return retcat
    def create_classifier_category(env,
                                   process_events,
                                   process_aux_events,
                                   process_weights,
                                   process_names,
                                   signal_events,
                                   signal_aux_events,
                                   signal_weights,
                                   classifier_sigeff_range=(1, 0),
                                   nJ=2,
                                   interpret_as_sigeff=True,
                                   process_preds=None):

        if not process_preds:
            process_preds = [None for cur_events in process_events]

        if interpret_as_sigeff:
            if (classifier_sigeff_range[0] < classifier_sigeff_range[1]):
                raise Exception(
                    "Warning: are you sure you understand what these cuts are doing? Lower signal efficiencies correspond to _harsher_ cuts, so expect (higher number, lower number)!"
                )

            # first, compute the cut values that correspond to the given signal efficiency values
            # classifier_range = (ClassifierBasedCategoryFiller._sigeff_to_score(env, signal_events, signal_weights, signal_aux_events, sigeff = classifier_sigeff_range[0]),
            #                     ClassifierBasedCategoryFiller._sigeff_to_score(env, signal_events, signal_weights, signal_aux_events, sigeff = classifier_sigeff_range[1]))

            classifier_range = ClassifierBasedCategoryFiller._sigeff_range_to_score_range(
                env,
                signal_events,
                signal_weights,
                signal_aux_events,
                sigeff_range=classifier_sigeff_range)

            print(
                "translated signal efficiency range ({}, {}) to classifier output range ({}, {})"
                .format(classifier_sigeff_range[0], classifier_sigeff_range[1],
                        classifier_range[0], classifier_range[1]))
        else:
            classifier_range = classifier_sigeff_range

        retcat = Category("clf_{:.2f}_{:.2f}".format(
            classifier_sigeff_range[0], classifier_sigeff_range[1]))

        for cur_events, cur_aux_events, cur_weights, process_name, cur_pred in zip(
                process_events, process_aux_events, process_weights,
                process_names, process_preds):
            # get the classifier predictions
            if cur_pred is None:
                cur_pred = env.predict(data=cur_events,
                                       auxdat=cur_aux_events)[:, 1]

            cur_nJ = cur_aux_events[:,
                                    TrainingConfig.auxiliary_branches.
                                    index("nJ")]

            if nJ:
                # a cut on the number of jets was requested
                cut = np.logical_and.reduce(
                    (cur_pred > classifier_range[0],
                     cur_pred < classifier_range[1], cur_nJ == nJ))
            else:
                # fill this category inclusively in the number of jets
                cut = np.logical_and.reduce((cur_pred > classifier_range[0],
                                             cur_pred < classifier_range[1]))

            passed_events = cur_events[cut]
            passed_weights = cur_weights[cut]
            passed_aux = cur_aux_events[cut]
            passed_pred = np.expand_dims(cur_pred[cut], axis=1)

            # also store some auxiliary information in this category
            aux_content = np.concatenate([passed_pred, passed_aux], axis=1)
            aux_variables = ["clf"] + TrainingConfig.auxiliary_branches

            #aux_content = cur_aux_events[cut]
            #aux_variables = TrainingConfig.auxiliary_branches

            retcat.add_events(events=passed_events,
                              weights=passed_weights,
                              process=process_name,
                              event_variables=TrainingConfig.training_branches,
                              aux_content=aux_content,
                              aux_variables=aux_variables)

        return retcat
    def create_classifier_category(mcoll,
                                   sig_process_data,
                                   sig_process_names,
                                   bkg_process_data,
                                   bkg_process_names,
                                   classifier_sigeff_range=(1.0, 0.0),
                                   nJ=2):

        # make sure to base all selections only on signal events with the correct number of jets
        sig_process_data = [
            cur_data.loc[cur_data["nJ"] == nJ] for cur_data in sig_process_data
        ]
        bkg_process_data = [
            cur_data.loc[cur_data["nJ"] == nJ] for cur_data in bkg_process_data
        ]

        # convert them to TrainingSamples as well
        sig_process_TrainingSamples = [
            TrainingSample.fromTable(cur_data) for cur_data in sig_process_data
        ]
        bkg_process_TrainingSamples = [
            TrainingSample.fromTable(cur_data) for cur_data in bkg_process_data
        ]
        all_signal_TrainingSample = TrainingSample.fromTable(
            pd.concat(sig_process_data))

        # obtain the classifier predictions on all samples
        sig_process_preds = [
            mcoll.predict(cur_data)[:, 1] for cur_data in sig_process_data
        ]
        bkg_process_preds = [
            mcoll.predict(cur_data)[:, 1] for cur_data in bkg_process_data
        ]
        all_signal_pred = np.concatenate(sig_process_preds, axis=0)

        # first, determine the cuts on the classifier based on the asked-for signal efficiency
        classifier_range = ClassifierBasedCategoryFiller._sigeff_range_to_score_range(
            all_signal_pred,
            all_signal_weights=all_signal_TrainingSample.weights,
            sigeff_range=classifier_sigeff_range)
        print(
            "translated signal efficiency range ({}, {}) to classifier output range ({}, {})"
            .format(classifier_sigeff_range[0], classifier_sigeff_range[1],
                    classifier_range[0], classifier_range[1]))

        retcat = Category("clf_{:.2f}_{:.2f}".format(
            classifier_sigeff_range[0], classifier_sigeff_range[1]))

        # then fill all events from all signal + background processes
        process_data = sig_process_data + bkg_process_data
        process_names = sig_process_names + bkg_process_names
        process_preds = sig_process_preds + bkg_process_preds

        for cur_process_data, cur_process_name, cur_pred in zip(
                process_data, process_names, process_preds):

            print("predicting on sample {} with length {}".format(
                cur_process_name, len(cur_process_data)))

            cut = np.logical_and.reduce((cur_pred > classifier_range[0],
                                         cur_pred < classifier_range[1]))

            assert len(cut) == len(cur_process_data)
            passed = cur_process_data[cut]
            passed = TrainingSample.fromTable(passed)

            # fill the category
            retcat.add_events(events=passed.data,
                              weights=passed.weights,
                              process=cur_process_name,
                              event_variables=TrainingConfig.training_branches)

            print("filled {} events from process '{}'".format(
                sum(passed.weights), cur_process_name))

        return retcat
예제 #7
0
def MakeDistributionControlPlots(infile, outdir, test_size=0.999):
    # sig_samples = ["Hbb"]
    # bkg_samples = ["ttbar", "Zjets", "Wjets", "diboson", "singletop"]

    # for MadGraph
    sig_samples = ["Hbb"]
    bkg_samples = ["ttbar", "Zjets", "Wjets", "diboson"]

    samples = sig_samples + bkg_samples

    # set up proper binnings for different variables
    binnings = {}
    binnings["mBB"] = get_binning(30, 600, 10)
    binnings["dRBB"] = get_binning(0.0, 3.0, 0.1)
    binnings["pTB1"] = get_binning(0, 300, 10)
    binnings["pTB2"] = get_binning(0, 300, 10)
    binnings["MET"] = get_binning(0, 300, 10)
    binnings["dEtaBB"] = get_binning(0, 5, 0.1)
    binnings["dPhiMETdijet"] = get_binning(0, np.pi, 0.1)
    binnings["SumPtJet"] = get_binning(0, 500, 10)

    print("loading data ...")
    data = [pd.read_hdf(infile_path, key=sample) for sample in samples]

    for cur_df, sample in zip(data, samples):
        print("have {} events available for '{}'".format(len(cur_df), sample))

    data_test = []
    mBB_test = []
    weights_test = []
    aux_data_test = []
    for sample in data:
        _, cur_test = train_test_split(sample,
                                       test_size=test_size,
                                       shuffle=True,
                                       random_state=12345)
        cur_testdata, cur_nuisdata, cur_weights = TrainNuisAuxSplit(
            cur_test
        )  # load the standard classifier input, nuisances and weights
        cur_aux_data = cur_test[TrainingConfig.other_branches].values
        data_test.append(cur_testdata)
        mBB_test.append(cur_nuisdata)
        weights_test.append(cur_weights / test_size)
        aux_data_test.append(cur_aux_data)

    # first, plot the total event content (i.e. corresponding to an "inclusive" event category)
    inclusive = Category("inclusive")
    for events, weights, process in zip(data_test, weights_test, samples):
        inclusive.add_events(events=events,
                             weights=weights,
                             process=process,
                             event_variables=TrainingConfig.training_branches,
                             aux_content=cur_aux_data,
                             aux_variables=TrainingConfig.other_branches)

    # print total event numbers for all processes
    print("============================")
    print(" inclusive expected event yield ")
    print("============================")
    for process in samples:
        print("{}: {} events".format(process,
                                     inclusive.get_number_events(process)))
    print("============================")

    # also fill inclusive 2- and 3-jet categories to get a baseline for the shapes
    inclusive_2J = CutBasedCategoryFiller.create_nJ_category(
        process_events=data_test,
        process_aux_events=aux_data_test,
        process_weights=weights_test,
        process_names=samples,
        nJ=2)

    print("============================")
    print(" inclusive 2j expected event yield ")
    print("============================")
    for process in samples:
        print("{}: {} events".format(process,
                                     inclusive_2J.get_number_events(process)))
    print("============================")

    inclusive_3J = CutBasedCategoryFiller.create_nJ_category(
        process_events=data_test,
        process_aux_events=aux_data_test,
        process_weights=weights_test,
        process_names=samples,
        nJ=3)

    print("============================")
    print(" inclusive 3j expected event yield ")
    print("============================")
    for process in samples:
        print("{}: {} events".format(process,
                                     inclusive_3J.get_number_events(process)))
    print("============================")

    # now, create separate histograms for each process and each event variable
    for cur_var in TrainingConfig.training_branches:
        if cur_var == "nJ":  # no plots for number of jets
            continue

        for cur_process in samples:
            CategoryPlotter.plot_category_composition(
                inclusive,
                binning=binnings[cur_var],
                outpath=os.path.join(
                    outdir,
                    "dist_{}_{}_inclusive.pdf".format(cur_var, cur_process)),
                var=cur_var,
                process_order=[cur_process],
                xlabel=cur_var,
                plotlabel=["inclusive"],
                args={})
            inclusive.export_histogram(binning=binnings[cur_var],
                                       processes=[cur_process],
                                       var_name=cur_var,
                                       outfile=os.path.join(
                                           outdir,
                                           "dist_{}_{}_inclusive.pkl".format(
                                               cur_var, cur_process)),
                                       clipping=True,
                                       density=True)

            CategoryPlotter.plot_category_composition(
                inclusive_2J,
                binning=binnings[cur_var],
                outpath=os.path.join(
                    outdir,
                    "dist_{}_{}_inclusive_2J.pdf".format(cur_var,
                                                         cur_process)),
                var=cur_var,
                process_order=[cur_process],
                xlabel=cur_var,
                plotlabel=["inclusive, nJ = 2"],
                args={})
            inclusive_2J.export_histogram(
                binning=binnings[cur_var],
                processes=[cur_process],
                var_name=cur_var,
                outfile=os.path.join(
                    outdir,
                    "dist_{}_{}_inclusive_2J.pkl".format(cur_var,
                                                         cur_process)),
                clipping=True,
                density=True)

            CategoryPlotter.plot_category_composition(
                inclusive_3J,
                binning=binnings[cur_var],
                outpath=os.path.join(
                    outdir,
                    "dist_{}_{}_inclusive_3J.pdf".format(cur_var,
                                                         cur_process)),
                var=cur_var,
                process_order=[cur_process],
                xlabel=cur_var,
                plotlabel=["inclusive, nJ = 3"],
                args={})
            inclusive_3J.export_histogram(
                binning=binnings[cur_var],
                processes=[cur_process],
                var_name=cur_var,
                outfile=os.path.join(
                    outdir,
                    "dist_{}_{}_inclusive_3J.pkl".format(cur_var,
                                                         cur_process)),
                clipping=True,
                density=True)