def create_nJ_category(process_events, process_aux_events, process_weights, process_names, nJ=2): retcat = Category("inclusive_{}J".format(nJ)) for cur_events, cur_aux_events, cur_weights, process_name in zip( process_events, process_aux_events, process_weights, process_names): # extract the branches that are needed for the cut cur_nJ = cur_aux_events[:, TrainingConfig.auxiliary_branches. index("nJ")] cut = (cur_nJ == nJ) passed_events = cur_events[cut] passed_weights = cur_weights[cut] passed_aux = cur_aux_events[cut] print("XXXXXX") print("adding aux with shape: {}".format(np.shape(passed_aux))) print("adding weights with shape: {}".format( np.shape(passed_weights))) retcat.add_events(events=passed_events, weights=passed_weights, process=process_name, event_variables=TrainingConfig.training_branches, aux_content=passed_aux, aux_variables=TrainingConfig.auxiliary_branches) return retcat
def create_nJ_category(process_data, process_names, nJ = 2): retcat = Category("inclusive_{}J".format(nJ)) formatter = only_nJ(nJ = nJ) for cur_process_data, cur_process_name in zip(process_data, process_names): passed = formatter.format_as_TrainingSample(cur_process_data) retcat.add_events(events = passed.data, weights = passed.weights, process = cur_process_name, event_variables = TrainingConfig.training_branches) return retcat
def create_high_MET_category(process_data, process_names, nJ = 2, cuts = {"MET_cut": 191, "dRBB_highMET_cut": 1.2, "dRBB_lowMET_cut": 5.0}): retcat = Category("high_MET") for cur_process_data, cur_process_name in zip(process_data, process_names): # apply the cuts passed = cur_process_data.loc[(cur_process_data["MET"] > cuts["MET_cut"]) & (cur_process_data["dRBB"] < cuts["dRBB_highMET_cut"]) & (cur_process_data["nJ"] == nJ)] passed = TrainingSample.fromTable(passed) # fill the category retcat.add_events(events = passed.data, weights = passed.weights, process = cur_process_name, event_variables = TrainingConfig.training_branches) print("filled {} events from process '{}'".format(sum(passed.weights), cur_process_name)) return retcat
def create_low_MET_category(process_events, process_aux_events, process_weights, process_names, nJ=2, cuts={ "MET_cut": 191, "dRBB_highMET_cut": 1.2, "dRBB_lowMET_cut": 5.0 }): retcat = Category("low_MET") for cur_events, cur_aux_events, cur_weights, process_name in zip( process_events, process_aux_events, process_weights, process_names): # extract the branches that are needed for the cut cur_MET = cur_events[:, TrainingConfig.training_branches.index("MET")] cur_dRBB = cur_aux_events[:, TrainingConfig.auxiliary_branches. index("dRBB")] cur_nJ = cur_aux_events[:, TrainingConfig.auxiliary_branches. index("nJ")] cut = np.logical_and.reduce( (cur_MET > 150, cur_MET < cuts["MET_cut"], cur_dRBB < cuts["dRBB_lowMET_cut"], cur_nJ == nJ)) passed_events = cur_events[cut] passed_weights = cur_weights[cut] passed_aux = cur_aux_events[cut] retcat.add_events(events=passed_events, weights=passed_weights, process=process_name, event_variables=TrainingConfig.training_branches, aux_content=passed_aux, aux_variables=TrainingConfig.auxiliary_branches) print("filled {} events from sample '{}'".format( len(passed_events), process_name)) return retcat
def create_classifier_category(env, process_events, process_aux_events, process_weights, process_names, signal_events, signal_aux_events, signal_weights, classifier_sigeff_range=(1, 0), nJ=2, interpret_as_sigeff=True, process_preds=None): if not process_preds: process_preds = [None for cur_events in process_events] if interpret_as_sigeff: if (classifier_sigeff_range[0] < classifier_sigeff_range[1]): raise Exception( "Warning: are you sure you understand what these cuts are doing? Lower signal efficiencies correspond to _harsher_ cuts, so expect (higher number, lower number)!" ) # first, compute the cut values that correspond to the given signal efficiency values # classifier_range = (ClassifierBasedCategoryFiller._sigeff_to_score(env, signal_events, signal_weights, signal_aux_events, sigeff = classifier_sigeff_range[0]), # ClassifierBasedCategoryFiller._sigeff_to_score(env, signal_events, signal_weights, signal_aux_events, sigeff = classifier_sigeff_range[1])) classifier_range = ClassifierBasedCategoryFiller._sigeff_range_to_score_range( env, signal_events, signal_weights, signal_aux_events, sigeff_range=classifier_sigeff_range) print( "translated signal efficiency range ({}, {}) to classifier output range ({}, {})" .format(classifier_sigeff_range[0], classifier_sigeff_range[1], classifier_range[0], classifier_range[1])) else: classifier_range = classifier_sigeff_range retcat = Category("clf_{:.2f}_{:.2f}".format( classifier_sigeff_range[0], classifier_sigeff_range[1])) for cur_events, cur_aux_events, cur_weights, process_name, cur_pred in zip( process_events, process_aux_events, process_weights, process_names, process_preds): # get the classifier predictions if cur_pred is None: cur_pred = env.predict(data=cur_events, auxdat=cur_aux_events)[:, 1] cur_nJ = cur_aux_events[:, TrainingConfig.auxiliary_branches. index("nJ")] if nJ: # a cut on the number of jets was requested cut = np.logical_and.reduce( (cur_pred > classifier_range[0], cur_pred < classifier_range[1], cur_nJ == nJ)) else: # fill this category inclusively in the number of jets cut = np.logical_and.reduce((cur_pred > classifier_range[0], cur_pred < classifier_range[1])) passed_events = cur_events[cut] passed_weights = cur_weights[cut] passed_aux = cur_aux_events[cut] passed_pred = np.expand_dims(cur_pred[cut], axis=1) # also store some auxiliary information in this category aux_content = np.concatenate([passed_pred, passed_aux], axis=1) aux_variables = ["clf"] + TrainingConfig.auxiliary_branches #aux_content = cur_aux_events[cut] #aux_variables = TrainingConfig.auxiliary_branches retcat.add_events(events=passed_events, weights=passed_weights, process=process_name, event_variables=TrainingConfig.training_branches, aux_content=aux_content, aux_variables=aux_variables) return retcat
def create_classifier_category(mcoll, sig_process_data, sig_process_names, bkg_process_data, bkg_process_names, classifier_sigeff_range=(1.0, 0.0), nJ=2): # make sure to base all selections only on signal events with the correct number of jets sig_process_data = [ cur_data.loc[cur_data["nJ"] == nJ] for cur_data in sig_process_data ] bkg_process_data = [ cur_data.loc[cur_data["nJ"] == nJ] for cur_data in bkg_process_data ] # convert them to TrainingSamples as well sig_process_TrainingSamples = [ TrainingSample.fromTable(cur_data) for cur_data in sig_process_data ] bkg_process_TrainingSamples = [ TrainingSample.fromTable(cur_data) for cur_data in bkg_process_data ] all_signal_TrainingSample = TrainingSample.fromTable( pd.concat(sig_process_data)) # obtain the classifier predictions on all samples sig_process_preds = [ mcoll.predict(cur_data)[:, 1] for cur_data in sig_process_data ] bkg_process_preds = [ mcoll.predict(cur_data)[:, 1] for cur_data in bkg_process_data ] all_signal_pred = np.concatenate(sig_process_preds, axis=0) # first, determine the cuts on the classifier based on the asked-for signal efficiency classifier_range = ClassifierBasedCategoryFiller._sigeff_range_to_score_range( all_signal_pred, all_signal_weights=all_signal_TrainingSample.weights, sigeff_range=classifier_sigeff_range) print( "translated signal efficiency range ({}, {}) to classifier output range ({}, {})" .format(classifier_sigeff_range[0], classifier_sigeff_range[1], classifier_range[0], classifier_range[1])) retcat = Category("clf_{:.2f}_{:.2f}".format( classifier_sigeff_range[0], classifier_sigeff_range[1])) # then fill all events from all signal + background processes process_data = sig_process_data + bkg_process_data process_names = sig_process_names + bkg_process_names process_preds = sig_process_preds + bkg_process_preds for cur_process_data, cur_process_name, cur_pred in zip( process_data, process_names, process_preds): print("predicting on sample {} with length {}".format( cur_process_name, len(cur_process_data))) cut = np.logical_and.reduce((cur_pred > classifier_range[0], cur_pred < classifier_range[1])) assert len(cut) == len(cur_process_data) passed = cur_process_data[cut] passed = TrainingSample.fromTable(passed) # fill the category retcat.add_events(events=passed.data, weights=passed.weights, process=cur_process_name, event_variables=TrainingConfig.training_branches) print("filled {} events from process '{}'".format( sum(passed.weights), cur_process_name)) return retcat
def MakeDistributionControlPlots(infile, outdir, test_size=0.999): # sig_samples = ["Hbb"] # bkg_samples = ["ttbar", "Zjets", "Wjets", "diboson", "singletop"] # for MadGraph sig_samples = ["Hbb"] bkg_samples = ["ttbar", "Zjets", "Wjets", "diboson"] samples = sig_samples + bkg_samples # set up proper binnings for different variables binnings = {} binnings["mBB"] = get_binning(30, 600, 10) binnings["dRBB"] = get_binning(0.0, 3.0, 0.1) binnings["pTB1"] = get_binning(0, 300, 10) binnings["pTB2"] = get_binning(0, 300, 10) binnings["MET"] = get_binning(0, 300, 10) binnings["dEtaBB"] = get_binning(0, 5, 0.1) binnings["dPhiMETdijet"] = get_binning(0, np.pi, 0.1) binnings["SumPtJet"] = get_binning(0, 500, 10) print("loading data ...") data = [pd.read_hdf(infile_path, key=sample) for sample in samples] for cur_df, sample in zip(data, samples): print("have {} events available for '{}'".format(len(cur_df), sample)) data_test = [] mBB_test = [] weights_test = [] aux_data_test = [] for sample in data: _, cur_test = train_test_split(sample, test_size=test_size, shuffle=True, random_state=12345) cur_testdata, cur_nuisdata, cur_weights = TrainNuisAuxSplit( cur_test ) # load the standard classifier input, nuisances and weights cur_aux_data = cur_test[TrainingConfig.other_branches].values data_test.append(cur_testdata) mBB_test.append(cur_nuisdata) weights_test.append(cur_weights / test_size) aux_data_test.append(cur_aux_data) # first, plot the total event content (i.e. corresponding to an "inclusive" event category) inclusive = Category("inclusive") for events, weights, process in zip(data_test, weights_test, samples): inclusive.add_events(events=events, weights=weights, process=process, event_variables=TrainingConfig.training_branches, aux_content=cur_aux_data, aux_variables=TrainingConfig.other_branches) # print total event numbers for all processes print("============================") print(" inclusive expected event yield ") print("============================") for process in samples: print("{}: {} events".format(process, inclusive.get_number_events(process))) print("============================") # also fill inclusive 2- and 3-jet categories to get a baseline for the shapes inclusive_2J = CutBasedCategoryFiller.create_nJ_category( process_events=data_test, process_aux_events=aux_data_test, process_weights=weights_test, process_names=samples, nJ=2) print("============================") print(" inclusive 2j expected event yield ") print("============================") for process in samples: print("{}: {} events".format(process, inclusive_2J.get_number_events(process))) print("============================") inclusive_3J = CutBasedCategoryFiller.create_nJ_category( process_events=data_test, process_aux_events=aux_data_test, process_weights=weights_test, process_names=samples, nJ=3) print("============================") print(" inclusive 3j expected event yield ") print("============================") for process in samples: print("{}: {} events".format(process, inclusive_3J.get_number_events(process))) print("============================") # now, create separate histograms for each process and each event variable for cur_var in TrainingConfig.training_branches: if cur_var == "nJ": # no plots for number of jets continue for cur_process in samples: CategoryPlotter.plot_category_composition( inclusive, binning=binnings[cur_var], outpath=os.path.join( outdir, "dist_{}_{}_inclusive.pdf".format(cur_var, cur_process)), var=cur_var, process_order=[cur_process], xlabel=cur_var, plotlabel=["inclusive"], args={}) inclusive.export_histogram(binning=binnings[cur_var], processes=[cur_process], var_name=cur_var, outfile=os.path.join( outdir, "dist_{}_{}_inclusive.pkl".format( cur_var, cur_process)), clipping=True, density=True) CategoryPlotter.plot_category_composition( inclusive_2J, binning=binnings[cur_var], outpath=os.path.join( outdir, "dist_{}_{}_inclusive_2J.pdf".format(cur_var, cur_process)), var=cur_var, process_order=[cur_process], xlabel=cur_var, plotlabel=["inclusive, nJ = 2"], args={}) inclusive_2J.export_histogram( binning=binnings[cur_var], processes=[cur_process], var_name=cur_var, outfile=os.path.join( outdir, "dist_{}_{}_inclusive_2J.pkl".format(cur_var, cur_process)), clipping=True, density=True) CategoryPlotter.plot_category_composition( inclusive_3J, binning=binnings[cur_var], outpath=os.path.join( outdir, "dist_{}_{}_inclusive_3J.pdf".format(cur_var, cur_process)), var=cur_var, process_order=[cur_process], xlabel=cur_var, plotlabel=["inclusive, nJ = 3"], args={}) inclusive_3J.export_histogram( binning=binnings[cur_var], processes=[cur_process], var_name=cur_var, outfile=os.path.join( outdir, "dist_{}_{}_inclusive_3J.pkl".format(cur_var, cur_process)), clipping=True, density=True)