def skim(self, file_index): try: dfreco = pickle.load(openfile(self.l_reco[file_index], "rb")) except Exception as e: # pylint: disable=broad-except print('failed to open file', self.l_reco[file_index], str(e)) sys.exit() #To (hopefully) fix double signd0 issue with database when unpacking dfreco = dfreco.loc[:, ~dfreco.columns.duplicated()] for ipt in range(self.p_nptbins): dfrecosk = seldf_singlevar(dfreco, self.v_var_binning, self.lpt_anbinmin[ipt], self.lpt_anbinmax[ipt]) dfrecosk = selectdfquery(dfrecosk, self.s_reco_skim[ipt]) dfrecosk = dfrecosk.reset_index(drop=True) f = openfile(self.mptfiles_recosk[ipt][file_index], "wb") pickle.dump(dfrecosk, f, protocol=4) f.close() if self.mcordata == "mc": try: dfgen = pickle.load(openfile(self.l_gen[file_index], "rb")) except Exception as e: # pylint: disable=broad-except print('failed to open MC file', self.l_gen[file_index], str(e)) dfgensk = seldf_singlevar(dfgen, self.v_var_binning, self.lpt_anbinmin[ipt], self.lpt_anbinmax[ipt]) dfgensk = selectdfquery(dfgensk, self.s_gen_skim[ipt]) dfgensk = dfgensk.reset_index(drop=True) pickle.dump(dfgensk, openfile(self.mptfiles_gensk[ipt][file_index], "wb"), protocol=4)
def applymodel(self, file_index): for ipt in range(self.p_nptbins): if os.path.exists(self.mptfiles_recoskmldec[ipt][file_index]): if os.stat(self.mptfiles_recoskmldec[ipt] [file_index]).st_size != 0: continue dfrecosk = pickle.load( openfile(self.mptfiles_recosk_forapply[ipt][file_index], "rb")) if self.doml is True: if os.path.isfile(self.lpt_model[ipt]) is False: print("Model file not present in bin %d" % ipt) if self.b_maskmissing: dfrecosk = dfrecosk.replace(self.v_varstomask, value=np.nan) mod = pickle.load(openfile(self.lpt_model[ipt], 'rb')) dfrecoskml = apply("BinaryClassification", [self.p_modelname], [mod], dfrecosk, self.v_train[ipt], None) probvar = "y_test_prob" + self.p_modelname dfrecoskml = dfrecoskml.loc[ dfrecoskml[probvar] > self.lpt_probcutpre[ipt]] else: dfrecoskml = dfrecosk.query("isstd == 1") pickle.dump(dfrecoskml, openfile(self.mptfiles_recoskmldec[ipt][file_index], "wb"), protocol=4) if self.do_mlprefilter is True and self.mcordata == "mc": dfgensk = pickle.load( openfile(self.mptfiles_gensk[ipt][file_index], "rb")) pickle.dump(dfgensk, openfile(self.mptfiles_genskmldec[ipt][file_index], "wb"), protocol=4)
def process_histomass_single(self, index): myfile = TFile.Open(self.l_histomass[index], "recreate") dfevtorig = pickle.load(openfile(self.l_evtorig[index], "rb")) if self.s_trigger is not None: dfevtorig = dfevtorig.query(self.s_trigger) if self.runlistrigger is not None: dfevtorig = selectdfrunlist(dfevtorig, \ self.run_param[self.runlistrigger], "run_number") hNorm = TH1F("hEvForNorm", "hEvForNorm", 2, 0.5, 2.5) hNorm.GetXaxis().SetBinLabel(1, "normsalisation factor") hNorm.GetXaxis().SetBinLabel(2, "selected events") nselevt = 0 norm = 0 if not dfevtorig.empty: nselevt = len(dfevtorig.query("is_ev_rej==0")) norm = getnormforselevt(dfevtorig) hNorm.SetBinContent(1, norm) hNorm.SetBinContent(2, nselevt) hNorm.Write() dfevtorig = dfevtorig.query("is_ev_rej==0") for ipt in range(self.p_nptfinbins): bin_id = self.bin_matching[ipt] df = pickle.load( openfile(self.mptfiles_recoskmldec[bin_id][index], "rb")) if self.doml is True: df = df.query(self.l_selml[bin_id]) if self.s_evtsel is not None: df = df.query(self.s_evtsel) if self.s_trigger is not None: df = df.query(self.s_trigger) df = seldf_singlevar(df, self.v_var_binning, \ self.lpt_finbinmin[ipt], self.lpt_finbinmax[ipt]) suffix = "%s%d_%d" % \ (self.v_var_binning, self.lpt_finbinmin[ipt], self.lpt_finbinmax[ipt]) h_invmass = TH1F("hmass" + suffix, "", self.p_num_bins, self.p_mass_fit_lim[0], self.p_mass_fit_lim[1]) if self.runlistrigger is not None: df = selectdfrunlist(df, \ self.run_param[self.runlistrigger], "run_number") fill_hist(h_invmass, df.inv_mass) myfile.cd() h_invmass.Write() if self.mcordata == "mc": df[self.v_ismcrefl] = np.array(tag_bit_df( df, self.v_bitvar, self.b_mcrefl), dtype=int) df_sig = df[df[self.v_ismcsignal] == 1] df_refl = df[df[self.v_ismcrefl] == 1] h_invmass_sig = TH1F("hmass_sig" + suffix, "", self.p_num_bins, self.p_mass_fit_lim[0], self.p_mass_fit_lim[1]) h_invmass_refl = TH1F("hmass_refl" + suffix, "", self.p_num_bins, self.p_mass_fit_lim[0], self.p_mass_fit_lim[1]) fill_hist(h_invmass_sig, df_sig.inv_mass) fill_hist(h_invmass_refl, df_refl.inv_mass) myfile.cd() h_invmass_sig.Write() h_invmass_refl.Write() print("FINISHED")
def process_valevents(self, file_index): dfevt = pickle.load(openfile(self.l_evt[file_index], "rb")) grouped = dfevt.groupby(self.v_evtmatch) for _, group in grouped: if len(group) > 1: print(len(group)) print(group) print("WARNING:EVENT DUPLICATION") dfreco = pickle.load(openfile(self.l_reco[file_index], "rb")) fileevtroot = TFile.Open(self.l_evtvalroot[file_index], "recreate") dfreco = dfreco.query("is_ev_rej == 0") h_n_tracklets = TH1F("h_n_tracklets", "h_n_tracklets", 100, -0.5, 99.5) h_n_tracklets_corr = TH1F("h_n_tracklets_corr", "h_n_tracklets_corr", 100, -0.5, 99.5) h_run = TH1F("h_run", "h_run", 100000, 200000, 300000) h_trigg = TH1F("h_trigg", "h_trigg", 2, -0.5, 1.5) fill_hist(h_n_tracklets_corr, dfreco["n_tracklets_corr"]) fill_hist(h_n_tracklets, dfreco["n_tracklets"]) fill_hist(h_run, dfreco["run_number"]) hmultvsrun = scatterplotroot(dfreco, "n_tracklets_corr", "run_number", 100, -0.5, 99.5, 100000, 200000.5, 300000.5) hmultvsrun.SetName("hmultvsrun") fill_hist(h_trigg, dfreco["is_ev_rej_INT7"]) hmultvsrun.Write() h_n_tracklets_corr.Write() h_n_tracklets.Write() hmultvsrun.Write() h_trigg.Write() h_run.Write() prof = hmultvsrun.ProfileY() prof.SetName("prof") prof.Write() fileevtroot.Close()
def skim(self, file_index): try: dfreco = pickle.load(openfile(self.l_reco[file_index], "rb")) except Exception as e: # pylint: disable=broad-except print('failed to open file', self.l_reco[file_index], str(e)) for ipt in range(self.p_nptbins): dfrecosk = seldf_singlevar(dfreco, self.v_var_binning, self.lpt_anbinmin[ipt], self.lpt_anbinmax[ipt]) dfrecosk = selectdfquery(dfrecosk, self.s_reco_skim[ipt]) dfrecosk = dfrecosk.reset_index(drop=True) f = openfile(self.mptfiles_recosk[ipt][file_index], "wb") pickle.dump(dfrecosk, f, protocol=4) f.close() if self.mcordata == "mc": try: dfgen = pickle.load(openfile(self.l_gen[file_index], "rb")) except Exception as e: # pylint: disable=broad-except print('failed to open MC file', self.l_gen[file_index], str(e)) dfgensk = seldf_singlevar(dfgen, self.v_var_binning, self.lpt_anbinmin[ipt], self.lpt_anbinmax[ipt]) dfgensk = selectdfquery(dfgensk, self.s_gen_skim[ipt]) dfgensk = dfgensk.reset_index(drop=True) pickle.dump(dfgensk, openfile(self.mptfiles_gensk[ipt][file_index], "wb"), protocol=4)
def do_apply(self): df_data = apply(self.p_mltype, self.p_classname, self.p_trainedmod, self.df_data, self.v_train) df_mc = apply(self.p_mltype, self.p_classname, self.p_trainedmod, self.df_mc, self.v_train) pickle.dump(df_data, openfile(self.f_reco_applieddata, "wb"), protocol=4) pickle.dump(df_mc, openfile(self.f_reco_appliedmc, "wb"), protocol=4)
def prepare_data_mc_mcgen(self): self.logger.info("Prepare data reco as well as MC reco and gen") if os.path.exists(self.f_reco_applieddata) \ and os.path.exists(self.f_reco_appliedmc) \ and self.step_done("preparemlsamples_data_mc_mcgen"): self.df_data = pickle.load(openfile(self.f_reco_applieddata, "rb")) self.df_mc = pickle.load(openfile(self.f_reco_appliedmc, "rb")) else: self.df_data = pickle.load(openfile(self.f_reco_data, "rb")) self.df_mc = pickle.load(openfile(self.f_reco_mc, "rb")) self.df_data = selectdfquery(self.df_data, self.p_evtsel) self.df_mc = selectdfquery(self.df_mc, self.p_evtsel) self.df_data = selectdfquery(self.df_data, self.p_triggersel_data) self.df_mc = selectdfquery(self.df_mc, self.p_triggersel_mc) self.df_mcgen = pickle.load(openfile(self.f_gen_mc, "rb")) self.df_mcgen = selectdfquery(self.df_mcgen, self.p_evtsel) self.df_mcgen = selectdfquery(self.df_mcgen, self.p_triggersel_mc) self.df_mcgen = self.df_mcgen.query(self.p_presel_gen_eff) self.arraydf = [self.df_data, self.df_mc] self.df_mc = seldf_singlevar(self.df_mc, self.v_bin, self.p_binmin, self.p_binmax) self.df_mcgen = seldf_singlevar(self.df_mcgen, self.v_bin, self.p_binmin, self.p_binmax) self.df_data = seldf_singlevar(self.df_data, self.v_bin, self.p_binmin, self.p_binmax)
def do_scancuts(self): self.logger.info("Scanning cuts") prob_array = [0.0, 0.2, 0.6, 0.9] dfdata = pickle.load(openfile(self.f_reco_applieddata, "rb")) dfmc = pickle.load(openfile(self.f_reco_appliedmc, "rb")) vardistplot_probscan(dfmc, self.v_all, "xgboost_classifier", prob_array, self.dirmlplot, "scancutsmc", 0, self.p_plot_options) vardistplot_probscan(dfmc, self.v_all, "xgboost_classifier", prob_array, self.dirmlplot, "scancutsmc", 1, self.p_plot_options) vardistplot_probscan(dfdata, self.v_all, "xgboost_classifier", prob_array, self.dirmlplot, "scancutsdata", 0, self.p_plot_options) vardistplot_probscan(dfdata, self.v_all, "xgboost_classifier", prob_array, self.dirmlplot, "scancutsdata", 1, self.p_plot_options) if not self.v_cuts: self.logger.warning("No variables for cut efficiency scan. Will be skipped") return efficiency_cutscan(dfmc, self.v_cuts, "xgboost_classifier", 0.5, self.dirmlplot, "mc", self.p_plot_options) efficiency_cutscan(dfmc, self.v_cuts, "xgboost_classifier", 0.9, self.dirmlplot, "mc", self.p_plot_options) efficiency_cutscan(dfdata, self.v_cuts, "xgboost_classifier", 0.5, self.dirmlplot, "data", self.p_plot_options) efficiency_cutscan(dfdata, self.v_cuts, "xgboost_classifier", 0.9, self.dirmlplot, "data", self.p_plot_options)
def applymodel(self, file_index): for ipt in range(self.p_nptbins): if os.path.exists(self.mptfiles_recoskmldec[ipt][file_index]): if os.stat(self.mptfiles_recoskmldec[ipt][file_index]).st_size != 0: continue dfrecosk = pickle.load(openfile(self.mptfiles_recosk[ipt][file_index], "rb")) if self.p_mask_values: mask_df(dfrecosk, self.p_mask_values) if self.doml is True: if os.path.isfile(self.lpt_model[ipt]) is False: print("Model file not present in bin %d" % ipt) mod = pickle.load(openfile(self.lpt_model[ipt], 'rb')) if self.mltype == "MultiClassification": dfrecoskml = apply(self.mltype, [self.p_modelname], [mod], dfrecosk, self.v_train[ipt], self.multiclass_labels) prob0 = "y_test_prob" + self.p_modelname + self.multiclass_labels[0] prob1 = "y_test_prob" + self.p_modelname + self.multiclass_labels[1] dfrecoskml = dfrecoskml.loc[(dfrecoskml[prob0] <= self.lpt_probcutpre[ipt][0]) & (dfrecoskml[prob1] >= self.lpt_probcutpre[ipt][1])] else: dfrecoskml = apply("BinaryClassification", [self.p_modelname], [mod], dfrecosk, self.v_train[ipt]) probvar = "y_test_prob" + self.p_modelname dfrecoskml = dfrecoskml.loc[dfrecoskml[probvar] > self.lpt_probcutpre[ipt]] else: dfrecoskml = dfrecosk.query("isstd == 1") pickle.dump(dfrecoskml, openfile(self.mptfiles_recoskmldec[ipt][file_index], "wb"), protocol=4)
def preparesample(self): self.logger.info("Prepare Sample") self.df_data = pickle.load(openfile(self.f_reco_data, "rb")) self.df_mc = pickle.load(openfile(self.f_reco_mc, "rb")) self.df_mcgen = pickle.load(openfile(self.f_gen_mc, "rb")) self.df_mcgen = self.df_mcgen.query(self.p_presel_gen_eff) arraydf = [self.df_data, self.df_mc] self.df_mc = seldf_singlevar(self.df_mc, self.v_bin, self.p_binmin, self.p_binmax) self.df_mcgen = seldf_singlevar(self.df_mcgen, self.v_bin, self.p_binmin, self.p_binmax) self.df_data = seldf_singlevar(self.df_data, self.v_bin, self.p_binmin, self.p_binmax) self.df_sig, self.df_bkg = arraydf[self.p_tagsig], arraydf[self.p_tagbkg] self.df_sig = seldf_singlevar(self.df_sig, self.v_bin, self.p_binmin, self.p_binmax) self.df_bkg = seldf_singlevar(self.df_bkg, self.v_bin, self.p_binmin, self.p_binmax) self.df_sig = self.df_sig.query(self.s_selsigml) self.df_bkg = self.df_bkg.query(self.s_selbkgml) self.df_bkg["ismcsignal"] = 0 self.df_bkg["ismcprompt"] = 0 self.df_bkg["ismcfd"] = 0 self.df_bkg["ismcbkg"] = 0 if self.p_nsig > len(self.df_sig): self.logger.warning("There are not enough signal events") if self.p_nbkg > len(self.df_bkg): self.logger.warning("There are not enough background events") self.p_nsig = min(len(self.df_sig), self.p_nsig) self.p_nbkg = min(len(self.df_bkg), self.p_nbkg) self.logger.info("Used number of signal events is %d", self.p_nsig) self.logger.info("Used number of background events is %d", self.p_nbkg) self.df_ml = pd.DataFrame() self.df_sig = shuffle(self.df_sig, random_state=self.rnd_shuffle) self.df_bkg = shuffle(self.df_bkg, random_state=self.rnd_shuffle) self.df_sig = self.df_sig[:self.p_nsig] self.df_bkg = self.df_bkg[:self.p_nbkg] self.df_sig[self.v_sig] = 1 self.df_bkg[self.v_sig] = 0 self.df_ml = pd.concat([self.df_sig, self.df_bkg]) self.df_mltrain, self.df_mltest = train_test_split(self.df_ml, \ test_size=self.test_frac, random_state=self.rnd_splt) self.df_mltrain = self.df_mltrain.reset_index(drop=True) self.df_mltest = self.df_mltest.reset_index(drop=True) self.df_sigtrain, self.df_bkgtrain = split_df_sigbkg(self.df_mltrain, self.v_sig) self.df_sigtest, self.df_bkgtest = split_df_sigbkg(self.df_mltest, self.v_sig) self.logger.info("Total number of candidates: train %d and test %d", len(self.df_mltrain), len(self.df_mltest)) self.logger.info("Number of signal candidates: train %d and test %d", len(self.df_sigtrain), len(self.df_sigtest)) self.logger.info("Number of bkg candidates: %d and test %d", len(self.df_bkgtrain), len(self.df_bkgtest)) self.df_xtrain = self.df_mltrain[self.v_train] self.df_ytrain = self.df_mltrain[self.v_sig] self.df_xtest = self.df_mltest[self.v_train] self.df_ytest = self.df_mltest[self.v_sig]
def applymodel_hipe4ml(self, file_index): for ipt in range(self.p_nptbins): if os.path.exists(self.mptfiles_recoskmldec[ipt][file_index]): if os.stat(self.mptfiles_recoskmldec[ipt] [file_index]).st_size != 0: continue print(self.mptfiles_recosk[ipt][file_index]) dfrecosk = pickle.load( openfile(self.mptfiles_recosk[ipt][file_index], "rb")) dfrecosk = seldf_singlevar(dfrecosk, self.v_var_binning, self.lpt_anbinmintr[ipt], self.lpt_anbinmaxtr[ipt]) if self.doml is True: if os.path.isfile(self.lpt_modhandler_hipe4ml[ipt]) is False: print("hipe4ml model file not present in bin %d" % ipt) if self.b_maskmissing: dfrecosk = dfrecosk.replace(self.v_varstomask, value=np.nan) modhandler = pickle.load( openfile(self.lpt_modhandler_hipe4ml[ipt], 'rb')) mod = modhandler.get_original_model() #njobsgrid = {'n_jobs': -1} #mod.set_params(**njobsgrid) if self.mltype == "MultiClassification": dfrecoskml = apply("MultiClassification", [self.p_modelname], [mod], dfrecosk, self.v_train[ipt], self.multiclass_labels) probvar0 = 'y_test_prob' + self.p_modelname + self.multiclass_labels[ 0] probvar1 = 'y_test_prob' + self.p_modelname + self.multiclass_labels[ 1] dfrecoskml = dfrecoskml.loc[ (dfrecoskml[probvar0] <= self.lpt_probcutpre[ipt][0]) & (dfrecoskml[probvar1] >= self.lpt_probcutpre[ipt][1])] else: dfrecoskml = apply("BinaryClassification", [self.p_modelname], [mod], dfrecosk, self.v_train[ipt], None) probvar = "y_test_prob" + self.p_modelname dfrecoskml = dfrecoskml.loc[ dfrecoskml[probvar] > self.lpt_probcutpre[ipt]] else: dfrecoskml = dfrecosk.query("isstd == 1") pickle.dump(dfrecoskml, openfile(self.mptfiles_recoskmldec[ipt][file_index], "wb"), protocol=4) if (self.do_mlprefilter is True or self.apply_w_pkl_layout is True) and self.mcordata == "mc": dfgensk = pickle.load( openfile(self.mptfiles_gensk[ipt][file_index], "rb")) pickle.dump(dfgensk, openfile(self.mptfiles_genskmldec[ipt][file_index], "wb"), protocol=4)
def applymodel(self, file_index): for ipt in range(self.p_nptbins): dfrecosk = pickle.load(openfile(self.mptfiles_recosk[ipt][file_index], "rb")) if os.path.isfile(self.lpt_model[ipt]) is False: print("Model file not present in bin %d" % ipt) mod = pickle.load(openfile(self.lpt_model[ipt], 'rb')) dfrecoskml = apply("BinaryClassification", [self.p_modelname], [mod], dfrecosk, self.v_train) probvar = "y_test_prob" + self.p_modelname dfrecoskml = dfrecoskml.loc[dfrecoskml[probvar] > self.lpt_probcutpre[ipt]] pickle.dump(dfrecoskml, openfile(self.mptfiles_recoskmldec[ipt][file_index], "wb"), protocol=4)
def unpack(self, file_index): treeevtorig = uproot.open(self.l_root[file_index])[self.n_treeevt] dfevtorig = treeevtorig.pandas.df(branches=self.v_evt) dfevtorig = selectdfrunlist(dfevtorig, self.runlist, "run_number") dfevtorig = selectdfquery(dfevtorig, self.s_cen_unp) dfevtorig = dfevtorig.reset_index(drop=True) pickle.dump(dfevtorig, openfile(self.l_evtorig[file_index], "wb"), protocol=4) dfevt = selectdfquery(dfevtorig, self.s_good_evt_unp) dfevt = dfevt.reset_index(drop=True) pickle.dump(dfevt, openfile(self.l_evt[file_index], "wb"), protocol=4) treereco = uproot.open(self.l_root[file_index])[self.n_treereco] dfreco = treereco.pandas.df(branches=self.v_all) dfreco = selectdfrunlist(dfreco, self.runlist, "run_number") dfreco = selectdfquery(dfreco, self.s_reco_unp) dfreco = pd.merge(dfreco, dfevt, on=self.v_evtmatch) isselacc = selectfidacc(dfreco.pt_cand.values, dfreco.y_cand.values) dfreco = dfreco[np.array(isselacc, dtype=bool)] if self.b_trackcuts is not None: dfreco = filter_bit_df(dfreco, self.v_bitvar, self.b_trackcuts) dfreco[self.v_isstd] = np.array(tag_bit_df(dfreco, self.v_bitvar, self.b_std), dtype=int) dfreco = dfreco.reset_index(drop=True) if self.mcordata == "mc": dfreco[self.v_ismcsignal] = np.array(tag_bit_df(dfreco, self.v_bitvar, self.b_mcsig), dtype=int) dfreco[self.v_ismcprompt] = np.array(tag_bit_df(dfreco, self.v_bitvar, self.b_mcsigprompt), dtype=int) dfreco[self.v_ismcfd] = np.array(tag_bit_df(dfreco, self.v_bitvar, self.b_mcsigfd), dtype=int) dfreco[self.v_ismcbkg] = np.array(tag_bit_df(dfreco, self.v_bitvar, self.b_mcbkg), dtype=int) pickle.dump(dfreco, openfile(self.l_reco[file_index], "wb"), protocol=4) if self.mcordata == "mc": treegen = uproot.open(self.l_root[file_index])[self.n_treegen] dfgen = treegen.pandas.df(branches=self.v_gen) dfgen = selectdfrunlist(dfgen, self.runlist, "run_number") dfgen = pd.merge(dfgen, dfevtorig, on=self.v_evtmatch) dfgen = selectdfquery(dfgen, self.s_gen_unp) dfgen[self.v_isstd] = np.array(tag_bit_df(dfgen, self.v_bitvar, self.b_std), dtype=int) dfgen[self.v_ismcsignal] = np.array(tag_bit_df(dfgen, self.v_bitvar, self.b_mcsig), dtype=int) dfgen[self.v_ismcprompt] = np.array(tag_bit_df(dfgen, self.v_bitvar, self.b_mcsigprompt), dtype=int) dfgen[self.v_ismcfd] = np.array(tag_bit_df(dfgen, self.v_bitvar, self.b_mcsigfd), dtype=int) dfgen[self.v_ismcbkg] = np.array(tag_bit_df(dfgen, self.v_bitvar, self.b_mcbkg), dtype=int) dfgen = dfgen.reset_index(drop=True) pickle.dump(dfgen, openfile(self.l_gen[file_index], "wb"), protocol=4)
def process_response(self): list_df_mc_reco = [] list_df_mc_gen = [] for iptskim, _ in enumerate(self.lpt_anbinmin): df_mc_reco = pickle.load( openfile(self.lpt_recodecmerged[iptskim], "rb")) if "pt_jet" not in df_mc_reco.columns: print( "Jet variables not found in the dataframe. Skipping process_response." ) return if self.s_evtsel is not None: df_mc_reco = df_mc_reco.query(self.s_evtsel) if self.s_trigger is not None: df_mc_reco = df_mc_reco.query(self.s_trigger) df_mc_reco = df_mc_reco.query(self.l_selml[iptskim]) list_df_mc_reco.append(df_mc_reco) df_mc_gen = pickle.load( openfile(self.lpt_gendecmerged[iptskim], "rb")) df_mc_gen = df_mc_gen.query(self.s_presel_gen_eff) list_df_mc_gen.append(df_mc_gen) df_mc_reco_merged = pd.concat(list_df_mc_reco) df_mc_gen_merged = pd.concat(list_df_mc_gen) df_mc_reco_merged_fd = df_mc_reco_merged[ df_mc_reco_merged.ismcfd == 1] # reconstructed & selected non-prompt jets df_mc_gen_merged_fd = df_mc_gen_merged[ df_mc_gen_merged.ismcfd == 1] # generated & selected non-prompt jets out_file = TFile.Open(self.n_fileeff, "update") # Detector response matrix of pt_jet of non-prompt jets df_resp_jet_fd = df_mc_reco_merged_fd.loc[:, ["pt_gen_jet", "pt_jet"]] his_resp_jet_fd = TH2F("his_resp_jet_fd", \ "Response matrix of #it{p}_{T}^{jet, ch} of non-prompt jets;#it{p}_{T}^{jet, ch, gen.} (GeV/#it{c});#it{p}_{T}^{jet, ch, rec.} (GeV/#it{c})", \ 100, 0, 100, 100, 0, 100) fill_hist(his_resp_jet_fd, df_resp_jet_fd) # Simulated pt_cand vs. pt_jet of non-prompt jets df_ptc_ptjet_fd = df_mc_gen_merged_fd.loc[:, ["pt_cand", "pt_jet"]] n_bins = len(self.lpt_finbinmin) analysis_bin_lims_temp = self.lpt_finbinmin.copy() analysis_bin_lims_temp.append(self.lpt_finbinmax[n_bins - 1]) analysis_bin_lims = array.array('d', analysis_bin_lims_temp) his_ptc_ptjet_fd = TH2F("his_ptc_ptjet_fd", \ "Simulated #it{p}_{T}^{cand.} vs. #it{p}_{T}^{jet} of non-prompt jets;#it{p}_{T}^{cand., gen.} (GeV/#it{c});#it{p}_{T}^{jet, ch, gen.} (GeV/#it{c})", \ n_bins, analysis_bin_lims, 100, 0, 100) fill_hist(his_ptc_ptjet_fd, df_ptc_ptjet_fd) out_file.cd() his_resp_jet_fd.Write() his_ptc_ptjet_fd.Write() out_file.Close()
def process_valevents(self, file_index): dfevt = pickle.load(openfile(self.l_evtorig[file_index], "rb")) dfevt = dfevt.query("is_ev_rej==0") triggerlist = ["HighMultV0", "HighMultSPD", "INT7"] varlist = ["v0m_corr", "n_tracklets_corr", "perc_v0m"] nbinsvar = [100, 200] minrvar = [0, 0] maxrvar = [1500, 200] fileevtroot = TFile.Open(self.l_evtvalroot[file_index], "recreate") hv0mvsperc = scatterplot(dfevt, "perc_v0m", "v0m_corr", 50000, 0, 100, 200, 0., 2000.) hv0mvsperc.SetName("hv0mvsperc") hv0mvsperc.Write() for ivar, var in enumerate(varlist): label = "hbitINT7vs%s" % (var) histoMB = TH1F(label, label, nbinsvar[ivar], minrvar[ivar], maxrvar[ivar]) fill_hist(histoMB, dfevt.query("trigger_hasbit_INT7==1")[var]) histoMB.Sumw2() histoMB.Write() for trigger in triggerlist: triggerbit = "trigger_hasbit_%s==1" % trigger labeltriggerANDMB = "hbit%sANDINT7vs%s" % (trigger, var) labeltrigger = "hbit%svs%s" % (trigger, var) histotrigANDMB = TH1F(labeltriggerANDMB, labeltriggerANDMB, nbinsvar[ivar], minrvar[ivar], maxrvar[ivar]) histotrig = TH1F(labeltrigger, labeltrigger, nbinsvar[ivar], minrvar[ivar], maxrvar[ivar]) fill_hist( histotrigANDMB, dfevt.query(triggerbit + " and trigger_hasbit_INT7==1")[var]) fill_hist(histotrig, dfevt.query(triggerbit)[var]) histotrigANDMB.Sumw2() histotrig.Sumw2() histotrigANDMB.Write() histotrig.Write() dfevtnorm = pickle.load(openfile(self.l_evtorig[file_index], "rb")) hNorm = TH1F("hEvForNorm", ";;Normalisation", 2, 0.5, 2.5) hNorm.GetXaxis().SetBinLabel(1, "normsalisation factor") hNorm.GetXaxis().SetBinLabel(2, "selected events") nselevt = 0 norm = 0 if not dfevtnorm.empty: nselevt = len(dfevtnorm.query("is_ev_rej==0")) norm = getnormforselevt(dfevtnorm) hNorm.SetBinContent(1, norm) hNorm.SetBinContent(2, nselevt) hNorm.Write() fileevtroot.Close()
def process_histomass(self): myfile = TFile.Open(self.n_filemass, "recreate") for ipt in range(self.p_nptfinbins): bin_id = self.bin_matching[ipt] df = pickle.load(openfile(self.lpt_recodecmerged[bin_id], "rb")) df = df.query(self.l_selml[bin_id]) if self.s_evtsel is not None: df = df.query(self.s_evtsel) if self.s_trigger is not None: df = df.query(self.s_trigger) df = seldf_singlevar(df, self.v_var_binning, \ self.lpt_finbinmin[ipt], self.lpt_finbinmax[ipt]) for ibin2 in range(len(self.lvar2_binmin)): suffix = "%s%d_%d_%.2f%s_%.2f_%.2f" % \ (self.v_var_binning, self.lpt_finbinmin[ipt], self.lpt_finbinmax[ipt], self.lpt_probcutfin[bin_id], self.v_var2_binning, self.lvar2_binmin[ibin2], self.lvar2_binmax[ibin2]) h_invmass = TH1F("hmass" + suffix, "", self.p_num_bins, self.p_mass_fit_lim[0], self.p_mass_fit_lim[1]) df_bin = seldf_singlevar(df, self.v_var2_binning, self.lvar2_binmin[ibin2], self.lvar2_binmax[ibin2]) fill_hist(h_invmass, df_bin.inv_mass) myfile.cd() h_invmass.Write() if "pt_jet" in df_bin.columns: zarray = z_calc(df_bin.pt_jet, df_bin.phi_jet, df_bin.eta_jet, df_bin.pt_cand, df_bin.phi_cand, df_bin.eta_cand) h_zvsinvmass = TH2F("hzvsmass" + suffix, "", 5000, 1.00, 6.00, 2000, -0.5, 1.5) zvsinvmass = np.vstack((df_bin.inv_mass, zarray)).T fill_hist(h_zvsinvmass, zvsinvmass) h_zvsinvmass.Write()
def do_test(self): df_ml_test = test(self.p_mltype, self.p_classname, self.p_trainedmod, self.df_mltest, self.v_train, self.v_sig) df_ml_test_to_df = self.dirmlout+"/testsample_%s_mldecision.pkl" % (self.s_suffix) df_ml_test_to_root = self.dirmlout+"/testsample_%s_mldecision.root" % (self.s_suffix) pickle.dump(df_ml_test, openfile(df_ml_test_to_df, "wb"), protocol=4) write_tree(df_ml_test_to_root, self.n_treetest, df_ml_test)
def do_test(self): self.do_train() if self.step_done("test"): self.df_mltest_applied = pickle.load( openfile(self.f_mltest_applied, "rb")) return self.logger.info("Testing") self.df_mltest_applied = test(self.p_mltype, self.p_classname, self.p_trainedmod, self.df_mltest, self.v_train, self.v_sig) df_ml_test_to_root = self.dirmlout + "/testsample_%s_mldecision.root" % ( self.s_suffix) pickle.dump(self.df_mltest_applied, openfile(self.f_mltest_applied, "wb"), protocol=4) write_tree(df_ml_test_to_root, self.n_treetest, self.df_mltest_applied)
def do_apply(self): self.prepare_data_mc_mcgen() if self.step_done("application"): return self.do_train() self.logger.info("Application") df_data = apply(self.p_mltype, self.p_classname, self.p_trainedmod, self.df_data, self.v_train) df_mc = apply(self.p_mltype, self.p_classname, self.p_trainedmod, self.df_mc, self.v_train) pickle.dump(df_data, openfile(self.f_reco_applieddata, "wb"), protocol=4) pickle.dump(df_mc, openfile(self.f_reco_appliedmc, "wb"), protocol=4)
def check_duplicates(file_path, cols): """Open dataframe and check for duplicates """ df = pickle.load(openfile(file_path, "rb"))[cols] len_orig = len(df) df_dupl = df[df.duplicated(keep=False)] len_dupl = len(df_dupl) return len_orig, len_dupl, df_dupl
def create_bkg_df(size): dfreco = pickle.load(openfile("./results_test/filtrated_df.pkl", "rb")) print("datasamlpe loaded", dfreco.shape) d_bkg = dfreco[dfreco["is_d"] == 1] nd_bkg = dfreco[dfreco["is_d"] == 0] frames = [d_bkg, nd_bkg] bkg = pd.concat(frames) bkg["ismcsignal"] = 0 bkg = bkg.sample(n=size) print(bkg) return bkg
def process_histomass(self): for ipt in range(self.p_nptbins): myfile = TFile.Open(self.lpt_filemass[ipt], "recreate") df = pickle.load(openfile(self.lpt_recodecmerged[ipt], "rb")) df = df.query(self.l_selml[ipt]) h_invmass = TH1F("hmass", "", self.p_num_bins, self.p_mass_fit_lim[0], self.p_mass_fit_lim[1]) fill_hist(h_invmass, df.inv_mass) myfile.cd() h_invmass.Write() canv_mass = TCanvas("c%d" % (ipt), "canvas", 500, 500) h_invmass.Draw() canv_mass.SaveAs("%s/chisto_bin%d.pdf" % (self.d_results, ipt))
def process_histomass(self): myfile = TFile.Open(self.n_filemass, "recreate") for ipt in range(self.p_nptfinbins): bin_id = self.bin_matching[ipt] df = pickle.load(openfile(self.lpt_recodecmerged[bin_id], "rb")) df = df.query(self.l_selml[bin_id]) if self.s_evtsel is not None: df = df.query(self.s_evtsel) if self.s_trigger is not None: df = df.query(self.s_trigger) df = seldf_singlevar(df, self.v_var_binning, \ self.lpt_finbinmin[ipt], self.lpt_finbinmax[ipt]) for ibin2 in range(len(self.lvar2_binmin)): suffix = "%s%d_%d_%.2f%s_%.2f_%.2f" % \ (self.v_var_binning, self.lpt_finbinmin[ipt], self.lpt_finbinmax[ipt], self.lpt_probcutfin[bin_id], self.v_var2_binning, self.lvar2_binmin[ibin2], self.lvar2_binmax[ibin2]) h_invmass = TH1F("hmass" + suffix, "", self.p_num_bins, self.p_mass_fit_lim[0], self.p_mass_fit_lim[1]) df_bin = seldf_singlevar(df, self.v_var2_binning, self.lvar2_binmin[ibin2], self.lvar2_binmax[ibin2]) weights = None #apply_weights = self.datap["analysis"][self.typean]["triggersel"]["weights"] #if apply_weights is not None: # filenorm = TFile.Open("norm.root", "read") # hnorm = filenorm.Get("hnorm_" + apply_weights[0] + "_" + apply_weights[1]) # weights = [hnorm.GetBinContent(hnorm.FindBin(_bin)) \ # for _bin in df_bin[apply_weights[0]]] fill_hist(h_invmass, df_bin.inv_mass, weights=weights) myfile.cd() h_invmass.Write() if "pt_jet" in df_bin.columns: zarray = z_calc(df_bin.pt_jet, df_bin.phi_jet, df_bin.eta_jet, df_bin.pt_cand, df_bin.phi_cand, df_bin.eta_cand) h_zvsinvmass = TH2F("hzvsmass" + suffix, "", 5000, 1.00, 6.00, 2000, -0.5, 1.5) zvsinvmass = np.vstack((df_bin.inv_mass, zarray)).T fill_hist(h_zvsinvmass, zvsinvmass) h_zvsinvmass.Write()
def process_histomass(self): for ipt in range(self.p_nptbins): myfile = TFile.Open(self.lpt_filemass[ipt], "recreate") df = pickle.load(openfile(self.lpt_recodecmerged[ipt], "rb")) df = df.query(self.l_selml[ipt]) h_invmass = TH1F("hmass", "", self.p_num_bins, self.p_mass_fit_lim[0], self.p_mass_fit_lim[1]) fill_hist(h_invmass, df.inv_mass) myfile.cd() if self.usefit is True: fitter(h_invmass, self.p_casefit, self.p_sgnfunc[ipt], self.p_bkgfunc[ipt], \ self.p_masspeak, self.p_rebin[ipt], self.p_dolike, self.p_fixingausmean, \ self.p_fixingaussigma, self.p_sigmaarray[ipt], self.p_massmin[ipt], \ self.p_massmax[ipt], self.p_fixedmean, self.p_fixedsigma, self.d_results, \ self.v_var_binning, self.lpt_anbinmin[ipt], self.lpt_anbinmax[ipt]) h_invmass.Write() canv_mass = TCanvas("c%d" % (ipt), "canvas", 500, 500) h_invmass.Draw() canv_mass.SaveAs("%s/chisto_bin%d.pdf" % (self.d_results, ipt))
def process_histomass(self): myfile = TFile.Open(self.n_filemass, "recreate") for ipt in range(self.p_nptfinbins): bin_id = self.bin_matching[ipt] df = pickle.load(openfile(self.lpt_recodecmerged[bin_id], "rb")) df = df.query(self.l_selml[bin_id]) df = seldf_singlevar(df, self.v_var_binning, \ self.lpt_finbinmin[ipt], self.lpt_finbinmax[ipt]) for ibin2 in range(len(self.lvar2_binmin)): suffix = "%s%d_%d_%.2f%s_%d_%d" % \ (self.v_var_binning, self.lpt_finbinmin[ipt], self.lpt_finbinmax[ipt], self.lpt_probcutfin[bin_id], self.v_var2_binning, self.lvar2_binmin[ibin2], self.lvar2_binmax[ibin2]) h_invmass = TH1F("hmass" + suffix, "", self.p_num_bins, self.p_mass_fit_lim[0], self.p_mass_fit_lim[1]) df_bin = seldf_singlevar(df, self.v_var2_binning, self.lvar2_binmin[ibin2], self.lvar2_binmax[ibin2]) fill_hist(h_invmass, df_bin.inv_mass) myfile.cd() h_invmass.Write()
def fill_from_pickles(file_paths, histo_params, cols=None, query=None, skim_func=None, skim_func_args=None, queries=None, merge_on=None): print(f"Process files {file_paths}") dfs = [pickle.load(openfile(f, "rb")) for f in file_paths] df = dfs[0] if len(dfs) > 1: if merge_on and len(merge_on) != len(dfs) - 1: print( f"ERROR: merge_on must be {len(dfs) - 1} however found to be {len(merge_on)}" ) sys.exit(1) for df_, on in zip(dfs[1:], merge_on): # Recursively merge dataframes df = pd.merge(df, df_, on=on) if query: # Apply common query df = df.query(query) if cols: # Select already columns which are needed in the following df = df[cols] if skim_func: # Skim the dataframe according to user function df = skim_func(df, skim_func_args) histos = [] if not queries: queries = [None] * len(histo_params) if len(queries) != len(histo_params): print("ERROR: Need as many queries as histogram parameters") sys.exit(1) for hp, qu in zip(histo_params, queries): n_cols = len(hp[0]) if n_cols > 2: print(f"ERROR: Cannot handle plots with dimension > 2") sys.exit(1) histo_func = TH1F if n_cols == 1 else TH2F df_fill = df if qu: # If there is an additional query for this histogram apply it to dataframe df_fill = df.query(qu) # Arrange for 1D or 2D plotting fill_with = df_fill[hp[0][0]] if n_cols == 1 else df_fill[ hp[0]].to_numpy() histo_name = "_".join(hp[0]) histo = histo_func(histo_name, histo_name, *hp[1]) weights = df_fill[hp[2]] if len(hp) == 3 else None fill_hist(histo, fill_with, weights=weights) histo.SetDirectory(0) histos.append(histo) return histos
def do_significance(self): self.logger.info("Doing significance optimization") #first extract the number of data events in the ml sample self.df_evt_data = pickle.load(openfile(self.f_evt_data, 'rb')) if self.p_dofullevtmerge is True: self.df_evttotsample_data = pickle.load(openfile(self.f_evttotsample_data, 'rb')) else: self.logger.info("The total merged event dataframe was not merged \ for space limits") self.df_evttotsample_data = pickle.load(openfile(self.f_evt_data, 'rb')) #and the total number of events self.p_nevttot = len(self.df_evttotsample_data) self.p_nevtml = len(self.df_evt_data) self.logger.info("Number of data events used for ML: %d", self.p_nevtml) self.logger.info("Total number of data events: %d", self.p_nevttot) #calculate acceptance correction. we use in this case all #the signal from the mc sample, without limiting to the n. signal #events used for training denacc = len(self.df_mcgen[self.df_mcgen["ismcprompt"] == 1]) numacc = len(self.df_mc[self.df_mc["ismcprompt"] == 1]) acc, acc_err = self.calceff(numacc, denacc) self.logger.info("Acceptance: %.3e +/- %.3e", acc, acc_err) #calculation of the expected fonll signals df_fonll = pd.read_csv(self.f_fonll) ptmin = self.p_binmin ptmax = self.p_binmax df_fonll_in_pt = df_fonll.query('(pt >= @ptmin) and (pt < @ptmax)')[self.p_fonllband] prod_cross = df_fonll_in_pt.sum() * self.p_fragf * 1e-12 / len(df_fonll_in_pt) delta_pt = ptmax - ptmin signal_yield = 2. * prod_cross * delta_pt * self.p_br * acc * self.p_taa \ / (self.p_sigmamb * self.p_fprompt) self.logger.info("Expected signal yield: %.3e", signal_yield) signal_yield = self.p_raahp * signal_yield self.logger.info("Expected signal yield x RAA hp: %.3e", signal_yield) #now we plot the fonll expectation plt.figure(figsize=(20, 15)) plt.subplot(111) plt.plot(df_fonll['pt'], df_fonll[self.p_fonllband] * self.p_fragf, linewidth=4.0) plt.xlabel('P_t [GeV/c]', fontsize=20) plt.ylabel('Cross Section [pb/GeV]', fontsize=20) plt.title("FONLL cross section " + self.p_case, fontsize=20) plt.semilogy() plt.savefig(f'{self.dirmlplot}/FONLL_curve_{self.s_suffix}.png') df_data_sideband = self.df_data.query(self.s_selbkgml) df_data_sideband = shuffle(df_data_sideband, random_state=self.rnd_shuffle) df_data_sideband = df_data_sideband.tail(round(len(df_data_sideband) * self.p_bkgfracopt)) hmass = TH1F('hmass', '', self.p_num_bins, self.p_mass_fit_lim[0], self.p_mass_fit_lim[1]) df_mc_signal = self.df_mc[self.df_mc["ismcsignal"] == 1] mass_array = df_mc_signal['inv_mass'].values for mass_value in np.nditer(mass_array): hmass.Fill(mass_value) gaus_fit = TF1("gaus_fit", "gaus", self.p_mass_fit_lim[0], self.p_mass_fit_lim[1]) gaus_fit.SetParameters(0, hmass.Integral()) gaus_fit.SetParameters(1, self.p_mass) gaus_fit.SetParameters(2, 0.02) self.logger.info("To fit the signal a gaussian function is used") fitsucc = hmass.Fit("gaus_fit", "RQ") if int(fitsucc) != 0: self.logger.warning("Problem in signal peak fit") sigma = 0. sigma = gaus_fit.GetParameter(2) self.logger.info("Mean of the gaussian: %.3e", gaus_fit.GetParameter(1)) self.logger.info("Sigma of the gaussian: %.3e", sigma) sig_region = [self.p_mass - 3 * sigma, self.p_mass + 3 * sigma] fig_signif_pevt = plt.figure(figsize=(20, 15)) plt.xlabel('Threshold', fontsize=20) plt.ylabel(r'Significance Per Event ($3 \sigma$)', fontsize=20) plt.title("Significance Per Event vs Threshold", fontsize=20) fig_signif = plt.figure(figsize=(20, 15)) plt.xlabel('Threshold', fontsize=20) plt.ylabel(r'Significance ($3 \sigma$)', fontsize=20) plt.title("Significance vs Threshold", fontsize=20) for name in self.p_classname: df_sig = self.df_mltest[self.df_mltest["ismcprompt"] == 1] eff_array, eff_err_array, x_axis = self.calc_sigeff_steps(self.p_nstepsign, df_sig, name) bkg_array, bkg_err_array, _ = calc_bkg(df_data_sideband, name, self.p_nstepsign, self.p_mass_fit_lim, self.p_bin_width, sig_region, self.p_savefit, self.dirmlplot) sig_array = [eff * signal_yield for eff in eff_array] sig_err_array = [eff_err * signal_yield for eff_err in eff_err_array] bkg_array = [bkg / (self.p_bkgfracopt * self.p_nevtml) for bkg in bkg_array] bkg_err_array = [bkg_err / (self.p_bkgfracopt * self.p_nevtml) \ for bkg_err in bkg_err_array] signif_array, signif_err_array = calc_signif(sig_array, sig_err_array, bkg_array, bkg_err_array) plt.figure(fig_signif_pevt.number) plt.errorbar(x_axis, signif_array, yerr=signif_err_array, alpha=0.3, label=f'{name}', elinewidth=2.5, linewidth=4.0) signif_array_ml = [sig * sqrt(self.p_nevtml) for sig in signif_array] signif_err_array_ml = [sig_err * sqrt(self.p_nevtml) for sig_err in signif_err_array] plt.figure(fig_signif.number) plt.errorbar(x_axis, signif_array_ml, yerr=signif_err_array_ml, alpha=0.3, label=f'{name}_ML_dataset', elinewidth=2.5, linewidth=4.0) signif_array_tot = [sig * sqrt(self.p_nevttot) for sig in signif_array] signif_err_array_tot = [sig_err * sqrt(self.p_nevttot) for sig_err in signif_err_array] plt.figure(fig_signif.number) plt.errorbar(x_axis, signif_array_tot, yerr=signif_err_array_tot, alpha=0.3, label=f'{name}_Tot', elinewidth=2.5, linewidth=4.0) plt.figure(fig_signif_pevt.number) plt.legend(loc="lower left", prop={'size': 18}) plt.savefig(f'{self.dirmlplot}/Significance_PerEvent_{self.s_suffix}.png') plt.figure(fig_signif.number) plt.legend(loc="lower left", prop={'size': 18}) plt.savefig(f'{self.dirmlplot}/Significance_{self.s_suffix}.png')
def process_response(self): """ First of all, we load all the mc gen and reco files that are skimmed in bins of HF candidate ptand we apply the standard selection to all of them. After this, we merged them all to create a single file of gen and reco monte carlo sample with all the HF candidate pt. In particular gen jets are selected according to run trigger, runlist, and gen jet zbin_recoand pseudorapidity. Reco candidates according to evt selection, eta jets, trigger and ml probability of the HF hadron """ zbin_reco = [] nzbin_reco = self.p_nbinshape_reco zbin_reco = self.varshaperanges_reco zbinarray_reco = array.array('d', zbin_reco) zbin_gen = [] nzbin_gen = self.p_nbinshape_gen zbin_gen = self.varshaperanges_gen zbinarray_gen = array.array('d', zbin_gen) jetptbin_reco = [] njetptbin_reco = self.p_nbin2_reco jetptbin_reco = self.var2ranges_reco jetptbinarray_reco = array.array('d', jetptbin_reco) jetptbin_gen = [] njetptbin_gen = self.p_nbin2_gen jetptbin_gen = self.var2ranges_gen jetptbinarray_gen = array.array('d', jetptbin_gen) candptbin = [] candptbin = self.lpt_finbinmin.copy() candptbin.append(self.lpt_finbinmax[-1]) candptbinarray = array.array('d', candptbin) out_file = TFile.Open(self.n_fileeff, "update") list_df_mc_reco = [] list_df_mc_gen = [] for iptskim, _ in enumerate(self.lpt_anbinmin): df_mc_gen = pickle.load(openfile(self.lpt_gendecmerged[iptskim], "rb")) df_mc_gen = selectdfrunlist(df_mc_gen, \ self.run_param[self.runlistrigger[self.triggerbit]], "run_number") df_mc_gen = df_mc_gen.query(self.s_jetsel_gen) list_df_mc_gen.append(df_mc_gen) df_mc_reco = pickle.load(openfile(self.lpt_recodecmerged[iptskim], "rb")) if self.s_evtsel is not None: df_mc_reco = df_mc_reco.query(self.s_evtsel) if self.s_jetsel_reco is not None: df_mc_reco = df_mc_reco.query(self.s_jetsel_reco) if self.s_trigger is not None: df_mc_reco = df_mc_reco.query(self.s_trigger) if self.doml is True: df_mc_reco = df_mc_reco.query(self.l_selml[iptskim]) list_df_mc_reco.append(df_mc_reco) # Here we can merge the dataframes corresponding to different HF pt in a # single one. In addition we are here selecting only non prompt HF df_gen = pd.concat(list_df_mc_gen) df_mc_reco = pd.concat(list_df_mc_reco) # add the z columns df_gen["z"] = z_calc(df_gen.pt_jet, df_gen.phi_jet, df_gen.eta_jet, df_gen.pt_cand, df_gen.phi_cand, df_gen.eta_cand) df_mc_reco["z"] = z_calc(df_mc_reco.pt_jet, df_mc_reco.phi_jet, df_mc_reco.eta_jet, df_mc_reco.pt_cand, df_mc_reco.phi_cand, df_mc_reco.eta_cand) df_mc_reco["z_gen"] = z_gen_calc(df_mc_reco.pt_gen_jet, df_mc_reco.phi_gen_jet, df_mc_reco.eta_gen_jet, df_mc_reco.pt_gen_cand, df_mc_reco.delta_phi_gen_jet, df_mc_reco.delta_eta_gen_jet) df_gen_nonprompt = df_gen[df_gen.ismcfd == 1] df_gen_prompt = df_gen[df_gen.ismcprompt == 1] df_mc_reco_merged_nonprompt = df_mc_reco[df_mc_reco.ismcfd == 1] df_mc_reco_merged_prompt = df_mc_reco[df_mc_reco.ismcprompt == 1] # The following plots are 3d plots all at generated level of z, # pt_jet and pt_cand. This was used in the first version of the feeddown # subtraction, currently is obsolete hzvsjetpt_gen_unmatched = TH2F("hzvsjetpt_gen_unmatched", "hzvsjetpt_gen_unmatched", \ nzbin_gen, zbinarray_gen, njetptbin_gen, jetptbinarray_gen) df_zvsjetpt_gen_unmatched = df_gen_prompt.loc[:, [self.v_varshape_binning, "pt_jet"]] fill_hist(hzvsjetpt_gen_unmatched, df_zvsjetpt_gen_unmatched) hzvsjetpt_gen_unmatched.Write() titlehist = "hzvsjetptvscandpt_gen_nonprompt" hzvsjetptvscandpt_gen_nonprompt = makefill3dhist(df_gen_nonprompt, titlehist, \ zbinarray_gen, jetptbinarray_gen, candptbinarray, self.v_varshape_binning, "pt_jet", "pt_cand") hzvsjetptvscandpt_gen_nonprompt.Write() # hz_gen_nocuts is the distribution of generated z values in b in # bins of gen_jet pt before the reco z and jetpt selection. hz_gen_cuts # also includes cut on z reco and jet pt reco. These are used for overall # efficiency correction to estimate the fraction of candidates that are # in the reco range but outside the gen range and viceversa for ibin2 in range(self.p_nbin2_gen): suffix = "%s_%.2f_%.2f" % \ (self.v_var2_binning, self.lvar2_binmin_gen[ibin2], self.lvar2_binmax_gen[ibin2]) hz_gen_nocuts = TH1F("hz_gen_nocuts_nonprompt" + suffix, \ "hz_gen_nocuts_nonprompt" + suffix, nzbin_gen, zbinarray_gen) hz_gen_nocuts.Sumw2() hz_gen_cuts = TH1F("hz_gen_cuts_nonprompt" + suffix, "hz_gen_cuts_nonprompt" + suffix, nzbin_gen, zbinarray_gen) hz_gen_cuts.Sumw2() df_tmp = seldf_singlevar(df_mc_reco_merged_nonprompt, "pt_gen_jet", \ self.lvar2_binmin_gen[ibin2], self.lvar2_binmax_gen[ibin2]) df_tmp = seldf_singlevar(df_tmp, self.v_varshape_binning_gen, \ self.lvarshape_binmin_gen[0], self.lvarshape_binmax_gen[-1]) fill_hist(hz_gen_nocuts, df_tmp[self.v_varshape_binning_gen]) df_tmp = seldf_singlevar(df_tmp, "pt_jet", self.lvar2_binmin_reco[0], self.lvar2_binmax_reco[-1]) df_tmp = seldf_singlevar(df_tmp, self.v_varshape_binning, self.lvarshape_binmin_reco[0], self.lvarshape_binmax_reco[-1]) fill_hist(hz_gen_cuts, df_tmp[self.v_varshape_binning_gen]) hz_gen_cuts.Write() hz_gen_nocuts.Write() # Addendum for unfolding hz_gen_nocuts_pr = TH1F("hz_gen_nocuts" + suffix, \ "hz_gen_nocuts" + suffix, nzbin_gen, zbinarray_gen) hz_gen_nocuts_pr.Sumw2() hz_gen_cuts_pr = TH1F("hz_gen_cuts" + suffix, "hz_gen_cuts" + suffix, nzbin_gen, zbinarray_gen) hz_gen_cuts_pr.Sumw2() df_tmp_pr = seldf_singlevar(df_mc_reco_merged_prompt, "pt_gen_jet", \ self.lvar2_binmin_gen[ibin2], self.lvar2_binmax_gen[ibin2]) df_tmp_pr = seldf_singlevar(df_tmp_pr, self.v_varshape_binning_gen, \ self.lvarshape_binmin_gen[0], self.lvarshape_binmax_gen[-1]) fill_hist(hz_gen_nocuts_pr, df_tmp_pr[self.v_varshape_binning_gen]) df_tmp_pr = seldf_singlevar(df_tmp_pr, "pt_jet", self.lvar2_binmin_reco[0], self.lvar2_binmax_reco[-1]) df_tmp_pr = seldf_singlevar(df_tmp_pr, self.v_varshape_binning, self.lvarshape_binmin_reco[0], self.lvarshape_binmax_reco[-1]) fill_hist(hz_gen_cuts_pr, df_tmp_pr[self.v_varshape_binning_gen]) hz_gen_cuts_pr.Write() hz_gen_nocuts_pr.Write() # End addendum for unfolding df_tmp_selgen, df_tmp_selreco, df_tmp_selrecogen = \ self.create_df_closure(df_mc_reco_merged_nonprompt) df_tmp_selgen_pr, df_tmp_selreco_pr, df_tmp_selrecogen_pr = \ self.create_df_closure(df_mc_reco_merged_prompt) # histograms for response of feeddown hzvsjetpt_reco_nocuts = \ build2dhisto("hzvsjetpt_reco_nocuts_nonprompt", zbinarray_reco, jetptbinarray_reco) hzvsjetpt_reco_cuts = \ build2dhisto("hzvsjetpt_reco_cuts_nonprompt", zbinarray_reco, jetptbinarray_reco) hzvsjetpt_gen_nocuts = \ build2dhisto("hzvsjetpt_gen_nocuts_nonprompt", zbinarray_gen, jetptbinarray_gen) hzvsjetpt_gen_cuts = \ build2dhisto("hzvsjetpt_gen_cuts_nonprompt", zbinarray_gen, jetptbinarray_gen) hzvsjetpt_reco = hzvsjetpt_reco_nocuts.Clone("hzvsjetpt_reco_nonprompt") hzvsjetpt_gen = hzvsjetpt_gen_nocuts.Clone("hzvsjetpt_genv") response_matrix = RooUnfoldResponse(hzvsjetpt_reco, hzvsjetpt_gen) fill2dhist(df_tmp_selreco, hzvsjetpt_reco_nocuts, self.v_varshape_binning, "pt_jet") fill2dhist(df_tmp_selgen, hzvsjetpt_gen_nocuts, self.v_varshape_binning_gen, "pt_gen_jet") fill2dhist(df_tmp_selrecogen, hzvsjetpt_reco_cuts, self.v_varshape_binning, "pt_jet") fill2dhist(df_tmp_selrecogen, hzvsjetpt_gen_cuts, self.v_varshape_binning_gen, "pt_gen_jet") hzvsjetpt_reco_nocuts.Write() hzvsjetpt_gen_nocuts.Write() hzvsjetpt_reco_cuts.Write() hzvsjetpt_gen_cuts.Write() # histograms for unfolding hzvsjetpt_reco_nocuts_pr = \ build2dhisto("hzvsjetpt_reco_nocuts", zbinarray_reco, jetptbinarray_reco) hzvsjetpt_reco_cuts_pr = \ build2dhisto("hzvsjetpt_reco_cuts", zbinarray_reco, jetptbinarray_reco) hzvsjetpt_gen_nocuts_pr = \ build2dhisto("hzvsjetpt_gen_nocuts", zbinarray_gen, jetptbinarray_gen) hzvsjetpt_gen_cuts_pr = \ build2dhisto("hzvsjetpt_gen_cuts", zbinarray_gen, jetptbinarray_gen) fill2dhist(df_tmp_selreco_pr, hzvsjetpt_reco_nocuts_pr, self.v_varshape_binning, "pt_jet") fill2dhist(df_tmp_selgen_pr, hzvsjetpt_gen_nocuts_pr, self.v_varshape_binning_gen, "pt_gen_jet") fill2dhist(df_tmp_selrecogen_pr, hzvsjetpt_reco_cuts_pr, self.v_varshape_binning, "pt_jet") fill2dhist(df_tmp_selrecogen_pr, hzvsjetpt_gen_cuts_pr, self.v_varshape_binning_gen, "pt_gen_jet") hzvsjetpt_reco_nocuts_pr.Write() hzvsjetpt_gen_nocuts_pr.Write() hzvsjetpt_reco_cuts_pr.Write() hzvsjetpt_gen_cuts_pr.Write() hzvsjetpt_reco_closure_pr = \ build2dhisto("hzvsjetpt_reco_closure", zbinarray_reco, jetptbinarray_reco) hzvsjetpt_gen_closure_pr = \ build2dhisto("hzvsjetpt_gen_closure", zbinarray_reco, jetptbinarray_reco) hzvsjetpt_reco_pr = \ build2dhisto("hzvsjetpt_reco", zbinarray_reco, jetptbinarray_reco) hzvsjetpt_gen_pr = \ build2dhisto("hzvsjetpt_gen", zbinarray_gen, jetptbinarray_gen) response_matrix_pr = RooUnfoldResponse(hzvsjetpt_reco_pr, hzvsjetpt_gen_pr) response_matrix_closure_pr = RooUnfoldResponse(hzvsjetpt_reco_pr, hzvsjetpt_gen_pr) fill2dhist(df_tmp_selreco_pr, hzvsjetpt_reco_pr, self.v_varshape_binning, "pt_jet") fill2dhist(df_tmp_selgen_pr, hzvsjetpt_gen_pr, self.v_varshape_binning_gen, "pt_gen_jet") hzvsjetpt_reco_pr.Write() hzvsjetpt_gen_pr.Write() hjetpt_gen_nocuts_pr = TH1F("hjetpt_gen_nocuts", \ "hjetpt_gen_nocuts", njetptbin_gen, jetptbinarray_gen) hjetpt_gen_cuts_pr = TH1F("hjetpt_gen_cuts", \ "hjetpt_gen_cuts", njetptbin_gen, jetptbinarray_gen) hjetpt_gen_nocuts_closure = TH1F("hjetpt_gen_nocuts_closure", \ "hjetpt_gen_nocuts_closure", njetptbin_gen, jetptbinarray_gen) hjetpt_gen_cuts_closure = TH1F("hjetpt_gen_cuts_closure", \ "hjetpt_gen_cuts_closure", njetptbin_gen, jetptbinarray_gen) hjetpt_gen_nocuts_pr.Sumw2() hjetpt_gen_cuts_pr.Sumw2() hjetpt_gen_nocuts_closure.Sumw2() hjetpt_gen_nocuts_closure.Sumw2() fill_hist(hjetpt_gen_nocuts_pr, df_tmp_selgen_pr["pt_gen_jet"]) fill_hist(hjetpt_gen_cuts_pr, df_tmp_selrecogen_pr["pt_gen_jet"]) hjetpt_gen_nocuts_pr.Write() hjetpt_gen_cuts_pr.Write() # end of histograms for unfolding hjetpt_genvsreco_full = \ TH2F("hjetpt_genvsreco_full_nonprompt", "hjetpt_genvsreco_full_nonprompt", \ njetptbin_gen * 100, self.lvar2_binmin_gen[0], self.lvar2_binmax_gen[-1], \ njetptbin_reco * 100, self.lvar2_binmin_reco[0], self.lvar2_binmax_reco[-1]) hz_genvsreco_full = \ TH2F("hz_genvsreco_full_nonprompt", "hz_genvsreco_full_nonprompt", \ nzbin_gen * 100, self.lvarshape_binmin_gen[0], self.lvarshape_binmax_gen[-1], nzbin_reco * 100, self.lvarshape_binmin_reco[0], self.lvarshape_binmax_reco[-1]) fill2dhist(df_tmp_selrecogen, hjetpt_genvsreco_full, "pt_gen_jet", "pt_jet") hjetpt_genvsreco_full.Scale(1.0 / hjetpt_genvsreco_full.Integral(1, -1, 1, -1)) hjetpt_genvsreco_full.Write() fill2dhist(df_tmp_selrecogen, hz_genvsreco_full, self.v_varshape_binning_gen, self.v_varshape_binning) hz_genvsreco_full.Scale(1.0 / hz_genvsreco_full.Integral(1, -1, 1, -1)) hz_genvsreco_full.Write() for row in df_tmp_selrecogen.itertuples(): response_matrix.Fill(getattr(row, self.v_varshape_binning), row.pt_jet, getattr(row, self.v_varshape_binning_gen), row.pt_gen_jet) response_matrix.Write("response_matrix_nonprompt") # histograms for unfolding hjetpt_genvsreco_full_pr = \ TH2F("hjetpt_genvsreco_full", "hjetpt_genvsreco_full", \ njetptbin_gen * 100, self.lvar2_binmin_gen[0], self.lvar2_binmax_gen[-1], \ njetptbin_reco * 100, self.lvar2_binmin_reco[0], self.lvar2_binmax_reco[-1]) hz_genvsreco_full_pr = \ TH2F("hz_genvsreco_full", "hz_genvsreco_full", \ nzbin_gen * 100, self.lvarshape_binmin_gen[0], self.lvarshape_binmax_gen[-1], nzbin_reco * 100, self.lvarshape_binmin_reco[0], self.lvarshape_binmax_reco[-1]) fill2dhist(df_tmp_selrecogen_pr, hjetpt_genvsreco_full_pr, "pt_gen_jet", "pt_jet") hjetpt_genvsreco_full_pr.Scale(1.0 / hjetpt_genvsreco_full_pr.Integral(1, -1, 1, -1)) hjetpt_genvsreco_full_pr.Write() fill2dhist(df_tmp_selrecogen_pr, hz_genvsreco_full_pr, self.v_varshape_binning_gen, self.v_varshape_binning) hz_genvsreco_full_pr.Scale(1.0 / hz_genvsreco_full_pr.Integral(1, -1, 1, -1)) hz_genvsreco_full_pr.Write() hzvsjetpt_prior_weights = build2dhisto("hzvsjetpt_prior_weights", \ zbinarray_gen, jetptbinarray_gen) fill2dhist(df_tmp_selrecogen_pr, hzvsjetpt_prior_weights, self.v_varshape_binning_gen, "pt_gen_jet") # end of histograms for unfolding for ibin2 in range(self.p_nbin2_reco): df_tmp_selrecogen_jetbin = seldf_singlevar(df_tmp_selrecogen, "pt_jet", \ self.lvar2_binmin_reco[ibin2], self.lvar2_binmax_reco[ibin2]) suffix = "%s_%.2f_%.2f" % (self.v_var2_binning, \ self.lvar2_binmin_reco[ibin2], self.lvar2_binmax_reco[ibin2]) hz_genvsreco = TH2F("hz_genvsreco_nonprompt" + suffix, "hz_genvsreco_nonprompt" + suffix, \ nzbin_gen * 100, self.lvarshape_binmin_gen[0], self.lvarshape_binmax_gen[-1], \ nzbin_reco*100, self.lvarshape_binmin_reco[0], self.lvarshape_binmax_reco[-1]) fill2dhist(df_tmp_selrecogen_jetbin, hz_genvsreco, self.v_varshape_binning_gen, self.v_varshape_binning) norm = hz_genvsreco.Integral(1, -1, 1, -1) if norm > 0: hz_genvsreco.Scale(1.0/norm) hz_genvsreco.Write() df_tmp_selrecogen_pr_jetbin = seldf_singlevar(df_tmp_selrecogen_pr, "pt_jet", \ self.lvar2_binmin_reco[ibin2], self.lvar2_binmax_reco[ibin2]) suffix = "%s_%.2f_%.2f" % (self.v_var2_binning, \ self.lvar2_binmin_reco[ibin2], self.lvar2_binmax_reco[ibin2]) hz_genvsreco_pr = TH2F("hz_genvsreco" + suffix, "hz_genvsreco" + suffix, \ nzbin_gen * 100, self.lvarshape_binmin_gen[0], self.lvarshape_binmax_gen[-1], \ nzbin_reco*100, self.lvarshape_binmin_reco[0], self.lvarshape_binmax_reco[-1]) fill2dhist(df_tmp_selrecogen_pr_jetbin, hz_genvsreco_pr, self.v_varshape_binning_gen, self.v_varshape_binning) norm_pr = hz_genvsreco_pr.Integral(1, -1, 1, -1) if norm_pr > 0: hz_genvsreco_pr.Scale(1.0/norm_pr) hz_genvsreco_pr.Write() for ibinshape in range(len(self.lvarshape_binmin_reco)): df_tmp_selrecogen_zbin = seldf_singlevar(df_tmp_selrecogen, self.v_varshape_binning, \ self.lvarshape_binmin_reco[ibinshape], self.lvarshape_binmax_reco[ibinshape]) suffix = "%s_%.2f_%.2f" % \ (self.v_varshape_binning, self.lvarshape_binmin_reco[ibinshape], self.lvarshape_binmax_reco[ibinshape]) hjetpt_genvsreco = TH2F("hjetpt_genvsreco_nonprompt" + suffix, \ "hjetpt_genvsreco_nonprompt" + suffix, njetptbin_gen * 100, self.lvar2_binmin_gen[0], \ self.lvar2_binmax_gen[-1], njetptbin_reco * 100, self.lvar2_binmin_reco[0], \ self.lvar2_binmax_reco[-1]) fill2dhist(df_tmp_selrecogen_zbin, hjetpt_genvsreco, "pt_gen_jet", "pt_jet") norm = hjetpt_genvsreco.Integral(1, -1, 1, -1) if norm > 0: hjetpt_genvsreco.Scale(1.0/norm) hjetpt_genvsreco.Write() df_tmp_selrecogen_pr_zbin = seldf_singlevar(df_tmp_selrecogen_pr, self.v_varshape_binning, \ self.lvarshape_binmin_reco[ibinshape], self.lvarshape_binmax_reco[ibinshape]) suffix = "%s_%.2f_%.2f" % \ (self.v_varshape_binning, self.lvarshape_binmin_reco[ibinshape], self.lvarshape_binmax_reco[ibinshape]) hjetpt_genvsreco_pr = TH2F("hjetpt_genvsreco" + suffix, \ "hjetpt_genvsreco" + suffix, njetptbin_gen * 100, self.lvar2_binmin_gen[0], \ self.lvar2_binmax_gen[-1], njetptbin_reco * 100, self.lvar2_binmin_reco[0], \ self.lvar2_binmax_reco[-1]) fill2dhist(df_tmp_selrecogen_pr_zbin, hjetpt_genvsreco_pr, "pt_gen_jet", "pt_jet") norm_pr = hjetpt_genvsreco_pr.Integral(1, -1, 1, -1) if norm_pr > 0: hjetpt_genvsreco_pr.Scale(1.0/norm_pr) hjetpt_genvsreco_pr.Write() for ibinshape in range(len(self.lvarshape_binmin_gen)): dtmp_nonprompt_zgen = seldf_singlevar(df_mc_reco_merged_nonprompt, \ self.v_varshape_binning_gen, self.lvarshape_binmin_gen[ibinshape], self.lvarshape_binmax_gen[ibinshape]) suffix = "%s_%.2f_%.2f" % \ (self.v_varshape_binning, self.lvarshape_binmin_gen[ibinshape], self.lvarshape_binmax_gen[ibinshape]) hz_fracdiff = TH1F("hz_fracdiff_nonprompt" + suffix, "hz_fracdiff_nonprompt" + suffix, 100, -2, 2) fill_hist(hz_fracdiff, (dtmp_nonprompt_zgen[self.v_varshape_binning] - \ dtmp_nonprompt_zgen[self.v_varshape_binning_gen])/dtmp_nonprompt_zgen[self.v_varshape_binning_gen]) norm = hz_fracdiff.Integral(1, -1) if norm: hz_fracdiff.Scale(1.0 / norm) hz_fracdiff.Write() dtmp_prompt_zgen = seldf_singlevar(df_mc_reco_merged_prompt, \ self.v_varshape_binning_gen, self.lvarshape_binmin_gen[ibinshape], self.lvarshape_binmax_gen[ibinshape]) suffix = "%s_%.2f_%.2f" % \ (self.v_varshape_binning, self.lvarshape_binmin_gen[ibinshape], self.lvarshape_binmax_gen[ibinshape]) hz_fracdiff_pr = TH1F("hz_fracdiff_prompt" + suffix, "hz_fracdiff_prompt" + suffix, 100, -2, 2) fill_hist(hz_fracdiff_pr, (dtmp_prompt_zgen[self.v_varshape_binning] - \ dtmp_prompt_zgen[self.v_varshape_binning_gen])/dtmp_prompt_zgen[self.v_varshape_binning_gen]) norm_pr = hz_fracdiff_pr.Integral(1, -1) if norm_pr: hz_fracdiff_pr.Scale(1.0 / norm_pr) hz_fracdiff_pr.Write() for ibin2 in range(self.p_nbin2_gen): dtmp_nonprompt_jetptgen = seldf_singlevar(df_mc_reco_merged_nonprompt, \ "pt_gen_jet", self.lvar2_binmin_gen[ibin2], self.lvar2_binmax_gen[ibin2]) suffix = "%s_%.2f_%.2f" % (self.v_var2_binning, self.lvar2_binmin_gen[ibin2], self.lvar2_binmax_gen[ibin2]) hjetpt_fracdiff = TH1F("hjetpt_fracdiff_nonprompt" + suffix, "hjetpt_fracdiff_nonprompt" + suffix, 100, -2, 2) fill_hist(hjetpt_fracdiff, (dtmp_nonprompt_jetptgen["pt_jet"] - \ dtmp_nonprompt_jetptgen["pt_gen_jet"])/dtmp_nonprompt_jetptgen["pt_gen_jet"]) norm = hjetpt_fracdiff.Integral(1, -1) if norm: hjetpt_fracdiff.Scale(1.0 / norm) hjetpt_fracdiff.Write() dtmp_prompt_jetptgen = seldf_singlevar(df_mc_reco_merged_prompt, \ "pt_gen_jet", self.lvar2_binmin_gen[ibin2], self.lvar2_binmax_gen[ibin2]) suffix = "%s_%.2f_%.2f" % (self.v_var2_binning, self.lvar2_binmin_gen[ibin2], self.lvar2_binmax_gen[ibin2]) hjetpt_fracdiff_pr = TH1F("hjetpt_fracdiff_prompt" + suffix, "hjetpt_fracdiff_prompt" + suffix, 100, -2, 2) fill_hist(hjetpt_fracdiff_pr, (dtmp_prompt_jetptgen["pt_jet"] - \ dtmp_prompt_jetptgen["pt_gen_jet"])/dtmp_prompt_jetptgen["pt_gen_jet"]) norm_pr = hjetpt_fracdiff_pr.Integral(1, -1) if norm_pr: hjetpt_fracdiff_pr.Scale(1.0 / norm_pr) hjetpt_fracdiff_pr.Write() df_mc_reco_merged_prompt_train, df_mc_reco_merged_prompt_test = \ train_test_split(df_mc_reco_merged_prompt, test_size=self.closure_frac) df_tmp_selgen_pr_test, df_tmp_selreco_pr_test, df_tmp_selrecogen_pr_test = \ self.create_df_closure(df_mc_reco_merged_prompt_test) _, _, df_tmp_selrecogen_pr_train = \ self.create_df_closure(df_mc_reco_merged_prompt_train) fill2dhist(df_tmp_selreco_pr_test, hzvsjetpt_reco_closure_pr, self.v_varshape_binning, "pt_jet") fill2dhist(df_tmp_selgen_pr_test, hzvsjetpt_gen_closure_pr, self.v_varshape_binning_gen, "pt_gen_jet") hzvsjetpt_reco_closure_pr.Write("input_closure_reco") hzvsjetpt_gen_closure_pr.Write("input_closure_gen") for ibin2 in range(self.p_nbin2_gen): suffix = "%s_%.2f_%.2f" % \ (self.v_var2_binning, self.lvar2_binmin_gen[ibin2], self.lvar2_binmax_gen[ibin2]) hz_gen_nocuts_closure = TH1F("hz_gen_nocuts_closure" + suffix, "hz_gen_nocuts_closure" + suffix, nzbin_gen, zbinarray_gen) hz_gen_nocuts_closure.Sumw2() hz_gen_cuts_closure = TH1F("hz_gen_cuts_closure" + suffix, "hz_gen_cuts_closure" + suffix, nzbin_gen, zbinarray_gen) hz_gen_cuts_closure.Sumw2() df_tmp_selgen_pr_test_bin = seldf_singlevar(df_tmp_selgen_pr_test, \ "pt_gen_jet", self.lvar2_binmin_gen[ibin2], self.lvar2_binmax_gen[ibin2]) df_tmp_selrecogen_pr_test_bin = seldf_singlevar(df_tmp_selrecogen_pr_test, \ "pt_gen_jet", self.lvar2_binmin_gen[ibin2], self.lvar2_binmax_gen[ibin2]) fill_hist(hz_gen_nocuts_closure, df_tmp_selgen_pr_test_bin[self.v_varshape_binning_gen]) fill_hist(hz_gen_cuts_closure, df_tmp_selrecogen_pr_test_bin[self.v_varshape_binning_gen]) hz_gen_cuts_closure.Write() hz_gen_nocuts_closure.Write() fill_hist(hjetpt_gen_nocuts_closure, df_tmp_selgen_pr_test["pt_gen_jet"]) fill_hist(hjetpt_gen_cuts_closure, df_tmp_selrecogen_pr_test["pt_gen_jet"]) hjetpt_gen_nocuts_closure.Write() hjetpt_gen_cuts_closure.Write() hzvsjetpt_reco_nocuts_closure = TH2F("hzvsjetpt_reco_nocuts_closure", "hzvsjetpt_reco_nocuts_closure", nzbin_reco, zbinarray_reco, njetptbin_reco, jetptbinarray_reco) hzvsjetpt_reco_nocuts_closure.Sumw2() hzvsjetpt_reco_cuts_closure = TH2F("hzvsjetpt_reco_cuts_closure", "hzvsjetpt_reco_cuts_closure", nzbin_reco, zbinarray_reco, njetptbin_reco, jetptbinarray_reco) hzvsjetpt_reco_cuts_closure.Sumw2() fill2dhist(df_tmp_selreco_pr_test, hzvsjetpt_reco_nocuts_closure, self.v_varshape_binning, "pt_jet") fill2dhist(df_tmp_selrecogen_pr_test, hzvsjetpt_reco_cuts_closure, self.v_varshape_binning, "pt_jet") hzvsjetpt_reco_nocuts_closure.Write() hzvsjetpt_reco_cuts_closure.Write() for row in df_tmp_selrecogen_pr.itertuples(): response_matrix_weight = 1.0 if self.doprior is True: binx = hzvsjetpt_prior_weights.GetXaxis().FindBin(getattr(row, self.v_varshape_binning_gen)) biny = hzvsjetpt_prior_weights.GetYaxis().FindBin(row.pt_gen_jet) weight = hzvsjetpt_prior_weights.GetBinContent(binx, biny) if weight > 0.0: response_matrix_weight = 1.0/weight response_matrix_pr.Fill(getattr(row, self.v_varshape_binning), row.pt_jet,\ getattr(row, self.v_varshape_binning_gen), row.pt_gen_jet, response_matrix_weight) for row in df_tmp_selrecogen_pr_train.itertuples(): response_matrix_weight = 1.0 if self.doprior is True: binx = hzvsjetpt_prior_weights.GetXaxis().FindBin(getattr(row, self.v_varshape_binning_gen)) biny = hzvsjetpt_prior_weights.GetYaxis().FindBin(row.pt_gen_jet) weight = hzvsjetpt_prior_weights.GetBinContent(binx, biny) if weight > 0.0: response_matrix_weight = 1.0/weight response_matrix_closure_pr.Fill(getattr(row, self.v_varshape_binning), row.pt_jet,\ getattr(row, self.v_varshape_binning_gen), row.pt_gen_jet, response_matrix_weight) response_matrix_pr.Write("response_matrix") response_matrix_closure_pr.Write("response_matrix_closure") out_file.Close()
def process_efficiency_single(self, index): out_file = TFile.Open(self.l_histoeff[index], "recreate") for ibin2 in range(self.p_nbin2_reco): stringbin2 = "_%s_%.2f_%.2f" % (self.v_var2_binning, \ self.lvar2_binmin_reco[ibin2], \ self.lvar2_binmax_reco[ibin2]) n_bins = self.p_nptfinbins analysis_bin_lims_temp = self.lpt_finbinmin.copy() analysis_bin_lims_temp.append(self.lpt_finbinmax[n_bins-1]) analysis_bin_lims = array.array('f', analysis_bin_lims_temp) h_gen_pr = TH1F("h_gen_pr" + stringbin2, "Prompt Generated in acceptance |y|<0.5", \ n_bins, analysis_bin_lims) h_presel_pr = TH1F("h_presel_pr" + stringbin2, "Prompt Reco in acc |#eta|<0.8 and sel", \ n_bins, analysis_bin_lims) h_sel_pr = TH1F("h_sel_pr" + stringbin2, "Prompt Reco and sel in acc |#eta|<0.8 and sel", \ n_bins, analysis_bin_lims) h_gen_fd = TH1F("h_gen_fd" + stringbin2, "FD Generated in acceptance |y|<0.5", \ n_bins, analysis_bin_lims) h_presel_fd = TH1F("h_presel_fd" + stringbin2, "FD Reco in acc |#eta|<0.8 and sel", \ n_bins, analysis_bin_lims) h_sel_fd = TH1F("h_sel_fd" + stringbin2, "FD Reco and sel in acc |#eta|<0.8 and sel", \ n_bins, analysis_bin_lims) bincounter = 0 for ipt in range(self.p_nptfinbins): bin_id = self.bin_matching[ipt] df_mc_reco = pickle.load(openfile(self.mptfiles_recoskmldec[bin_id][index], "rb")) if self.s_evtsel is not None: df_mc_reco = df_mc_reco.query(self.s_evtsel) if self.s_jetsel_reco is not None: df_mc_reco = df_mc_reco.query(self.s_jetsel_reco) if self.s_trigger is not None: df_mc_reco = df_mc_reco.query(self.s_trigger) df_mc_reco = selectdfrunlist(df_mc_reco, \ self.run_param[self.runlistrigger[self.triggerbit]], "run_number") df_mc_gen = pickle.load(openfile(self.mptfiles_gensk[bin_id][index], "rb")) df_mc_gen = df_mc_gen.query(self.s_jetsel_gen) df_mc_gen = selectdfrunlist(df_mc_gen, \ self.run_param[self.runlistrigger[self.triggerbit]], "run_number") df_mc_reco = seldf_singlevar(df_mc_reco, self.v_var_binning, \ self.lpt_finbinmin[ipt], self.lpt_finbinmax[ipt]) df_mc_gen = seldf_singlevar(df_mc_gen, self.v_var_binning, \ self.lpt_finbinmin[ipt], self.lpt_finbinmax[ipt]) df_mc_reco = seldf_singlevar(df_mc_reco, self.v_var2_binning, \ self.lvar2_binmin_reco[ibin2], self.lvar2_binmax_reco[ibin2]) df_mc_gen = seldf_singlevar(df_mc_gen, self.v_var2_binning, \ self.lvar2_binmin_reco[ibin2], self.lvar2_binmax_reco[ibin2]) df_gen_sel_pr = df_mc_gen[df_mc_gen.ismcprompt == 1] df_reco_presel_pr = df_mc_reco[df_mc_reco.ismcprompt == 1] df_reco_sel_pr = None if self.doml is True: df_reco_sel_pr = df_reco_presel_pr.query(self.l_selml[bin_id]) else: df_reco_sel_pr = df_reco_presel_pr.copy() df_gen_sel_fd = df_mc_gen[df_mc_gen.ismcfd == 1] df_reco_presel_fd = df_mc_reco[df_mc_reco.ismcfd == 1] df_reco_sel_fd = None if self.doml is True: df_reco_sel_fd = df_reco_presel_fd.query(self.l_selml[bin_id]) else: df_reco_sel_fd = df_reco_presel_fd.copy() val = len(df_gen_sel_pr) err = math.sqrt(val) h_gen_pr.SetBinContent(bincounter + 1, val) h_gen_pr.SetBinError(bincounter + 1, err) val = len(df_reco_presel_pr) err = math.sqrt(val) h_presel_pr.SetBinContent(bincounter + 1, val) h_presel_pr.SetBinError(bincounter + 1, err) val = len(df_reco_sel_pr) err = math.sqrt(val) h_sel_pr.SetBinContent(bincounter + 1, val) h_sel_pr.SetBinError(bincounter + 1, err) val = len(df_gen_sel_fd) err = math.sqrt(val) h_gen_fd.SetBinContent(bincounter + 1, val) h_gen_fd.SetBinError(bincounter + 1, err) val = len(df_reco_presel_fd) err = math.sqrt(val) h_presel_fd.SetBinContent(bincounter + 1, val) h_presel_fd.SetBinError(bincounter + 1, err) val = len(df_reco_sel_fd) err = math.sqrt(val) h_sel_fd.SetBinContent(bincounter + 1, val) h_sel_fd.SetBinError(bincounter + 1, err) bincounter = bincounter + 1 out_file.cd() h_gen_pr.Write() h_presel_pr.Write() h_sel_pr.Write() h_gen_fd.Write() h_presel_fd.Write() h_sel_fd.Write()
def process_histomass_single(self, index): myfile = TFile.Open(self.l_histomass[index], "recreate") for ipt in range(self.p_nptfinbins): bin_id = self.bin_matching[ipt] df = pickle.load(openfile(self.mptfiles_recoskmldec[bin_id][index], "rb")) if self.doml is True: df = df.query(self.l_selml[bin_id]) if self.s_evtsel is not None: df = df.query(self.s_evtsel) if self.s_jetsel_reco is not None: df = df.query(self.s_jetsel_reco) if self.s_trigger is not None: df = df.query(self.s_trigger) h_invmass_all = TH1F("hmass_%d" % ipt, "", self.p_num_bins, self.p_mass_fit_lim[0], self.p_mass_fit_lim[1]) fill_hist(h_invmass_all, df.inv_mass) myfile.cd() h_invmass_all.Write() df = seldf_singlevar(df, self.v_var_binning, \ self.lpt_finbinmin[ipt], self.lpt_finbinmax[ipt]) for ibin2 in range(self.p_nbin2_reco): suffix = "%s%d_%d_%.2f%s_%.2f_%.2f" % \ (self.v_var_binning, self.lpt_finbinmin[ipt], self.lpt_finbinmax[ipt], self.lpt_probcutfin[bin_id], self.v_var2_binning, self.lvar2_binmin_reco[ibin2], self.lvar2_binmax_reco[ibin2]) df_bin = seldf_singlevar(df, self.v_var2_binning, self.lvar2_binmin_reco[ibin2], self.lvar2_binmax_reco[ibin2]) df_bin = selectdfrunlist(df_bin, \ self.run_param[self.runlistrigger[self.triggerbit]], "run_number") # add the z column df_bin["z"] = z_calc(df_bin.pt_jet, df_bin.phi_jet, df_bin.eta_jet, df_bin.pt_cand, df_bin.phi_cand, df_bin.eta_cand) h_invmass = TH1F("hmass" + suffix, "", self.p_num_bins, self.p_mass_fit_lim[0], self.p_mass_fit_lim[1]) fill_hist(h_invmass, df_bin.inv_mass) myfile.cd() h_invmass.Write() massarray = [1.0 + i * (5.0 / 5000.0) for i in range(5001)] # 5000 bins in range 1.0-6.0 massarray_reco = array.array('d', massarray) zarray_reco = array.array('d', self.varshaperanges_reco) h_zvsinvmass = TH2F("hzvsmass" + suffix, "", \ 5000, massarray_reco, self.p_nbinshape_reco, zarray_reco) h_zvsinvmass.Sumw2() fill2dhist(df_bin, h_zvsinvmass, "inv_mass", self.v_varshape_binning) h_zvsinvmass.Write() if self.mcordata == "mc": df_bin[self.v_ismcrefl] = np.array(tag_bit_df(df_bin, self.v_bitvar, self.b_mcrefl), dtype=int) df_bin_sig = df_bin[df_bin[self.v_ismcsignal] == 1] df_bin_refl = df_bin[df_bin[self.v_ismcrefl] == 1] h_invmass_sig = TH1F("hmass_sig" + suffix, "", self.p_num_bins, self.p_mass_fit_lim[0], self.p_mass_fit_lim[1]) h_invmass_refl = TH1F("hmass_refl" + suffix, "", self.p_num_bins, self.p_mass_fit_lim[0], self.p_mass_fit_lim[1]) fill_hist(h_invmass_sig, df_bin_sig.inv_mass) fill_hist(h_invmass_refl, df_bin_refl.inv_mass) myfile.cd() h_invmass_sig.Write() h_invmass_refl.Write()