Esempio n. 1
0
    def skim(self, file_index):
        try:
            dfreco = pickle.load(openfile(self.l_reco[file_index], "rb"))
        except Exception as e:  # pylint: disable=broad-except
            print('failed to open file', self.l_reco[file_index], str(e))
            sys.exit()
        #To (hopefully) fix double signd0 issue with database when unpacking
        dfreco = dfreco.loc[:, ~dfreco.columns.duplicated()]

        for ipt in range(self.p_nptbins):
            dfrecosk = seldf_singlevar(dfreco, self.v_var_binning,
                                       self.lpt_anbinmin[ipt],
                                       self.lpt_anbinmax[ipt])
            dfrecosk = selectdfquery(dfrecosk, self.s_reco_skim[ipt])
            dfrecosk = dfrecosk.reset_index(drop=True)
            f = openfile(self.mptfiles_recosk[ipt][file_index], "wb")
            pickle.dump(dfrecosk, f, protocol=4)
            f.close()
            if self.mcordata == "mc":
                try:
                    dfgen = pickle.load(openfile(self.l_gen[file_index], "rb"))
                except Exception as e:  # pylint: disable=broad-except
                    print('failed to open MC file', self.l_gen[file_index],
                          str(e))
                dfgensk = seldf_singlevar(dfgen, self.v_var_binning,
                                          self.lpt_anbinmin[ipt],
                                          self.lpt_anbinmax[ipt])
                dfgensk = selectdfquery(dfgensk, self.s_gen_skim[ipt])
                dfgensk = dfgensk.reset_index(drop=True)
                pickle.dump(dfgensk,
                            openfile(self.mptfiles_gensk[ipt][file_index],
                                     "wb"),
                            protocol=4)
Esempio n. 2
0
 def applymodel(self, file_index):
     for ipt in range(self.p_nptbins):
         if os.path.exists(self.mptfiles_recoskmldec[ipt][file_index]):
             if os.stat(self.mptfiles_recoskmldec[ipt]
                        [file_index]).st_size != 0:
                 continue
         dfrecosk = pickle.load(
             openfile(self.mptfiles_recosk_forapply[ipt][file_index], "rb"))
         if self.doml is True:
             if os.path.isfile(self.lpt_model[ipt]) is False:
                 print("Model file not present in bin %d" % ipt)
             if self.b_maskmissing:
                 dfrecosk = dfrecosk.replace(self.v_varstomask,
                                             value=np.nan)
             mod = pickle.load(openfile(self.lpt_model[ipt], 'rb'))
             dfrecoskml = apply("BinaryClassification", [self.p_modelname],
                                [mod], dfrecosk, self.v_train[ipt], None)
             probvar = "y_test_prob" + self.p_modelname
             dfrecoskml = dfrecoskml.loc[
                 dfrecoskml[probvar] > self.lpt_probcutpre[ipt]]
         else:
             dfrecoskml = dfrecosk.query("isstd == 1")
         pickle.dump(dfrecoskml,
                     openfile(self.mptfiles_recoskmldec[ipt][file_index],
                              "wb"),
                     protocol=4)
         if self.do_mlprefilter is True and self.mcordata == "mc":
             dfgensk = pickle.load(
                 openfile(self.mptfiles_gensk[ipt][file_index], "rb"))
             pickle.dump(dfgensk,
                         openfile(self.mptfiles_genskmldec[ipt][file_index],
                                  "wb"),
                         protocol=4)
 def process_histomass_single(self, index):
     myfile = TFile.Open(self.l_histomass[index], "recreate")
     dfevtorig = pickle.load(openfile(self.l_evtorig[index], "rb"))
     if self.s_trigger is not None:
         dfevtorig = dfevtorig.query(self.s_trigger)
     if self.runlistrigger is not None:
         dfevtorig = selectdfrunlist(dfevtorig, \
                          self.run_param[self.runlistrigger], "run_number")
     hNorm = TH1F("hEvForNorm", "hEvForNorm", 2, 0.5, 2.5)
     hNorm.GetXaxis().SetBinLabel(1, "normsalisation factor")
     hNorm.GetXaxis().SetBinLabel(2, "selected events")
     nselevt = 0
     norm = 0
     if not dfevtorig.empty:
         nselevt = len(dfevtorig.query("is_ev_rej==0"))
         norm = getnormforselevt(dfevtorig)
     hNorm.SetBinContent(1, norm)
     hNorm.SetBinContent(2, nselevt)
     hNorm.Write()
     dfevtorig = dfevtorig.query("is_ev_rej==0")
     for ipt in range(self.p_nptfinbins):
         bin_id = self.bin_matching[ipt]
         df = pickle.load(
             openfile(self.mptfiles_recoskmldec[bin_id][index], "rb"))
         if self.doml is True:
             df = df.query(self.l_selml[bin_id])
         if self.s_evtsel is not None:
             df = df.query(self.s_evtsel)
         if self.s_trigger is not None:
             df = df.query(self.s_trigger)
         df = seldf_singlevar(df, self.v_var_binning, \
                              self.lpt_finbinmin[ipt], self.lpt_finbinmax[ipt])
         suffix = "%s%d_%d" % \
                  (self.v_var_binning, self.lpt_finbinmin[ipt], self.lpt_finbinmax[ipt])
         h_invmass = TH1F("hmass" + suffix, "", self.p_num_bins,
                          self.p_mass_fit_lim[0], self.p_mass_fit_lim[1])
         if self.runlistrigger is not None:
             df = selectdfrunlist(df, \
                      self.run_param[self.runlistrigger], "run_number")
         fill_hist(h_invmass, df.inv_mass)
         myfile.cd()
         h_invmass.Write()
         if self.mcordata == "mc":
             df[self.v_ismcrefl] = np.array(tag_bit_df(
                 df, self.v_bitvar, self.b_mcrefl),
                                            dtype=int)
             df_sig = df[df[self.v_ismcsignal] == 1]
             df_refl = df[df[self.v_ismcrefl] == 1]
             h_invmass_sig = TH1F("hmass_sig" + suffix, "", self.p_num_bins,
                                  self.p_mass_fit_lim[0],
                                  self.p_mass_fit_lim[1])
             h_invmass_refl = TH1F("hmass_refl" + suffix, "",
                                   self.p_num_bins, self.p_mass_fit_lim[0],
                                   self.p_mass_fit_lim[1])
             fill_hist(h_invmass_sig, df_sig.inv_mass)
             fill_hist(h_invmass_refl, df_refl.inv_mass)
             myfile.cd()
             h_invmass_sig.Write()
             h_invmass_refl.Write()
         print("FINISHED")
Esempio n. 4
0
 def process_valevents(self, file_index):
     dfevt = pickle.load(openfile(self.l_evt[file_index], "rb"))
     grouped = dfevt.groupby(self.v_evtmatch)
     for _, group in grouped:
         if len(group) > 1:
             print(len(group))
             print(group)
             print("WARNING:EVENT DUPLICATION")
     dfreco = pickle.load(openfile(self.l_reco[file_index], "rb"))
     fileevtroot = TFile.Open(self.l_evtvalroot[file_index], "recreate")
     dfreco = dfreco.query("is_ev_rej == 0")
     h_n_tracklets = TH1F("h_n_tracklets", "h_n_tracklets", 100, -0.5, 99.5)
     h_n_tracklets_corr = TH1F("h_n_tracklets_corr", "h_n_tracklets_corr", 100, -0.5, 99.5)
     h_run = TH1F("h_run", "h_run", 100000, 200000, 300000)
     h_trigg = TH1F("h_trigg", "h_trigg", 2, -0.5, 1.5)
     fill_hist(h_n_tracklets_corr, dfreco["n_tracklets_corr"])
     fill_hist(h_n_tracklets, dfreco["n_tracklets"])
     fill_hist(h_run, dfreco["run_number"])
     hmultvsrun = scatterplotroot(dfreco, "n_tracklets_corr",
                                  "run_number", 100, -0.5, 99.5, 100000,
                                  200000.5, 300000.5)
     hmultvsrun.SetName("hmultvsrun")
     fill_hist(h_trigg, dfreco["is_ev_rej_INT7"])
     hmultvsrun.Write()
     h_n_tracklets_corr.Write()
     h_n_tracklets.Write()
     hmultvsrun.Write()
     h_trigg.Write()
     h_run.Write()
     prof = hmultvsrun.ProfileY()
     prof.SetName("prof")
     prof.Write()
     fileevtroot.Close()
 def skim(self, file_index):
     try:
         dfreco = pickle.load(openfile(self.l_reco[file_index], "rb"))
     except Exception as e:  # pylint: disable=broad-except
         print('failed to open file', self.l_reco[file_index], str(e))
     for ipt in range(self.p_nptbins):
         dfrecosk = seldf_singlevar(dfreco, self.v_var_binning,
                                    self.lpt_anbinmin[ipt],
                                    self.lpt_anbinmax[ipt])
         dfrecosk = selectdfquery(dfrecosk, self.s_reco_skim[ipt])
         dfrecosk = dfrecosk.reset_index(drop=True)
         f = openfile(self.mptfiles_recosk[ipt][file_index], "wb")
         pickle.dump(dfrecosk, f, protocol=4)
         f.close()
         if self.mcordata == "mc":
             try:
                 dfgen = pickle.load(openfile(self.l_gen[file_index], "rb"))
             except Exception as e:  # pylint: disable=broad-except
                 print('failed to open MC file', self.l_gen[file_index],
                       str(e))
             dfgensk = seldf_singlevar(dfgen, self.v_var_binning,
                                       self.lpt_anbinmin[ipt],
                                       self.lpt_anbinmax[ipt])
             dfgensk = selectdfquery(dfgensk, self.s_gen_skim[ipt])
             dfgensk = dfgensk.reset_index(drop=True)
             pickle.dump(dfgensk,
                         openfile(self.mptfiles_gensk[ipt][file_index],
                                  "wb"),
                         protocol=4)
Esempio n. 6
0
 def do_apply(self):
     df_data = apply(self.p_mltype, self.p_classname, self.p_trainedmod,
                     self.df_data, self.v_train)
     df_mc = apply(self.p_mltype, self.p_classname, self.p_trainedmod,
                   self.df_mc, self.v_train)
     pickle.dump(df_data, openfile(self.f_reco_applieddata, "wb"), protocol=4)
     pickle.dump(df_mc, openfile(self.f_reco_appliedmc, "wb"), protocol=4)
Esempio n. 7
0
    def prepare_data_mc_mcgen(self):

        self.logger.info("Prepare data reco as well as MC reco and gen")
        if os.path.exists(self.f_reco_applieddata) \
                and os.path.exists(self.f_reco_appliedmc) \
                and self.step_done("preparemlsamples_data_mc_mcgen"):
            self.df_data = pickle.load(openfile(self.f_reco_applieddata, "rb"))
            self.df_mc = pickle.load(openfile(self.f_reco_appliedmc, "rb"))
        else:
            self.df_data = pickle.load(openfile(self.f_reco_data, "rb"))
            self.df_mc = pickle.load(openfile(self.f_reco_mc, "rb"))
            self.df_data = selectdfquery(self.df_data, self.p_evtsel)
            self.df_mc = selectdfquery(self.df_mc, self.p_evtsel)

            self.df_data = selectdfquery(self.df_data, self.p_triggersel_data)
            self.df_mc = selectdfquery(self.df_mc, self.p_triggersel_mc)

        self.df_mcgen = pickle.load(openfile(self.f_gen_mc, "rb"))
        self.df_mcgen = selectdfquery(self.df_mcgen, self.p_evtsel)
        self.df_mcgen = selectdfquery(self.df_mcgen, self.p_triggersel_mc)
        self.df_mcgen = self.df_mcgen.query(self.p_presel_gen_eff)

        self.arraydf = [self.df_data, self.df_mc]
        self.df_mc = seldf_singlevar(self.df_mc, self.v_bin, self.p_binmin,
                                     self.p_binmax)
        self.df_mcgen = seldf_singlevar(self.df_mcgen, self.v_bin,
                                        self.p_binmin, self.p_binmax)
        self.df_data = seldf_singlevar(self.df_data, self.v_bin, self.p_binmin,
                                       self.p_binmax)
Esempio n. 8
0
    def do_scancuts(self):
        self.logger.info("Scanning cuts")

        prob_array = [0.0, 0.2, 0.6, 0.9]
        dfdata = pickle.load(openfile(self.f_reco_applieddata, "rb"))
        dfmc = pickle.load(openfile(self.f_reco_appliedmc, "rb"))
        vardistplot_probscan(dfmc, self.v_all, "xgboost_classifier",
                             prob_array, self.dirmlplot, "scancutsmc", 0, self.p_plot_options)
        vardistplot_probscan(dfmc, self.v_all, "xgboost_classifier",
                             prob_array, self.dirmlplot, "scancutsmc", 1, self.p_plot_options)
        vardistplot_probscan(dfdata, self.v_all, "xgboost_classifier",
                             prob_array, self.dirmlplot, "scancutsdata", 0, self.p_plot_options)
        vardistplot_probscan(dfdata, self.v_all, "xgboost_classifier",
                             prob_array, self.dirmlplot, "scancutsdata", 1, self.p_plot_options)
        if not self.v_cuts:
            self.logger.warning("No variables for cut efficiency scan. Will be skipped")
            return
        efficiency_cutscan(dfmc, self.v_cuts, "xgboost_classifier", 0.5,
                           self.dirmlplot, "mc", self.p_plot_options)
        efficiency_cutscan(dfmc, self.v_cuts, "xgboost_classifier", 0.9,
                           self.dirmlplot, "mc", self.p_plot_options)
        efficiency_cutscan(dfdata, self.v_cuts, "xgboost_classifier", 0.5,
                           self.dirmlplot, "data", self.p_plot_options)
        efficiency_cutscan(dfdata, self.v_cuts, "xgboost_classifier", 0.9,
                           self.dirmlplot, "data", self.p_plot_options)
Esempio n. 9
0
 def applymodel(self, file_index):
     for ipt in range(self.p_nptbins):
         if os.path.exists(self.mptfiles_recoskmldec[ipt][file_index]):
             if os.stat(self.mptfiles_recoskmldec[ipt][file_index]).st_size != 0:
                 continue
         dfrecosk = pickle.load(openfile(self.mptfiles_recosk[ipt][file_index], "rb"))
         if self.p_mask_values:
             mask_df(dfrecosk, self.p_mask_values)
         if self.doml is True:
             if os.path.isfile(self.lpt_model[ipt]) is False:
                 print("Model file not present in bin %d" % ipt)
             mod = pickle.load(openfile(self.lpt_model[ipt], 'rb'))
             if self.mltype == "MultiClassification":
                 dfrecoskml = apply(self.mltype, [self.p_modelname], [mod],
                                    dfrecosk, self.v_train[ipt], self.multiclass_labels)
                 prob0 = "y_test_prob" + self.p_modelname + self.multiclass_labels[0]
                 prob1 = "y_test_prob" + self.p_modelname + self.multiclass_labels[1]
                 dfrecoskml = dfrecoskml.loc[(dfrecoskml[prob0] <= self.lpt_probcutpre[ipt][0]) &
                                             (dfrecoskml[prob1] >= self.lpt_probcutpre[ipt][1])]
             else:
                 dfrecoskml = apply("BinaryClassification", [self.p_modelname], [mod],
                                    dfrecosk, self.v_train[ipt])
                 probvar = "y_test_prob" + self.p_modelname
                 dfrecoskml = dfrecoskml.loc[dfrecoskml[probvar] > self.lpt_probcutpre[ipt]]
         else:
             dfrecoskml = dfrecosk.query("isstd == 1")
         pickle.dump(dfrecoskml, openfile(self.mptfiles_recoskmldec[ipt][file_index], "wb"),
                     protocol=4)
Esempio n. 10
0
    def preparesample(self):
        self.logger.info("Prepare Sample")
        self.df_data = pickle.load(openfile(self.f_reco_data, "rb"))
        self.df_mc = pickle.load(openfile(self.f_reco_mc, "rb"))
        self.df_mcgen = pickle.load(openfile(self.f_gen_mc, "rb"))
        self.df_mcgen = self.df_mcgen.query(self.p_presel_gen_eff)
        arraydf = [self.df_data, self.df_mc]
        self.df_mc = seldf_singlevar(self.df_mc, self.v_bin, self.p_binmin, self.p_binmax)
        self.df_mcgen = seldf_singlevar(self.df_mcgen, self.v_bin, self.p_binmin, self.p_binmax)
        self.df_data = seldf_singlevar(self.df_data, self.v_bin, self.p_binmin, self.p_binmax)


        self.df_sig, self.df_bkg = arraydf[self.p_tagsig], arraydf[self.p_tagbkg]
        self.df_sig = seldf_singlevar(self.df_sig, self.v_bin, self.p_binmin, self.p_binmax)
        self.df_bkg = seldf_singlevar(self.df_bkg, self.v_bin, self.p_binmin, self.p_binmax)
        self.df_sig = self.df_sig.query(self.s_selsigml)
        self.df_bkg = self.df_bkg.query(self.s_selbkgml)
        self.df_bkg["ismcsignal"] = 0
        self.df_bkg["ismcprompt"] = 0
        self.df_bkg["ismcfd"] = 0
        self.df_bkg["ismcbkg"] = 0


        if self.p_nsig > len(self.df_sig):
            self.logger.warning("There are not enough signal events")
        if self.p_nbkg > len(self.df_bkg):
            self.logger.warning("There are not enough background events")

        self.p_nsig = min(len(self.df_sig), self.p_nsig)
        self.p_nbkg = min(len(self.df_bkg), self.p_nbkg)

        self.logger.info("Used number of signal events is %d", self.p_nsig)
        self.logger.info("Used number of background events is %d", self.p_nbkg)

        self.df_ml = pd.DataFrame()
        self.df_sig = shuffle(self.df_sig, random_state=self.rnd_shuffle)
        self.df_bkg = shuffle(self.df_bkg, random_state=self.rnd_shuffle)
        self.df_sig = self.df_sig[:self.p_nsig]
        self.df_bkg = self.df_bkg[:self.p_nbkg]
        self.df_sig[self.v_sig] = 1
        self.df_bkg[self.v_sig] = 0
        self.df_ml = pd.concat([self.df_sig, self.df_bkg])
        self.df_mltrain, self.df_mltest = train_test_split(self.df_ml, \
                                           test_size=self.test_frac, random_state=self.rnd_splt)
        self.df_mltrain = self.df_mltrain.reset_index(drop=True)
        self.df_mltest = self.df_mltest.reset_index(drop=True)
        self.df_sigtrain, self.df_bkgtrain = split_df_sigbkg(self.df_mltrain, self.v_sig)
        self.df_sigtest, self.df_bkgtest = split_df_sigbkg(self.df_mltest, self.v_sig)
        self.logger.info("Total number of candidates: train %d and test %d", len(self.df_mltrain),
                         len(self.df_mltest))
        self.logger.info("Number of signal candidates: train %d and test %d",
                         len(self.df_sigtrain), len(self.df_sigtest))
        self.logger.info("Number of bkg candidates: %d and test %d", len(self.df_bkgtrain),
                         len(self.df_bkgtest))

        self.df_xtrain = self.df_mltrain[self.v_train]
        self.df_ytrain = self.df_mltrain[self.v_sig]
        self.df_xtest = self.df_mltest[self.v_train]
        self.df_ytest = self.df_mltest[self.v_sig]
Esempio n. 11
0
    def applymodel_hipe4ml(self, file_index):
        for ipt in range(self.p_nptbins):
            if os.path.exists(self.mptfiles_recoskmldec[ipt][file_index]):
                if os.stat(self.mptfiles_recoskmldec[ipt]
                           [file_index]).st_size != 0:
                    continue
            print(self.mptfiles_recosk[ipt][file_index])
            dfrecosk = pickle.load(
                openfile(self.mptfiles_recosk[ipt][file_index], "rb"))
            dfrecosk = seldf_singlevar(dfrecosk, self.v_var_binning,
                                       self.lpt_anbinmintr[ipt],
                                       self.lpt_anbinmaxtr[ipt])
            if self.doml is True:
                if os.path.isfile(self.lpt_modhandler_hipe4ml[ipt]) is False:
                    print("hipe4ml model file not present in bin %d" % ipt)
                if self.b_maskmissing:
                    dfrecosk = dfrecosk.replace(self.v_varstomask,
                                                value=np.nan)
                modhandler = pickle.load(
                    openfile(self.lpt_modhandler_hipe4ml[ipt], 'rb'))
                mod = modhandler.get_original_model()

                #njobsgrid = {'n_jobs': -1}
                #mod.set_params(**njobsgrid)

                if self.mltype == "MultiClassification":
                    dfrecoskml = apply("MultiClassification",
                                       [self.p_modelname], [mod], dfrecosk,
                                       self.v_train[ipt],
                                       self.multiclass_labels)
                    probvar0 = 'y_test_prob' + self.p_modelname + self.multiclass_labels[
                        0]
                    probvar1 = 'y_test_prob' + self.p_modelname + self.multiclass_labels[
                        1]
                    dfrecoskml = dfrecoskml.loc[
                        (dfrecoskml[probvar0] <= self.lpt_probcutpre[ipt][0]) &
                        (dfrecoskml[probvar1] >= self.lpt_probcutpre[ipt][1])]
                else:
                    dfrecoskml = apply("BinaryClassification",
                                       [self.p_modelname], [mod], dfrecosk,
                                       self.v_train[ipt], None)
                    probvar = "y_test_prob" + self.p_modelname
                    dfrecoskml = dfrecoskml.loc[
                        dfrecoskml[probvar] > self.lpt_probcutpre[ipt]]
            else:
                dfrecoskml = dfrecosk.query("isstd == 1")
            pickle.dump(dfrecoskml,
                        openfile(self.mptfiles_recoskmldec[ipt][file_index],
                                 "wb"),
                        protocol=4)
            if (self.do_mlprefilter is True or
                    self.apply_w_pkl_layout is True) and self.mcordata == "mc":
                dfgensk = pickle.load(
                    openfile(self.mptfiles_gensk[ipt][file_index], "rb"))
                pickle.dump(dfgensk,
                            openfile(self.mptfiles_genskmldec[ipt][file_index],
                                     "wb"),
                            protocol=4)
Esempio n. 12
0
 def applymodel(self, file_index):
     for ipt in range(self.p_nptbins):
         dfrecosk = pickle.load(openfile(self.mptfiles_recosk[ipt][file_index], "rb"))
         if os.path.isfile(self.lpt_model[ipt]) is False:
             print("Model file not present in bin %d" % ipt)
         mod = pickle.load(openfile(self.lpt_model[ipt], 'rb'))
         dfrecoskml = apply("BinaryClassification", [self.p_modelname], [mod],
                            dfrecosk, self.v_train)
         probvar = "y_test_prob" + self.p_modelname
         dfrecoskml = dfrecoskml.loc[dfrecoskml[probvar] > self.lpt_probcutpre[ipt]]
         pickle.dump(dfrecoskml, openfile(self.mptfiles_recoskmldec[ipt][file_index], "wb"),
                     protocol=4)
Esempio n. 13
0
    def unpack(self, file_index):
        treeevtorig = uproot.open(self.l_root[file_index])[self.n_treeevt]
        dfevtorig = treeevtorig.pandas.df(branches=self.v_evt)
        dfevtorig = selectdfrunlist(dfevtorig, self.runlist, "run_number")
        dfevtorig = selectdfquery(dfevtorig, self.s_cen_unp)
        dfevtorig = dfevtorig.reset_index(drop=True)
        pickle.dump(dfevtorig, openfile(self.l_evtorig[file_index], "wb"), protocol=4)
        dfevt = selectdfquery(dfevtorig, self.s_good_evt_unp)
        dfevt = dfevt.reset_index(drop=True)
        pickle.dump(dfevt, openfile(self.l_evt[file_index], "wb"), protocol=4)


        treereco = uproot.open(self.l_root[file_index])[self.n_treereco]
        dfreco = treereco.pandas.df(branches=self.v_all)
        dfreco = selectdfrunlist(dfreco, self.runlist, "run_number")
        dfreco = selectdfquery(dfreco, self.s_reco_unp)
        dfreco = pd.merge(dfreco, dfevt, on=self.v_evtmatch)
        isselacc = selectfidacc(dfreco.pt_cand.values, dfreco.y_cand.values)
        dfreco = dfreco[np.array(isselacc, dtype=bool)]
        if self.b_trackcuts is not None:
            dfreco = filter_bit_df(dfreco, self.v_bitvar, self.b_trackcuts)
        dfreco[self.v_isstd] = np.array(tag_bit_df(dfreco, self.v_bitvar,
                                                   self.b_std), dtype=int)
        dfreco = dfreco.reset_index(drop=True)
        if self.mcordata == "mc":
            dfreco[self.v_ismcsignal] = np.array(tag_bit_df(dfreco, self.v_bitvar,
                                                            self.b_mcsig), dtype=int)
            dfreco[self.v_ismcprompt] = np.array(tag_bit_df(dfreco, self.v_bitvar,
                                                            self.b_mcsigprompt), dtype=int)
            dfreco[self.v_ismcfd] = np.array(tag_bit_df(dfreco, self.v_bitvar,
                                                        self.b_mcsigfd), dtype=int)
            dfreco[self.v_ismcbkg] = np.array(tag_bit_df(dfreco, self.v_bitvar,
                                                         self.b_mcbkg), dtype=int)
        pickle.dump(dfreco, openfile(self.l_reco[file_index], "wb"), protocol=4)

        if self.mcordata == "mc":
            treegen = uproot.open(self.l_root[file_index])[self.n_treegen]
            dfgen = treegen.pandas.df(branches=self.v_gen)
            dfgen = selectdfrunlist(dfgen, self.runlist, "run_number")
            dfgen = pd.merge(dfgen, dfevtorig, on=self.v_evtmatch)
            dfgen = selectdfquery(dfgen, self.s_gen_unp)
            dfgen[self.v_isstd] = np.array(tag_bit_df(dfgen, self.v_bitvar,
                                                      self.b_std), dtype=int)
            dfgen[self.v_ismcsignal] = np.array(tag_bit_df(dfgen, self.v_bitvar,
                                                           self.b_mcsig), dtype=int)
            dfgen[self.v_ismcprompt] = np.array(tag_bit_df(dfgen, self.v_bitvar,
                                                           self.b_mcsigprompt), dtype=int)
            dfgen[self.v_ismcfd] = np.array(tag_bit_df(dfgen, self.v_bitvar,
                                                       self.b_mcsigfd), dtype=int)
            dfgen[self.v_ismcbkg] = np.array(tag_bit_df(dfgen, self.v_bitvar,
                                                        self.b_mcbkg), dtype=int)
            dfgen = dfgen.reset_index(drop=True)
            pickle.dump(dfgen, openfile(self.l_gen[file_index], "wb"), protocol=4)
Esempio n. 14
0
    def process_response(self):
        list_df_mc_reco = []
        list_df_mc_gen = []
        for iptskim, _ in enumerate(self.lpt_anbinmin):
            df_mc_reco = pickle.load(
                openfile(self.lpt_recodecmerged[iptskim], "rb"))
            if "pt_jet" not in df_mc_reco.columns:
                print(
                    "Jet variables not found in the dataframe. Skipping process_response."
                )
                return
            if self.s_evtsel is not None:
                df_mc_reco = df_mc_reco.query(self.s_evtsel)
            if self.s_trigger is not None:
                df_mc_reco = df_mc_reco.query(self.s_trigger)
            df_mc_reco = df_mc_reco.query(self.l_selml[iptskim])
            list_df_mc_reco.append(df_mc_reco)
            df_mc_gen = pickle.load(
                openfile(self.lpt_gendecmerged[iptskim], "rb"))
            df_mc_gen = df_mc_gen.query(self.s_presel_gen_eff)
            list_df_mc_gen.append(df_mc_gen)
        df_mc_reco_merged = pd.concat(list_df_mc_reco)
        df_mc_gen_merged = pd.concat(list_df_mc_gen)
        df_mc_reco_merged_fd = df_mc_reco_merged[
            df_mc_reco_merged.ismcfd ==
            1]  # reconstructed & selected non-prompt jets
        df_mc_gen_merged_fd = df_mc_gen_merged[
            df_mc_gen_merged.ismcfd ==
            1]  # generated & selected non-prompt jets
        out_file = TFile.Open(self.n_fileeff, "update")

        # Detector response matrix of pt_jet of non-prompt jets
        df_resp_jet_fd = df_mc_reco_merged_fd.loc[:, ["pt_gen_jet", "pt_jet"]]
        his_resp_jet_fd = TH2F("his_resp_jet_fd", \
            "Response matrix of #it{p}_{T}^{jet, ch} of non-prompt jets;#it{p}_{T}^{jet, ch, gen.} (GeV/#it{c});#it{p}_{T}^{jet, ch, rec.} (GeV/#it{c})", \
            100, 0, 100, 100, 0, 100)
        fill_hist(his_resp_jet_fd, df_resp_jet_fd)

        # Simulated pt_cand vs. pt_jet of non-prompt jets
        df_ptc_ptjet_fd = df_mc_gen_merged_fd.loc[:, ["pt_cand", "pt_jet"]]
        n_bins = len(self.lpt_finbinmin)
        analysis_bin_lims_temp = self.lpt_finbinmin.copy()
        analysis_bin_lims_temp.append(self.lpt_finbinmax[n_bins - 1])
        analysis_bin_lims = array.array('d', analysis_bin_lims_temp)
        his_ptc_ptjet_fd = TH2F("his_ptc_ptjet_fd", \
            "Simulated #it{p}_{T}^{cand.} vs. #it{p}_{T}^{jet} of non-prompt jets;#it{p}_{T}^{cand., gen.} (GeV/#it{c});#it{p}_{T}^{jet, ch, gen.} (GeV/#it{c})", \
            n_bins, analysis_bin_lims, 100, 0, 100)
        fill_hist(his_ptc_ptjet_fd, df_ptc_ptjet_fd)

        out_file.cd()
        his_resp_jet_fd.Write()
        his_ptc_ptjet_fd.Write()
        out_file.Close()
Esempio n. 15
0
 def process_valevents(self, file_index):
     dfevt = pickle.load(openfile(self.l_evtorig[file_index], "rb"))
     dfevt = dfevt.query("is_ev_rej==0")
     triggerlist = ["HighMultV0", "HighMultSPD", "INT7"]
     varlist = ["v0m_corr", "n_tracklets_corr", "perc_v0m"]
     nbinsvar = [100, 200]
     minrvar = [0, 0]
     maxrvar = [1500, 200]
     fileevtroot = TFile.Open(self.l_evtvalroot[file_index], "recreate")
     hv0mvsperc = scatterplot(dfevt, "perc_v0m", "v0m_corr", 50000, 0, 100,
                              200, 0., 2000.)
     hv0mvsperc.SetName("hv0mvsperc")
     hv0mvsperc.Write()
     for ivar, var in enumerate(varlist):
         label = "hbitINT7vs%s" % (var)
         histoMB = TH1F(label, label, nbinsvar[ivar], minrvar[ivar],
                        maxrvar[ivar])
         fill_hist(histoMB, dfevt.query("trigger_hasbit_INT7==1")[var])
         histoMB.Sumw2()
         histoMB.Write()
         for trigger in triggerlist:
             triggerbit = "trigger_hasbit_%s==1" % trigger
             labeltriggerANDMB = "hbit%sANDINT7vs%s" % (trigger, var)
             labeltrigger = "hbit%svs%s" % (trigger, var)
             histotrigANDMB = TH1F(labeltriggerANDMB, labeltriggerANDMB,
                                   nbinsvar[ivar], minrvar[ivar],
                                   maxrvar[ivar])
             histotrig = TH1F(labeltrigger, labeltrigger, nbinsvar[ivar],
                              minrvar[ivar], maxrvar[ivar])
             fill_hist(
                 histotrigANDMB,
                 dfevt.query(triggerbit +
                             " and trigger_hasbit_INT7==1")[var])
             fill_hist(histotrig, dfevt.query(triggerbit)[var])
             histotrigANDMB.Sumw2()
             histotrig.Sumw2()
             histotrigANDMB.Write()
             histotrig.Write()
     dfevtnorm = pickle.load(openfile(self.l_evtorig[file_index], "rb"))
     hNorm = TH1F("hEvForNorm", ";;Normalisation", 2, 0.5, 2.5)
     hNorm.GetXaxis().SetBinLabel(1, "normsalisation factor")
     hNorm.GetXaxis().SetBinLabel(2, "selected events")
     nselevt = 0
     norm = 0
     if not dfevtnorm.empty:
         nselevt = len(dfevtnorm.query("is_ev_rej==0"))
         norm = getnormforselevt(dfevtnorm)
     hNorm.SetBinContent(1, norm)
     hNorm.SetBinContent(2, nselevt)
     hNorm.Write()
     fileevtroot.Close()
Esempio n. 16
0
    def process_histomass(self):
        myfile = TFile.Open(self.n_filemass, "recreate")

        for ipt in range(self.p_nptfinbins):
            bin_id = self.bin_matching[ipt]
            df = pickle.load(openfile(self.lpt_recodecmerged[bin_id], "rb"))
            df = df.query(self.l_selml[bin_id])
            if self.s_evtsel is not None:
                df = df.query(self.s_evtsel)
            if self.s_trigger is not None:
                df = df.query(self.s_trigger)
            df = seldf_singlevar(df, self.v_var_binning, \
                                 self.lpt_finbinmin[ipt], self.lpt_finbinmax[ipt])
            for ibin2 in range(len(self.lvar2_binmin)):
                suffix = "%s%d_%d_%.2f%s_%.2f_%.2f" % \
                         (self.v_var_binning, self.lpt_finbinmin[ipt],
                          self.lpt_finbinmax[ipt], self.lpt_probcutfin[bin_id],
                          self.v_var2_binning, self.lvar2_binmin[ibin2], self.lvar2_binmax[ibin2])
                h_invmass = TH1F("hmass" + suffix, "", self.p_num_bins,
                                 self.p_mass_fit_lim[0], self.p_mass_fit_lim[1])
                df_bin = seldf_singlevar(df, self.v_var2_binning,
                                         self.lvar2_binmin[ibin2], self.lvar2_binmax[ibin2])
                fill_hist(h_invmass, df_bin.inv_mass)
                myfile.cd()
                h_invmass.Write()

                if "pt_jet" in df_bin.columns:
                    zarray = z_calc(df_bin.pt_jet, df_bin.phi_jet, df_bin.eta_jet,
                                    df_bin.pt_cand, df_bin.phi_cand, df_bin.eta_cand)
                    h_zvsinvmass = TH2F("hzvsmass" + suffix, "", 5000, 1.00, 6.00, 2000, -0.5, 1.5)
                    zvsinvmass = np.vstack((df_bin.inv_mass, zarray)).T
                    fill_hist(h_zvsinvmass, zvsinvmass)
                    h_zvsinvmass.Write()
Esempio n. 17
0
 def do_test(self):
     df_ml_test = test(self.p_mltype, self.p_classname, self.p_trainedmod,
                       self.df_mltest, self.v_train, self.v_sig)
     df_ml_test_to_df = self.dirmlout+"/testsample_%s_mldecision.pkl" % (self.s_suffix)
     df_ml_test_to_root = self.dirmlout+"/testsample_%s_mldecision.root" % (self.s_suffix)
     pickle.dump(df_ml_test, openfile(df_ml_test_to_df, "wb"), protocol=4)
     write_tree(df_ml_test_to_root, self.n_treetest, df_ml_test)
Esempio n. 18
0
    def do_test(self):

        self.do_train()
        if self.step_done("test"):
            self.df_mltest_applied = pickle.load(
                openfile(self.f_mltest_applied, "rb"))
            return

        self.logger.info("Testing")
        self.df_mltest_applied = test(self.p_mltype, self.p_classname,
                                      self.p_trainedmod, self.df_mltest,
                                      self.v_train, self.v_sig)
        df_ml_test_to_root = self.dirmlout + "/testsample_%s_mldecision.root" % (
            self.s_suffix)
        pickle.dump(self.df_mltest_applied,
                    openfile(self.f_mltest_applied, "wb"),
                    protocol=4)
        write_tree(df_ml_test_to_root, self.n_treetest, self.df_mltest_applied)
Esempio n. 19
0
    def do_apply(self):

        self.prepare_data_mc_mcgen()

        if self.step_done("application"):
            return

        self.do_train()

        self.logger.info("Application")

        df_data = apply(self.p_mltype, self.p_classname, self.p_trainedmod,
                        self.df_data, self.v_train)
        df_mc = apply(self.p_mltype, self.p_classname, self.p_trainedmod,
                      self.df_mc, self.v_train)
        pickle.dump(df_data,
                    openfile(self.f_reco_applieddata, "wb"),
                    protocol=4)
        pickle.dump(df_mc, openfile(self.f_reco_appliedmc, "wb"), protocol=4)
Esempio n. 20
0
def check_duplicates(file_path, cols):
    """Open dataframe and check for duplicates
    """

    df = pickle.load(openfile(file_path, "rb"))[cols]
    len_orig = len(df)
    df_dupl = df[df.duplicated(keep=False)]
    len_dupl = len(df_dupl)

    return len_orig, len_dupl, df_dupl
Esempio n. 21
0
def create_bkg_df(size):
    dfreco = pickle.load(openfile("./results_test/filtrated_df.pkl", "rb"))
    print("datasamlpe loaded", dfreco.shape)
    d_bkg = dfreco[dfreco["is_d"] == 1]
    nd_bkg = dfreco[dfreco["is_d"] == 0]

    frames = [d_bkg, nd_bkg]
    bkg = pd.concat(frames)
    bkg["ismcsignal"] = 0
    bkg = bkg.sample(n=size)
    print(bkg)
    return bkg
 def process_histomass(self):
     for ipt in range(self.p_nptbins):
         myfile = TFile.Open(self.lpt_filemass[ipt], "recreate")
         df = pickle.load(openfile(self.lpt_recodecmerged[ipt], "rb"))
         df = df.query(self.l_selml[ipt])
         h_invmass = TH1F("hmass", "", self.p_num_bins,
                          self.p_mass_fit_lim[0], self.p_mass_fit_lim[1])
         fill_hist(h_invmass, df.inv_mass)
         myfile.cd()
         h_invmass.Write()
         canv_mass = TCanvas("c%d" % (ipt), "canvas", 500, 500)
         h_invmass.Draw()
         canv_mass.SaveAs("%s/chisto_bin%d.pdf" % (self.d_results, ipt))
Esempio n. 23
0
    def process_histomass(self):
        myfile = TFile.Open(self.n_filemass, "recreate")

        for ipt in range(self.p_nptfinbins):
            bin_id = self.bin_matching[ipt]
            df = pickle.load(openfile(self.lpt_recodecmerged[bin_id], "rb"))
            df = df.query(self.l_selml[bin_id])
            if self.s_evtsel is not None:
                df = df.query(self.s_evtsel)
            if self.s_trigger is not None:
                df = df.query(self.s_trigger)
            df = seldf_singlevar(df, self.v_var_binning, \
                                 self.lpt_finbinmin[ipt], self.lpt_finbinmax[ipt])
            for ibin2 in range(len(self.lvar2_binmin)):
                suffix = "%s%d_%d_%.2f%s_%.2f_%.2f" % \
                         (self.v_var_binning, self.lpt_finbinmin[ipt],
                          self.lpt_finbinmax[ipt], self.lpt_probcutfin[bin_id],
                          self.v_var2_binning, self.lvar2_binmin[ibin2], self.lvar2_binmax[ibin2])
                h_invmass = TH1F("hmass" + suffix, "", self.p_num_bins,
                                 self.p_mass_fit_lim[0],
                                 self.p_mass_fit_lim[1])
                df_bin = seldf_singlevar(df, self.v_var2_binning,
                                         self.lvar2_binmin[ibin2],
                                         self.lvar2_binmax[ibin2])
                weights = None
                #apply_weights = self.datap["analysis"][self.typean]["triggersel"]["weights"]
                #if apply_weights is not None:
                #    filenorm = TFile.Open("norm.root", "read")
                #    hnorm = filenorm.Get("hnorm_" + apply_weights[0] + "_" + apply_weights[1])
                #    weights = [hnorm.GetBinContent(hnorm.FindBin(_bin)) \
                #               for _bin in df_bin[apply_weights[0]]]
                fill_hist(h_invmass, df_bin.inv_mass, weights=weights)
                myfile.cd()
                h_invmass.Write()

                if "pt_jet" in df_bin.columns:
                    zarray = z_calc(df_bin.pt_jet, df_bin.phi_jet,
                                    df_bin.eta_jet, df_bin.pt_cand,
                                    df_bin.phi_cand, df_bin.eta_cand)
                    h_zvsinvmass = TH2F("hzvsmass" + suffix, "", 5000, 1.00,
                                        6.00, 2000, -0.5, 1.5)
                    zvsinvmass = np.vstack((df_bin.inv_mass, zarray)).T
                    fill_hist(h_zvsinvmass, zvsinvmass)
                    h_zvsinvmass.Write()
Esempio n. 24
0
 def process_histomass(self):
     for ipt in range(self.p_nptbins):
         myfile = TFile.Open(self.lpt_filemass[ipt], "recreate")
         df = pickle.load(openfile(self.lpt_recodecmerged[ipt], "rb"))
         df = df.query(self.l_selml[ipt])
         h_invmass = TH1F("hmass", "", self.p_num_bins,
                          self.p_mass_fit_lim[0], self.p_mass_fit_lim[1])
         fill_hist(h_invmass, df.inv_mass)
         myfile.cd()
         if self.usefit is True:
             fitter(h_invmass, self.p_casefit, self.p_sgnfunc[ipt], self.p_bkgfunc[ipt], \
                 self.p_masspeak, self.p_rebin[ipt], self.p_dolike, self.p_fixingausmean, \
                 self.p_fixingaussigma, self.p_sigmaarray[ipt], self.p_massmin[ipt], \
                 self.p_massmax[ipt], self.p_fixedmean, self.p_fixedsigma, self.d_results, \
                 self.v_var_binning, self.lpt_anbinmin[ipt], self.lpt_anbinmax[ipt])
         h_invmass.Write()
         canv_mass = TCanvas("c%d" % (ipt), "canvas", 500, 500)
         h_invmass.Draw()
         canv_mass.SaveAs("%s/chisto_bin%d.pdf" % (self.d_results, ipt))
Esempio n. 25
0
    def process_histomass(self):
        myfile = TFile.Open(self.n_filemass, "recreate")

        for ipt in range(self.p_nptfinbins):
            bin_id = self.bin_matching[ipt]
            df = pickle.load(openfile(self.lpt_recodecmerged[bin_id], "rb"))
            df = df.query(self.l_selml[bin_id])
            df = seldf_singlevar(df, self.v_var_binning, \
                                 self.lpt_finbinmin[ipt], self.lpt_finbinmax[ipt])
            for ibin2 in range(len(self.lvar2_binmin)):
                suffix = "%s%d_%d_%.2f%s_%d_%d" % \
                         (self.v_var_binning, self.lpt_finbinmin[ipt],
                          self.lpt_finbinmax[ipt], self.lpt_probcutfin[bin_id],
                          self.v_var2_binning, self.lvar2_binmin[ibin2], self.lvar2_binmax[ibin2])
                h_invmass = TH1F("hmass" + suffix, "", self.p_num_bins,
                                 self.p_mass_fit_lim[0], self.p_mass_fit_lim[1])
                df_bin = seldf_singlevar(df, self.v_var2_binning,
                                         self.lvar2_binmin[ibin2], self.lvar2_binmax[ibin2])
                fill_hist(h_invmass, df_bin.inv_mass)
                myfile.cd()
                h_invmass.Write()
def fill_from_pickles(file_paths,
                      histo_params,
                      cols=None,
                      query=None,
                      skim_func=None,
                      skim_func_args=None,
                      queries=None,
                      merge_on=None):

    print(f"Process files {file_paths}")

    dfs = [pickle.load(openfile(f, "rb")) for f in file_paths]
    df = dfs[0]
    if len(dfs) > 1:
        if merge_on and len(merge_on) != len(dfs) - 1:
            print(
                f"ERROR: merge_on must be {len(dfs) - 1} however found to be {len(merge_on)}"
            )
            sys.exit(1)

        for df_, on in zip(dfs[1:], merge_on):
            # Recursively merge dataframes
            df = pd.merge(df, df_, on=on)

    if query:
        # Apply common query
        df = df.query(query)
    if cols:
        # Select already columns which are needed in the following
        df = df[cols]

    if skim_func:
        # Skim the dataframe according to user function
        df = skim_func(df, skim_func_args)

    histos = []
    if not queries:
        queries = [None] * len(histo_params)

    if len(queries) != len(histo_params):
        print("ERROR: Need as many queries as histogram parameters")
        sys.exit(1)

    for hp, qu in zip(histo_params, queries):
        n_cols = len(hp[0])
        if n_cols > 2:
            print(f"ERROR: Cannot handle plots with dimension > 2")
            sys.exit(1)
        histo_func = TH1F if n_cols == 1 else TH2F

        df_fill = df
        if qu:
            # If there is an additional query for this histogram apply it to dataframe
            df_fill = df.query(qu)

        # Arrange for 1D or 2D plotting
        fill_with = df_fill[hp[0][0]] if n_cols == 1 else df_fill[
            hp[0]].to_numpy()

        histo_name = "_".join(hp[0])
        histo = histo_func(histo_name, histo_name, *hp[1])

        weights = df_fill[hp[2]] if len(hp) == 3 else None
        fill_hist(histo, fill_with, weights=weights)
        histo.SetDirectory(0)
        histos.append(histo)

    return histos
Esempio n. 27
0
    def do_significance(self):
        self.logger.info("Doing significance optimization")
        #first extract the number of data events in the ml sample
        self.df_evt_data = pickle.load(openfile(self.f_evt_data, 'rb'))
        if self.p_dofullevtmerge is True:
            self.df_evttotsample_data = pickle.load(openfile(self.f_evttotsample_data, 'rb'))
        else:
            self.logger.info("The total merged event dataframe was not merged \
                             for space limits")
            self.df_evttotsample_data = pickle.load(openfile(self.f_evt_data, 'rb'))
        #and the total number of events
        self.p_nevttot = len(self.df_evttotsample_data)
        self.p_nevtml = len(self.df_evt_data)
        self.logger.info("Number of data events used for ML: %d", self.p_nevtml)
        self.logger.info("Total number of data events: %d", self.p_nevttot)
        #calculate acceptance correction. we use in this case all
        #the signal from the mc sample, without limiting to the n. signal
        #events used for training
        denacc = len(self.df_mcgen[self.df_mcgen["ismcprompt"] == 1])
        numacc = len(self.df_mc[self.df_mc["ismcprompt"] == 1])
        acc, acc_err = self.calceff(numacc, denacc)

        self.logger.info("Acceptance: %.3e +/- %.3e", acc, acc_err)
        #calculation of the expected fonll signals
        df_fonll = pd.read_csv(self.f_fonll)
        ptmin = self.p_binmin
        ptmax = self.p_binmax
        df_fonll_in_pt = df_fonll.query('(pt >= @ptmin) and (pt < @ptmax)')[self.p_fonllband]
        prod_cross = df_fonll_in_pt.sum() * self.p_fragf * 1e-12 / len(df_fonll_in_pt)
        delta_pt = ptmax - ptmin
        signal_yield = 2. * prod_cross * delta_pt * self.p_br * acc * self.p_taa \
                       / (self.p_sigmamb * self.p_fprompt)
        self.logger.info("Expected signal yield: %.3e", signal_yield)
        signal_yield = self.p_raahp * signal_yield
        self.logger.info("Expected signal yield x RAA hp: %.3e", signal_yield)

        #now we plot the fonll expectation
        plt.figure(figsize=(20, 15))
        plt.subplot(111)
        plt.plot(df_fonll['pt'], df_fonll[self.p_fonllband] * self.p_fragf, linewidth=4.0)
        plt.xlabel('P_t [GeV/c]', fontsize=20)
        plt.ylabel('Cross Section [pb/GeV]', fontsize=20)
        plt.title("FONLL cross section " + self.p_case, fontsize=20)
        plt.semilogy()
        plt.savefig(f'{self.dirmlplot}/FONLL_curve_{self.s_suffix}.png')

        df_data_sideband = self.df_data.query(self.s_selbkgml)
        df_data_sideband = shuffle(df_data_sideband, random_state=self.rnd_shuffle)
        df_data_sideband = df_data_sideband.tail(round(len(df_data_sideband) * self.p_bkgfracopt))
        hmass = TH1F('hmass', '', self.p_num_bins, self.p_mass_fit_lim[0], self.p_mass_fit_lim[1])
        df_mc_signal = self.df_mc[self.df_mc["ismcsignal"] == 1]
        mass_array = df_mc_signal['inv_mass'].values
        for mass_value in np.nditer(mass_array):
            hmass.Fill(mass_value)

        gaus_fit = TF1("gaus_fit", "gaus", self.p_mass_fit_lim[0], self.p_mass_fit_lim[1])
        gaus_fit.SetParameters(0, hmass.Integral())
        gaus_fit.SetParameters(1, self.p_mass)
        gaus_fit.SetParameters(2, 0.02)
        self.logger.info("To fit the signal a gaussian function is used")
        fitsucc = hmass.Fit("gaus_fit", "RQ")

        if int(fitsucc) != 0:
            self.logger.warning("Problem in signal peak fit")
            sigma = 0.

        sigma = gaus_fit.GetParameter(2)
        self.logger.info("Mean of the gaussian: %.3e", gaus_fit.GetParameter(1))
        self.logger.info("Sigma of the gaussian: %.3e", sigma)
        sig_region = [self.p_mass - 3 * sigma, self.p_mass + 3 * sigma]
        fig_signif_pevt = plt.figure(figsize=(20, 15))
        plt.xlabel('Threshold', fontsize=20)
        plt.ylabel(r'Significance Per Event ($3 \sigma$)', fontsize=20)
        plt.title("Significance Per Event vs Threshold", fontsize=20)
        fig_signif = plt.figure(figsize=(20, 15))
        plt.xlabel('Threshold', fontsize=20)
        plt.ylabel(r'Significance ($3 \sigma$)', fontsize=20)
        plt.title("Significance vs Threshold", fontsize=20)

        for name in self.p_classname:
            df_sig = self.df_mltest[self.df_mltest["ismcprompt"] == 1]
            eff_array, eff_err_array, x_axis = self.calc_sigeff_steps(self.p_nstepsign,
                                                                      df_sig, name)
            bkg_array, bkg_err_array, _ = calc_bkg(df_data_sideband, name, self.p_nstepsign,
                                                   self.p_mass_fit_lim, self.p_bin_width,
                                                   sig_region, self.p_savefit, self.dirmlplot)
            sig_array = [eff * signal_yield for eff in eff_array]
            sig_err_array = [eff_err * signal_yield for eff_err in eff_err_array]
            bkg_array = [bkg / (self.p_bkgfracopt * self.p_nevtml) for bkg in bkg_array]
            bkg_err_array = [bkg_err / (self.p_bkgfracopt * self.p_nevtml) \
                             for bkg_err in bkg_err_array]
            signif_array, signif_err_array = calc_signif(sig_array, sig_err_array, bkg_array,
                                                         bkg_err_array)
            plt.figure(fig_signif_pevt.number)
            plt.errorbar(x_axis, signif_array, yerr=signif_err_array, alpha=0.3, label=f'{name}',
                         elinewidth=2.5, linewidth=4.0)
            signif_array_ml = [sig * sqrt(self.p_nevtml) for sig in signif_array]
            signif_err_array_ml = [sig_err * sqrt(self.p_nevtml) for sig_err in signif_err_array]
            plt.figure(fig_signif.number)
            plt.errorbar(x_axis, signif_array_ml, yerr=signif_err_array_ml, alpha=0.3,
                         label=f'{name}_ML_dataset', elinewidth=2.5, linewidth=4.0)
            signif_array_tot = [sig * sqrt(self.p_nevttot) for sig in signif_array]
            signif_err_array_tot = [sig_err * sqrt(self.p_nevttot) for sig_err in signif_err_array]
            plt.figure(fig_signif.number)
            plt.errorbar(x_axis, signif_array_tot, yerr=signif_err_array_tot, alpha=0.3,
                         label=f'{name}_Tot', elinewidth=2.5, linewidth=4.0)
            plt.figure(fig_signif_pevt.number)
            plt.legend(loc="lower left", prop={'size': 18})
            plt.savefig(f'{self.dirmlplot}/Significance_PerEvent_{self.s_suffix}.png')
            plt.figure(fig_signif.number)
            plt.legend(loc="lower left", prop={'size': 18})
            plt.savefig(f'{self.dirmlplot}/Significance_{self.s_suffix}.png')
Esempio n. 28
0
    def process_response(self):
        """
        First of all, we load all the mc gen and reco files that are skimmed
        in bins of HF candidate ptand we apply the standard selection to all
        of them. After this, we merged them all to create a single file of gen
        and reco monte carlo sample with all the HF candidate pt. In particular
        gen jets are selected according to run trigger, runlist, and gen jet
        zbin_recoand pseudorapidity. Reco candidates according to evt selection, eta
        jets, trigger and ml probability of the HF hadron
        """
        zbin_reco = []
        nzbin_reco = self.p_nbinshape_reco
        zbin_reco = self.varshaperanges_reco
        zbinarray_reco = array.array('d', zbin_reco)

        zbin_gen = []
        nzbin_gen = self.p_nbinshape_gen
        zbin_gen = self.varshaperanges_gen
        zbinarray_gen = array.array('d', zbin_gen)

        jetptbin_reco = []
        njetptbin_reco = self.p_nbin2_reco
        jetptbin_reco = self.var2ranges_reco
        jetptbinarray_reco = array.array('d', jetptbin_reco)

        jetptbin_gen = []
        njetptbin_gen = self.p_nbin2_gen
        jetptbin_gen = self.var2ranges_gen
        jetptbinarray_gen = array.array('d', jetptbin_gen)

        candptbin = []
        candptbin = self.lpt_finbinmin.copy()
        candptbin.append(self.lpt_finbinmax[-1])
        candptbinarray = array.array('d', candptbin)

        out_file = TFile.Open(self.n_fileeff, "update")
        list_df_mc_reco = []
        list_df_mc_gen = []

        for iptskim, _ in enumerate(self.lpt_anbinmin):

            df_mc_gen = pickle.load(openfile(self.lpt_gendecmerged[iptskim], "rb"))
            df_mc_gen = selectdfrunlist(df_mc_gen, \
                    self.run_param[self.runlistrigger[self.triggerbit]], "run_number")
            df_mc_gen = df_mc_gen.query(self.s_jetsel_gen)
            list_df_mc_gen.append(df_mc_gen)

            df_mc_reco = pickle.load(openfile(self.lpt_recodecmerged[iptskim], "rb"))
            if self.s_evtsel is not None:
                df_mc_reco = df_mc_reco.query(self.s_evtsel)
            if self.s_jetsel_reco is not None:
                df_mc_reco = df_mc_reco.query(self.s_jetsel_reco)
            if self.s_trigger is not None:
                df_mc_reco = df_mc_reco.query(self.s_trigger)
            if self.doml is True:
                df_mc_reco = df_mc_reco.query(self.l_selml[iptskim])
            list_df_mc_reco.append(df_mc_reco)

        # Here we can merge the dataframes corresponding to different HF pt in a
        # single one. In addition we are here selecting only non prompt HF

        df_gen = pd.concat(list_df_mc_gen)
        df_mc_reco = pd.concat(list_df_mc_reco)

        # add the z columns
        df_gen["z"] = z_calc(df_gen.pt_jet, df_gen.phi_jet, df_gen.eta_jet,
                             df_gen.pt_cand, df_gen.phi_cand, df_gen.eta_cand)

        df_mc_reco["z"] = z_calc(df_mc_reco.pt_jet, df_mc_reco.phi_jet, df_mc_reco.eta_jet,
                                 df_mc_reco.pt_cand, df_mc_reco.phi_cand, df_mc_reco.eta_cand)

        df_mc_reco["z_gen"] = z_gen_calc(df_mc_reco.pt_gen_jet, df_mc_reco.phi_gen_jet,
                                         df_mc_reco.eta_gen_jet, df_mc_reco.pt_gen_cand,
                                         df_mc_reco.delta_phi_gen_jet, df_mc_reco.delta_eta_gen_jet)

        df_gen_nonprompt = df_gen[df_gen.ismcfd == 1]
        df_gen_prompt = df_gen[df_gen.ismcprompt == 1]
        df_mc_reco_merged_nonprompt = df_mc_reco[df_mc_reco.ismcfd == 1]
        df_mc_reco_merged_prompt = df_mc_reco[df_mc_reco.ismcprompt == 1]

        # The following plots are 3d plots all at generated level of z,
        # pt_jet and pt_cand. This was used in the first version of the feeddown
        # subtraction, currently is obsolete

        hzvsjetpt_gen_unmatched = TH2F("hzvsjetpt_gen_unmatched", "hzvsjetpt_gen_unmatched", \
            nzbin_gen, zbinarray_gen, njetptbin_gen, jetptbinarray_gen)
        df_zvsjetpt_gen_unmatched = df_gen_prompt.loc[:, [self.v_varshape_binning, "pt_jet"]]
        fill_hist(hzvsjetpt_gen_unmatched, df_zvsjetpt_gen_unmatched)
        hzvsjetpt_gen_unmatched.Write()
        titlehist = "hzvsjetptvscandpt_gen_nonprompt"
        hzvsjetptvscandpt_gen_nonprompt = makefill3dhist(df_gen_nonprompt, titlehist, \
            zbinarray_gen, jetptbinarray_gen, candptbinarray, self.v_varshape_binning, "pt_jet", "pt_cand")
        hzvsjetptvscandpt_gen_nonprompt.Write()

        # hz_gen_nocuts is the distribution of generated z values in b in
        # bins of gen_jet pt before the reco z and jetpt selection. hz_gen_cuts
        # also includes cut on z reco and jet pt reco. These are used for overall
        # efficiency correction to estimate the fraction of candidates that are
        # in the reco range but outside the gen range and viceversa

        for ibin2 in range(self.p_nbin2_gen):
            suffix = "%s_%.2f_%.2f" % \
                (self.v_var2_binning, self.lvar2_binmin_gen[ibin2], self.lvar2_binmax_gen[ibin2])
            hz_gen_nocuts = TH1F("hz_gen_nocuts_nonprompt" + suffix, \
                "hz_gen_nocuts_nonprompt" + suffix, nzbin_gen, zbinarray_gen)
            hz_gen_nocuts.Sumw2()
            hz_gen_cuts = TH1F("hz_gen_cuts_nonprompt" + suffix,
                               "hz_gen_cuts_nonprompt" + suffix, nzbin_gen, zbinarray_gen)
            hz_gen_cuts.Sumw2()

            df_tmp = seldf_singlevar(df_mc_reco_merged_nonprompt, "pt_gen_jet", \
                                     self.lvar2_binmin_gen[ibin2], self.lvar2_binmax_gen[ibin2])
            df_tmp = seldf_singlevar(df_tmp, self.v_varshape_binning_gen, \
                                     self.lvarshape_binmin_gen[0], self.lvarshape_binmax_gen[-1])
            fill_hist(hz_gen_nocuts, df_tmp[self.v_varshape_binning_gen])
            df_tmp = seldf_singlevar(df_tmp, "pt_jet",
                                     self.lvar2_binmin_reco[0], self.lvar2_binmax_reco[-1])
            df_tmp = seldf_singlevar(df_tmp, self.v_varshape_binning,
                                     self.lvarshape_binmin_reco[0], self.lvarshape_binmax_reco[-1])
            fill_hist(hz_gen_cuts, df_tmp[self.v_varshape_binning_gen])
            hz_gen_cuts.Write()
            hz_gen_nocuts.Write()

            # Addendum for unfolding
            hz_gen_nocuts_pr = TH1F("hz_gen_nocuts" + suffix, \
                "hz_gen_nocuts" + suffix, nzbin_gen, zbinarray_gen)
            hz_gen_nocuts_pr.Sumw2()
            hz_gen_cuts_pr = TH1F("hz_gen_cuts" + suffix,
                                  "hz_gen_cuts" + suffix, nzbin_gen, zbinarray_gen)
            hz_gen_cuts_pr.Sumw2()
            df_tmp_pr = seldf_singlevar(df_mc_reco_merged_prompt, "pt_gen_jet", \
                                     self.lvar2_binmin_gen[ibin2], self.lvar2_binmax_gen[ibin2])
            df_tmp_pr = seldf_singlevar(df_tmp_pr, self.v_varshape_binning_gen, \
                                     self.lvarshape_binmin_gen[0], self.lvarshape_binmax_gen[-1])
            fill_hist(hz_gen_nocuts_pr, df_tmp_pr[self.v_varshape_binning_gen])
            df_tmp_pr = seldf_singlevar(df_tmp_pr, "pt_jet",
                                        self.lvar2_binmin_reco[0], self.lvar2_binmax_reco[-1])
            df_tmp_pr = seldf_singlevar(df_tmp_pr, self.v_varshape_binning,
                                        self.lvarshape_binmin_reco[0], self.lvarshape_binmax_reco[-1])
            fill_hist(hz_gen_cuts_pr, df_tmp_pr[self.v_varshape_binning_gen])
            hz_gen_cuts_pr.Write()
            hz_gen_nocuts_pr.Write()
            # End addendum for unfolding


        df_tmp_selgen, df_tmp_selreco, df_tmp_selrecogen = \
                self.create_df_closure(df_mc_reco_merged_nonprompt)

        df_tmp_selgen_pr, df_tmp_selreco_pr, df_tmp_selrecogen_pr = \
                self.create_df_closure(df_mc_reco_merged_prompt)

        # histograms for response of feeddown
        hzvsjetpt_reco_nocuts = \
            build2dhisto("hzvsjetpt_reco_nocuts_nonprompt", zbinarray_reco, jetptbinarray_reco)
        hzvsjetpt_reco_cuts = \
            build2dhisto("hzvsjetpt_reco_cuts_nonprompt", zbinarray_reco, jetptbinarray_reco)
        hzvsjetpt_gen_nocuts = \
            build2dhisto("hzvsjetpt_gen_nocuts_nonprompt", zbinarray_gen, jetptbinarray_gen)
        hzvsjetpt_gen_cuts = \
            build2dhisto("hzvsjetpt_gen_cuts_nonprompt", zbinarray_gen, jetptbinarray_gen)

        hzvsjetpt_reco = hzvsjetpt_reco_nocuts.Clone("hzvsjetpt_reco_nonprompt")
        hzvsjetpt_gen = hzvsjetpt_gen_nocuts.Clone("hzvsjetpt_genv")
        response_matrix = RooUnfoldResponse(hzvsjetpt_reco, hzvsjetpt_gen)

        fill2dhist(df_tmp_selreco, hzvsjetpt_reco_nocuts, self.v_varshape_binning, "pt_jet")
        fill2dhist(df_tmp_selgen, hzvsjetpt_gen_nocuts, self.v_varshape_binning_gen, "pt_gen_jet")
        fill2dhist(df_tmp_selrecogen, hzvsjetpt_reco_cuts, self.v_varshape_binning, "pt_jet")
        fill2dhist(df_tmp_selrecogen, hzvsjetpt_gen_cuts, self.v_varshape_binning_gen, "pt_gen_jet")

        hzvsjetpt_reco_nocuts.Write()
        hzvsjetpt_gen_nocuts.Write()
        hzvsjetpt_reco_cuts.Write()
        hzvsjetpt_gen_cuts.Write()

        # histograms for unfolding
        hzvsjetpt_reco_nocuts_pr = \
            build2dhisto("hzvsjetpt_reco_nocuts", zbinarray_reco, jetptbinarray_reco)
        hzvsjetpt_reco_cuts_pr = \
            build2dhisto("hzvsjetpt_reco_cuts", zbinarray_reco, jetptbinarray_reco)
        hzvsjetpt_gen_nocuts_pr = \
            build2dhisto("hzvsjetpt_gen_nocuts", zbinarray_gen, jetptbinarray_gen)
        hzvsjetpt_gen_cuts_pr = \
            build2dhisto("hzvsjetpt_gen_cuts", zbinarray_gen, jetptbinarray_gen)

        fill2dhist(df_tmp_selreco_pr, hzvsjetpt_reco_nocuts_pr, self.v_varshape_binning, "pt_jet")
        fill2dhist(df_tmp_selgen_pr, hzvsjetpt_gen_nocuts_pr, self.v_varshape_binning_gen, "pt_gen_jet")
        fill2dhist(df_tmp_selrecogen_pr, hzvsjetpt_reco_cuts_pr, self.v_varshape_binning, "pt_jet")
        fill2dhist(df_tmp_selrecogen_pr, hzvsjetpt_gen_cuts_pr, self.v_varshape_binning_gen, "pt_gen_jet")
        hzvsjetpt_reco_nocuts_pr.Write()
        hzvsjetpt_gen_nocuts_pr.Write()
        hzvsjetpt_reco_cuts_pr.Write()
        hzvsjetpt_gen_cuts_pr.Write()

        hzvsjetpt_reco_closure_pr = \
            build2dhisto("hzvsjetpt_reco_closure", zbinarray_reco, jetptbinarray_reco)
        hzvsjetpt_gen_closure_pr = \
            build2dhisto("hzvsjetpt_gen_closure", zbinarray_reco, jetptbinarray_reco)
        hzvsjetpt_reco_pr = \
            build2dhisto("hzvsjetpt_reco", zbinarray_reco, jetptbinarray_reco)
        hzvsjetpt_gen_pr = \
            build2dhisto("hzvsjetpt_gen", zbinarray_gen, jetptbinarray_gen)
        response_matrix_pr = RooUnfoldResponse(hzvsjetpt_reco_pr, hzvsjetpt_gen_pr)
        response_matrix_closure_pr = RooUnfoldResponse(hzvsjetpt_reco_pr, hzvsjetpt_gen_pr)

        fill2dhist(df_tmp_selreco_pr, hzvsjetpt_reco_pr, self.v_varshape_binning, "pt_jet")
        fill2dhist(df_tmp_selgen_pr, hzvsjetpt_gen_pr, self.v_varshape_binning_gen, "pt_gen_jet")
        hzvsjetpt_reco_pr.Write()
        hzvsjetpt_gen_pr.Write()

        hjetpt_gen_nocuts_pr = TH1F("hjetpt_gen_nocuts", \
            "hjetpt_gen_nocuts", njetptbin_gen, jetptbinarray_gen)
        hjetpt_gen_cuts_pr = TH1F("hjetpt_gen_cuts", \
            "hjetpt_gen_cuts", njetptbin_gen, jetptbinarray_gen)
        hjetpt_gen_nocuts_closure = TH1F("hjetpt_gen_nocuts_closure", \
            "hjetpt_gen_nocuts_closure", njetptbin_gen, jetptbinarray_gen)
        hjetpt_gen_cuts_closure = TH1F("hjetpt_gen_cuts_closure", \
            "hjetpt_gen_cuts_closure", njetptbin_gen, jetptbinarray_gen)
        hjetpt_gen_nocuts_pr.Sumw2()
        hjetpt_gen_cuts_pr.Sumw2()
        hjetpt_gen_nocuts_closure.Sumw2()
        hjetpt_gen_nocuts_closure.Sumw2()

        fill_hist(hjetpt_gen_nocuts_pr, df_tmp_selgen_pr["pt_gen_jet"])
        fill_hist(hjetpt_gen_cuts_pr, df_tmp_selrecogen_pr["pt_gen_jet"])
        hjetpt_gen_nocuts_pr.Write()
        hjetpt_gen_cuts_pr.Write()
        # end of histograms for unfolding

        hjetpt_genvsreco_full = \
            TH2F("hjetpt_genvsreco_full_nonprompt", "hjetpt_genvsreco_full_nonprompt", \
            njetptbin_gen * 100, self.lvar2_binmin_gen[0], self.lvar2_binmax_gen[-1], \
            njetptbin_reco * 100, self.lvar2_binmin_reco[0], self.lvar2_binmax_reco[-1])

        hz_genvsreco_full = \
            TH2F("hz_genvsreco_full_nonprompt", "hz_genvsreco_full_nonprompt", \
                 nzbin_gen * 100, self.lvarshape_binmin_gen[0], self.lvarshape_binmax_gen[-1],
                 nzbin_reco * 100, self.lvarshape_binmin_reco[0], self.lvarshape_binmax_reco[-1])

        fill2dhist(df_tmp_selrecogen, hjetpt_genvsreco_full, "pt_gen_jet", "pt_jet")
        hjetpt_genvsreco_full.Scale(1.0 / hjetpt_genvsreco_full.Integral(1, -1, 1, -1))
        hjetpt_genvsreco_full.Write()
        fill2dhist(df_tmp_selrecogen, hz_genvsreco_full, self.v_varshape_binning_gen, self.v_varshape_binning)
        hz_genvsreco_full.Scale(1.0 / hz_genvsreco_full.Integral(1, -1, 1, -1))
        hz_genvsreco_full.Write()
        for row in df_tmp_selrecogen.itertuples():
            response_matrix.Fill(getattr(row, self.v_varshape_binning), row.pt_jet, getattr(row, self.v_varshape_binning_gen), row.pt_gen_jet)
        response_matrix.Write("response_matrix_nonprompt")

        # histograms for unfolding
        hjetpt_genvsreco_full_pr = \
            TH2F("hjetpt_genvsreco_full", "hjetpt_genvsreco_full", \
            njetptbin_gen * 100, self.lvar2_binmin_gen[0], self.lvar2_binmax_gen[-1], \
            njetptbin_reco * 100, self.lvar2_binmin_reco[0], self.lvar2_binmax_reco[-1])

        hz_genvsreco_full_pr = \
            TH2F("hz_genvsreco_full", "hz_genvsreco_full", \
                 nzbin_gen * 100, self.lvarshape_binmin_gen[0], self.lvarshape_binmax_gen[-1],
                 nzbin_reco * 100, self.lvarshape_binmin_reco[0], self.lvarshape_binmax_reco[-1])
        fill2dhist(df_tmp_selrecogen_pr, hjetpt_genvsreco_full_pr, "pt_gen_jet", "pt_jet")
        hjetpt_genvsreco_full_pr.Scale(1.0 / hjetpt_genvsreco_full_pr.Integral(1, -1, 1, -1))
        hjetpt_genvsreco_full_pr.Write()
        fill2dhist(df_tmp_selrecogen_pr, hz_genvsreco_full_pr, self.v_varshape_binning_gen, self.v_varshape_binning)
        hz_genvsreco_full_pr.Scale(1.0 / hz_genvsreco_full_pr.Integral(1, -1, 1, -1))
        hz_genvsreco_full_pr.Write()


        hzvsjetpt_prior_weights = build2dhisto("hzvsjetpt_prior_weights", \
            zbinarray_gen, jetptbinarray_gen)
        fill2dhist(df_tmp_selrecogen_pr, hzvsjetpt_prior_weights, self.v_varshape_binning_gen, "pt_gen_jet")
        # end of histograms for unfolding

        for ibin2 in range(self.p_nbin2_reco):
            df_tmp_selrecogen_jetbin = seldf_singlevar(df_tmp_selrecogen, "pt_jet", \
                self.lvar2_binmin_reco[ibin2], self.lvar2_binmax_reco[ibin2])
            suffix = "%s_%.2f_%.2f" % (self.v_var2_binning, \
                self.lvar2_binmin_reco[ibin2], self.lvar2_binmax_reco[ibin2])
            hz_genvsreco = TH2F("hz_genvsreco_nonprompt" + suffix, "hz_genvsreco_nonprompt" + suffix, \
                nzbin_gen * 100, self.lvarshape_binmin_gen[0], self.lvarshape_binmax_gen[-1], \
                nzbin_reco*100, self.lvarshape_binmin_reco[0], self.lvarshape_binmax_reco[-1])
            fill2dhist(df_tmp_selrecogen_jetbin, hz_genvsreco, self.v_varshape_binning_gen, self.v_varshape_binning)
            norm = hz_genvsreco.Integral(1, -1, 1, -1)
            if norm > 0:
                hz_genvsreco.Scale(1.0/norm)
            hz_genvsreco.Write()

            df_tmp_selrecogen_pr_jetbin = seldf_singlevar(df_tmp_selrecogen_pr, "pt_jet", \
                self.lvar2_binmin_reco[ibin2], self.lvar2_binmax_reco[ibin2])
            suffix = "%s_%.2f_%.2f" % (self.v_var2_binning, \
                self.lvar2_binmin_reco[ibin2], self.lvar2_binmax_reco[ibin2])
            hz_genvsreco_pr = TH2F("hz_genvsreco" + suffix, "hz_genvsreco" + suffix, \
                nzbin_gen * 100, self.lvarshape_binmin_gen[0], self.lvarshape_binmax_gen[-1], \
                nzbin_reco*100, self.lvarshape_binmin_reco[0], self.lvarshape_binmax_reco[-1])
            fill2dhist(df_tmp_selrecogen_pr_jetbin, hz_genvsreco_pr, self.v_varshape_binning_gen, self.v_varshape_binning)
            norm_pr = hz_genvsreco_pr.Integral(1, -1, 1, -1)
            if norm_pr > 0:
                hz_genvsreco_pr.Scale(1.0/norm_pr)
            hz_genvsreco_pr.Write()

        for ibinshape in range(len(self.lvarshape_binmin_reco)):
            df_tmp_selrecogen_zbin = seldf_singlevar(df_tmp_selrecogen, self.v_varshape_binning, \
                self.lvarshape_binmin_reco[ibinshape], self.lvarshape_binmax_reco[ibinshape])
            suffix = "%s_%.2f_%.2f" % \
                (self.v_varshape_binning, self.lvarshape_binmin_reco[ibinshape], self.lvarshape_binmax_reco[ibinshape])
            hjetpt_genvsreco = TH2F("hjetpt_genvsreco_nonprompt" + suffix, \
                "hjetpt_genvsreco_nonprompt" + suffix, njetptbin_gen * 100, self.lvar2_binmin_gen[0], \
                self.lvar2_binmax_gen[-1], njetptbin_reco * 100, self.lvar2_binmin_reco[0], \
                self.lvar2_binmax_reco[-1])
            fill2dhist(df_tmp_selrecogen_zbin, hjetpt_genvsreco, "pt_gen_jet", "pt_jet")
            norm = hjetpt_genvsreco.Integral(1, -1, 1, -1)
            if norm > 0:
                hjetpt_genvsreco.Scale(1.0/norm)
            hjetpt_genvsreco.Write()

            df_tmp_selrecogen_pr_zbin = seldf_singlevar(df_tmp_selrecogen_pr, self.v_varshape_binning, \
                self.lvarshape_binmin_reco[ibinshape], self.lvarshape_binmax_reco[ibinshape])
            suffix = "%s_%.2f_%.2f" % \
                (self.v_varshape_binning, self.lvarshape_binmin_reco[ibinshape], self.lvarshape_binmax_reco[ibinshape])
            hjetpt_genvsreco_pr = TH2F("hjetpt_genvsreco" + suffix, \
                "hjetpt_genvsreco" + suffix, njetptbin_gen * 100, self.lvar2_binmin_gen[0], \
                self.lvar2_binmax_gen[-1], njetptbin_reco * 100, self.lvar2_binmin_reco[0], \
                self.lvar2_binmax_reco[-1])
            fill2dhist(df_tmp_selrecogen_pr_zbin, hjetpt_genvsreco_pr, "pt_gen_jet", "pt_jet")
            norm_pr = hjetpt_genvsreco_pr.Integral(1, -1, 1, -1)
            if norm_pr > 0:
                hjetpt_genvsreco_pr.Scale(1.0/norm_pr)
            hjetpt_genvsreco_pr.Write()

        for ibinshape in range(len(self.lvarshape_binmin_gen)):
            dtmp_nonprompt_zgen = seldf_singlevar(df_mc_reco_merged_nonprompt, \
                self.v_varshape_binning_gen, self.lvarshape_binmin_gen[ibinshape], self.lvarshape_binmax_gen[ibinshape])
            suffix = "%s_%.2f_%.2f" % \
                     (self.v_varshape_binning, self.lvarshape_binmin_gen[ibinshape], self.lvarshape_binmax_gen[ibinshape])
            hz_fracdiff = TH1F("hz_fracdiff_nonprompt" + suffix,
                               "hz_fracdiff_nonprompt" + suffix, 100, -2, 2)
            fill_hist(hz_fracdiff, (dtmp_nonprompt_zgen[self.v_varshape_binning] - \
                    dtmp_nonprompt_zgen[self.v_varshape_binning_gen])/dtmp_nonprompt_zgen[self.v_varshape_binning_gen])
            norm = hz_fracdiff.Integral(1, -1)
            if norm:
                hz_fracdiff.Scale(1.0 / norm)
            hz_fracdiff.Write()

            dtmp_prompt_zgen = seldf_singlevar(df_mc_reco_merged_prompt, \
                self.v_varshape_binning_gen, self.lvarshape_binmin_gen[ibinshape], self.lvarshape_binmax_gen[ibinshape])
            suffix = "%s_%.2f_%.2f" % \
                     (self.v_varshape_binning, self.lvarshape_binmin_gen[ibinshape], self.lvarshape_binmax_gen[ibinshape])
            hz_fracdiff_pr = TH1F("hz_fracdiff_prompt" + suffix,
                                  "hz_fracdiff_prompt" + suffix, 100, -2, 2)
            fill_hist(hz_fracdiff_pr, (dtmp_prompt_zgen[self.v_varshape_binning] - \
                    dtmp_prompt_zgen[self.v_varshape_binning_gen])/dtmp_prompt_zgen[self.v_varshape_binning_gen])
            norm_pr = hz_fracdiff_pr.Integral(1, -1)
            if norm_pr:
                hz_fracdiff_pr.Scale(1.0 / norm_pr)
            hz_fracdiff_pr.Write()

        for ibin2 in range(self.p_nbin2_gen):
            dtmp_nonprompt_jetptgen = seldf_singlevar(df_mc_reco_merged_nonprompt, \
                "pt_gen_jet", self.lvar2_binmin_gen[ibin2], self.lvar2_binmax_gen[ibin2])
            suffix = "%s_%.2f_%.2f" % (self.v_var2_binning,
                                       self.lvar2_binmin_gen[ibin2], self.lvar2_binmax_gen[ibin2])
            hjetpt_fracdiff = TH1F("hjetpt_fracdiff_nonprompt" + suffix,
                                   "hjetpt_fracdiff_nonprompt" + suffix, 100, -2, 2)
            fill_hist(hjetpt_fracdiff, (dtmp_nonprompt_jetptgen["pt_jet"] - \
                dtmp_nonprompt_jetptgen["pt_gen_jet"])/dtmp_nonprompt_jetptgen["pt_gen_jet"])
            norm = hjetpt_fracdiff.Integral(1, -1)
            if norm:
                hjetpt_fracdiff.Scale(1.0 / norm)
            hjetpt_fracdiff.Write()

            dtmp_prompt_jetptgen = seldf_singlevar(df_mc_reco_merged_prompt, \
                "pt_gen_jet", self.lvar2_binmin_gen[ibin2], self.lvar2_binmax_gen[ibin2])
            suffix = "%s_%.2f_%.2f" % (self.v_var2_binning,
                                       self.lvar2_binmin_gen[ibin2], self.lvar2_binmax_gen[ibin2])
            hjetpt_fracdiff_pr = TH1F("hjetpt_fracdiff_prompt" + suffix,
                                      "hjetpt_fracdiff_prompt" + suffix, 100, -2, 2)
            fill_hist(hjetpt_fracdiff_pr, (dtmp_prompt_jetptgen["pt_jet"] - \
                dtmp_prompt_jetptgen["pt_gen_jet"])/dtmp_prompt_jetptgen["pt_gen_jet"])
            norm_pr = hjetpt_fracdiff_pr.Integral(1, -1)
            if norm_pr:
                hjetpt_fracdiff_pr.Scale(1.0 / norm_pr)
            hjetpt_fracdiff_pr.Write()

        df_mc_reco_merged_prompt_train, df_mc_reco_merged_prompt_test = \
                train_test_split(df_mc_reco_merged_prompt, test_size=self.closure_frac)
        df_tmp_selgen_pr_test, df_tmp_selreco_pr_test, df_tmp_selrecogen_pr_test = \
                self.create_df_closure(df_mc_reco_merged_prompt_test)
        _, _, df_tmp_selrecogen_pr_train = \
                self.create_df_closure(df_mc_reco_merged_prompt_train)

        fill2dhist(df_tmp_selreco_pr_test, hzvsjetpt_reco_closure_pr, self.v_varshape_binning, "pt_jet")
        fill2dhist(df_tmp_selgen_pr_test, hzvsjetpt_gen_closure_pr, self.v_varshape_binning_gen, "pt_gen_jet")
        hzvsjetpt_reco_closure_pr.Write("input_closure_reco")
        hzvsjetpt_gen_closure_pr.Write("input_closure_gen")


        for ibin2 in range(self.p_nbin2_gen):
            suffix = "%s_%.2f_%.2f" % \
                (self.v_var2_binning, self.lvar2_binmin_gen[ibin2], self.lvar2_binmax_gen[ibin2])
            hz_gen_nocuts_closure = TH1F("hz_gen_nocuts_closure" + suffix,
                                         "hz_gen_nocuts_closure" + suffix,
                                         nzbin_gen, zbinarray_gen)
            hz_gen_nocuts_closure.Sumw2()
            hz_gen_cuts_closure = TH1F("hz_gen_cuts_closure" + suffix,
                                       "hz_gen_cuts_closure" + suffix,
                                       nzbin_gen, zbinarray_gen)
            hz_gen_cuts_closure.Sumw2()
            df_tmp_selgen_pr_test_bin = seldf_singlevar(df_tmp_selgen_pr_test, \
                "pt_gen_jet", self.lvar2_binmin_gen[ibin2], self.lvar2_binmax_gen[ibin2])
            df_tmp_selrecogen_pr_test_bin = seldf_singlevar(df_tmp_selrecogen_pr_test, \
                "pt_gen_jet", self.lvar2_binmin_gen[ibin2], self.lvar2_binmax_gen[ibin2])
            fill_hist(hz_gen_nocuts_closure, df_tmp_selgen_pr_test_bin[self.v_varshape_binning_gen])
            fill_hist(hz_gen_cuts_closure, df_tmp_selrecogen_pr_test_bin[self.v_varshape_binning_gen])
            hz_gen_cuts_closure.Write()
            hz_gen_nocuts_closure.Write()

        fill_hist(hjetpt_gen_nocuts_closure, df_tmp_selgen_pr_test["pt_gen_jet"])
        fill_hist(hjetpt_gen_cuts_closure, df_tmp_selrecogen_pr_test["pt_gen_jet"])
        hjetpt_gen_nocuts_closure.Write()
        hjetpt_gen_cuts_closure.Write()

        hzvsjetpt_reco_nocuts_closure = TH2F("hzvsjetpt_reco_nocuts_closure",
                                             "hzvsjetpt_reco_nocuts_closure",
                                             nzbin_reco, zbinarray_reco,
                                             njetptbin_reco, jetptbinarray_reco)
        hzvsjetpt_reco_nocuts_closure.Sumw2()
        hzvsjetpt_reco_cuts_closure = TH2F("hzvsjetpt_reco_cuts_closure",
                                           "hzvsjetpt_reco_cuts_closure",
                                           nzbin_reco, zbinarray_reco,
                                           njetptbin_reco, jetptbinarray_reco)
        hzvsjetpt_reco_cuts_closure.Sumw2()

        fill2dhist(df_tmp_selreco_pr_test, hzvsjetpt_reco_nocuts_closure, self.v_varshape_binning, "pt_jet")
        fill2dhist(df_tmp_selrecogen_pr_test, hzvsjetpt_reco_cuts_closure, self.v_varshape_binning, "pt_jet")
        hzvsjetpt_reco_nocuts_closure.Write()
        hzvsjetpt_reco_cuts_closure.Write()

        for row in df_tmp_selrecogen_pr.itertuples():
            response_matrix_weight = 1.0
            if self.doprior is True:
                binx = hzvsjetpt_prior_weights.GetXaxis().FindBin(getattr(row, self.v_varshape_binning_gen))
                biny = hzvsjetpt_prior_weights.GetYaxis().FindBin(row.pt_gen_jet)
                weight = hzvsjetpt_prior_weights.GetBinContent(binx, biny)

                if weight > 0.0:
                    response_matrix_weight = 1.0/weight
            response_matrix_pr.Fill(getattr(row, self.v_varshape_binning), row.pt_jet,\
                getattr(row, self.v_varshape_binning_gen), row.pt_gen_jet, response_matrix_weight)
        for row in df_tmp_selrecogen_pr_train.itertuples():
            response_matrix_weight = 1.0
            if self.doprior is True:
                binx = hzvsjetpt_prior_weights.GetXaxis().FindBin(getattr(row, self.v_varshape_binning_gen))
                biny = hzvsjetpt_prior_weights.GetYaxis().FindBin(row.pt_gen_jet)
                weight = hzvsjetpt_prior_weights.GetBinContent(binx, biny)

                if weight > 0.0:
                    response_matrix_weight = 1.0/weight
            response_matrix_closure_pr.Fill(getattr(row, self.v_varshape_binning), row.pt_jet,\
                getattr(row, self.v_varshape_binning_gen), row.pt_gen_jet, response_matrix_weight)
        response_matrix_pr.Write("response_matrix")
        response_matrix_closure_pr.Write("response_matrix_closure")

        out_file.Close()
Esempio n. 29
0
    def process_efficiency_single(self, index):
        out_file = TFile.Open(self.l_histoeff[index], "recreate")
        for ibin2 in range(self.p_nbin2_reco):
            stringbin2 = "_%s_%.2f_%.2f" % (self.v_var2_binning, \
                                        self.lvar2_binmin_reco[ibin2], \
                                        self.lvar2_binmax_reco[ibin2])
            n_bins = self.p_nptfinbins
            analysis_bin_lims_temp = self.lpt_finbinmin.copy()
            analysis_bin_lims_temp.append(self.lpt_finbinmax[n_bins-1])
            analysis_bin_lims = array.array('f', analysis_bin_lims_temp)
            h_gen_pr = TH1F("h_gen_pr" + stringbin2, "Prompt Generated in acceptance |y|<0.5", \
                            n_bins, analysis_bin_lims)
            h_presel_pr = TH1F("h_presel_pr" + stringbin2, "Prompt Reco in acc |#eta|<0.8 and sel", \
                               n_bins, analysis_bin_lims)
            h_sel_pr = TH1F("h_sel_pr" + stringbin2, "Prompt Reco and sel in acc |#eta|<0.8 and sel", \
                            n_bins, analysis_bin_lims)
            h_gen_fd = TH1F("h_gen_fd" + stringbin2, "FD Generated in acceptance |y|<0.5", \
                            n_bins, analysis_bin_lims)
            h_presel_fd = TH1F("h_presel_fd" + stringbin2, "FD Reco in acc |#eta|<0.8 and sel", \
                               n_bins, analysis_bin_lims)
            h_sel_fd = TH1F("h_sel_fd" + stringbin2, "FD Reco and sel in acc |#eta|<0.8 and sel", \
                            n_bins, analysis_bin_lims)

            bincounter = 0
            for ipt in range(self.p_nptfinbins):
                bin_id = self.bin_matching[ipt]
                df_mc_reco = pickle.load(openfile(self.mptfiles_recoskmldec[bin_id][index], "rb"))
                if self.s_evtsel is not None:
                    df_mc_reco = df_mc_reco.query(self.s_evtsel)
                if self.s_jetsel_reco is not None:
                    df_mc_reco = df_mc_reco.query(self.s_jetsel_reco)
                if self.s_trigger is not None:
                    df_mc_reco = df_mc_reco.query(self.s_trigger)
                df_mc_reco = selectdfrunlist(df_mc_reco, \
                         self.run_param[self.runlistrigger[self.triggerbit]], "run_number")
                df_mc_gen = pickle.load(openfile(self.mptfiles_gensk[bin_id][index], "rb"))
                df_mc_gen = df_mc_gen.query(self.s_jetsel_gen)
                df_mc_gen = selectdfrunlist(df_mc_gen, \
                         self.run_param[self.runlistrigger[self.triggerbit]], "run_number")
                df_mc_reco = seldf_singlevar(df_mc_reco, self.v_var_binning, \
                                     self.lpt_finbinmin[ipt], self.lpt_finbinmax[ipt])
                df_mc_gen = seldf_singlevar(df_mc_gen, self.v_var_binning, \
                                     self.lpt_finbinmin[ipt], self.lpt_finbinmax[ipt])
                df_mc_reco = seldf_singlevar(df_mc_reco, self.v_var2_binning, \
                                             self.lvar2_binmin_reco[ibin2], self.lvar2_binmax_reco[ibin2])
                df_mc_gen = seldf_singlevar(df_mc_gen, self.v_var2_binning, \
                                            self.lvar2_binmin_reco[ibin2], self.lvar2_binmax_reco[ibin2])
                df_gen_sel_pr = df_mc_gen[df_mc_gen.ismcprompt == 1]
                df_reco_presel_pr = df_mc_reco[df_mc_reco.ismcprompt == 1]
                df_reco_sel_pr = None
                if self.doml is True:
                    df_reco_sel_pr = df_reco_presel_pr.query(self.l_selml[bin_id])
                else:
                    df_reco_sel_pr = df_reco_presel_pr.copy()
                df_gen_sel_fd = df_mc_gen[df_mc_gen.ismcfd == 1]
                df_reco_presel_fd = df_mc_reco[df_mc_reco.ismcfd == 1]
                df_reco_sel_fd = None
                if self.doml is True:
                    df_reco_sel_fd = df_reco_presel_fd.query(self.l_selml[bin_id])
                else:
                    df_reco_sel_fd = df_reco_presel_fd.copy()

                val = len(df_gen_sel_pr)
                err = math.sqrt(val)
                h_gen_pr.SetBinContent(bincounter + 1, val)
                h_gen_pr.SetBinError(bincounter + 1, err)
                val = len(df_reco_presel_pr)
                err = math.sqrt(val)
                h_presel_pr.SetBinContent(bincounter + 1, val)
                h_presel_pr.SetBinError(bincounter + 1, err)
                val = len(df_reco_sel_pr)
                err = math.sqrt(val)
                h_sel_pr.SetBinContent(bincounter + 1, val)
                h_sel_pr.SetBinError(bincounter + 1, err)

                val = len(df_gen_sel_fd)
                err = math.sqrt(val)
                h_gen_fd.SetBinContent(bincounter + 1, val)
                h_gen_fd.SetBinError(bincounter + 1, err)
                val = len(df_reco_presel_fd)
                err = math.sqrt(val)
                h_presel_fd.SetBinContent(bincounter + 1, val)
                h_presel_fd.SetBinError(bincounter + 1, err)
                val = len(df_reco_sel_fd)
                err = math.sqrt(val)
                h_sel_fd.SetBinContent(bincounter + 1, val)
                h_sel_fd.SetBinError(bincounter + 1, err)

                bincounter = bincounter + 1

            out_file.cd()
            h_gen_pr.Write()
            h_presel_pr.Write()
            h_sel_pr.Write()
            h_gen_fd.Write()
            h_presel_fd.Write()
            h_sel_fd.Write()
Esempio n. 30
0
    def process_histomass_single(self, index):
        myfile = TFile.Open(self.l_histomass[index], "recreate")
        for ipt in range(self.p_nptfinbins):
            bin_id = self.bin_matching[ipt]
            df = pickle.load(openfile(self.mptfiles_recoskmldec[bin_id][index], "rb"))
            if self.doml is True:
                df = df.query(self.l_selml[bin_id])
            if self.s_evtsel is not None:
                df = df.query(self.s_evtsel)
            if self.s_jetsel_reco is not None:
                df = df.query(self.s_jetsel_reco)
            if self.s_trigger is not None:
                df = df.query(self.s_trigger)

            h_invmass_all = TH1F("hmass_%d" % ipt, "", self.p_num_bins,
                                 self.p_mass_fit_lim[0], self.p_mass_fit_lim[1])
            fill_hist(h_invmass_all, df.inv_mass)
            myfile.cd()
            h_invmass_all.Write()

            df = seldf_singlevar(df, self.v_var_binning, \
                                 self.lpt_finbinmin[ipt], self.lpt_finbinmax[ipt])
            for ibin2 in range(self.p_nbin2_reco):
                suffix = "%s%d_%d_%.2f%s_%.2f_%.2f" % \
                         (self.v_var_binning, self.lpt_finbinmin[ipt],
                          self.lpt_finbinmax[ipt], self.lpt_probcutfin[bin_id],
                          self.v_var2_binning, self.lvar2_binmin_reco[ibin2],
                          self.lvar2_binmax_reco[ibin2])
                df_bin = seldf_singlevar(df, self.v_var2_binning,
                                         self.lvar2_binmin_reco[ibin2],
                                         self.lvar2_binmax_reco[ibin2])
                df_bin = selectdfrunlist(df_bin, \
                         self.run_param[self.runlistrigger[self.triggerbit]], "run_number")

                # add the z column
                df_bin["z"] = z_calc(df_bin.pt_jet, df_bin.phi_jet, df_bin.eta_jet,
                                     df_bin.pt_cand, df_bin.phi_cand, df_bin.eta_cand)

                h_invmass = TH1F("hmass" + suffix, "", self.p_num_bins,
                                 self.p_mass_fit_lim[0], self.p_mass_fit_lim[1])
                fill_hist(h_invmass, df_bin.inv_mass)
                myfile.cd()
                h_invmass.Write()

                massarray = [1.0 + i * (5.0 / 5000.0) for i in range(5001)] # 5000 bins in range 1.0-6.0
                massarray_reco = array.array('d', massarray)
                zarray_reco = array.array('d', self.varshaperanges_reco)
                h_zvsinvmass = TH2F("hzvsmass" + suffix, "", \
                    5000, massarray_reco, self.p_nbinshape_reco, zarray_reco)
                h_zvsinvmass.Sumw2()
                fill2dhist(df_bin, h_zvsinvmass, "inv_mass", self.v_varshape_binning)
                h_zvsinvmass.Write()

                if self.mcordata == "mc":
                    df_bin[self.v_ismcrefl] = np.array(tag_bit_df(df_bin, self.v_bitvar,
                                                                  self.b_mcrefl), dtype=int)
                    df_bin_sig = df_bin[df_bin[self.v_ismcsignal] == 1]
                    df_bin_refl = df_bin[df_bin[self.v_ismcrefl] == 1]
                    h_invmass_sig = TH1F("hmass_sig" + suffix, "", self.p_num_bins,
                                         self.p_mass_fit_lim[0], self.p_mass_fit_lim[1])
                    h_invmass_refl = TH1F("hmass_refl" + suffix, "", self.p_num_bins,
                                          self.p_mass_fit_lim[0], self.p_mass_fit_lim[1])
                    fill_hist(h_invmass_sig, df_bin_sig.inv_mass)
                    fill_hist(h_invmass_refl, df_bin_refl.inv_mass)
                    myfile.cd()
                    h_invmass_sig.Write()
                    h_invmass_refl.Write()