def preparesample(self): self.logger.info("Prepare Sample") self.df_data = pickle.load(openfile(self.f_reco_data, "rb")) self.df_mc = pickle.load(openfile(self.f_reco_mc, "rb")) self.df_mcgen = pickle.load(openfile(self.f_gen_mc, "rb")) self.df_mcgen = self.df_mcgen.query(self.p_presel_gen_eff) arraydf = [self.df_data, self.df_mc] self.df_mc = seldf_singlevar(self.df_mc, self.v_bin, self.p_binmin, self.p_binmax) self.df_mcgen = seldf_singlevar(self.df_mcgen, self.v_bin, self.p_binmin, self.p_binmax) self.df_data = seldf_singlevar(self.df_data, self.v_bin, self.p_binmin, self.p_binmax) self.df_sig, self.df_bkg = arraydf[self.p_tagsig], arraydf[self.p_tagbkg] self.df_sig = seldf_singlevar(self.df_sig, self.v_bin, self.p_binmin, self.p_binmax) self.df_bkg = seldf_singlevar(self.df_bkg, self.v_bin, self.p_binmin, self.p_binmax) self.df_sig = self.df_sig.query(self.s_selsigml) self.df_bkg = self.df_bkg.query(self.s_selbkgml) self.df_bkg["ismcsignal"] = 0 self.df_bkg["ismcprompt"] = 0 self.df_bkg["ismcfd"] = 0 self.df_bkg["ismcbkg"] = 0 if self.p_nsig > len(self.df_sig): self.logger.warning("There are not enough signal events") if self.p_nbkg > len(self.df_bkg): self.logger.warning("There are not enough background events") self.p_nsig = min(len(self.df_sig), self.p_nsig) self.p_nbkg = min(len(self.df_bkg), self.p_nbkg) self.logger.info("Used number of signal events is %d", self.p_nsig) self.logger.info("Used number of background events is %d", self.p_nbkg) self.df_ml = pd.DataFrame() self.df_sig = shuffle(self.df_sig, random_state=self.rnd_shuffle) self.df_bkg = shuffle(self.df_bkg, random_state=self.rnd_shuffle) self.df_sig = self.df_sig[:self.p_nsig] self.df_bkg = self.df_bkg[:self.p_nbkg] self.df_sig[self.v_sig] = 1 self.df_bkg[self.v_sig] = 0 self.df_ml = pd.concat([self.df_sig, self.df_bkg]) self.df_mltrain, self.df_mltest = train_test_split(self.df_ml, \ test_size=self.test_frac, random_state=self.rnd_splt) self.df_mltrain = self.df_mltrain.reset_index(drop=True) self.df_mltest = self.df_mltest.reset_index(drop=True) self.df_sigtrain, self.df_bkgtrain = split_df_sigbkg(self.df_mltrain, self.v_sig) self.df_sigtest, self.df_bkgtest = split_df_sigbkg(self.df_mltest, self.v_sig) self.logger.info("Total number of candidates: train %d and test %d", len(self.df_mltrain), len(self.df_mltest)) self.logger.info("Number of signal candidates: train %d and test %d", len(self.df_sigtrain), len(self.df_sigtest)) self.logger.info("Number of bkg candidates: %d and test %d", len(self.df_bkgtrain), len(self.df_bkgtest)) self.df_xtrain = self.df_mltrain[self.v_train] self.df_ytrain = self.df_mltrain[self.v_sig] self.df_xtest = self.df_mltest[self.v_train] self.df_ytest = self.df_mltest[self.v_sig]
def preparesample(self): self.logger.info("Prepare Sample") filename_train = \ os.path.join(self.dirmlout, f"df_train_{self.p_binmin}_{self.p_binmax}.pkl") filename_test = \ os.path.join(self.dirmlout, f"df_test_{self.p_binmin}_{self.p_binmax}.pkl") if os.path.exists(filename_train) \ and os.path.exists(filename_test) \ and self.step_done("preparemlsamples"): self.df_mltrain = pickle.load(openfile(filename_train, "rb")) self.df_mltest = pickle.load(openfile(filename_test, "rb")) else: self.prepare_data_mc_mcgen() self.df_sig, self.df_bkg = self.arraydf[ self.p_tagsig], self.arraydf[self.p_tagbkg] self.df_sig = seldf_singlevar(self.df_sig, self.v_bin, self.p_binmin, self.p_binmax) self.df_bkg = seldf_singlevar(self.df_bkg, self.v_bin, self.p_binmin, self.p_binmax) self.df_sig = self.df_sig.query(self.s_selsigml) self.df_bkg = self.df_bkg.query(self.s_selbkgml) self.df_bkg["ismcsignal"] = 0 self.df_bkg["ismcprompt"] = 0 self.df_bkg["ismcfd"] = 0 self.df_bkg["ismcbkg"] = 0 if self.p_equalise_sig_bkg: self.p_nsig = min(len(self.df_sig), len(self.df_bkg), self.p_nsig) self.p_nbkg = min(len(self.df_sig), len(self.df_bkg), self.p_nbkg) self.df_ml = pd.DataFrame() self.df_sig = shuffle(self.df_sig, random_state=self.rnd_shuffle) self.df_bkg = shuffle(self.df_bkg, random_state=self.rnd_shuffle) self.df_sig = self.df_sig[:self.p_nsig] self.df_bkg = self.df_bkg[:self.p_nbkg] self.df_sig[self.v_sig] = 1 self.df_bkg[self.v_sig] = 0 self.df_ml = pd.concat([self.df_sig, self.df_bkg]) self.df_mltrain, self.df_mltest = train_test_split(self.df_ml, \ test_size=self.test_frac, random_state=self.rnd_splt) self.df_mltrain = self.df_mltrain.reset_index(drop=True) self.df_mltest = self.df_mltest.reset_index(drop=True) # Write for later usage pickle.dump(self.df_mltrain, openfile(filename_train, "wb"), protocol=4) pickle.dump(self.df_mltest, openfile(filename_test, "wb"), protocol=4) # Now continue with extracting signal and background stats and report self.df_sigtrain, self.df_bkgtrain = split_df_sigbkg( self.df_mltrain, self.v_sig) self.df_sigtest, self.df_bkgtest = split_df_sigbkg( self.df_mltest, self.v_sig) self.logger.info("Total number of candidates: train %d and test %d", len(self.df_mltrain), len(self.df_mltest)) self.logger.info("Number of signal candidates: train %d and test %d", len(self.df_sigtrain), len(self.df_sigtest)) self.logger.info("Number of bkg candidates: %d and test %d", len(self.df_bkgtrain), len(self.df_bkgtest)) self.logger.info("Aim for number of signal events: %d", self.p_nsig) self.logger.info("Aim for number of background events: %d", self.p_nbkg) if self.p_nsig > (len(self.df_sigtrain) + len(self.df_sigtest)): self.logger.warning("There are not enough signal events") if self.p_nbkg > (len(self.df_bkgtrain) + len(self.df_bkgtest)): self.logger.warning("There are not enough background events") if self.p_mask_values: self.logger.info("Maksing values for training and testing") mask_df(self.df_mltrain, self.p_mask_values) mask_df(self.df_mltest, self.p_mask_values) # Final preparation of signal and background samples for training and testing self.df_xtrain = self.df_mltrain[self.v_train] self.df_ytrain = self.df_mltrain[self.v_sig] self.df_xtest = self.df_mltest[self.v_train] self.df_ytest = self.df_mltest[self.v_sig] self.step_done("preparemlsamples")