def test_flatten(): tf = ROOT.TFile('tmp.root', 'RECREATE') tt = ROOT.TTree("a", "a") length = np.array([3]) x = np.array([0, 1, 2], dtype='float64') tt.Branch('length', length, 'length/I') tt.Branch('x', x, 'x[length]/D') tt.Fill() x[0] = 3 x[1] = 4 x[2] = 5 tt.Fill() tf.Write() tf.Close() branches = list_branches('tmp.root') df_ = read_root('tmp.root', flatten=True) assert('__array_index' in df_.columns) assert(len(df_) == 6) assert(np.all(df_['__array_index'] == np.array([0, 1, 2, 0, 1, 2]))) # Also flatten chunked data for df_ in read_root('tmp.root', flatten=True, chunksize=1): assert(len(df_) == 3) assert(np.all(df_['__array_index'] == np.array([0, 1, 2]))) os.remove('tmp.root')
def test_read_write(): df = pd.DataFrame({'x': [1,2,3]}) df.to_root('tmp.root') df_ = read_root('tmp.root') os.remove('tmp.root') df.to_root('tmp.root', key='mykey') df_ = read_root('tmp.root', key='mykey') assert_frame_equal(df, df_) os.remove('tmp.root') tf = ROOT.TFile('tmp.root', 'recreate') tt = ROOT.TTree("a", "a") x = np.array([1]) x[0] = 42 tt.Branch('x', x, 'x/D') tt.Fill() x[0] = 1 tt.Fill() tt.Write() tf.Close() # Read when no index is present df = read_root('tmp.root', columns=['x']) os.remove('tmp.root')
def get_list_branches(root_file, directory='', tree='DecayTree'): '''function to return the list of branches in a TTree in pandas. arguments : root_file : the name of the file, can be eos://... directory: the name of the directory or even path/to/TTree tree : name of the TTree ''' if not directory: df = root_pandas.read_root(root_file, stop=1, key='{0}'.format(tree)) else : df = root_pandas.read_root(root_file, stop=1, key='{0}/{1}'.format(directory, tree)) return df.columns
def test_ignore_columns(): df = pd.DataFrame({'x': [1,2,3], 'y1': [2,3,4], 'y2': [3,4,5]}) df.to_root('tmp.root') df = read_root('tmp.root', ignore=['y1']) assert(df.columns[0] == 'x' and df.columns[1] == 'y2') df = read_root('tmp.root', ignore=['y*']) assert(df.columns == ['x']) # Test interaction with columns kwarg df = read_root('tmp.root', columns=['y*'], ignore=['*1']) assert(df.columns == ['y2']) os.remove('tmp.root')
def resample_branch(options): # maybe use pyroot here instead of root_pandas import pickle from root_pandas import read_root with open(options.configfile) as f: config = json.load(f) #load resamplers into config dictionary for task in config["tasks"]: with open(task["resampler_path"], 'rb') as f: resamplers = pickle.load(f) for pid in task["pids"]: try: pid["resampler"] = resamplers[pid["kind"]] except KeyError: logging.error("No resampler found for "+task["particle"]+" and "+pid["kind"]+".") raise chunksize = 100000 for i, chunk in enumerate(read_root(options.source_file, ignore=["*_COV_"], chunksize=chunksize)): for task in config["tasks"]: deps = chunk[task["features"]] for pid in task["pids"]: chunk[pid["name"]] = pid["resampler"].sample(deps.values.T) chunk.to_root(options.output_file, mode="a") logging.info('Processed {} entries'.format((i+1) * chunksize))
def test_drop_nonscalar_columns(): array = np.array([1, 2, 3]) matrix = np.array([[1, 2, 3], [4, 5, 6]]) bool_matrix = np.array([[True, False, True], [True, True, True]]) dt = np.dtype([ ('a', 'i4'), ('b', 'int64', array.shape), ('c', 'int64', matrix.shape), ('d', 'bool_'), ('e', 'bool_', matrix.shape) ]) arr = np.array([ (3, array, matrix, True, bool_matrix), (2, array, matrix, False, bool_matrix)], dtype=dt) path = 'tmp.root' array2root(arr, path, 'ntuple', mode='recreate') df = read_root(path, flatten=False) # the above line throws an error if flatten=True because nonscalar columns # are dropped only after the flattening is applied. However, the flattening # algorithm can not deal with arrays of more than one dimension. assert(len(df.columns) == 2) assert(np.all(df.index.values == np.array([0, 1]))) assert(np.all(df.a.values == np.array([3, 2]))) assert(np.all(df.d.values == np.array([True, False]))) os.remove(path)
def resample_branch(options): import pickle from root_pandas import read_root try: os.remove(options.output_file) except OSError: pass with open(options.configfile) as f: config = json.load(f) #load resamplers into config dictionary for task in config["tasks"]: with open(task["resampler_path"], 'rb') as f: resamplers = pickle.load(f) for pid in task["pids"]: try: pid["resampler"] = resamplers[pid["kind"]] except KeyError: print (resamplers) logging.error("No resampler found for {kind} in {picklefile}.".format(kind=pid["kind"], picklefile=task["resampler_path"])) raise chunksize = 100000 for i, chunk in enumerate(read_root(options.source_file, tree_key=options.tree, ignore=["*_COV_"], chunksize=chunksize)): for task in config["tasks"]: deps = chunk[task["features"]] for pid in task["pids"]: chunk[pid["name"]] = pid["resampler"].sample(deps.values.T) chunk.to_root(options.output_file, mode="a", tree_key=options.tree) logging.info('Processed {} entries'.format((i+1) * chunksize))
def test_persistent_index(): df = pd.DataFrame({'index': [42, 0, 1], 'x': [1,2,3]}) df = df.set_index('index') df.index.name = 'MyAwesomeName' df.to_root('tmp.root') assert('__index__MyAwesomeName' in list_branches('tmp.root')) df_ = read_root('tmp.root') assert_frame_equal(df, df_) os.remove('tmp.root') # See what happens if the index has no name df = pd.DataFrame({'x': [1,2,3]}) df.to_root('tmp.root') df_ = read_root('tmp.root') assert_frame_equal(df, df_) os.remove('tmp.root')
def get_DataFrame(self, columns, query=None): """ Returns the desired DataFrame. columns : list of strings List of columns that are needed. Must be frugal. All columns that are used in the query must be listed here. query : string A string to pass to the pandas.DataFrame.query method. """ # decrease priority for existing columns for column in self.columns_needed: self.columns_priority[column] -= 1; # join the new columns into the set self.columns_needed = list(set(list(self.columns_needed) + columns)) # set the priority of the new columns to max for column in columns: self.columns_priority[column] = DataFrameManagerROOT.max_priority self._prune_columns_needed() columns_to_load = [ x for x in self.columns_needed if x not in self._raw_dataset.columns.values ] if columns_to_load: loaded = root_pandas.read_root(self.filename, self.treename, columns=columns_to_load) assert( len(loaded.index) == len(self._raw_dataset.index) or len(self._raw_dataset.index) == 0 ) self._raw_dataset = pandas.concat([self._raw_dataset, loaded] , axis=1) if query is None: return self._raw_dataset else: return self._raw_dataset.query(query)
def test_noexpand_prefix(): xs = np.array([1, 2, 3]) df = pd.DataFrame({'x': xs}) df.to_root('tmp.root') # Not using the prefix should throw, as there's no matching branch name try: df = read_root('tmp.root', columns=['2*x']) except ValueError: pass else: assert False # Could also use TMath::Sqrt here df = read_root('tmp.root', columns=['noexpand:2*sqrt(x)']) # Note that the column name shouldn't have the noexpand prefix assert np.all(df['2*sqrt(x)'].values == 2*np.sqrt(xs)) os.remove('tmp.root')
def create_resamplers(options): import os.path import pickle from root_pandas import read_root from PIDPerfScripts.Binning import GetBinScheme pid_variables = ['{}_CombDLLK', '{}_CombDLLmu', '{}_CombDLLp', '{}_CombDLLe', '{}_V3ProbNNK', '{}_V3ProbNNpi', '{}_V3ProbNNmu', '{}_V3ProbNNp', '{}_V3ProbNNe', '{}_V3ProbNNghost', '{}_V3ProbNNK_Trafo', '{}_V3ProbNNpi_Trafo', '{}_V3ProbNNmu_Trafo', '{}_V3ProbNNp_Trafo', '{}_V3ProbNNe_Trafo', '{}_V3ProbNNghost_Trafo', #transformed variables with log( var/(1-var) ) '{}_V2ProbNNK', '{}_V2ProbNNpi', '{}_V2ProbNNmu', '{}_V2ProbNNp', '{}_V2ProbNNe', '{}_V2ProbNNghost', '{}_V2ProbNNK_Trafo', '{}_V2ProbNNpi_Trafo', '{}_V2ProbNNmu_Trafo', '{}_V2ProbNNp_Trafo', '{}_V2ProbNNe_Trafo', '{}_V2ProbNNghost_Trafo'] #transformed variables with log( var/(1-var) ) kin_variables = ['{}_P', '{}_Eta','nTracks'] with open('raw_data.json') as f: locations = json.load(f) if options.particles: locations = [sample for sample in locations if sample["particle"] in options.particles] if options.both_magnet_orientations: locations = [sample for sample in locations if sample["magnet"]=="Up"] # we use both maagnet orientations on the first run for sample in locations: binning_P = rooBinning_to_list(GetBinScheme(sample['branch_particle'], "P", None)) #last argument takes name of user-defined binning binning_ETA = rooBinning_to_list(GetBinScheme(sample['branch_particle'], "ETA", None)) #last argument takes name of user-defined binning TODO: let user pass this argument binning_nTracks = rooBinning_to_list(GetBinScheme(sample['branch_particle'], "nTracks", None)) #last argument takes name of user-defined binning TODO: let user pass this argument if options.both_magnet_orientations: if sample["magnet"]=="Up": data = [options.location + '/{particle}_Stripping{stripping}_MagnetUp.root' .format(**sample)] data += [options.location + '/{particle}_Stripping{stripping}_MagnetDown.root'.format(**sample)] resampler_location = options.saveto + '/{particle}_Stripping{stripping}_MagnetAny.pkl'.format(**sample) else: data = [options.location + '/{particle}_Stripping{stripping}_Magnet{magnet}.root'.format(**sample)] resampler_location = options.saveto + '/{particle}_Stripping{stripping}_Magnet{magnet}.pkl'.format(**sample) if os.path.exists(resampler_location): os.remove(resampler_location) resamplers = dict() deps = map(lambda x: x.format(sample['branch_particle']), kin_variables) pids = map(lambda x: x.format(sample['branch_particle']), pid_variables) for pid in pids: if "DLL" in pid: target_binning = np.linspace(-150, 150, 300) # binning for DLL elif "ProbNN" in pid and "Trafo" in pid: target_binning = np.linspace(-30, 30, 1000) # binning for transformed ProbNN elif "ProbNN" in pid: target_binning = np.linspace(0, 1, 100) # binning for (raw) ProbNN else: raise Exception resamplers[pid] = Resampler(binning_P, binning_ETA, binning_nTracks, target_binning) for dataSet in data: for i, chunk in enumerate(read_root(dataSet, columns=deps + pids + ['nsig_sw'], chunksize=100000, where=options.cutstring)): # where is None if option is not set for pid in pids: resamplers[pid].learn(chunk[deps + [pid]].values.T, weights=chunk['nsig_sw']) logging.info('Finished chunk {}'.format(i)) with open(resampler_location, 'wb') as f: pickle.dump(resamplers, f)
def test_chunked_reading(): df = pd.DataFrame({'x': [1,2,3,4,5,6]}) df.to_root('tmp.root') count = 0 for df_ in read_root('tmp.root', chunksize=2): assert(not df_.empty) count += 1 assert count == 3 os.remove('tmp.root')
def test_multiple_files(): df = pd.DataFrame({'x': [1,2,3,4,5,6]}) df.to_root('tmp1.root') df.to_root('tmp2.root') df.to_root('tmp3.root') df_ = read_root(['tmp1.root', 'tmp2.root', 'tmp3.root']) assert(len(df_) == 3 * len(df)) # Also test chunked read of multiple files counter = 0 for df_ in read_root(['tmp1.root', 'tmp2.root', 'tmp3.root'], chunksize=3): assert(len(df_) == 3) counter += 1 assert(counter == 6) os.remove('tmp1.root') os.remove('tmp2.root') os.remove('tmp3.root')
def plot(data, plotfile, mcfile=None, cuts=None, variables=None, bins=30): import numpy as np from root_numpy import root2array import matplotlib.pyplot as plt from matplotlib.colors import LogNorm from matplotlib.backends.backend_pdf import PdfPages #sns.set_palette("deep", desat=.6) #sns.set_context('talk') if cuts is None: cuts = [] if variables is None: variables = [] arr = read_root(data, where=prepare_sel(cuts)) if mcfile: arr_mc = read_root(mcfile, where=prepare_sel(cuts)) logger.info('Saving plots to {}'.format(plotfile)) with PdfPages(plotfile) as pdf: for col in arr.columns: logger.debug('Plotting ' + col) x = arr[col] n, bine, _ = plt.hist(x.values, histtype='stepfilled', bins=bins, color='blue') if mcfile: x_mc = arr_mc[col] if col in arr_mc.columns: n_mc, edges = np.histogram(arr_mc[col], bine) binned_hist(plt.gca(), n_mc, edges, histtype='stepfilled', alpha=0.7) #plt.hist(x_mc, histtype='stepfilled', bins=bins, alpha=0.8, normed=True) #plt.yscale('log') plt.xlabel(col) plt.ylim(0, max(n) * 1.05) pdf.savefig() plt.clf()
def get_event_number(config): """ Compute the total number of events contained in the base tuples of a given configuration. Parameters ---------- config : dictionary expected to contain the keys - 'filepath' - 'files' - 'pandas_kwargs' """ files = [config['filepath'] + f for f in config['files']] df = read_root(files, key=config['pandas_kwargs']['key'], columns=['SigYield_sw', 'nCandidate']) return df[df.nCandidate == 0].SigYield_sw.sum()
def set_target(treeName, branch_names, target, cuts): for i in range(utils.IO.nTarget): tmp_data_frame = (rpd.read_root(utils.IO.targetName[i], treeName, columns=branch_names)).query(cuts) utils.IO.target_df.append(tmp_data_frame) for j in range(len(target)): if j == 0: X_target = tmp_data_frame[[target[j].replace('noexpand:', '')]] else: X_target = np.concatenate([ X_target, tmp_data_frame[[target[j].replace('noexpand:', '')]] ], axis=1) return np.round(X_target, 5)
def test_brace_pattern_in_columns(): reference_df = pd.DataFrame() reference_df['var1'] = np.array([1, 2, 3]) reference_df['var2'] = np.array([4, 5, 6]) reference_df['var3'] = np.array([7, 8, 9]) reference_df['var{03}'] = np.array([10, 11, 12]) reference_df['var{04}'] = np.array([13, 14, 15]) reference_df['var{5}'] = np.array([16, 17, 18]) reference_df['var01'] = np.array([1.1, 2.1, 3.1]) reference_df['var02'] = np.array([4.1, 5.1, 6.1]) reference_df['var03'] = np.array([7.1, 8.1, 9.1]) reference_df['var11'] = np.array([10.1, 11.1, 12.1]) reference_df['var12'] = np.array([13.1, 14.1, 15.1]) reference_df['var13'] = np.array([16.1, 17.1, 18.1]) reference_df.to_root('tmp.root') # Try looking for a column that doesn't exist with assert_raises(ValueError): read_root('tmp.root', columns=['var{1,2,4}']) # Simple expansion df = read_root('tmp.root', columns=['var{1,2}']) assert set(df.columns) == {'var1', 'var2'} assert_frame_equal(df[['var1', 'var2']], reference_df[['var1', 'var2']]) # Single expansion with braces in name df = read_root('tmp.root', columns=['var{5}']) assert set(df.columns) == {'var{5}'} assert_frame_equal(df[['var{5}']], reference_df[['var{5}']]) # Single expansion with braces in name df = read_root('tmp.root', columns=['var{03}']) assert set(df.columns) == {'var{03}'} assert_frame_equal(df[['var{03}']], reference_df[['var{03}']]) # Multiple expansions with braces in name df = read_root('tmp.root', columns=[r'var{{03},2,{04}}']) assert set(df.columns) == {'var{03}', 'var2', 'var{04}'} assert_frame_equal(df[['var{03}', 'var2', 'var{04}']], reference_df[['var{03}', 'var2', 'var{04}']]) # Recursive expansions df = read_root('tmp.root', columns=[r'var{0{2,3},1{1,3}}']) assert set(df.columns) == {'var02', 'var03', 'var11', 'var13'} assert_frame_equal(df[['var02', 'var03', 'var11', 'var13']], reference_df[['var02', 'var03', 'var11', 'var13']]) os.remove('tmp.root')
def main(): channel = "mt" filename = "{0}-NOMINAL_ntuple_Data.root".format(channel) dirpath = "/eos/user/m/msajatov/data/ntuples_scp/v10" path = os.path.join(dirpath, filename) Cut.cutfile = "./cuts_2017.json" cut = Cut(cutstring="-OS- && -ISO- && -VETO- && -MT- && -TRIG-", channel=channel) branches = [ "pt_1", "pt_2", "jpt_1", "jpt_2", "bpt_1", "bpt_2", "njets", "nbtag", "m_sv", "mt_1", "mt_2", "pt_vis", "pt_tt", "mjj", "jdeta", "m_vis", "dijetpt", "met", "eta_1", "eta_2", "iso_1", "iso_2" ] + [ "evt", "by*IsolationMVArun2017v2DBoldDMwLT2017*", "pt_1", "pt_2", "q_1", "q_2", "iso_1", "iso_2", "phi_1", "phi_2", "eta_1", "eta_2", "mt_1", "njets", "decayMode_1", "decayMode_2", "dilepton_veto", "extraelec_veto", "extramuon_veto", "againstMuon*", "againstElectron*", "flagMETFilter", "trg*", "*Weight*", "*weight*", "htxs*" ] + [ "*weight*", "gen_match*", "topPtReweightWeight*", "zPtReweightWeight", "sf*", "njets", "jpt_1", "jdeta", "mjj" ] # chunksize is the number of events BEFORE the selection (where) is applied # for tt a chunksize of 5000 results in 37 events in the SR # for mt a chunksize of 2000 results in 36 events in the SR # for et a chunksize of 1000 results in 34 events in the SR df_iter = rp.read_root(paths=path, where=cut.getForDF(), columns=branches, chunksize=2000) print df_iter # get first chunk only (there must be a better way to do this but next() is not implemented for the genchunk...) for df in df_iter: print df break outpath = "../testdata/{0}_test.root".format(channel) df.to_root(outpath, key="TauCheck", mode="w")
def skim(): #open only the columns we need (for speed) over all the trees (files in folder) for i_file, file in enumerate(sorted(os.listdir(args.trees))): for i_key, key in enumerate(keys): print("Opening", key, "data in", args.trees + "/" + file) data_all = read_root(args.trees + "/" + file, key, columns=[ "station", "trackT0", "trackMomentum", "trackMomentumY", "trackPValue" ]) data_all['trackT0'] = data_all['trackT0'] * 1e-3 # ns -> us total_tv[i_key] += data_all.shape[0] # add to total from each file print("Total of", data_all.shape[0], "entries") #define the time and energy cuts (otherwise the wiggle looks wobbly!) time_cut = (data_all['trackT0'] > args.t_cut ) # us, define a time_cut with time > 30 us mom_cut = (data_all['trackMomentum'] > args.p_cut) # MeV #Apply cuts in one go! data = data_all[time_cut & mom_cut] total_tv_skim[i_key] += data.shape[ 0] # add to total from each file print("Total of", data.shape[0], "entries after energy and momentum cuts") #save the skimmed dataframe in a compressed HDF5 format # here we are appending to the file over tracks and then vertices print("Saving compressed data...") data = data.reset_index() # reset index from 0 cols_to_keep = ["trackMomentum", "trackPValue"] # only write for time and station # cols_to_keep = ["station", "trackT0", "trackMomentum", "trackMomentumY"] # only write for time and station data[cols_to_keep].to_hdf(args.df + "_" + str(i_file) + ".h5", key=key, mode='a', complevel=9, complib="zlib", format="fixed") print("Skimmed dataframe saved to disk", args.df, "\n") print("Grand total of (M)", total_tv[0] / 1e6, "tracks,", total_tv[1] / 1e6, "vertices") print("After the cuts (M)", total_tv_skim[0] / 1e6, "tracks,", total_tv_skim[1] / 1e6, "vertices")
def ev_score_toROOT(self, savepredicOnly=False): '''evaluate the score for the loaded modle''' L_varList = self.varlist() #get the model self.load_model() df = read_root(self.infile, 'sf/t', columns=L_varList, flatten=[ 'DLMS_ST', 'DLMS_HT', 'DLMS_dPhiLepW', 'DLMS_nJets30Clean' ]) print('df loaded') print(" going to evalute the score from ", self.pathToModel) if not '_SMS_' in self.infile: df.loc[:, 'mGo'] = float(self.mGo) df.loc[:, 'mLSP'] = float(self.mLSP) print('prediction will be made to mGo = ', self.mGo, ' and mLSP = ', self.mLSP) #print (df['mGo'].dtype) if self.do_multiClass: self.model.compile(loss='sparse_categorical_crossentropy', metrics=['accuracy'], optimizer='adam') prediction = self.model.predict_proba(df[self.var_list].values) if not savepredicOnly: for mm, mult in enumerate(self.ClassList): df.loc[:, mult] = prediction[:, mm] else: df = pd.DataFrame(prediction, columns=self.ClassList) else: self.model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) if not savepredicOnly: df.loc[:, 'DNN'] = self.model.predict(df[self.var_list]) else: df = pd.DataFrame(self.model.predict(df[self.var_list]), columns=["DNN"]) df.to_root(self.outdir + '/' + self.infile.split("/")[-1], key='sf/t') print("out put fle is wrote to ", self.outdir + '/' + self.infile.split("/")[-1])
def LoadBDTdf(dtype, polarity): df_list_bdt = [] ifname = filedir + 'Data/Lb_' + dtype + '_' + polarity + '.root' bdtfname = ifname[0:-5] + '_MVA.root' if os.path.isfile(bdtfname): print('BDT file already created') else: print() print('>>> Creating file with BDT variable') print() AddBDTinfo(ifname, 'tupleout/DecayTree', bdtfname, 'Data', pickled_model_path='../PIDGen_PIDCalib_MVA/xgb_reg.pkl') for df_bdt in read_root(bdtfname, 'DecayTree', chunksize=100000): df_list_bdt.append(df_bdt) return df_list_bdt
def get_MVAdf(self): '''this method creates the df with the right columns/branches from the root df for the MVA procedure''' MVAdf = read_root(paths=self.path, columns=self.ids + self.label + self.feats4MVA, flatten=self.flatfeats4MVA) #always change index to be a TB index first, just in case we only want a select few of TBs, which we cant do if we have a ET index MVAdf.index = MVAdf.apply(lambda x: str(int(x.runNumber)) + str( int(x.eventNumber)) + '-' + str(int(x.nCandidate)), axis=1) #if specific_TBs is not empty then we need to filter out the unwanted TBs by their id if self.specific_TBs.shape[0] != 0: MVAdf = MVAdf.loc[self.specific_TBs, :] #we then change the index according to whether were dealing with TBs or ETs, if its TBs then the index is essentially left unchanged MVAdf.index = MVAdf.apply(self.index_function, axis=1) #if specific_ETs is not empty that means we need to filter out and keep only the ETs asked for by using their ids if self.specific_ETs.shape[0] != 0: MVAdf = MVAdf.loc[self.specific_ETs, :] return MVAdf
def ReadRootFile(dtype, polarity): df_list = [] varsON = [ 'Lb_L0Global_TIS', 'Lb_L0HadronDecision_TOS', 'Lc_Hlt1TrackMVADecision_TOS', 'Lc_Hlt1TwoTrackMVADecision_TOS', 'Lb_Hlt2XcMuXForTauB2XcMuDecision_TOS', 'Lb_Hlt2XcMuXForTauB2XcFakeMuDecision_TOS', 'Lc_M', 'p_ProbNNp', 'p_ProbNNk', 'mu_PID*', '*_P', '*_PT', 'nTracks', 'runNumber', 'eventNumber', 'Lb_ISOLATION_*', 'mu_PX', 'mu_PY', 'mu_PZ', 'mu_ID', 'Lc_PX', 'Lc_PY', 'Lc_PZ' ] for df in read_root(filedir + 'Data/Lb_' + dtype + '_' + polarity + '.root', 'tupleout/DecayTree', chunksize=100000, columns=varsON): df_list.append(df) return df_list
def phsp_goofit_alt(): import root_pandas path = 'root://eoslhcb.cern.ch//eos/lhcb/user/d/dmuller/K3Pi/RS_with_weight_dtime.root' df = root_pandas.read_root(path, 'events') df.rename(columns={ 'c12': vars.cos1(), 'c34': vars.cos2(), 'dtime': vars.ltime(mode_config.D0), 'phi': vars.phi1(), 'm12': vars.m12(), 'm34': vars.m34() }, inplace=True) df[vars.m12()] = df[vars.m12()] * 1000. df[vars.m34()] = df[vars.m34()] * 1000. df['D0_Loki_BPVLTIME'] = df['D0_Loki_BPVLTIME'] / 1000. return df
def addbdtscore(infile, tree): ifile = open("discriminator_resolved.pickle") model = pickle.load(ifile) vars_to_load_ = [ 'MET', 'METSig', 'Jet1Pt', 'Jet1Eta', 'Jet1Phi', 'Jet2Pt', 'Jet2Eta', 'Jet2Phi', 'DiJetMass', 'DiJetPt', 'DiJetEta', 'DiJetPhi', 'nJets', 'met_Phi' ] if not ("SR" in tree or "SBand" in tree): vars_to_load_[0] = "RECOIL" df = read_root(infile, tree, columns=vars_to_load_) #df=df[vars_to_load_] print df[:1] out = model.decision_function(df).ravel() print out[:10] return out
def get_gen_sample(sample='mu'): den_columns = [] den_columns += ['Lambda_b0_TRUEP_E'] den_columns += ['Lambda_b0_TRUEP_X'] den_columns += ['Lambda_b0_TRUEP_Y'] den_columns += ['Lambda_b0_TRUEP_Z'] den_columns += ['Lambda_cplus_TRUEP_E'] den_columns += ['Lambda_cplus_TRUEP_X'] den_columns += ['Lambda_cplus_TRUEP_Y'] den_columns += ['Lambda_cplus_TRUEP_Z'] den_columns += [sample + 'minus_TRUEP_E'] den_columns += [sample + 'minus_TRUEP_X'] den_columns += [sample + 'minus_TRUEP_Y'] den_columns += [sample + 'minus_TRUEP_Z'] den_columns += ['nu_' + sample + '~_TRUEP_E'] den_columns += ['nu_' + sample + '~_TRUEP_X'] den_columns += ['nu_' + sample + '~_TRUEP_Y'] den_columns += ['nu_' + sample + '~_TRUEP_Z'] den_fname = '~/LbToLclnu_RunTwo/Selection/PID/FFs/GenMC/Lc' + sample.capitalize( ) + 'Nu_gen.root' df_den = rpd.read_root(den_fname, columns=den_columns, key='MCDecayTreeTuple/MCDecayTree') PLc_lab = atfk.lorentz_vector( atfk.vector(df_den['Lambda_cplus_TRUEP_X'], df_den['Lambda_cplus_TRUEP_Y'], df_den['Lambda_cplus_TRUEP_Z']), df_den['Lambda_cplus_TRUEP_E']) Pl_lab = atfk.lorentz_vector( atfk.vector(df_den[sample + 'minus_TRUEP_X'], df_den[sample + 'minus_TRUEP_Y'], df_den[sample + 'minus_TRUEP_Z']), df_den[sample + 'minus_TRUEP_E']) PNu_lab = atfk.lorentz_vector( atfk.vector(df_den["nu_" + sample + "~_TRUEP_X"], df_den["nu_" + sample + "~_TRUEP_Y"], df_den["nu_" + sample + "~_TRUEP_Z"]), df_den["nu_" + sample + "~_TRUEP_E"]) PLb_lab = PLc_lab + Pl_lab + PNu_lab df_den['Lb_True_Q2'], df_den['Lb_True_Costhetal'] = get_phasespace_vars( PLb_lab, PLc_lab, Pl_lab) return df_den[['Lb_True_Q2', 'Lb_True_Costhetal']].to_numpy(),
def resample_branch(options): import pickle from root_pandas import read_root try: os.remove(options.output_file) except OSError: pass if options.seed: np.random.seed(options.seed) with open(options.configfile) as f: config = json.load(f) #load resamplers into config dictionary for task in config["tasks"]: with open(task["resampler_path"], 'rb') as f: resamplers = pickle.load(f) for pid in task["pids"]: try: pid["resampler"] = resamplers[pid["kind"]] except KeyError: print(resamplers) logging.error( "No resampler found for {kind} in {picklefile}.". format(kind=pid["kind"], picklefile=task["resampler_path"])) raise chunksize = 100000 for i, chunk in enumerate( read_root(options.source_file, tree_key=options.input_tree, ignore=["*_COV_"], chunksize=chunksize)): for task in config["tasks"]: deps = chunk[task["features"]] for pid in task["pids"]: chunk[pid["name"]] = pid["resampler"].sample(deps.values.T) chunk.to_root(options.output_file, tree_key=options.output_tree, mode="a") logging.info('Processed {} entries'.format((i + 1) * chunksize))
def plot_time_series(df, channel): #if chn == 4 or chn == 5: #return df = df[df.error == 0] df = time_conversion(df[df.channel == channel]) df_ana = read_root(mostRecentDir.split('mx_')[0] + "analysis/ANA_" + mostRecentDir.split('/')[-2] + '.root', columns=['rate', 'drate', 'time', 'channel', 'e']) df_ana = time_conversion(df_ana[(df_ana.channel == channel) & (df_ana.e < 665) & (df_ana.e > 655)]) df = df.set_index('time').resample('10T').count().dropna().reset_index( ).rename(columns={'integral': 'Count'}) df = df.iloc[1:-1] fig, ax = plt.subplots(nrows=2, ncols=1) df.plot(x='time', y='Count', ax=ax[0]) df_ana.plot(x='time', y='rate', ax=ax[1]) plt.savefig(plotoutDir + '/time_series_channel' + str(channel) + '.png') plt.close() return
def set_data(treeName,branch_names): utils.IO.data_df.append(rpd.read_root(utils.IO.dataName[0],treeName, columns = branch_names)) utils.IO.data_df[0]['proc'] = ( np.ones_like(utils.IO.data_df[0].index)*utils.IO.dataProc[0] ).astype(np.int8) #input_df=rpd.read_root(utils.IO.dataName[0],treeName, columns = ['isSignal']) w = (np.ones_like(utils.IO.data_df[0].index)).astype(np.int8) #utils.IO.data_df[0]['weight'] = np.multiply(w,input_df['isSignal']) #utils.IO.data_df[0]['weight'] = np.multiply(w,1.) utils.IO.data_df[0]['weight'] = w y_data = utils.IO.data_df[0][['proc']] w_data = utils.IO.data_df[0][['weight']] for j in range(len(branch_names)): if j == 0: X_data = utils.IO.data_df[0][[branch_names[j].replace('noexpand:','')]] else: X_data = np.concatenate([X_data,utils.IO.data_df[0][[branch_names[j].replace('noexpand:','')]]],axis=1) return np.round(X_data,5),y_data,w_data
def loadFile(ifile): from root_pandas import read_root if 'MUTAU' in ifile: channel = 'mt' elif 'ETAU' in ifile: channel = 'et' elif 'TAUTAU' in ifile: channel = 'tt' else: raise Exception( 'Input files must have MUTAU, ETAU, or TAUTAU in the provided path. You gave {}, ya goober.' .format(ifile)) filename = ifile.split('/')[-1] print 'Loading input file...', filename input_df = read_root(ifile, columns=scaled_vars + selection_vars) ## read from TTrees into DataFrame slim_df = input_df[(input_df['njets'] > 1) & (input_df['mjj'] > 300) & (input_df['mt'] < 50)] ## preselection selection_df = slim_df[ selection_vars] ## get variables needed for selection (so they aren't normalized) weights = slim_df[ 'evtwt'] ## get just the weights (they are scaled differently) slim_df = slim_df.drop(selection_vars + ['evtwt'], axis=1) ## add the event label if 'VBF' in ifile or 'ggH' in ifile: isSignal = np.ones(len(slim_df)) else: isSignal = np.zeros(len(slim_df)) ## save the name of the process somenames = np.full(len(slim_df), filename.split('.root')[0]) ## scale event weights between 0 - 1 weights = MinMaxScaler().fit_transform(weights.values.reshape(-1, 1)) ## get lepton channel lepton = np.full(len(slim_df), channel) return slim_df, selection_df, somenames, lepton, isSignal, weights
def __init__(self, phase, tag, config, delayed_eff_mode="rel", delayed_eff_impl="add-then-calc", delayed_eff_ref=None, vtx_eff_nom_tagconf=None): self.phase = phase self.tag = tag self.config = config self.delayed_eff_mode = delayed_eff_mode if delayed_eff_impl == "new": delayed_eff_impl = "add-then-calc" self.delayed_eff_impl = delayed_eff_impl self.hardcoded = Hardcoded(phase) self.files, self.results = {}, {} for site in [1, 2, 3]: path = stage2_pbp_path(site, phase, tag, config) self.files[site] = R.TFile(path) results = read_root(path, 'results') for det in dets_for_phase(site, phase): self.results[(site, det)] = \ results.query(f'detector == {det}') cfg_path = configfile_path(tag, config) self.cfg = ConfigFile(cfg_path) if delayed_eff_impl == "old": ref_tag, ref_conf = delayed_eff_ref.split("@") self.delEffCalc = DelayedEffCalc(self.cfg['ibdDelayedEmin'], self.phase, ref_tag, ref_conf) self.promptEffCalc = PromptEffCalc(cfg_path) if vtx_eff_nom_tagconf: nom_tag, nom_conf = vtx_eff_nom_tagconf.split("@") self.vertexEffCalc = VertexEffCalc(self, phase, tag, config, nom_tag, nom_conf) else: self.vertexEffCalc = DummyVertexEffCalc()
def get_luminosity(mode, polarity, year): mode = get_mode(polarity, year, mode) # For a yet to be determined reason, some files do not contain a LumiTuple # so sort those ones out infiles = [] for f in mode.files: fl = ROOT.TFile.Open(f) if fl.Get('GetIntegratedLuminosity/LumiTuple'): infiles.append(f) fl.Close() # Get the files and stuff them into a dataframe df = root_pandas.read_root( infiles, key='GetIntegratedLuminosity/LumiTuple') log.info('Luminosity {} {}: {} +- {}'.format( year, polarity, df.sum().IntegratedLuminosity, df.sum().IntegratedLuminosityErr))
def run(input_fns, output_fn, h1, h2, h3): keys = list_trees(input_fns[0]) assert len(keys) == 1, keys df = read_root(input_fns, keys[0]) df['H1_isMuon'] = df['H1_isMuon'].astype(np.bool) df['H2_isMuon'] = df['H2_isMuon'].astype(np.bool) df['H3_isMuon'] = df['H3_isMuon'].astype(np.bool) # Sort the columns so that the first is the most kaon-like assert sorted([h1, h2, h3 ]) == [h1, h2, h3 ], 'Children are ranked from kaon-like to pion-like' order = np.argsort(df[['H3_ProbK', 'H2_ProbK', 'H1_ProbK']], axis=1) for col in [c for c in df.columns if c.startswith('H1_')]: col = col[len('H1_'):] cols = [f'H1_{col}', f'H2_{col}', f'H3_{col}'] df[cols] = df[cols].values[np.arange(order.shape[0])[:, None], order] # Compute the PE and mass of all particles for head, mass in [('H1', mass_dict[h1]), ('H2', mass_dict[h2]), ('H3', mass_dict[h3])]: df.eval(f'{head}_P = sqrt({head}_PX**2 + {head}_PY**2 + {head}_PZ**2)', inplace=True) df.eval(f'{head}_PE = sqrt({mass}**2 + {head}_P**2)', inplace=True) for component in ['PE', 'PX', 'PY', 'PZ']: df.eval( f'B_{component} = H1_{component} + H2_{component} + H3_{component}', inplace=True) df.eval(f'B_M = sqrt(B_PE**2 - B_PX**2 - B_PY**2 - B_PZ**2)', inplace=True) # if [h1, h2, h3] == ['K', 'K', 'K']: # Apply ignore muons df.query('~(H1_isMuon | H2_isMuon | H3_isMuon)', inplace=True) # Apply an additional selection df.query(f'(H1_IPChi2 > 25) & (H2_IPChi2 > 25) & (H3_IPChi2 > 25)', inplace=True) # Apply a PID selection df.query( f'(H1_Prob{h1} > {pid_cut}) & (H2_Prob{h2} > {pid_cut}) & (H3_Prob{h3} > {pid_cut})', inplace=True) to_root(df, output_fn, key=f'B2{h1}{h2}{h3}', mode='w', store_index=False)
def set_data_simple(treeName, branch_names): for i in range(utils.IO.nData): utils.IO.data_df.append( rpd.read_root(utils.IO.dataName[i], treeName, columns=branch_names)) for j in range(len(branch_names)): if j == 0: X_data = utils.IO.data_df[0][[ branch_names[j].replace('noexpand:', '') ]] else: X_data = np.concatenate([ X_data, utils.IO.data_df[0][[branch_names[j].replace('noexpand:', '')]] ], axis=1) return np.round(X_data, 5)
def readDatasetsToDataframes(pathToFolder): listOfDatasets = [] # identifiers = ["ChargedHiggs_", "TT_", "DYJets", "QCD_", "ST_", "WJets", "WW", "WZ", "ZZ"] identifiers = ["ChargedHiggs_", "TT_", "ST_", "WJets"] for identifier in identifiers: filePaths = glob.glob(pathToFolder + identifier + "*.root") dataset = read_root(filePaths, columns=COLUMNS_) dataset["eventType"] = eventTypeDict[identifier] listOfDatasets.append(dataset) numberOfSignalEvents = listOfDatasets[0].shape[0] numberOfBackgroundEvents = np.sum([x.shape[0] for x in listOfDatasets[1:]]) if (numberOfSignalEvents > numberOfBackgroundEvents): listOfDatasets[0] = listOfDatasets[0].sample( n=numberOfBackgroundEvents) dataframe = listOfDatasets[0].append(listOfDatasets[1:]) return dataframe
def get_LOFdf(self): '''this method creates the df with the right columns/branches in which to perform the LOF calculation on, and require the COM variables for the TBs and ETs''' LOFdf = read_root(paths=self.path, columns=self.ids + self.feats4LOF, flatten=self.flatfeats4LOF) #always change index to be a TB index first, just in case we only want a select few of TBs, which we cant do if we have a ET index LOFdf.index = LOFdf.apply(lambda x: str(int(x.runNumber)) + str( int(x.eventNumber)) + '-' + str(int(x.nCandidate)), axis=1) # if specific_TBs is not empty then we need to filter out the unwanted TBs by their id if self.specific_TBs.shape[0] != 0: LOFdf = LOFdf.loc[self.specific_TBs, :] # we then change the index according to whether were dealing with TBs or ETs, if its TBs then the index is essentially left unchanged LOFdf.index = LOFdf.apply(self.index_function, axis=1) #if specific_ETs is not empty that means we need to filter out and keep only the ETs asked for by using their ids if self.specific_ETs.shape[0] != 0: LOFdf = LOFdf.loc[self.specific_ETs, :] return LOFdf
def create_resamplers(options): import os.path import numpy as np import pickle from root_pandas import read_root from PIDPerfScripts.Binning import GetBinScheme pid_variables = ['{}_CombDLLK', '{}_CombDLLmu', '{}_CombDLLp', '{}_CombDLLe', '{}_V3ProbNNK', '{}_V3ProbNNpi', '{}_V3ProbNNmu', '{}_V3ProbNNp'] kin_variables = ['{}_P', '{}_Eta','nTracks'] with open('raw_data.json') as f: locations = json.load(f) if options.particles: locations = [sample for sample in locations if sample["particle"] in options.particles] for sample in locations: binning_P = rooBinning_to_list(GetBinScheme(sample['branch_particle'], "P", None)) #last argument takes name of user-defined binning binning_ETA = rooBinning_to_list(GetBinScheme(sample['branch_particle'], "ETA", None)) #last argument takes name of user-defined binning TODO: let user pass this argument binning_nTracks = rooBinning_to_list(GetBinScheme(sample['branch_particle'], "nTracks", None)) #last argument takes name of user-defined binning TODO: let user pass this argument data = options.location + '/{particle}_Stripping{stripping}_Magnet{magnet}.root'.format(**sample) resampler_location = '{particle}_Stripping{stripping}_Magnet{magnet}.pkl'.format(**sample) if os.path.exists(resampler_location): os.remove(resampler_location) resamplers = dict() deps = map(lambda x: x.format(sample['branch_particle']), kin_variables) pids = map(lambda x: x.format(sample['branch_particle']), pid_variables) for pid in pids: if "DLL" in pid: target_binning = np.linspace(-150, 150, 300) # binning for DLL elif "ProbNN" in pid: target_binning = np.linspace(0, 1, 100) # binning for ProbNN else: raise Exception resamplers[pid] = Resampler(binning_P, binning_ETA, binning_nTracks, target_binning) for i, chunk in enumerate(read_root(data, columns=deps + pids + ['nsig_sw'], chunksize=100000, where=options.cutstring)): # where is None if option is not set for pid in pids: resamplers[pid].learn(chunk[deps + [pid]].values.T, weights=chunk['nsig_sw']) logging.info('Finished chunk {}'.format(i)) with open(resampler_location, 'wb') as f: pickle.dump(resamplers, f)
def tot_event_number(base_file, base_tree, weight_column='', unique_events=['runNumber', 'eventNumber', 'nCandidate'], preselection=None, presel_column=[]): columns = [] if weight_column != '': columns.append(weight_column) for pr in presel_column: columns.append(pr) for ue in unique_events: columns.append(ue) df = read_root(base_file, key=base_tree, columns=columns) if preselection is not None: df = df.query(preselection) if weight_column != '': nevt = df.groupby(unique_events)[weight_column].head(1).sum() else: nevt = df.groupby(unique_events).ngroups return nevt
def weight_signal_with_resolution(w_s, y_s): proc = 999 for i in range(utils.IO.nSig): w_sig = np.asarray(w_s[np.asarray(y_s) == utils.IO.sigProc[i]]) proc = utils.IO.sigProc[i] input_df = rpd.read_root(utils.IO.signalName[i], "bbggSelectionTree", columns=['benchmark_reweight_SM']) #input_df=rpd.read_root(utils.IO.signalName[i],"bbggSelectionTree", columns = ['benchmark_reweight_2017fake']) utils.IO.signal_df[i][['weight']] = np.multiply( utils.IO.signal_df[i][['weight']], input_df[['benchmark_reweight_SM']]) #utils.IO.signal_df[i][['weight']] = np.multiply(utils.IO.signal_df[i][['weight']],41.5/7.666672541) #2017 SM #utils.IO.signal_df[i][['weight']] = np.multiply(utils.IO.signal_df[i][['weight']],35.9/57.32398753) #2016 SM utils.IO.signal_df[i][['weight']] = np.multiply( utils.IO.signal_df[i][['weight']], 59.4 / 7.719390612) #2018 SM utils.IO.signal_df[i][['weight']] = np.divide( utils.IO.signal_df[i][['weight']], utils.IO.signal_df[i][['sigmaMOverM']]) return utils.IO.signal_df[i][['weight']]
def test(raw_test_dataset, cut, output_dataset, path2tree, tree_name, year): test_dataset = os.getcwd() + '/dataset/' + 'test_tree.root' MjjRegLib.prepare_dataset(raw_test_dataset, test_dataset, path2tree, tree_name, cut) input_arr = read_root(test_dataset, columns=MjjRegConf.get_features(), key=path2tree + tree_name) test_first_ev = 0 test_last_ev = input_arr.shape[0] - 1 test_arr = input_arr.loc[test_first_ev:test_last_ev, 'leadingJet_pt':] reg_model = loaded_model = pickle.load( open(os.getcwd() + '/dataset/XGB_Mjj_Reg_model_' + year + '.xgb', "rb")) reg_C_arr = reg_model.predict(data=test_arr) MjjRegLib.make_output_file(test_dataset, path2tree, tree_name, reg_C_arr, output_dataset) print('Testing successfully compleated!')
def convertROOT_2_Parquet_2_TFRecord(fileNames): for fileName in fileNames: print("Processing file:",fileName) label = fileName.split("/")[-1].split(".")[0] label = label.lstrip("omtfHits_omtfAlgo0x0006_v1") path = str(pathlib.Path(fileName).parent) path = path.rstrip("omtfHits_omtfAlgo0x0006_v1") path = path.replace("ROOT","Python/") for iChunk, dfChunk in enumerate(read_root(fileName, chunksize=int(15E6))): print("\tProcessing chunk: {}".format(iChunk)) transformColumns(dfChunk, unionFormat="new") parquetFile = path+'df.parquet_{}_chunk_{}.gzip'.format(label, iChunk) dfChunk.to_parquet(parquetFile, compression='gzip') dataset = loadDatasetFromParquet(parquetFile) dataset = dataset.map(tf.io.serialize_tensor) tfrecordFileName = path+'{}_chunk_{}.tfrecord.gzip'.format(label,iChunk) writer = tf.data.experimental.TFRecordWriter(tfrecordFileName, compression_type="GZIP") writer.write(dataset) print("Chunk done.") break print("File done.")
def fillHisto(self, path, cut, name, weight=True): tmp = rp.read_root( paths=path, where=cut.get(), columns=self.weights + self.var + ["gen_match_1", "gen_match_2", "decayMode_1", "decayMode_2"]) if weight: tmp.eval("eweight = " + "*".join(self.weights), inplace=True) tmp["eweight"] *= float(self.lumi) else: tmp.eval("eweight = 1", inplace=True) tmpHist = R.TH2D(name, name, *(self.binning)) tmpHist.SetOption("COLZ") rn.fill_hist(tmpHist, array=tmp[self.var].values, weights=tmp["eweight"].values) return tmpHist
def main(): parser.add_argument('-i', dest='inputfile', help='Sample to run over', type=str, metavar = 'INPUTFILE', default = "") parser.add_argument('-c', dest='channel', help='Dataset channel',choices = ['mt','et','tt'], default = 'mt') parser.add_argument('-o', dest='outputfile', help='File the changes get written to', type=str, metavar = 'OUTPUTFILE', default = "output.root") parser.add_argument('-w', dest='weights', help='Weight that is to be recalculated', choices = ["all", "antilep_tauscaling", "puweight", "zweight", "trk_sf", "reco_sf", "top_weight"], default = "all" ) global args args = parser.parse_args() ignore_list = ['addlepton_p4'] # root_pandas can't handle vector<TLorentzVector> tmp = rp.read_root( paths = args.inputfile, ignore=ignore_list) if args.weights = "all" or args.weights = "puweight": tmp['puweight'] = tmp.apply( recalcPileupWeight, axis=1 ) #if args.weights = "all" or args.weights = "zweight": tmp[''] = tmp.apply( recalcZWeight, axis=1 ) if args.weights = "all" or args.weights = "trk_sf": tmp['trk_sf'] = tmp.apply( recalcTrkSF, axis=1 ) if args.weights = "all" or args.weights = "reco_sf": tmp['reco_sf'] = tmp.apply( recalcRecoSF, axis=1 ) if args.weights = "all" or args.weights = "antilep_tauscaling": tmp['eleTauFakeRateWeight'] = tmp.apply( recalcEleTauFakeRateWeight, axis=1 ) tmp['muTauFakeRateWeight'] = tmp.apply( recalcMuTauFakeRateWeight, axis=1 ) tmp['antilep_tauscaling'] = tmp.apply( recalcAntilepTauscaling, axis=1 ) # Always run eleTauFakeRateWeight and muTauFakeRateWeight first!
def sim_skim(): # key='trackerNTup/tracker' key = 'QualityVertices' for i_file, file in enumerate(sorted(os.listdir(args.trees))): print("Opening", key, "data in", args.trees + "/" + file) data_all = read_root( args.trees + "/" + file, key, columns=["station", "trackT0", "trackMomentum", "trackMomentumY"]) data_all['trackT0'] = data_all['trackT0'] * 1e-3 # ns -> us print("Total of", data_all.shape[0], "entries") #Apply cuts in one go! data = data_all #save the skimmed dataframe in a compressed HDF5 format # here we are appending to the file over tracks and then vertices print("Saving compressed data...") data = data.reset_index() # reset index from 0 cols_to_keep = [ "station", "trackT0", "trackMomentum", "trackMomentumY" ] # only write for time and station data[cols_to_keep].to_hdf(args.df + "_" + str(i_file) + ".h5", key=key, mode='a', complevel=9, complib="zlib", format="fixed") print("Skimmed dataframe saved to disk", args.df, "\n") # print("Opening root file...", file) # data_all = read_root(args.trees+"/"+file, key) # data_all['trackT0']=data_all['trackT0']*1e-3 # ns -> us # print("Total of", data_all.shape[0], "entries") # # data_all.to_hdf(args.df+"_"+str(i_file)+".h5", key="sim", mode='a', complevel=9, complib="zlib", format="table", data_columns=True) # data_all.to_hdf(args.df+"_"+str(i_file)+".h5", key="sim", mode='a', complevel=9, complib="zlib", format="table", data_columns=True) # print("Skimmed dataframe saved to disk", args.df, "\n") print("Exiting...") sys.exit()
def draw_result(testh5, h5key_test=None, gamma=True, Range=[-0.01, 1.0]): # ****** load the file if h5key_test: columns_CSE = [] for i in range(25): if gamma: columns_CSE.append('CSE_' + str(i)) else: columns_CSE.append('CSE0_' + str(i)) Gamma = pd.read_hdf(testh5, h5key_test, columns=columns_CSE) Gamma.plot.hist(sort_columns=False, subplots=True, layout=(5, 5), sharex=True, sharey=True, legend=False, range=Range, bins=20) else: if gamma: mykey = 'gamma' Col = 'gamma_CSE' else: mykey = 'Pi0' Col = 'pi0_gamma0_CSE' print("Col = ", Col) Gamma = root_pandas.read_root(testh5, mykey, columns=Col) pd.DataFrame(Gamma[Col].values.tolist()).replace(-9999.0, 0).plot.hist( sort_columns=False, subplots=True, layout=(5, 5), sharex=True, sharey=True, legend=False, range=Range, bins=20)
def get_dataframe(infile, treename=None, **kwargs): """ Get the dataframe from the input file. Args: infile (str): Name of the inputfile from which the dataframe should be read. Must either be a pkl or a root file. Which format will be read depends entirely on the ending of the filename. treename (str, optional): The TTree in the TFile that should be read. Since it is possible to store multiple trees in one file it can be necessary to specify which on to read. Option is only used for reads from .root files. Keyword Args: Forwarded to root_pandas.read_root See also: root_pandas.read_root Returns: pandas.DataFrame: The dataframe read from the file. """ logging.debug('Getting DataFrame from {}'.format(infile)) if not infile.endswith('.pkl') and not infile.endswith('.root'): logging.error('Infile does not have a parseable format: {}' ' Valid formats are .root and .pkl'.format(infile)) if infile.endswith('.pkl'): return pd.read_pickle(infile) if infile.endswith('.root'): try: from root_pandas import read_root if treename is None: # If there is more than one tree we still fail in read_root treename = get_treename(infile) return read_root(infile, key=treename, **kwargs) except ImportError: # log and bail out logging.error('Requested to read DataFrame from {}, but could not ' 'import root_pandas'.format(infile)) sys.exit(1)
def write_weights(name, inputMC, inputMC_tree, outputMC, outputMC_tree, referenceBranches): logging.info("opening file "+inputMC) #Create cut-string to write the weights only in the ranges that were used for calculating the weights weightBranches = [] i = 0; for referenceBranch in referenceBranches: weightBranch = referenceBranch+"_weights" weightBranches.append(weightBranch) with open(name+referenceBranch+"_weights.pickle", "r") as inputFile: loaded = pickle.load(inputFile) rangemin = loaded["rangemin"] rangemax = loaded["rangemax"] rangemin_str = str(rangemin) rangemax_str = str(rangemax) if i == 0: cutstring = referenceBranch + " > " + rangemin_str + " && " + referenceBranch + " < " + rangemax_str else: cutstring += " && " + referenceBranch + " > " + rangemin_str + " && " + referenceBranch + " < " + rangemax_str i += 1 logging.debug("Cutstring: "+cutstring) #Get entries of inputfile (without cuts) ignoreBranches = list(weightBranches) ignoreBranches.append("total_weights") mc_frame_wo_cuts = root_pandas.read_root(inputMC, inputMC_tree, ignore=ignoreBranches) logging.info("Entries without cuts:"+ str(len(mc_frame_wo_cuts.index))) #Read inputfile with cuts mc_frame = root_pandas.read_root(inputMC, inputMC_tree, where=cutstring, ignore=ignoreBranches) logging.info("Entries with above cuts:"+ str(len(mc_frame.index))) #Write referenceBranch_weights (can be multiple branches) to output rootfile for referenceBranch in referenceBranches: weightBranch = referenceBranch+"_weights" logging.info("Applying weights for " + weightBranch) with open(name+referenceBranch+"_weights.pickle", "r") as inputFile: loaded = pickle.load(inputFile) bin_edges = loaded["bin_edges"] weights = loaded["weights"] mc_frame[weightBranch] = weights[pandas.cut(mc_frame[referenceBranch], bin_edges, labels=False)] # checking for MC events with nan event weights nanWeightEvents = mc_frame.loc[numpy.isnan(mc_frame[weightBranch])] InfWeightEvents = mc_frame.loc[numpy.isinf(mc_frame[weightBranch])] for nan_or_inf in (nanWeightEvents, InfWeightEvents): if len(nan_or_inf) > 0: logging.warning("Some events got a weight of Nan or inf in "+weightBranch) #logging.debug(nan_or_inf) #logging.warning("plotting them!") #plot = nan_or_inf.plot() #matplotlib.pyplot.savefig('./Plots/ReWeight/reweight' + weightBranch + inputMC[inputMC.rfind("/")+1:] + "_nanInfWeightEvents.png") logging.warning("REMOVING THE NAN/INF-WEIGHTED EVENTS") mc_frame = mc_frame.loc[numpy.isnan(mc_frame[weightBranch]) == False] mc_frame = mc_frame.loc[numpy.isinf(mc_frame[weightBranch]) == False] logging.info("Entries after removing NaN or Inf-Events: "+str(len(mc_frame.index))) #Normalizing the weights if len(weightBranches) > 1: totalWeight = numpy.ones(len(mc_frame.index)) for weightBranch in weightBranches: if len(weightBranches) > 1: totalWeight *= mc_frame[weightBranch] #normalizing this one weight. If you later use the product of weights, you need to normalize that product too factor = len(mc_frame.index)/sum(mc_frame[weightBranch]) mc_frame[weightBranch] = mc_frame[weightBranch] * factor #Normalizing the total weight (if applicable) if len(weightBranches) > 1: factor = len(mc_frame.index)/sum(totalWeight) totalWeight = totalWeight * factor mc_frame["total_weight"] = totalWeight logging.info("writing root file to " + outputMC) mc_frame.to_root(outputMC, outputMC_tree)
#'eminus_TRACKHITS_nHitsOT', #'eminus_TRACKHITS_nHitsTT', #'eminus_TRACKHITS_nHitsMuon', #'eminus_NEARESTCELL_NEIGHBORS_E', #'eminus_NEARESTCELL_E', #'eminus_WeightedSum_Photon_P', 'eminus_DivisionWeightedSum_Photon_P', 'eminus_AcceptedSum_Photon_P', 'eminus_P_with_AcceptedSum', #'eminus_P_with_WeightedSum', 'eminus_P_with_DivisionWeightedSum', 'eminus_TRUEP'] data = read_root(filelocation, columns=variables) #Take only subset of data data = data.sample(50000) TRUEP = np.asarray(data['eminus_TRUEP']) BremAdder_P = np.asarray(data['eminus_P']) #Remove target variable data = data.drop('eminus_TRUEP', 1) #Print training variables print "Training variables:\n", '='*20, '\n', '\n'.join(list(data.columns)) #Create training/test-sample
'eminus_ProbNN*', 'eminus_TRACKHITS_nHitsTotal', 'eminus_TRACKHITS_nHitsVelo', 'eminus_TRACKHITS_nHitsIT', 'eminus_TRACKHITS_nHitsOT', 'eminus_TRACKHITS_nHitsTT', 'eminus_TRACKHITS_nHitsMuon', 'eminus_TRACKHITS_nHitsM1', #Vector features 'eminus_RECOPHOTONS_P_VEC', #'eminus_RECOPHOTONS_PT_VEC', 'eminus_TRUEP', 'eminus_NEARESTCELL_NEIGHBORS_E', 'eminus_NEARESTCELL_E', 'eminus_PIDe', 'eminus_PIDmu', 'eminus_PIDK', 'eminus_PIDp', 'eminus_TRACK_GhostProb' ] data = read_root(filelocation, columns=variables) #training data (excluding true) #Load the classifier output #========================== classifieroutput = pd.read_pickle("/home/dberninghoff/Master-Make-Based/B2Kemu-Electrons/Pickles/Classifieroutput.pkl") #Calculate the sum of all reconstructed photon momenta weighted by the classifier output #======================================================================================= weighted_sums = [] divisionweight_sums = [] threshold = 0.4989 accepted_sums = []
def concat_df_chunks(filenames, chunksize, **kwargs): return chain( *(read_root(f, chunksize=chunksize, **kwargs) for f in filenames) )
# no single electron cut (170418) #s_path_to_input = './resources/ambe_no_s1_pulse_shape.txt' #df_ambe_data = pd.read_table(s_path_to_input, sep='\t', ) #s_path_to_input = './resources/list_lax_NR_0.6.2.csv' #s_path_to_input = './resources/list_lax_NR_0.8.4.csv' #s_path_to_input = './resources/list_lax_NR_0.9.1.csv' #s_path_to_input = './resources/list_lax_NR_0.9.2.csv' #s_path_to_input = './resources/data_AmBe_lowenergy.csv' #df_ambe_data = pd.read_table(s_path_to_input, sep=',') #print df_ambe_data['x'], df_ambe_data['y'], df_ambe_data['z'], df_ambe_data['distance_to_source'] s_path_to_input = './resources/data_AmBe_cs1_lt_200.root' df_ambe_data = root_pandas.read_root(s_path_to_input) # AmBe optimized df_ambe_data = df_ambe_data[((df_ambe_data['x']**2. + df_ambe_data['y']**2.) < config_xe1t.max_r**2.) & (df_ambe_data['z'] < config_xe1t.max_z) & (config_xe1t.min_z < df_ambe_data['z']) & (df_ambe_data['distance_to_source'] < 80.)] # apply cuts #df_ambe_data = df_ambe_data[df_ambe_data['CutLowEnergyAmBe']] #df_ambe_data = df_ambe_data[df_ambe_data['CutAmBeFiducial']] df_ambe_data = df_ambe_data[df_ambe_data['CutS1LowEnergyRange']] df_ambe_data = df_ambe_data[df_ambe_data['CutS2Threshold']] df_ambe_data = df_ambe_data[df_ambe_data['CutInteractionPeaksBiggest']] df_ambe_data = df_ambe_data[df_ambe_data['CutS2AreaFractionTop']] df_ambe_data = df_ambe_data[df_ambe_data['CutS2SingleScatterSimple']] df_ambe_data = df_ambe_data[df_ambe_data['CutDAQVeto']] #df_ambe_data = df_ambe_data[df_ambe_data['CutEndOfRunCheck']]
def main(): args = parse_args() config = parse_config(args.config_file) if config is None: print('No configuration file is defined. ' 'Define one with `--config-file`.') sys.exit(1) # read dataset files = config['files'] if 'filepath' in config: files = [config['filepath'] + f for f in files] kwargs = config['pandas_kwargs'] print('Reading ', end='') entries = 0 for f in files: rootfile = ROOT.TFile(f) tree = rootfile.Get(kwargs['key']) entries += tree.GetEntries() maxslices = args.max_slices chunksize = kwargs['chunksize'] total = (maxslices if maxslices is not None and maxslices < (entries / chunksize) else (entries / chunksize)) print(total * chunksize, 'events.') df = pd.concat([ df for df in tqdm( islice( read_root(files, flatten=True, **kwargs), maxslices), total=total)]) # rename the tagging particle branches df.rename(columns=dict(zip(df.columns, [c.replace(config['tagging_particle_prefix'], 'tp').replace('-', '_') for c in df.columns])), inplace=True) df['event_id'] = df.runNumber.apply(str) + '_' + df.eventNumber.apply(str) if 'invert_target' in config and config['invert_target']: df['target'] = np.sign(df.B_ID) != np.sign(df.tp_ID) else: df['target'] = np.sign(df.B_ID) == np.sign(df.tp_ID) # read features and selections try: if 'inclusive_mva_features' in config: mva_features = ['tp_' + f for f in config['inclusive_mva_features']] else: mva_features = ['tp_' + f.split(' ')[0] for f in config['selections']] except: raise ValueError('Tried to parse features for the BDT.' ' Either provide well-formatted `selections` or' ' define a `inclusive_mva_features` set.') # build BDT model and train the classifier n_cv x 3 times xgb_kwargs = config['xgb_kwargs'] n_jobs = config['n_jobs'] bootstrap_scores = [] bootstrap_d2s = [] nfold = (args.bootstrap_folds if args.bootstrap_folds is not None else config['n_cv']) print('Starting bootstrapping.') pbar = tqdm(total=nfold * 3) for _ in range(nfold): # yield 3-fold split for CV df_sets = [df.iloc[indices] for indices in NSplit(df)] cv_scores = [] for i in range(3): df1, df2, df3 = (df_sets[i % 3].copy(), df_sets[(i + 1) % 3].copy(), df_sets[(i + 2) % 3].copy()) model = XGBClassifier(nthread=n_jobs, **xgb_kwargs) sample_weight = (df1.target if 'training_weights' in config and config['training_weights'] else None) model.fit(df1[mva_features], df1.target, sample_weight=df1.SigYield_sw) df2['probas'] = model.predict_proba(df2[mva_features])[:, 1] df2.reset_index(inplace=True, drop=True) df2_max = df2.iloc[df2.groupby('event_id')['probas'].idxmax()].copy() df3['probas'] = model.predict_proba(df3[mva_features])[:, 1] df3.reset_index(inplace=True, drop=True) df3_max = df3.iloc[df3.groupby('event_id')['probas'].idxmax()].copy() # calibrate calibrator = PolynomialLogisticRegression(power=4, solver='lbfgs', n_jobs=n_jobs) calibrator.fit(df2_max.probas.reshape(-1, 1), df2_max.target, sample_weight=df2_max.SigYield_sw) df3_max['calib_probas'] = calibrator.predict_proba(df3_max.probas)[:, 1] score = tagging_power_score(df3_max.calib_probas, tot_event_number=get_event_number(df3_max), sample_weight=df3_max.SigYield_sw) bootstrap_scores.append(score) bootstrap_d2s.append(d2_score(df3_max.calib_probas, sample_weight=df3_max.SigYield_sw)) pbar.update(1) pbar.close() print(dedent("""\ Final {}-fold bootstrap performance D2 = {:<6}% ε_eff = {:<6}%""") .format(nfold, 100 * ufloat(np.mean(bootstrap_d2s), np.std(bootstrap_d2s)), 100 * ufloat(np.mean(noms(bootstrap_scores)), np.std(noms(bootstrap_scores)))))
#! /usr/bin/env python import os import cPickle as pkl from root_pandas import read_root current_dir = os.path.dirname(__file__) bb_dir = os.path.join(current_dir, '../..') hgg_bg = pkl.load(open(bb_dir+'/files/hgg_bg.p', "rb")) df_data = read_root(bb_dir+'/files/BH/OutputFile_ForBrian.root','BH_Tree') pkl.dump(df_data, open( bb_dir+"/files/BH/BH_paper_data.p", "wb" ), protocol = -1)
#! /usr/bin/env python import cPickle as pkl import pandas as pd from root_pandas import read_root #df_signal = read_root('../../files/HiggsToGG/Tree_LowPtSUSY_Tree_HGG_BB1.root','HGG_Tree') df_bg = read_root('../../files/HiggsToGG/Tree_LowPtSUSY_Tree_PPGG_BB_All.root','HGG_Tree') #pkl.dump(df_signal, open( "../../files/hgg_signal.p", "wb" ), protocol = -1) pkl.dump(df_bg, open( "../../files/hgg_bg.p", "wb" ), protocol = -1)
def create_weights(name, referenceMC, referenceMC_tree, referenceData, referenceData_tree, referenceBranches, ranges, binning, weightBranch): logging.info("(Re-)Creating the weights from the control channel") mc_frame = root_pandas.read_root(referenceMC, referenceMC_tree, columns=referenceBranches) if weightBranch == None: data_frame = root_pandas.read_root(referenceData, referenceData_tree, columns=referenceBranches) else: referenceBranches_w_weight = list(referenceBranches) referenceBranches_w_weight.append(weightBranch) data_frame = root_pandas.read_root(referenceData, referenceData_tree, columns=referenceBranches_w_weight) #Go through the Branches counter = 0 for referenceBranch in referenceBranches: branch_range = ranges[counter] #string that looks like this: [min,max] branch_range = branch_range.replace("[", "") branch_range = branch_range.replace("]", "") rangemin, rangemax = branch_range.split(",") rangemin = float(rangemin) rangemax = float(rangemax) if rangemin > rangemax: raise SystemExit("rangemin > rangemax in ", referenceBranch) # Create histograms and create control plots mc_counts, bin_edges = numpy.histogram(mc_frame[referenceBranch], range=(rangemin,rangemax), bins=binning, density=True) if weightBranch == None: data_counts, bin_edges = numpy.histogram(data_frame[referenceBranch], range=(rangemin,rangemax),bins=binning, density=True) else: data_counts, bin_edges = numpy.histogram(data_frame[referenceBranch], weights=data_frame[weightBranch], range=(rangemin,rangemax), bins=binning, density=True) weights = ( data_counts.astype(float)/float(sum(data_counts)) ) / ( mc_counts.astype(float)/float(sum(mc_counts)) ) logging.info("Weights for "+ referenceBranch +":") logging.debug(weights) # Plot without weights mc_counts_plot = numpy.append(mc_counts, 0) #append any number so that the last regular bin is shown data_counts_plot = numpy.append(data_counts, 0) matplotlib.pyplot.step(bin_edges, mc_counts_plot, where='post', color='g') matplotlib.pyplot.step(bin_edges, data_counts_plot, where='post', color='r') matplotlib.pyplot.xlim(min(bin_edges), max(bin_edges)) matplotlib.pyplot.title(referenceBranch+" normalised BEFORE weighting. Green=MC, Red=DATA") matplotlib.pyplot.savefig("./Plots/ReWeight/"+referenceBranch+"_before_reweight.png") matplotlib.pyplot.clf() matplotlib.pyplot.cla() # Plot with weights plot_weights = list(weights) #check for nan or inf weights for plot_weight in plot_weights: if numpy.isnan(plot_weight) or numpy.isinf(plot_weight): plot_weight = 0 mc_counts_weighted = mc_counts * plot_weights mc_counts_weighted_plot = numpy.append(mc_counts_weighted, 0) matplotlib.pyplot.step(bin_edges, mc_counts_weighted_plot, where='post', color='g') matplotlib.pyplot.step(bin_edges, data_counts_plot, where='post', color='r', linestyle='--') matplotlib.pyplot.xlim(min(bin_edges),max(bin_edges)) matplotlib.pyplot.title(referenceBranch+" AFTER weighting. Green=MC, Red=DATA") matplotlib.pyplot.savefig("./Plots/ReWeight/"+referenceBranch+"_after_reweight.png") matplotlib.pyplot.clf() matplotlib.pyplot.cla() #A NaN occurs when a bin in MC is empty. This happens mainly for high PT though, where data bins should also be mostly empty and it is usually best to assign a zero. #For every referenceBranch save another pickle file with open(name+referenceBranch+"_weights.pickle", "w") as outputFile: pickle.dump({"bin_edges" : bin_edges, "weights" : weights, "rangemin" : rangemin, "rangemax" : rangemax}, outputFile) logging.info(name+referenceBranch+"_weights.pickle saved") counter += 1
#!/usr/bin/env python from root_pandas import read_root tree = read_root("test/genfaketau/out/tmva.root", "TestTree") signal = tree[tree.classID < .5] background = tree[tree.classID > .5] for i in [.05 * x for x in range(20)]: cut = background.BDTG.quantile(i) sfraction = signal[signal.BDTG > cut].size / float(signal.size) bfraction = background[background.BDTG < cut].size / float(background.size) print "{:4.2f} {:6.4f} {:6.4f} {:6.4f}".format(i, cut, sfraction, bfraction)
vectorfeatures = [ 'eminus_P', 'eminus_RECOPHOTONS_P_VEC', 'eminus_RECOPHOTONS_PT_VEC', 'eminus_RECOPHOTONS_Dist2Orig*_VEC', 'eminus_RECOPHOTONS_DOCA*_VEC', 'eminus_RECOPHOTONS_IP_BESTPV_VEC', #'eminus_RECOPHOTONS_TRUE_FromThisParticle_VEC', 'eminus_RECOPHOTONS_TRUE_PhotonFromThisParticle_VEC', #'eminus_RECOPHOTONS_BremAdded_VEC', 'eminus_RECOPHOTONS_Dist2TrExtrap_Velo_VEC', 'eminus_RECOPHOTONS_Dist2TrExtrapInError_Velo_VEC', 'eminus_RECOPHOTONS_Dist2TrExtrap_TT_VEC', 'eminus_RECOPHOTONS_Dist2TrExtrapInError_TT_VEC', 'eminus_RECOPHOTONS_BremAdded_VEC'] ignorefeatures = [ ] print "Reading data..." datascalar = read_root(filelocation, columns=scalarfeatures, ignore=ignorefeatures) #dataframe datavector = read_root(filelocation, columns=vectorfeatures, ignore=ignorefeatures) datavector = datavector.drop('eminus_P', 1) print "Reading data complete." #Create the unpacked dataset print "Creating unpacked dataset..." data = dataframe_join_vectors_to_scalars(datascalar, datavector) print "Creating unpacked dataset complete." #Create new variables: maximum/minimum of Dist2TrExtrap(InError) maxDist2TrExtrapInError = np.maximum(data['eminus_RECOPHOTONS_Dist2TrExtrapInError_Velo_VEC'], data['eminus_RECOPHOTONS_Dist2TrExtrapInError_TT_VEC']) minDist2TrExtrapInError = np.minimum(data['eminus_RECOPHOTONS_Dist2TrExtrapInError_Velo_VEC'], data['eminus_RECOPHOTONS_Dist2TrExtrapInError_TT_VEC']) maxDist2TrExtrap = np.maximum(data['eminus_RECOPHOTONS_Dist2TrExtrap_Velo_VEC'], data['eminus_RECOPHOTONS_Dist2TrExtrap_TT_VEC'])
entries = float(tree.GetEntries(cutstring)) rel_eff = entries/entries_before abs_eff = generator_eff * entries/before_stripping print "\n\n============After %s============" % stage if cutstring != "1": print "Additional cuts: %s" % cutstring print "Entries: %i" % entries print "Rel. Efficiency: %f" % rel_eff print "Abs. Efficiency: %f" % abs_eff #Get entries if weight-branch is found for branch in tree.GetListOfBranches(): if weightbranch == branch.GetName(): weighted = True dataframe = read_root(rootfilename, treename, columns=[weightbranch], where=cutstring) entries_weight = np.sum(dataframe[weightbranch]) rel_eff_weight = entries_weight/entries_before_w_weight abs_eff_weight = generator_eff * entries_weight/before_stripping print "\n--With Weights--" if cutstring != "1": print "Additional cuts: %s" % cutstring print "Entries: %.2f" % entries_weight print "Rel. Efficiency: %f" % rel_eff_weight print "Abs. Efficiency: %f" % abs_eff_weight #Update entries before with weight entries_before_w_weight = entries_weight #Update entries before
def create_resamplers(options): import os.path import pickle from root_pandas import read_root from PIDPerfScripts.Binning import GetBinScheme if options.binningFile and options.binningName: import imp try: imp.load_source("userbinning", options.binningFile) except IOError: msg = "Failed to load binning scheme file '{0}'".format(options.binningFile) raise IOError(msg) print( "Using custom binning scheme defined in {0} with name {1}".format(options.binningFile, options.binningName) ) else: print("Using default binning scheme") options.binningName = None pid_variables = [ "{}_CombDLLK", "{}_CombDLLmu", "{}_CombDLLp", "{}_CombDLLe", "{}_V3ProbNNK", "{}_V3ProbNNpi", "{}_V3ProbNNmu", "{}_V3ProbNNp", ] kin_variables = ["{}_P", "{}_Eta", "nTracks"] with open("raw_data.json") as f: locations = json.load(f) if options.particles: locations = [sample for sample in locations if sample["particle"] in options.particles] if options.both_magnet_orientations: locations = [ sample for sample in locations if sample["magnet"] == "Up" ] # we use both maagnet orientations on the first run for sample in locations: binning_P = rooBinning_to_list( GetBinScheme(sample["branch_particle"], "P", options.binningName) ) # last argument takes name of user-defined binning binning_ETA = rooBinning_to_list( GetBinScheme(sample["branch_particle"], "ETA", options.binningName) ) # last argument takes name of user-defined binning binning_nTracks = rooBinning_to_list( GetBinScheme(sample["branch_particle"], "nTracks", options.binningName) ) # last argument takes name of user-defined binning if options.both_magnet_orientations: if sample["magnet"] == "Up": data = [options.location + "/{particle}_Stripping{stripping}_MagnetUp.root".format(**sample)] data += [options.location + "/{particle}_Stripping{stripping}_MagnetDown.root".format(**sample)] resampler_location = "{particle}_Stripping{stripping}_MagnetAny.pkl".format(**sample) else: data = [options.location + "/{particle}_Stripping{stripping}_Magnet{magnet}.root".format(**sample)] resampler_location = "{particle}_Stripping{stripping}_Magnet{magnet}.pkl".format(**sample) if os.path.exists(resampler_location): os.remove(resampler_location) resamplers = dict() deps = map(lambda x: x.format(sample["branch_particle"]), kin_variables) pids = map(lambda x: x.format(sample["branch_particle"]), pid_variables) for pid in pids: if "DLL" in pid: target_binning = np.linspace(-150, 150, 300) # binning for DLL elif "ProbNN" in pid: target_binning = np.linspace(0, 1, 100) # binning for ProbNN else: raise Exception resamplers[pid] = Resampler(binning_P, binning_ETA, binning_nTracks, target_binning) for dataSet in data: for i, chunk in enumerate( read_root(dataSet, columns=deps + pids + ["nsig_sw"], chunksize=100000, where=options.cutstring) ): # where is None if option is not set for pid in pids: resamplers[pid].learn(chunk[deps + [pid]].values.T, weights=chunk["nsig_sw"]) logging.info("Finished chunk {}".format(i)) with open(resampler_location, "wb") as f: pickle.dump(resamplers, f)
#!/usr/bin/env python import ROOT as r from root_pandas import read_root quants = 'eta pt chargedPt constituents chargedConstituents'.split() quants += 'closestdr closestpt closestparticledr closestparticlept'.split() quants += 'signalPt signalChargedPt signalConstituents signalChargedConstituents'.split() quants += 'isoPt isoChargedPt isoConstituents isoChargedConstituents'.split() taus_in = ['genjet_' + q.lower() for q in quants] + ['isoMVA03', 'antiElectron', 'antiMuon', 'match', 'pt'] taus_in = ['tau_' + v for v in taus_in] taus = read_root("test/genfaketau/out/ntuple.root", "ttjets", columns=taus_in, flatten=True) fakes = taus[(taus.tau_match == 6)] selection = taus[ (taus.tau_match == 6) & (taus.tau_isoMVA03 >= 3) & (taus.tau_pt >= 20.) ] gen_in = [q.lower() for q in quants] gen_in = ['genjet_' + v for v in gen_in] alljets = read_root("test/genfaketau/out/ntuple.root", "ttjets", columns=gen_in, flatten=True) jets = alljets[ (alljets.genjet_pt > 18) & (alljets.genjet_eta > -2.5) & (alljets.genjet_eta < 2.5) & (alljets.genjet_closestparticledr > 0.1) & (alljets.genjet_constituents <= 22)