def test_flatten(): tf = ROOT.TFile('tmp.root', 'RECREATE') tt = ROOT.TTree("a", "a") length = np.array([3]) x = np.array([0, 1, 2], dtype='float64') tt.Branch('length', length, 'length/I') tt.Branch('x', x, 'x[length]/D') tt.Fill() x[0] = 3 x[1] = 4 x[2] = 5 tt.Fill() tf.Write() tf.Close() branches = list_branches('tmp.root') df_ = read_root('tmp.root', flatten=True) assert('__array_index' in df_.columns) assert(len(df_) == 6) assert(np.all(df_['__array_index'] == np.array([0, 1, 2, 0, 1, 2]))) # Also flatten chunked data for df_ in read_root('tmp.root', flatten=True, chunksize=1): assert(len(df_) == 3) assert(np.all(df_['__array_index'] == np.array([0, 1, 2]))) os.remove('tmp.root')
def export_root_data_contents(self, rfile): """ Uses the root_numpy class to list trees and output them. :param rfile: Input ROOT file :return: a dictionary defining the underlying data in the file link. The key is the name of each tree. """ for tree in list_trees(rfile): print 'Processing tree ' + tree print list_branches(rfile, tree) arr = root2array(rfile, treename=tree) print str(arr.view(numpy.recarray))
def read_one_event(run=6, subrun=0, event=0): filename = run_path + str(subrun) + '-RecoFull-Parser.root' root_file_entries_list = root_numpy.list_branches(filename, 'analysistree/anatree') reco_file_values = root_numpy.root2array(filename, 'analysistree/anatree', start=event, stop=event + 1, step=1) dictionary_reco_file_values = {} for root_file_index in range(len(root_file_entries_list)): dictionary_reco_file_values[root_file_entries_list[ root_file_index]] = reco_file_values[0][root_file_index] all_channel_waveform_adc = [] count = 0 for i in range(1280): if i in dictionary_reco_file_values['RecoWaveform_Channel']: all_channel_waveform_adc.append( dictionary_reco_file_values['RecoWaveform_ADC'][count * 1667:(count + 1) * 1667]) count += 1 else: all_channel_waveform_adc.append(np.zeros(1667)) return np.array(all_channel_waveform_adc).reshape((1280, 1667))
def get_reconstruction_variables(run, subrun, event): root_file_entries_list = root_numpy.list_branches( '/eos/experiment/wa105/offline/LArSoft/Data/Reco/2018_June_24/ROOT/recofast/' + str(run) + '/' + str(run) + '-' + str(subrun) + '-RecoFast-Parser.root', 'analysistree/anatree') reco_file_values = root_numpy.root2array( '/eos/experiment/wa105/offline/LArSoft/Data/Reco/2018_June_24/ROOT/recofast/' + str(run) + '/' + str(run) + '-' + str(subrun) + '-RecoFast-Parser.root', 'analysistree/anatree', start=int(event), stop=int(event) + 1, step=1) dictionary_reco_file_values = {} for root_file_index in range(len(root_file_entries_list)): dictionary_reco_file_values[root_file_entries_list[ root_file_index]] = reco_file_values[0][root_file_index] if dictionary_reco_file_values['NumberOfTracks'] >= 1: track_number_of_hits_index_position = [ dictionary_reco_file_values['Track_NumberOfHits'][0] ] for i in range(1, dictionary_reco_file_values['NumberOfTracks']): track_number_of_hits_index_position.append( track_number_of_hits_index_position[i - 1] + dictionary_reco_file_values['Track_NumberOfHits'][i]) for key in dictionary_reco_file_values.keys(): if key[:10] == 'Track_Hit_': dictionary_reco_file_values[key] = np.split( dictionary_reco_file_values[key], track_number_of_hits_index_position) return dictionary_reco_file_values
def __init__(self, fileNames, tree="ana/hgc"): """Constructor. Arguments: fileName -- String for path to the ROOT file tree -- Name of the TTree object inside the ROOT file (default: 'ana/hgc') """ super(HGCalNtuple, self).__init__() self._tree = ROOT.TChain(tree) self._branches = [] branch_blacklist = ['tc_wafer', 'tc_cell', 'tc_waferu', 'tc_waferv', 'tc_cellu', 'tc_cellv', 'gen_PUNumInt', 'gen_TrueNumInt'] for file_name in fileNames: protocol = '' if '/eos/user/' in file_name: protocol = 'root://eosuser.cern.ch/' elif '/eos/cms/' in file_name: protocol = 'root://eoscms.cern.ch/' self._tree.Add(protocol+file_name) if len(self._branches) == 0: self._branches = [br for br in rnp.list_branches(protocol+file_name, tree) if br not in branch_blacklist] # print 'Cache size: {}'.format(self._tree.GetCacheSize()) self._entries = self._tree.GetEntries()
def get_branches(filename: object, treename: object, vectors: object = None) -> object: """ function that returns the appropriate branch string in a branch for the specified vectors of the branch """ if vectors is None: return rn.list_branches(filename, treename=treename) all_branches = rn.list_branches(filename, treename=treename) for vector in vectors: try: all_branches.pop(all_branches.index(vector)) except: continue all_branches.extend([vector + '.x()', vector + '.y()', vector + '.z()']) return all_branches
def read_one_event( run, subrun, event ): # Reads 3x1x1 binary files and returns ADC counts for each channel in a 1280(channels) x 1667(ticks) array filename = '/eos/experiment/wa105/offline/LArSoft/Data/Reco/2018_June_24/ROOT/recofull/' + str( run) + '/' + str(run) + '-' + str(subrun) + '-RecoFull-Parser.root' root_file_entries_list = root_numpy.list_branches(filename, 'analysistree/anatree') reco_file_values = root_numpy.root2array(filename, 'analysistree/anatree', start=event, stop=event + 1, step=1) dictionary_reco_file_values = {} for root_file_index in range(len(root_file_entries_list)): dictionary_reco_file_values[root_file_entries_list[ root_file_index]] = reco_file_values[0][root_file_index] all_channel_waveform_adc = [] count = 0 for i in range(1280): if i in dictionary_reco_file_values['RecoWaveform_Channel']: all_channel_waveform_adc.append( dictionary_reco_file_values['RecoWaveform_ADC'][count * 1667:(count + 1) * 1667]) count += 1 else: all_channel_waveform_adc.append(np.zeros(1667)) return np.array(all_channel_waveform_adc).reshape((1280, 1667))
def get_reconstruction_variables(run=6, subrun=0, event=0): root_file_entries_list = root_numpy.list_branches( run_path + str(subrun) + '-RecoFast-Parser.root', 'analysistree/anatree') reco_file_values = root_numpy.root2array(run_path + str(subrun) + '-RecoFast-Parser.root', 'analysistree/anatree', start=event, stop=event + 1, step=1) dictionary_reco_file_values = {} for root_file_index in range(len(root_file_entries_list)): dictionary_reco_file_values[root_file_entries_list[ root_file_index]] = reco_file_values[0][root_file_index] if dictionary_reco_file_values['NumberOfTracks'] >= 1: track_number_of_hits_index_position = [ dictionary_reco_file_values['Track_NumberOfHits'][0] ] for i in range(1, dictionary_reco_file_values['NumberOfTracks']): track_number_of_hits_index_position.append( track_number_of_hits_index_position[i - 1] + dictionary_reco_file_values['Track_NumberOfHits'][i]) for key in dictionary_reco_file_values.keys(): if key[:10] == 'Track_Hit_': dictionary_reco_file_values[key] = np.split( dictionary_reco_file_values[key], track_number_of_hits_index_position) return dictionary_reco_file_values
def get_matching_variables(fname, tree, patterns): branches = list_branches(fname, tree) selected = [] for p in patterns: for b in branches: if fnmatch(b, p) and not b in selected: selected.append(b) return selected
def Read_Root_file(Path_to_tree, Tree_name_array, Branches, Tree_selection=None): """Transform .root file with branches to ndarray (similar as a dictionary). Returns an array of arrays where each array will be a branch This function use root2array wich is a root_numpy's function. Parameters: Path_to_tree -- The complete path to the folder of the .root files Tree_name_array -- The name of each tree inside the folder without .root extension. Only the names in an array. For example: ['bla1', 'bla2'] Branches -- The branches can be selected one by one in array. For example: ['Brho' , 'D'] Selection -- A prelimanar selection can be apply. For example: 'ThetaLdeg >> 0.0 & D == 0.0' Excepctions: None Return: array_data -- A matrix with the name of the branches in arrays related with the data of each initial branch in the original .root file """ path_filename = list( map(lambda s: os.path.join(Path_to_tree, s) + '.root', Tree_name_array)) #The file path path_treename = rn.list_trees(path_filename[0]) #The name inside the tree print('We are reading this root files:') print(Tree_name_array) if Branches == 'All': Tree_branches = rn.list_branches( path_filename[0] ) #Asumption with all the trees have the same name for their branches print('All branches are chosen') else: Tree_branches = Branches print('These branches ') print(Branches) print('are selected') if Tree_selection != None: print('The pre-selection over trees is ', Tree_selection) array_data = rn.root2array(filenames=path_filename, treename=path_treename[0], branches=Tree_branches, selection=Tree_selection) else: print('No pre-selection applied') array_data = rn.root2array(filenames=path_filename, treename=path_treename[0], branches=Tree_branches) return array_data
def load_root_tree(files, tree=None, columns=None, ignore=None, *kargs, **kwargs): from pandas import DataFrame from root_numpy import root2array, list_trees, list_branches # check if we get a list or a single file if not isinstance(files, list): files = [files] # use the first file to define tree & branches init_file = files[0] # check to see if there is a specified tree, if not, # look for a single tree. If the choice is ambiguous, # raise an error and exit if tree == None: trees = list_trees(init_file) if len(trees) == 1: tree = trees[0] elif len(trees) == 0: raise ValueError('Error: no trees found in {}'.format(init_file)) else: raise ValueError( 'Ambiguous call: more than one tree found in {}'.format( init_file)) branches = list_branches(init_file, tree) # match existing branches to branches asked for by user if not columns: all_vars = branches else: all_vars = get_matching_variables(branches, columns) # handle branches that are asked to be ignored if ignore: if isinstance(ignore, string_types): ignore = [ignore] ignored = get_matching_variables(ignore, branches) for var in ignored: all_vars.remove(var) arr = root2array(files, tree, all_vars, *kargs, **kwargs) if 'index' in arr.dtype.names: df = DataFrame.from_records(arr, index='index') else: df = DataFrame.from_records(arr) return df
def _LoadRoot(filepath): if not useRootNumpy: raise IOError("root_numpy not available - can't load ROOT file") data = BDSAsciiData() trees = _rnp.list_trees(filepath) if 'optics' in trees: branches = _rnp.list_branches(filepath, 'optics') treedata = _rnp.root2array(filepath, 'optics') elif 'orbit' in trees: branches = _rnp.list_branches(filepath, 'orbit') treedata = _rnp.root2array(filepath, 'orbit') else: raise IOError("This file doesn't have the required tree 'optics'.") for element in range(len(treedata[branches[0]])): elementlist = [] for branch in branches: if element == 0: data._AddProperty(branch) elementlist.append(treedata[branch][element]) data.append(elementlist) return data
def get_matching_variables(file, tree, patterns): from fnmatch import fnmatch from root_numpy import list_branches branches = list_branches(file, tree) selected = [] for p in patterns: for b in branches: if fnmatch(b, p) and not b in selected: selected.append(b) return selected
def test_persistent_index(): df = pd.DataFrame({'index': [42, 0, 1], 'x': [1,2,3]}) df = df.set_index('index') df.index.name = 'MyAwesomeName' df.to_root('tmp.root') assert('__index__MyAwesomeName' in list_branches('tmp.root')) df_ = read_root('tmp.root') assert_frame_equal(df, df_) os.remove('tmp.root') # See what happens if the index has no name df = pd.DataFrame({'x': [1,2,3]}) df.to_root('tmp.root') df_ = read_root('tmp.root') assert_frame_equal(df, df_) os.remove('tmp.root')
def to_pandas_old(data_in, index=None, columns=None): """Convert data from numpy or root to pandas dataframe. Convert data safely to pandas, whatever the format is. Parameters ---------- data_in : any reasonable data The data to be converted """ # TODO: generalize root_index_name = '__index__' data_in = dev_tool.entries_to_str(data_in) if is_root(data_in): root_index = None if root_index_name in root_numpy.list_branches( filename=data_in['filenames'], treename=data_in.get('treename')): root_index = root_numpy.root2array( filenames=data_in['filenames'], treename=data_in.get('treename'), selection=data_in.get('selection'), branches=root_index_name) data_in = root_numpy.root2array(**data_in) # why **? it's a root dict if is_list(data_in): data_in = np.array(data_in) if is_ndarray(data_in): if ((isinstance(columns, (list, tuple)) and len(columns) == 1) or isinstance(columns, basestring)): data_in = to_ndarray(data_in) data_in = pd.DataFrame(data_in, columns=columns, index=root_index) if index is not None: data_in = data_in.loc[index] elif isinstance(data_in, pd.DataFrame): pass else: raise TypeError("Could not convert data to pandas. Data: " + data_in) return data_in
def read_root(paths, key=None, columns=None, ignore=None, chunksize=None, where=None, flatten=False, *args, **kwargs): """ Read a ROOT file, or list of ROOT files, into a pandas DataFrame. Further *args and *kwargs are passed to root_numpy's root2array. If the root file contains a branch matching __index__*, it will become the DataFrame's index. Parameters ---------- paths: string or list The path(s) to the root file(s) key: string The key of the tree to load. columns: str or sequence of str A sequence of shell-patterns (can contain *, ?, [] or {}). Matching columns are read. The columns beginning with `noexpand:` are not interpreted as shell-patterns, allowing formula columns such as `noexpand:2*x`. The column in the returned DataFrame will not have the `noexpand:` prefix. ignore: str or sequence of str A sequence of shell-patterns (can contain *, ?, [] or {}). All matching columns are ignored (overriding the columns argument). chunksize: int If this parameter is specified, an iterator is returned that yields DataFrames with `chunksize` rows. where: str Only rows that match the expression will be read. flatten: sequence of str A sequence of column names. Will use root_numpy.stretch to flatten arrays in the specified columns into individual entries. All arrays specified in the columns must have the same length for this to work. Be careful if you combine this with chunksize, as chunksize will refer to the number of unflattened entries, so you will be iterating over a number of entries that is potentially larger than chunksize. The index of each element within its former array will be saved in the __array_index column. Returns ------- DataFrame created from matching data in the specified TTree Notes ----- >>> df = read_root('test.root', 'MyTree', columns=['A{B,C}*', 'D'], where='ABB > 100') """ if not isinstance(paths, list): paths = [paths] # Use a single file to search for trees and branches seed_path = paths[0] if not key: trees = list_trees(seed_path) if len(trees) == 1: key = trees[0] elif len(trees) == 0: raise ValueError('No trees found in {}'.format(seed_path)) else: raise ValueError('More than one tree found in {}'.format(seed_path)) branches = list_branches(seed_path, key) if not columns: all_vars = branches else: if isinstance(columns, string_types): columns = [columns] # __index__* is always loaded if it exists # XXX Figure out what should happen with multi-dimensional indices index_branches = list(filter(lambda x: x.startswith('__index__'), branches)) if index_branches: columns = columns[:] columns.append(index_branches[0]) columns, noexpand = filter_noexpand_columns(columns) columns = list(itertools.chain.from_iterable(list(map(expand_braces, columns)))) all_vars = get_matching_variables(branches, columns) + noexpand if ignore: if isinstance(ignore, string_types): ignore = [ignore] ignored = get_matching_variables(branches, ignore, fail=False) ignored = list(itertools.chain.from_iterable(list(map(expand_braces, ignored)))) if any(map(lambda x: x.startswith('__index__'), ignored)): raise ValueError('__index__* branch is being ignored!') for var in ignored: all_vars.remove(var) def do_flatten(arr, flatten): if flatten is True: warnings.warn(" The option flatten=True is deprecated. Please specify the branches you would like " "to flatten in a list: flatten=['foo', 'bar']", FutureWarning) arr_, idx = stretch(arr, return_indices=True) else: nonscalar = get_nonscalar_columns(arr) fields = [x for x in arr.dtype.names if (x not in nonscalar or x in flatten)] will_drop = [x for x in arr.dtype.names if x not in fields] if will_drop: warnings.warn("Ignored the following non-scalar branches: {bad_names}" .format(bad_names=", ".join(will_drop)), UserWarning) arr_, idx = stretch(arr, fields=fields, return_indices=True) arr = append_fields(arr_, '__array_index', idx, usemask=False, asrecarray=True) return arr if chunksize: tchain = ROOT.TChain(key) for path in paths: tchain.Add(path) n_entries = tchain.GetEntries() # XXX could explicitly clean up the opened TFiles with TChain::Reset def genchunks(): for chunk in range(int(ceil(float(n_entries) / chunksize))): arr = root2array(paths, key, all_vars, start=chunk * chunksize, stop=(chunk+1) * chunksize, selection=where, *args, **kwargs) if flatten: arr = do_flatten(arr, flatten) yield convert_to_dataframe(arr) return genchunks() arr = root2array(paths, key, all_vars, selection=where, *args, **kwargs) if flatten: arr = do_flatten(arr, flatten) return convert_to_dataframe(arr)
def read_root(path, tree_key=None, columns=None, ignore=None, chunksize=None, where=None, *kargs, **kwargs): """ Read a ROOT file into a pandas DataFrame. Further *kargs and *kwargs are passed to root_numpy's root2array. If the root file contains a branch called index, it will become the DataFrame's index. Parameters ---------- path: string The path to the root file tree_key: string The key of the tree to load. columns: str or sequence of str A sequence of shell-patterns (can contain *, ?, [] or {}). Matching columns are read. ignore: str or sequence of str A sequence of shell-patterns (can contain *, ?, [] or {}). All matching columns are ignored (overriding the columns argument) chunksize: int If this parameter is specified, an iterator is returned that yields DataFrames with `chunksize` rows where: str Only rows that match the expression will be read Returns ------- DataFrame created from matching data in the specified TTree Notes ----- >>> df = read_root('test.root', 'MyTree', columns=['A{B,C}*', 'D'], where='ABB > 100') """ if not tree_key: branches = list_trees(path) if len(branches) == 1: tree_key = branches[0] else: raise ValueError('More than one tree found in {}'.format(path)) branches = list_branches(path, tree_key) if not columns: all_vars = branches else: # index is always loaded if it exists if isinstance(columns, string_types): columns = [columns] if 'index' in branches: columns = columns[:] columns.append('index') columns = list(itertools.chain.from_iterable(list(map(expand_braces, columns)))) all_vars = get_matching_variables(branches, columns) if ignore: if isinstance(ignore, string_types): ignore = [ignore] ignored = get_matching_variables(branches, ignore, fail=False) ignored = list(itertools.chain.from_iterable(list(map(expand_braces, ignored)))) if 'index' in ignored: raise ValueError('index variable is being ignored!') for var in ignored: all_vars.remove(var) if chunksize: f = ROOT.TFile(path) n_entries = f.Get(tree_key).GetEntries() f.Close() def genchunks(): for chunk in range(int(ceil(float(n_entries) / chunksize))): arr = root2array(path, tree_key, all_vars, start=chunk * chunksize, stop=(chunk+1) * chunksize, selection=where, *kargs, **kwargs) yield convert_to_dataframe(arr) return genchunks() arr = root2array(path, tree_key, all_vars, selection=where, *kargs, **kwargs) return convert_to_dataframe(arr)
def test_flatten(): tf = ROOT.TFile('tmp.root', 'RECREATE') tt = ROOT.TTree("a", "a") length = np.array([3]) x = np.array([0, 1, 2], dtype='float64') y = np.array([6, 7, 8], dtype='float64') tt.Branch('length', length, 'length/I') tt.Branch('x', x, 'x[length]/D') tt.Branch('y', y, 'y[length]/D') tt.Fill() x[0] = 3 x[1] = 4 x[2] = 5 y[0] = 9 y[1] = 10 y[2] = 11 tt.Fill() tf.Write() tf.Close() branches = list_branches('tmp.root') # flatten one out of two array branches with warnings.catch_warnings(): warnings.simplefilter("ignore") df_ = read_root('tmp.root', flatten=['x']) assert('__array_index' in df_.columns) assert(len(df_) == 6) assert('length' in df_.columns.values) assert('x' in df_.columns.values) assert('y' not in df_.columns.values) assert(np.all(df_['__array_index'] == np.array([0, 1, 2, 0, 1, 2]))) assert(np.all(df_['x'] == np.array([0, 1, 2, 3, 4, 5]))) # flatten both array branches df_ = read_root('tmp.root', flatten=['x','y']) assert('__array_index' in df_.columns) assert(len(df_) == 6) assert(np.all(df_['__array_index'] == np.array([0, 1, 2, 0, 1, 2]))) assert('length' in df_.columns.values) assert('x' in df_.columns.values) assert('y' in df_.columns.values) assert(np.all(df_['x'] == np.array([0, 1, 2, 3, 4, 5]))) assert(np.all(df_['y'] == np.array([6, 7, 8, 9, 10, 11]))) # Also flatten chunked data for df_ in read_root('tmp.root', flatten=['x'], chunksize=1): assert(len(df_) == 3) assert(np.all(df_['__array_index'] == np.array([0, 1, 2]))) # Also test deprecated behaviour with warnings.catch_warnings(): warnings.simplefilter("ignore") df_ = read_root('tmp.root', flatten=True) assert('__array_index' in df_.columns) assert(len(df_) == 6) assert(np.all(df_['__array_index'] == np.array([0, 1, 2, 0, 1, 2]))) os.remove('tmp.root')
graph.GetXaxis().SetRangeUser(0.001, 1) graph.GetXaxis().SetTitle("sig. eff.") graph.GetYaxis().SetRangeUser(0.001, 1) graph.GetYaxis().SetTitle("bkg. eff.") print "AUC:", auc(eff_sig, eff_bkg) return graph fin = r.TFile("nelsonVar_histos.root", "READ") bkg_tree = fin.Get("bkg_tree") sig_tree = fin.Get("sig_1000mev_tree") bkg_np_arr = rn.tree2array(bkg_tree) sig_np_arr = rn.tree2array(sig_tree) branches = rn.list_branches("nelsonVar_histos.root", "bkg_tree") print branches features = [] for branch in branches: if 'cylinder' in branch: features.append(branch) sig_dic = {} bkg_dic = {} for branch, bkg_data, sig_data in zip(branches, zip(*bkg_np_arr), zip(*sig_np_arr)): sig_dic[branch] = sig_data bkg_dic[branch] = bkg_data
signal = False myrange = 20 njets = 3 # signal = False # myrange = range(0,25) print('signal?', signal) print('njets?', njets) if signal: filename = '../data/vbfroot/data-CxAOD-0.root' else: filename = '../data/ggfroot/data-CxAOD-0.root' branch_names = root_numpy.list_branches(filename, 'Nominal') branch = {name: number for number, name in enumerate(branch_names)} # vbf_reg = scrape_folder('../data/vbfroot/', branch, scrape_regular, maxfiles=1) arr = root2array(filename, 'Nominal') mycount = 0 for i in range(len(arr)): if check_event(arr, branch, i, njets): print(i, 'weight', arr[i][branch['eventWeight']]) event3d(arr, branch, i) mycount += 1 if mycount == myrange: break
def generate_data_sample(numbers_particles, n_tracks, selection, file_path, log_path, readed_files_txt): """ Generates data sample from different decays. Parameters ---------- numbers_particles : pandas.DataFrame Number of particles of each type in each data file. n_tracks : int Number of tracks of each particle type. selection : string Selection criteria for the particles. file_path : string Name of the data sample file. log_path : string Name of the log file. readed_files_txt : string Name of the file which contains https of all read files. Files from this file will not be read. This is needed for the warm start. Return ------ 1 """ # Estimate how many track of the each particle from the each file should be taken particles = numbers_particles.columns.drop(['http', 'tree_name']) part = 1. * n_tracks / numbers_particles[particles].sum() # Try to create or open LOG file if not os.path.exists(log_path): LOG = open(log_path, 'w') LOG.write('Particles pdgs: ' + str(particles) + '\n') LOG.write('Selection: ' + selection + '\n') LOG.write('Number of tracks: ' + str(n_tracks) + '\n') LOG.flush() else: LOG = open(log_path, 'a') # Try create or open file with the READED data files. if not os.path.exists(readed_files_txt): READED = open(readed_files_txt, 'w') READED.write("") READED.close() READED = list(numpy.loadtxt(readed_files_txt, dtype='S', delimiter='\n', comments='#', ndmin=1)) else: READED = list(numpy.loadtxt(readed_files_txt, dtype='S', delimiter='\n', comments='#', ndmin=1)) # Count how many track have been taken try: data = pandas.read_csv(file_path, usecols=['MCParticleType']) numbers_per_particle = {} for pdg in particles: numbers_per_particle[pdg] = len(data[numpy.abs(data.MCParticleType.values) == int(pdg)]) data = pandas.DataFrame() except: data = pandas.DataFrame() numbers_per_particle = {} for i in particles: numbers_per_particle[i] = 0 for index in numbers_particles.index: success = 0 while success != 1: try: file_http = numbers_particles.loc[index]['http'] tree_name = numbers_particles.loc[index]['tree_name'] # A file was readed before? if file_http in READED: success = 1 continue branches = root_numpy.list_branches(file_http, treename=tree_name) branches = numpy.array(branches) data_array = root_numpy.root2array(filenames=file_http, treename=tree_name, branches=branches[branches != 'piplus_OWNPV_COV_'], selection=selection) data = pandas.DataFrame(data=data_array, columns=branches[branches != 'piplus_OWNPV_COV_']) LOG.write(file_http + '\n') LOG.flush() data_iter = pandas.DataFrame(columns=branches[branches != 'piplus_OWNPV_COV_']) data_iter_index = [] for one_particle in particles: p_type = numpy.abs(data['MCParticleType'].values) data_particle = data[p_type == int(one_particle)] number = numbers_particles.loc[index][one_particle] number_take = int(round(part[one_particle] * number)) data_particle_take_index, _ = train_test_split(data_particle.index, train_size=number_take, random_state=42) data_iter_index += list(data_particle_take_index) numbers_per_particle[one_particle] += number_take data_iter = data.loc[data_iter_index] if os.path.exists(file_path): data_iter.to_csv(file_path, mode='a', header=False) else: data_iter.to_csv(file_path, mode='a', header=True) del data_iter, data, data_array gc.collect() READED.append(file_http) numpy.array(READED).tofile(readed_files_txt, sep="\n") LOG.write('Tracks selected: ' + str(numbers_per_particle) + '\n') LOG.flush() success = 1 except: LOG.write('Unexpected error \n') LOG.flush() return 1
tree_wjets = file_wjets.Get('events') tree_ww = file_ww.Get('events') tree_wz = file_wz.Get('events') tree_zz = file_zz.Get('events') ''' ''' ------------------------------ Define Branches, select the candidates and do cuts over the branches ----------------------- ''' SamplesList = [ 'data', 'dy', 'qcd', 'singletop', 'ttbar', 'wjets', 'ww', 'wz', 'zz' ] #Tree branches by hand or all directly: #Tree_Branches = ['Muon_Px', 'Muon_Py', 'Muon_Pz', 'Muon_E', 'Muon_Charge', 'Muon_Iso'] #for example Tree_Branches = rn.list_branches('files/data.root') #Select the candidates, two muons, zero electrons and zero photons with some requirements Candidates_Selection = 'NMuon == 2 & NElectron == 0 & NPhoton == 0 & NJet == 0' IsoTrigger_Selection = 'triggerIsoMu24 == 1.0' HadroMC_Selection = 'MChadronicWDecayQuark_px == 0.0' #Combine all the cuts in the selection: Tree_Selection = [ Candidates_Selection, IsoTrigger_Selection, HadroMC_Selection ] Tree_Selection = '&'.join(Tree_Selection) print('Choosing candidates ...') #array_data = rn.root2array(filenames = Path_to_tree + 'data.root', treename = 'events', branches = ['Muon_Px' , 'NMuon'], selection = 'NMuon == 2.0' , object_selection = {'Muon_Px == 0.0' : 'Muon_Px'})
def read_root(paths, key=None, columns=None, ignore=None, chunksize=None, where=None, flatten=False, *args, **kwargs): """ Read a ROOT file, or list of ROOT files, into a pandas DataFrame. Further *args and *kwargs are passed to root_numpy's root2array. If the root file contains a branch matching __index__*, it will become the DataFrame's index. Parameters ---------- paths: string or list The path(s) to the root file(s) key: string The key of the tree to load. columns: str or sequence of str A sequence of shell-patterns (can contain *, ?, [] or {}). Matching columns are read. The columns beginning with `noexpand:` are not interpreted as shell-patterns, allowing formula columns such as `noexpand:2*x`. The column in the returned DataFrame will not have the `noexpand:` prefix. ignore: str or sequence of str A sequence of shell-patterns (can contain *, ?, [] or {}). All matching columns are ignored (overriding the columns argument). chunksize: int If this parameter is specified, an iterator is returned that yields DataFrames with `chunksize` rows. where: str Only rows that match the expression will be read. flatten: sequence of str A sequence of column names. Will use root_numpy.stretch to flatten arrays in the specified columns into individual entries. All arrays specified in the columns must have the same length for this to work. Be careful if you combine this with chunksize, as chunksize will refer to the number of unflattened entries, so you will be iterating over a number of entries that is potentially larger than chunksize. The index of each element within its former array will be saved in the __array_index column. Returns ------- DataFrame created from matching data in the specified TTree Notes ----- >>> df = read_root('test.root', 'MyTree', columns=['A{B,C}*', 'D'], where='ABB > 100') """ if not isinstance(paths, list): paths = [paths] # Use a single file to search for trees and branches, ensuring the key exists for seed_path in paths: trees = list_trees(seed_path) if key and key not in trees: continue break else: if key: raise OSError('{} not found in any of the given paths'.format(key)) else: raise OSError('No trees found in any of the given paths') if not key: if len(trees) == 1: key = trees[0] elif len(trees) == 0: raise ValueError('No trees found in {}'.format(seed_path)) else: raise ValueError('More than one tree found in {}'.format(seed_path)) branches = list_branches(seed_path, key) if not columns: all_vars = branches else: if isinstance(columns, string_types): columns = [columns] # __index__* is always loaded if it exists # XXX Figure out what should happen with multi-dimensional indices index_branches = list(filter(lambda x: x.startswith('__index__'), branches)) if index_branches: columns = columns[:] columns.append(index_branches[0]) columns, noexpand = filter_noexpand_columns(columns) columns = list(itertools.chain.from_iterable(list(map(expand_braces, columns)))) all_vars = get_matching_variables(branches, columns) + noexpand if ignore: if isinstance(ignore, string_types): ignore = [ignore] ignored = get_matching_variables(branches, ignore, fail=False) ignored = list(itertools.chain.from_iterable(list(map(expand_braces, ignored)))) if any(map(lambda x: x.startswith('__index__'), ignored)): raise ValueError('__index__* branch is being ignored!') for var in ignored: all_vars.remove(var) if chunksize: tchain = ROOT.TChain(key) for path in paths: tchain.Add(path) n_entries = tchain.GetEntries() n_chunks = int(ceil(float(n_entries) / chunksize)) # XXX could explicitly clean up the opened TFiles with TChain::Reset class genchunk(object): def __len__(self): return n_chunks def __iter__(self): current_index = 0 for chunk in range(n_chunks): arr = root2array(paths, key, all_vars, start=chunk * chunksize, stop=(chunk+1) * chunksize, selection=where, *args, **kwargs) if len(arr) == 0: continue if flatten: arr = do_flatten(arr, flatten) yield convert_to_dataframe(arr, start_index=current_index) current_index += len(arr) return genchunk() arr = root2array(paths, key, all_vars, selection=where, *args, **kwargs) if flatten: arr = do_flatten(arr, flatten) return convert_to_dataframe(arr)
def test_flatten(): tf = ROOT.TFile('tmp.root', 'RECREATE') tt = ROOT.TTree("a", "a") length = np.array([3]) x = np.array([0, 1, 2], dtype='float64') y = np.array([6, 7, 8], dtype='float64') tt.Branch('length', length, 'length/I') tt.Branch('x', x, 'x[length]/D') tt.Branch('y', y, 'y[length]/D') tt.Fill() x[0] = 3 x[1] = 4 x[2] = 5 y[0] = 9 y[1] = 10 y[2] = 11 tt.Fill() tf.Write() tf.Close() branches = list_branches('tmp.root') assert (set(branches) == {'length', 'x', 'y'}) # flatten one out of two array branches with warnings.catch_warnings(): warnings.simplefilter("ignore") df_ = read_root('tmp.root', flatten=['x']) assert ('__array_index' in df_.columns) assert (len(df_) == 6) assert ('length' in df_.columns.values) assert ('x' in df_.columns.values) assert ('y' not in df_.columns.values) assert (np.all(df_['__array_index'] == np.array([0, 1, 2, 0, 1, 2]))) assert (np.all(df_['x'] == np.array([0, 1, 2, 3, 4, 5]))) # flatten both array branches df_ = read_root('tmp.root', flatten=['x', 'y']) assert ('__array_index' in df_.columns) assert (len(df_) == 6) assert (np.all(df_['__array_index'] == np.array([0, 1, 2, 0, 1, 2]))) assert ('length' in df_.columns.values) assert ('x' in df_.columns.values) assert ('y' in df_.columns.values) assert (np.all(df_['x'] == np.array([0, 1, 2, 3, 4, 5]))) assert (np.all(df_['y'] == np.array([6, 7, 8, 9, 10, 11]))) # Also flatten chunked data for df_ in read_root('tmp.root', flatten=['x'], chunksize=1): assert (len(df_) == 3) assert (np.all(df_['__array_index'] == np.array([0, 1, 2]))) # Also test deprecated behaviour with warnings.catch_warnings(): warnings.simplefilter("ignore") df_ = read_root('tmp.root', flatten=True) assert ('__array_index' in df_.columns) assert (len(df_) == 6) assert (np.all(df_['__array_index'] == np.array([0, 1, 2, 0, 1, 2]))) os.remove('tmp.root')
def generate_data_sample(numbers_particles, n_tracks, selection, file_path, log_path, readed_files_txt): """ Generates data sample from different decays. Parameters ---------- numbers_particles : pandas.DataFrame Number of particles of each type in each data file. n_tracks : int Number of tracks of each particle type. selection : string Selection criteria for the particles. file_path : string Name of the data sample file. log_path : string Name of the log file. readed_files_txt : string Name of the file which contains https of all read files. Files from this file will not be read. This is needed for the warm start. Return ------ 1 """ # Estimate how many track of the each particle from the each file should be taken particles = numbers_particles.columns.drop(['http', 'tree_name']) part = 1. * n_tracks / numbers_particles[particles].sum() # Try to create or open LOG file if not os.path.exists(log_path): LOG = open(log_path, 'w') LOG.write('Particles pdgs: ' + str(particles) + '\n') LOG.write('Selection: ' + selection + '\n') LOG.write('Number of tracks: ' + str(n_tracks) + '\n') LOG.flush() else: LOG = open(log_path, 'a') # Try create or open file with the READED data files. if not os.path.exists(readed_files_txt): READED = open(readed_files_txt, 'w') READED.write("") READED.close() READED = list( numpy.loadtxt(readed_files_txt, dtype='S', delimiter='\n', comments='#', ndmin=1)) else: READED = list( numpy.loadtxt(readed_files_txt, dtype='S', delimiter='\n', comments='#', ndmin=1)) # Count how many track have been taken try: data = pandas.read_csv(file_path, usecols=['MCParticleType']) numbers_per_particle = {} for pdg in particles: numbers_per_particle[pdg] = len( data[numpy.abs(data.MCParticleType.values) == int(pdg)]) data = pandas.DataFrame() except: data = pandas.DataFrame() numbers_per_particle = {} for i in particles: numbers_per_particle[i] = 0 for index in numbers_particles.index: success = 0 while success != 1: try: file_http = numbers_particles.loc[index]['http'] tree_name = numbers_particles.loc[index]['tree_name'] # A file was readed before? if file_http in READED: success = 1 continue branches = root_numpy.list_branches(file_http, treename=tree_name) branches = numpy.array(branches) data_array = root_numpy.root2array( filenames=file_http, treename=tree_name, branches=branches[branches != 'piplus_OWNPV_COV_'], selection=selection) data = pandas.DataFrame( data=data_array, columns=branches[branches != 'piplus_OWNPV_COV_']) LOG.write(file_http + '\n') LOG.flush() data_iter = pandas.DataFrame( columns=branches[branches != 'piplus_OWNPV_COV_']) data_iter_index = [] for one_particle in particles: p_type = numpy.abs(data['MCParticleType'].values) data_particle = data[p_type == int(one_particle)] number = numbers_particles.loc[index][one_particle] number_take = int(round(part[one_particle] * number)) data_particle_take_index, _ = train_test_split( data_particle.index, train_size=number_take, random_state=42) data_iter_index += list(data_particle_take_index) numbers_per_particle[one_particle] += number_take data_iter = data.loc[data_iter_index] if os.path.exists(file_path): data_iter.to_csv(file_path, mode='a', header=False) else: data_iter.to_csv(file_path, mode='a', header=True) del data_iter, data, data_array gc.collect() READED.append(file_http) numpy.array(READED).tofile(readed_files_txt, sep="\n") LOG.write('Tracks selected: ' + str(numbers_per_particle) + '\n') LOG.flush() success = 1 except: LOG.write('Unexpected error \n') LOG.flush() return 1
def test_list_branches(): branches = rnp.list_branches(load('single1.root')) assert_equal(branches, ['n_int', 'f_float', 'd_double'])
def get_info(self): trees = rn.list_trees(self.file) for tree in trees: print(str.capitalize(tree) + ":") print(rn.list_branches(self.file, treename=tree))
def find_product(self, product, tree="Events"): branches = rn.list_branches(self.file, treename=tree) return [ branch for branch in branches if str.lower(product) in str.lower(branch) ]
# the variables to reweight used_branch = ['C_PT', 'C_Y', 'nSPDHits', 'Dp_M', 'pK_M'] #set up data sets """ root2reweight are the mc need to reweight rootfixdata are the data to weight to root2addweight usually is the denominator for efficiency estimation (e.g: total mc truth) """ #used root files root2reweight = 'root/endlamdab2DpK.root' rootfixdata = 'root/endSplot54t59Xibc.root' root2addweight = 'root/aaaalamdab2DpK.root' all_branch = root_numpy.list_branches(root2addweight, 'DecayTree') original = root_numpy.root2array(root2reweight, branches=used_branch) target = root_numpy.root2array(rootfixdata, branches=used_branch) used = root_numpy.root2array(root2addweight, branches=used_branch) original = pandas.DataFrame(original) target = pandas.DataFrame(target) used = pandas.DataFrame(used) used_tmp = read_root(root2addweight, columns=all_branch) #set up the orignal weights (weights for the orignal file and the target files) original_weights = numpy.ones(len(original)) tree01 = root_numpy.root2array('root/endSplot54t59Xibc.root', 'DecayTree') target_weights = tree01['sig_sw']
def converter(arguments): """ Process converting standard-format ROOT file to HDF5 file with cell content. Arguments: path: Path to the ROOT file to be converted. args: Namespace containing command-line arguments, to configure the reading and writing of files. Returns: Converted data in numpy array format """ global args # Unpack arguments index, counter, path, start, stop = arguments # Suppress warnings like these when loading the files: TClass::Init:0: RuntimeWarning: no dictionary for class [bla] is available ROOT.gErrorIgnoreLevel = ROOT.kError # Split indexes into 10 sets. index_edges = list(map(int, np.linspace(start, stop, 10, endpoint=True))) index_ranges = zip(index_edges[:-1], index_edges[1:]) import root_numpy # Read-in data from ROOT.TTree all_branches = root_numpy.list_branches(path, args.tree) # Any branches that needs to be removed is defined in "variableLists.py" # remove = remove_branches() remove = remove_branches_pho() # remove = remove_branchesData() keep_branches = sorted(list(set(all_branches) - set(remove))) one_evts_array = [] for i, (loop_start, loop_stop) in enumerate(index_ranges): array = root_numpy.root2array(path, args.tree, start=loop_start, stop=loop_stop, selection=args.selection, branches=keep_branches, warn_missing_tree=True) ROOT.gErrorIgnoreLevel = ROOT.kInfo n_evts = len(array) # If NO events survived, it's probably the selection if n_evts == 0: print("n_evts = 0") return # If only one event survives ( can happen with small files) the tracks can't be saved properly???, for now add all of these and save them later if n_evts == 1: one_evts_array.append(array) continue # Convert to HDF5-ready format. data = convert_to_hdf5(array) if (args.tree == 'el_tree') and args.datatype == 'MC': scale = scale_eventWeight(data['mcChannelNumber'][0]) data['event_totalWeight'] *= scale # Save output of every subprocess to a file filename = '{:s}_{:04d}.h5'.format(args.tag, index) if counter == 0 and i == 0: saveToFile(filename, data) else: appendToFile(filename, data) del data, array gc.collect() # Add all arrays with only one event and save them to the output file if len(one_evts_array) > 1: one_evts_array = np.concatenate(one_evts_array) one_evts_data = convert_to_hdf5(one_evts_array) filename = '{:s}_{:04d}.h5'.format(args.tag, index) appendToFile(filename, one_evts_data)
def read_root(paths, key=None, columns=None, ignore=None, chunksize=None, where=None, flatten=False, *args, **kwargs): """ Read a ROOT file, or list of ROOT files, into a pandas DataFrame. Further *args and *kwargs are passed to root_numpy's root2array. If the root file contains a branch matching __index__*, it will become the DataFrame's index. Parameters ---------- paths: string or list The path(s) to the root file(s) key: string The key of the tree to load. columns: str or sequence of str A sequence of shell-patterns (can contain *, ?, [] or {}). Matching columns are read. The columns beginning with `noexpand:` are not interpreted as shell-patterns, allowing formula columns such as `noexpand:2*x`. The column in the returned DataFrame will not have the `noexpand:` prefix. ignore: str or sequence of str A sequence of shell-patterns (can contain *, ?, [] or {}). All matching columns are ignored (overriding the columns argument). chunksize: int If this parameter is specified, an iterator is returned that yields DataFrames with `chunksize` rows. where: str Only rows that match the expression will be read. flatten: sequence of str A sequence of column names. Will use root_numpy.stretch to flatten arrays in the specified columns into individual entries. All arrays specified in the columns must have the same length for this to work. Be careful if you combine this with chunksize, as chunksize will refer to the number of unflattened entries, so you will be iterating over a number of entries that is potentially larger than chunksize. The index of each element within its former array will be saved in the __array_index column. Returns ------- DataFrame created from matching data in the specified TTree Notes ----- >>> df = read_root('test.root', 'MyTree', columns=['A{B,C}*', 'D'], where='ABB > 100') """ if not isinstance(paths, list): paths = [paths] # Use a single file to search for trees and branches seed_path = paths[0] if not key: trees = list_trees(seed_path) if len(trees) == 1: key = trees[0] elif len(trees) == 0: raise ValueError('No trees found in {}'.format(seed_path)) else: raise ValueError( 'More than one tree found in {}'.format(seed_path)) branches = list_branches(seed_path, key) if not columns: all_vars = branches else: if isinstance(columns, string_types): columns = [columns] # __index__* is always loaded if it exists # XXX Figure out what should happen with multi-dimensional indices index_branches = list( filter(lambda x: x.startswith('__index__'), branches)) if index_branches: columns = columns[:] columns.append(index_branches[0]) columns, noexpand = filter_noexpand_columns(columns) columns = list( itertools.chain.from_iterable(list(map(expand_braces, columns)))) all_vars = get_matching_variables(branches, columns) + noexpand if ignore: if isinstance(ignore, string_types): ignore = [ignore] ignored = get_matching_variables(branches, ignore, fail=False) ignored = list( itertools.chain.from_iterable(list(map(expand_braces, ignored)))) if any(map(lambda x: x.startswith('__index__'), ignored)): raise ValueError('__index__* branch is being ignored!') for var in ignored: all_vars.remove(var) def do_flatten(arr, flatten): if flatten is True: warnings.warn( " The option flatten=True is deprecated. Please specify the branches you would like " "to flatten in a list: flatten=['foo', 'bar']", FutureWarning) arr_, idx = stretch(arr, return_indices=True) else: nonscalar = get_nonscalar_columns(arr) fields = [ x for x in arr.dtype.names if (x not in nonscalar or x in flatten) ] will_drop = [x for x in arr.dtype.names if x not in fields] if will_drop: warnings.warn( "Ignored the following non-scalar branches: {bad_names}". format(bad_names=", ".join(will_drop)), UserWarning) arr_, idx = stretch(arr, fields=fields, return_indices=True) arr = append_fields(arr_, '__array_index', idx, usemask=False, asrecarray=True) return arr if chunksize: tchain = ROOT.TChain(key) for path in paths: tchain.Add(path) n_entries = tchain.GetEntries() # XXX could explicitly clean up the opened TFiles with TChain::Reset def genchunks(): for chunk in range(int(ceil(float(n_entries) / chunksize))): arr = root2array(paths, key, all_vars, start=chunk * chunksize, stop=(chunk + 1) * chunksize, selection=where, *args, **kwargs) if flatten: arr = do_flatten(arr, flatten) yield convert_to_dataframe(arr) return genchunks() arr = root2array(paths, key, all_vars, selection=where, *args, **kwargs) if flatten: arr = do_flatten(arr, flatten) return convert_to_dataframe(arr)
path_file_to_save = "/users/LHCb/corentin/radiative_dataset/data/" #path to MC : path_mod = "/data/lhcb/marin/hhpi0gamma/radiativehhpi0RG_R16S28r1p1_MC.root" #path to experimental dataset : path_exp = "/users/LHCb/corentin/radiative_dataset/data/selected_exp10000.root" path_branch_name_shared = path_file_to_save + "branch_names_shared.csv" path_branch_name_scalar = path_file_to_save + "branch_names_scalar.csv" path_branch_name_real = path_file_to_save + "branch_names_real.csv" path_branch_name_nonconstant = path_file_to_save + "branch_names_nonconstant.csv" path_branch_name_selec = path_file_to_save + "branch_names_selec.csv" path_branch_name_noncorr = path_file_to_save + "branch_names_noncorr.csv" data_mod = root2array(filenames=path_mod) branch_name_mod = list_branches(filename=path_mod) branch_name_exp = list_branches(filename=path_exp) branch_name_shared = [] branch_name_scalar = [] branch_name_scalar_exp = [] branch_name_real = [] for i in range(len(branch_name_mod)): if branch_name_mod[i] in branch_name_exp: branch_name_shared += [branch_name_mod[i]] if type(data_mod[0][i]) in [ np.float64, np.int64, np.int32, np.float32 ]: branch_name_scalar += [branch_name_mod[i]] print 'created branch_name_shared of length : ' + str(len(branch_name_shared))
rp.to_root(DF_test, 'NNFlatTree_TestSample.root', key='NNFlatTree') DF_test_VBF = DF_test[DF_test['ggFVBF'] == 1] DF_test_ggF = DF_test[DF_test['ggFVBF'] == 0] DF_train_VBF = DF_train[DF_train['ggFVBF'] == 1] DF_train_ggF = DF_train[DF_train['ggFVBF'] == 0] rp.to_root(DF_test_VBF, 'NNFlatTree_VBF1000.root', key='NNFlatTree') rp.to_root(DF_test_ggF, 'NNFlatTree_ggF1000.root', key='NNFlatTree') ### Vectorial Tree from Reader ### if runForVBFggF: VT_name = VT_path + 'VBF_H1000.root' DF_VT_VBF1000 = pd.DataFrame( root2array(VT_name, 'Nominal', branches=list_branches(VT_name))) NNTreeMakerForTestTrain(VT_name, 'VBFH1000', DF_test_VBF, DF_train_VBF) VT_name = VT_path + 'ggF_H1000.root' DF_VT_ggF1000 = pd.DataFrame( root2array(VT_name, 'Nominal', branches=list_branches(VT_name))) NNTreeMakerForTestTrain(VT_name, 'ggFH1000', DF_test_ggF, DF_train_ggF) ### add NNScore to VT from Reader for bkg (only Test DF) ### Samples = [ #'ZeeB_Sh221', 'ZeeC_Sh221', 'ZeeL_Sh221', 'Zee_Sh221', #'ZmumuB_Sh221', 'ZmumuC_Sh221', 'ZmumuL_Sh221', 'Zmumu_Sh221', 'WqqWlv_Sh221', 'WqqZll_Sh221', 'WlvZqq_Sh221', 'ZqqZll_Sh221', #'stops_PwPy8', 'stopWt_PwPy8', 'ttbar_nonallhad_PwPy8', #'ZtautauB_Sh221', 'ZtautauC_Sh221', 'ZtautauL_Sh221', #'Ztautau_Sh221', #'data15', 'data16', #'VBF_H2000', 'ggF_H2000', 'VBF_H3000', 'ggF_H3000',