Ejemplo n.º 1
2
def test_flatten():
    tf = ROOT.TFile('tmp.root', 'RECREATE')
    tt = ROOT.TTree("a", "a")

    length = np.array([3])
    x = np.array([0, 1, 2], dtype='float64')
    tt.Branch('length', length, 'length/I')
    tt.Branch('x', x, 'x[length]/D')

    tt.Fill()
    x[0] = 3
    x[1] = 4
    x[2] = 5
    tt.Fill()
    
    tf.Write()
    tf.Close()

    branches = list_branches('tmp.root')

    df_ = read_root('tmp.root', flatten=True)

    assert('__array_index' in df_.columns)
    assert(len(df_) == 6)
    assert(np.all(df_['__array_index'] == np.array([0, 1, 2, 0, 1, 2])))

    # Also flatten chunked data

    for df_ in read_root('tmp.root', flatten=True, chunksize=1):
        assert(len(df_) == 3)
        assert(np.all(df_['__array_index'] == np.array([0, 1, 2])))

    os.remove('tmp.root')
Ejemplo n.º 2
0
 def export_root_data_contents(self, rfile):
     """
     Uses the root_numpy class to list trees and output them.
     :param rfile: Input ROOT file
     :return: a dictionary defining the underlying data in the file link. The key is the name of each tree.
     """
     for tree in list_trees(rfile):
         print 'Processing tree ' + tree
         print list_branches(rfile, tree)
         arr = root2array(rfile, treename=tree)
         print str(arr.view(numpy.recarray))
def read_one_event(run=6, subrun=0, event=0):
    filename = run_path + str(subrun) + '-RecoFull-Parser.root'
    root_file_entries_list = root_numpy.list_branches(filename,
                                                      'analysistree/anatree')
    reco_file_values = root_numpy.root2array(filename,
                                             'analysistree/anatree',
                                             start=event,
                                             stop=event + 1,
                                             step=1)

    dictionary_reco_file_values = {}

    for root_file_index in range(len(root_file_entries_list)):
        dictionary_reco_file_values[root_file_entries_list[
            root_file_index]] = reco_file_values[0][root_file_index]

    all_channel_waveform_adc = []
    count = 0
    for i in range(1280):
        if i in dictionary_reco_file_values['RecoWaveform_Channel']:
            all_channel_waveform_adc.append(
                dictionary_reco_file_values['RecoWaveform_ADC'][count *
                                                                1667:(count +
                                                                      1) *
                                                                1667])
            count += 1
        else:
            all_channel_waveform_adc.append(np.zeros(1667))

    return np.array(all_channel_waveform_adc).reshape((1280, 1667))
Ejemplo n.º 4
0
def get_reconstruction_variables(run, subrun, event):
    root_file_entries_list = root_numpy.list_branches(
        '/eos/experiment/wa105/offline/LArSoft/Data/Reco/2018_June_24/ROOT/recofast/'
        + str(run) + '/' + str(run) + '-' + str(subrun) +
        '-RecoFast-Parser.root', 'analysistree/anatree')
    reco_file_values = root_numpy.root2array(
        '/eos/experiment/wa105/offline/LArSoft/Data/Reco/2018_June_24/ROOT/recofast/'
        + str(run) + '/' + str(run) + '-' + str(subrun) +
        '-RecoFast-Parser.root',
        'analysistree/anatree',
        start=int(event),
        stop=int(event) + 1,
        step=1)
    dictionary_reco_file_values = {}

    for root_file_index in range(len(root_file_entries_list)):
        dictionary_reco_file_values[root_file_entries_list[
            root_file_index]] = reco_file_values[0][root_file_index]

    if dictionary_reco_file_values['NumberOfTracks'] >= 1:
        track_number_of_hits_index_position = [
            dictionary_reco_file_values['Track_NumberOfHits'][0]
        ]
        for i in range(1, dictionary_reco_file_values['NumberOfTracks']):
            track_number_of_hits_index_position.append(
                track_number_of_hits_index_position[i - 1] +
                dictionary_reco_file_values['Track_NumberOfHits'][i])

        for key in dictionary_reco_file_values.keys():
            if key[:10] == 'Track_Hit_':
                dictionary_reco_file_values[key] = np.split(
                    dictionary_reco_file_values[key],
                    track_number_of_hits_index_position)

    return dictionary_reco_file_values
Ejemplo n.º 5
0
    def __init__(self, fileNames, tree="ana/hgc"):
        """Constructor.

        Arguments:
        fileName -- String for path to the ROOT file
        tree     -- Name of the TTree object inside the ROOT file (default: 'ana/hgc')
        """
        super(HGCalNtuple, self).__init__()
        self._tree = ROOT.TChain(tree)
        self._branches = []
        branch_blacklist = ['tc_wafer',
                            'tc_cell',
                            'tc_waferu',
                            'tc_waferv',
                            'tc_cellu',
                            'tc_cellv',
                            'gen_PUNumInt',
                            'gen_TrueNumInt']

        for file_name in fileNames:
            protocol = ''
            if '/eos/user/' in file_name:
                protocol = 'root://eosuser.cern.ch/'
            elif '/eos/cms/' in file_name:
                protocol = 'root://eoscms.cern.ch/'

            self._tree.Add(protocol+file_name)
            if len(self._branches) == 0:
                self._branches = [br for br in rnp.list_branches(protocol+file_name, tree) if br not in branch_blacklist]
        # print 'Cache size: {}'.format(self._tree.GetCacheSize())

        self._entries = self._tree.GetEntries()
Ejemplo n.º 6
0
def get_branches(filename: object, treename: object, vectors: object = None) -> object:
    """ function that returns the appropriate branch string in a branch for the specified vectors of the branch  """
    if vectors is None:
        return rn.list_branches(filename, treename=treename)

    all_branches = rn.list_branches(filename, treename=treename)

    for vector in vectors:
        try:
            all_branches.pop(all_branches.index(vector))
        except:
            continue

        all_branches.extend([vector + '.x()', vector + '.y()', vector + '.z()'])

    return all_branches
Ejemplo n.º 7
0
def read_one_event(
    run, subrun, event
):  # Reads 3x1x1 binary files and returns ADC counts for each channel in a 1280(channels) x 1667(ticks) array
    filename = '/eos/experiment/wa105/offline/LArSoft/Data/Reco/2018_June_24/ROOT/recofull/' + str(
        run) + '/' + str(run) + '-' + str(subrun) + '-RecoFull-Parser.root'
    root_file_entries_list = root_numpy.list_branches(filename,
                                                      'analysistree/anatree')
    reco_file_values = root_numpy.root2array(filename,
                                             'analysistree/anatree',
                                             start=event,
                                             stop=event + 1,
                                             step=1)

    dictionary_reco_file_values = {}

    for root_file_index in range(len(root_file_entries_list)):
        dictionary_reco_file_values[root_file_entries_list[
            root_file_index]] = reco_file_values[0][root_file_index]

    all_channel_waveform_adc = []
    count = 0
    for i in range(1280):
        if i in dictionary_reco_file_values['RecoWaveform_Channel']:
            all_channel_waveform_adc.append(
                dictionary_reco_file_values['RecoWaveform_ADC'][count *
                                                                1667:(count +
                                                                      1) *
                                                                1667])
            count += 1
        else:
            all_channel_waveform_adc.append(np.zeros(1667))

    return np.array(all_channel_waveform_adc).reshape((1280, 1667))
def get_reconstruction_variables(run=6, subrun=0, event=0):
    root_file_entries_list = root_numpy.list_branches(
        run_path + str(subrun) + '-RecoFast-Parser.root',
        'analysistree/anatree')
    reco_file_values = root_numpy.root2array(run_path + str(subrun) +
                                             '-RecoFast-Parser.root',
                                             'analysistree/anatree',
                                             start=event,
                                             stop=event + 1,
                                             step=1)
    dictionary_reco_file_values = {}

    for root_file_index in range(len(root_file_entries_list)):
        dictionary_reco_file_values[root_file_entries_list[
            root_file_index]] = reco_file_values[0][root_file_index]

    if dictionary_reco_file_values['NumberOfTracks'] >= 1:
        track_number_of_hits_index_position = [
            dictionary_reco_file_values['Track_NumberOfHits'][0]
        ]
        for i in range(1, dictionary_reco_file_values['NumberOfTracks']):
            track_number_of_hits_index_position.append(
                track_number_of_hits_index_position[i - 1] +
                dictionary_reco_file_values['Track_NumberOfHits'][i])

        for key in dictionary_reco_file_values.keys():
            if key[:10] == 'Track_Hit_':
                dictionary_reco_file_values[key] = np.split(
                    dictionary_reco_file_values[key],
                    track_number_of_hits_index_position)
    return dictionary_reco_file_values
Ejemplo n.º 9
0
def get_matching_variables(fname, tree, patterns):
    branches = list_branches(fname, tree)

    selected = []

    for p in patterns:
        for b in branches:
            if fnmatch(b, p) and not b in selected:
                selected.append(b)
    return selected
Ejemplo n.º 10
0
def get_matching_variables(fname, tree, patterns):
    branches = list_branches(fname, tree)

    selected = []

    for p in patterns:
        for b in branches:
            if fnmatch(b, p) and not b in selected:
                selected.append(b)
    return selected
Ejemplo n.º 11
0
def Read_Root_file(Path_to_tree,
                   Tree_name_array,
                   Branches,
                   Tree_selection=None):
    """Transform .root file with branches to ndarray (similar as a dictionary).

        Returns an array of arrays where each array will be a branch
        This function use root2array wich is a root_numpy's function.

        Parameters:
        Path_to_tree -- The complete path to the folder of the .root files
        Tree_name_array -- The name of each tree inside the folder without .root extension. Only the names in an array. For example: ['bla1', 'bla2']
        Branches -- The branches can be selected one by one in array. For example: ['Brho' , 'D']
        Selection -- A prelimanar selection can be apply. For example: 'ThetaLdeg >> 0.0 & D == 0.0'

        Excepctions:
        None

        Return:
        array_data -- A matrix with the name of the branches in arrays related with the data of each initial branch in the original .root file

        """

    path_filename = list(
        map(lambda s: os.path.join(Path_to_tree, s) + '.root',
            Tree_name_array))  #The file path
    path_treename = rn.list_trees(path_filename[0])  #The name inside the tree

    print('We are reading this root files:')
    print(Tree_name_array)

    if Branches == 'All':
        Tree_branches = rn.list_branches(
            path_filename[0]
        )  #Asumption with all the trees have the same name for their branches
        print('All branches are chosen')
    else:
        Tree_branches = Branches
        print('These branches ')
        print(Branches)
        print('are selected')

    if Tree_selection != None:
        print('The pre-selection over trees is ', Tree_selection)
        array_data = rn.root2array(filenames=path_filename,
                                   treename=path_treename[0],
                                   branches=Tree_branches,
                                   selection=Tree_selection)
    else:
        print('No pre-selection applied')
        array_data = rn.root2array(filenames=path_filename,
                                   treename=path_treename[0],
                                   branches=Tree_branches)

    return array_data
Ejemplo n.º 12
0
def load_root_tree(files,
                   tree=None,
                   columns=None,
                   ignore=None,
                   *kargs,
                   **kwargs):

    from pandas import DataFrame
    from root_numpy import root2array, list_trees, list_branches

    # check if we get a list or a single file
    if not isinstance(files, list):
        files = [files]
    # use the first file to define tree & branches
    init_file = files[0]

    # check to see if there is a specified tree, if not,
    # look for a single tree. If the choice is ambiguous,
    # raise an error and exit
    if tree == None:
        trees = list_trees(init_file)
        if len(trees) == 1:
            tree = trees[0]
        elif len(trees) == 0:
            raise ValueError('Error: no trees found in {}'.format(init_file))
        else:
            raise ValueError(
                'Ambiguous call: more than one tree found in {}'.format(
                    init_file))

    branches = list_branches(init_file, tree)

    # match existing branches to branches asked for by user
    if not columns:
        all_vars = branches
    else:
        all_vars = get_matching_variables(branches, columns)

    # handle branches that are asked to be ignored
    if ignore:
        if isinstance(ignore, string_types):
            ignore = [ignore]
        ignored = get_matching_variables(ignore, branches)
        for var in ignored:
            all_vars.remove(var)

    arr = root2array(files, tree, all_vars, *kargs, **kwargs)

    if 'index' in arr.dtype.names:
        df = DataFrame.from_records(arr, index='index')
    else:
        df = DataFrame.from_records(arr)
    return df
Ejemplo n.º 13
0
def _LoadRoot(filepath):
    if not useRootNumpy:
        raise IOError("root_numpy not available - can't load ROOT file")
    data = BDSAsciiData()
    trees = _rnp.list_trees(filepath)

    if 'optics' in trees:
        branches = _rnp.list_branches(filepath, 'optics')
        treedata = _rnp.root2array(filepath, 'optics')
    elif 'orbit' in trees:
        branches = _rnp.list_branches(filepath, 'orbit')
        treedata = _rnp.root2array(filepath, 'orbit')
    else:
        raise IOError("This file doesn't have the required tree 'optics'.")
    for element in range(len(treedata[branches[0]])):
        elementlist = []
        for branch in branches:
            if element == 0:
                data._AddProperty(branch)
            elementlist.append(treedata[branch][element])
        data.append(elementlist)
    return data
Ejemplo n.º 14
0
def get_matching_variables(file, tree, patterns):
    from fnmatch import fnmatch
    from root_numpy import list_branches

    branches = list_branches(file, tree)

    selected = []

    for p in patterns:
        for b in branches:
            if fnmatch(b, p) and not b in selected:
                selected.append(b)
    return selected
Ejemplo n.º 15
0
def test_persistent_index():
    df = pd.DataFrame({'index': [42, 0, 1], 'x': [1,2,3]})
    df = df.set_index('index')
    df.index.name = 'MyAwesomeName'
    df.to_root('tmp.root')
    assert('__index__MyAwesomeName' in list_branches('tmp.root'))
    df_ = read_root('tmp.root')
    assert_frame_equal(df, df_)
    os.remove('tmp.root')

    # See what happens if the index has no name
    df = pd.DataFrame({'x': [1,2,3]})
    df.to_root('tmp.root')
    df_ = read_root('tmp.root')
    assert_frame_equal(df, df_)
    os.remove('tmp.root')
Ejemplo n.º 16
0
def to_pandas_old(data_in, index=None, columns=None):
    """Convert data from numpy or root to pandas dataframe.

    Convert data safely to pandas, whatever the format is.
    Parameters
    ----------
    data_in : any reasonable data
        The data to be converted
    """
    # TODO: generalize
    root_index_name = '__index__'

    data_in = dev_tool.entries_to_str(data_in)
    if is_root(data_in):
        root_index = None
        if root_index_name in root_numpy.list_branches(
                filename=data_in['filenames'],
                treename=data_in.get('treename')):
            root_index = root_numpy.root2array(
                filenames=data_in['filenames'],
                treename=data_in.get('treename'),
                selection=data_in.get('selection'),
                branches=root_index_name)
        data_in = root_numpy.root2array(**data_in)  # why **? it's a root dict

    if is_list(data_in):
        data_in = np.array(data_in)
    if is_ndarray(data_in):
        if ((isinstance(columns, (list, tuple)) and len(columns) == 1)
                or isinstance(columns, basestring)):

            data_in = to_ndarray(data_in)
        data_in = pd.DataFrame(data_in, columns=columns, index=root_index)
        if index is not None:
            data_in = data_in.loc[index]
    elif isinstance(data_in, pd.DataFrame):
        pass
    else:
        raise TypeError("Could not convert data to pandas. Data: " + data_in)
    return data_in
Ejemplo n.º 17
0
def read_root(paths, key=None, columns=None, ignore=None, chunksize=None, where=None, flatten=False, *args, **kwargs):
    """
    Read a ROOT file, or list of ROOT files, into a pandas DataFrame.
    Further *args and *kwargs are passed to root_numpy's root2array.
    If the root file contains a branch matching __index__*, it will become the DataFrame's index.

    Parameters
    ----------
    paths: string or list
        The path(s) to the root file(s)
    key: string
        The key of the tree to load.
    columns: str or sequence of str
        A sequence of shell-patterns (can contain *, ?, [] or {}). Matching columns are read.
        The columns beginning with `noexpand:` are not interpreted as shell-patterns,
        allowing formula columns such as `noexpand:2*x`. The column in the returned DataFrame
        will not have the `noexpand:` prefix.
    ignore: str or sequence of str
        A sequence of shell-patterns (can contain *, ?, [] or {}). All matching columns are ignored (overriding the columns argument).
    chunksize: int
        If this parameter is specified, an iterator is returned that yields DataFrames with `chunksize` rows.
    where: str
        Only rows that match the expression will be read.
    flatten: sequence of str
        A sequence of column names. Will use root_numpy.stretch to flatten arrays in the specified columns into
        individual entries. All arrays specified in the columns must have the same length for this to work.
        Be careful if you combine this with chunksize, as chunksize will refer to the number of unflattened entries,
        so you will be iterating over a number of entries that is potentially larger than chunksize.
        The index of each element within its former array will be saved in the __array_index column.

    Returns
    -------
        DataFrame created from matching data in the specified TTree

    Notes
    -----

        >>> df = read_root('test.root', 'MyTree', columns=['A{B,C}*', 'D'], where='ABB > 100')

    """

    if not isinstance(paths, list):
        paths = [paths]
    # Use a single file to search for trees and branches
    seed_path = paths[0]

    if not key:
        trees = list_trees(seed_path)
        if len(trees) == 1:
            key = trees[0]
        elif len(trees) == 0:
            raise ValueError('No trees found in {}'.format(seed_path))
        else:
            raise ValueError('More than one tree found in {}'.format(seed_path))

    branches = list_branches(seed_path, key)

    if not columns:
        all_vars = branches
    else:
        if isinstance(columns, string_types):
            columns = [columns]
        # __index__* is always loaded if it exists
        # XXX Figure out what should happen with multi-dimensional indices
        index_branches = list(filter(lambda x: x.startswith('__index__'), branches))
        if index_branches:
            columns = columns[:]
            columns.append(index_branches[0])
        columns, noexpand = filter_noexpand_columns(columns)
        columns = list(itertools.chain.from_iterable(list(map(expand_braces, columns))))
        all_vars = get_matching_variables(branches, columns) + noexpand

    if ignore:
        if isinstance(ignore, string_types):
            ignore = [ignore]
        ignored = get_matching_variables(branches, ignore, fail=False)
        ignored = list(itertools.chain.from_iterable(list(map(expand_braces, ignored))))
        if any(map(lambda x: x.startswith('__index__'), ignored)):
            raise ValueError('__index__* branch is being ignored!')
        for var in ignored:
            all_vars.remove(var)

    def do_flatten(arr, flatten):
        if flatten is True:
            warnings.warn(" The option flatten=True is deprecated. Please specify the branches you would like "
                          "to flatten in a list: flatten=['foo', 'bar']", FutureWarning)
            arr_, idx = stretch(arr, return_indices=True)
        else:
            nonscalar = get_nonscalar_columns(arr)
            fields = [x for x in arr.dtype.names if (x not in nonscalar or x in flatten)]
            will_drop = [x for x in arr.dtype.names if x not in fields]
            if will_drop:
                warnings.warn("Ignored the following non-scalar branches: {bad_names}"
                      .format(bad_names=", ".join(will_drop)), UserWarning)
            arr_, idx = stretch(arr, fields=fields, return_indices=True)
        arr = append_fields(arr_, '__array_index', idx, usemask=False, asrecarray=True)
        return arr

    if chunksize:
        tchain = ROOT.TChain(key)
        for path in paths:
            tchain.Add(path)
        n_entries = tchain.GetEntries()
        # XXX could explicitly clean up the opened TFiles with TChain::Reset

        def genchunks():
            for chunk in range(int(ceil(float(n_entries) / chunksize))):
                arr = root2array(paths, key, all_vars, start=chunk * chunksize, stop=(chunk+1) * chunksize, selection=where, *args, **kwargs)
                if flatten:
                    arr = do_flatten(arr, flatten)
                yield convert_to_dataframe(arr)
        return genchunks()

    arr = root2array(paths, key, all_vars, selection=where, *args, **kwargs)
    if flatten:
        arr = do_flatten(arr, flatten)
    return convert_to_dataframe(arr)
Ejemplo n.º 18
0
def read_root(path, tree_key=None, columns=None, ignore=None, chunksize=None, where=None, *kargs, **kwargs):
    """
    Read a ROOT file into a pandas DataFrame.
    Further *kargs and *kwargs are passed to root_numpy's root2array.
    If the root file contains a branch called index, it will become the DataFrame's index.

    Parameters
    ----------
    path: string
        The path to the root file
    tree_key: string
        The key of the tree to load.
    columns: str or sequence of str
        A sequence of shell-patterns (can contain *, ?, [] or {}). Matching columns are read.
    ignore: str or sequence of str
        A sequence of shell-patterns (can contain *, ?, [] or {}). All matching columns are ignored (overriding the columns argument)
    chunksize: int
        If this parameter is specified, an iterator is returned that yields DataFrames with `chunksize` rows
    where: str
        Only rows that match the expression will be read

    Returns
    -------
        DataFrame created from matching data in the specified TTree

    Notes
    -----

        >>> df = read_root('test.root', 'MyTree', columns=['A{B,C}*', 'D'], where='ABB > 100')

    """
    if not tree_key:
        branches = list_trees(path)
        if len(branches) == 1:
            tree_key = branches[0]
        else:
            raise ValueError('More than one tree found in {}'.format(path))

    branches = list_branches(path, tree_key)

    if not columns:
        all_vars = branches
    else:
        # index is always loaded if it exists
        if isinstance(columns, string_types):
            columns = [columns]
        if 'index' in branches:
            columns = columns[:]
            columns.append('index')
        columns = list(itertools.chain.from_iterable(list(map(expand_braces, columns))))
        all_vars = get_matching_variables(branches, columns)

    if ignore:
        if isinstance(ignore, string_types):
            ignore = [ignore]
        ignored = get_matching_variables(branches, ignore, fail=False)
        ignored = list(itertools.chain.from_iterable(list(map(expand_braces, ignored))))
        if 'index' in ignored:
            raise ValueError('index variable is being ignored!')
        for var in ignored:
            all_vars.remove(var)

    if chunksize:
        f = ROOT.TFile(path)
        n_entries = f.Get(tree_key).GetEntries()
        f.Close()
        def genchunks():
            for chunk in range(int(ceil(float(n_entries) / chunksize))):
                arr = root2array(path, tree_key, all_vars, start=chunk * chunksize, stop=(chunk+1) * chunksize, selection=where, *kargs, **kwargs)
                yield convert_to_dataframe(arr)
        return genchunks()

    arr = root2array(path, tree_key, all_vars, selection=where, *kargs, **kwargs)
    return convert_to_dataframe(arr)
Ejemplo n.º 19
0
def test_flatten():
    tf = ROOT.TFile('tmp.root', 'RECREATE')
    tt = ROOT.TTree("a", "a")

    length = np.array([3])
    x = np.array([0, 1, 2], dtype='float64')
    y = np.array([6, 7, 8], dtype='float64')
    tt.Branch('length', length, 'length/I')
    tt.Branch('x', x, 'x[length]/D')
    tt.Branch('y', y, 'y[length]/D')
    tt.Fill()
    x[0] = 3
    x[1] = 4
    x[2] = 5
    y[0] = 9
    y[1] = 10
    y[2] = 11
    tt.Fill()
    
    tf.Write()
    tf.Close()

    branches = list_branches('tmp.root')


    # flatten one out of two array branches
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        df_ = read_root('tmp.root', flatten=['x'])
    assert('__array_index' in df_.columns)
    assert(len(df_) == 6)
    assert('length' in df_.columns.values)
    assert('x' in df_.columns.values)
    assert('y' not in df_.columns.values)
    assert(np.all(df_['__array_index'] == np.array([0, 1, 2, 0, 1, 2])))
    assert(np.all(df_['x'] == np.array([0, 1, 2, 3, 4, 5])))


    # flatten both array branches
    df_ = read_root('tmp.root', flatten=['x','y'])
    assert('__array_index' in df_.columns)
    assert(len(df_) == 6)
    assert(np.all(df_['__array_index'] == np.array([0, 1, 2, 0, 1, 2])))
    assert('length' in df_.columns.values)
    assert('x' in df_.columns.values)
    assert('y' in df_.columns.values)
    assert(np.all(df_['x'] == np.array([0, 1, 2, 3, 4, 5])))
    assert(np.all(df_['y'] == np.array([6, 7, 8, 9, 10, 11])))


    # Also flatten chunked data
    for df_ in read_root('tmp.root', flatten=['x'], chunksize=1):
        assert(len(df_) == 3)
        assert(np.all(df_['__array_index'] == np.array([0, 1, 2])))

    # Also test deprecated behaviour
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        df_ = read_root('tmp.root', flatten=True)
    assert('__array_index' in df_.columns)
    assert(len(df_) == 6)
    assert(np.all(df_['__array_index'] == np.array([0, 1, 2, 0, 1, 2])))


    os.remove('tmp.root')
Ejemplo n.º 20
0
    graph.GetXaxis().SetRangeUser(0.001, 1)
    graph.GetXaxis().SetTitle("sig. eff.")
    graph.GetYaxis().SetRangeUser(0.001, 1)
    graph.GetYaxis().SetTitle("bkg. eff.")
    print "AUC:", auc(eff_sig, eff_bkg)
    return graph


fin = r.TFile("nelsonVar_histos.root", "READ")
bkg_tree = fin.Get("bkg_tree")
sig_tree = fin.Get("sig_1000mev_tree")

bkg_np_arr = rn.tree2array(bkg_tree)
sig_np_arr = rn.tree2array(sig_tree)

branches = rn.list_branches("nelsonVar_histos.root", "bkg_tree")
print branches

features = []
for branch in branches:
    if 'cylinder' in branch:
        features.append(branch)

sig_dic = {}
bkg_dic = {}

for branch, bkg_data, sig_data in zip(branches, zip(*bkg_np_arr),
                                      zip(*sig_np_arr)):
    sig_dic[branch] = sig_data
    bkg_dic[branch] = bkg_data
Ejemplo n.º 21
0
    signal = False
    myrange = 20
    njets = 3

    # signal = False
    # myrange = range(0,25)

    print('signal?', signal)
    print('njets?', njets)

    if signal:
        filename = '../data/vbfroot/data-CxAOD-0.root'
    else:
        filename = '../data/ggfroot/data-CxAOD-0.root'

    branch_names = root_numpy.list_branches(filename, 'Nominal')
    branch = {name: number for number, name in enumerate(branch_names)}

    # vbf_reg = scrape_folder('../data/vbfroot/', branch, scrape_regular, maxfiles=1)
    arr = root2array(filename, 'Nominal')

    mycount = 0
    for i in range(len(arr)):
        if check_event(arr, branch, i, njets):
            print(i, 'weight', arr[i][branch['eventWeight']])
            event3d(arr, branch, i)
            mycount += 1

        if mycount == myrange:
            break
Ejemplo n.º 22
0
def generate_data_sample(numbers_particles, n_tracks, selection, file_path, log_path, readed_files_txt):
    """
    Generates data sample from different decays.

    Parameters
    ----------
    numbers_particles : pandas.DataFrame
        Number of particles of each type in each data file.
    n_tracks : int
        Number of tracks of each particle type.
    selection : string
        Selection criteria for the particles.
    file_path : string
        Name of the data sample file.
    log_path : string
        Name of the log file.
    readed_files_txt : string
        Name of the file which contains https of all read files. Files from this file will not be read. This is needed for the warm start.

    Return
    ------
    1
    """

    # Estimate how many track of the each particle from the each file should be taken
    particles = numbers_particles.columns.drop(['http', 'tree_name'])
    part = 1. * n_tracks / numbers_particles[particles].sum()


    # Try to create or open LOG file
    if not os.path.exists(log_path):

        LOG = open(log_path, 'w')
        LOG.write('Particles pdgs: ' + str(particles) + '\n')
        LOG.write('Selection: ' + selection + '\n')
        LOG.write('Number of tracks: ' + str(n_tracks) + '\n')
        LOG.flush()

    else:

        LOG = open(log_path, 'a')



    # Try create or open file with the READED data files.
    if not os.path.exists(readed_files_txt):

        READED = open(readed_files_txt, 'w')
        READED.write("")
        READED.close()
        READED = list(numpy.loadtxt(readed_files_txt, dtype='S', delimiter='\n', comments='#', ndmin=1))

    else:

        READED = list(numpy.loadtxt(readed_files_txt, dtype='S', delimiter='\n', comments='#', ndmin=1))


    # Count how many track have been taken
    try:
        data = pandas.read_csv(file_path, usecols=['MCParticleType'])

        numbers_per_particle = {}
        for pdg in particles:
            numbers_per_particle[pdg] = len(data[numpy.abs(data.MCParticleType.values) == int(pdg)])

        data = pandas.DataFrame()

    except:

        data = pandas.DataFrame()

        numbers_per_particle = {}
        for i in particles:
            numbers_per_particle[i] = 0


    for index in numbers_particles.index:

        success = 0
        while success != 1:
            try:
                file_http = numbers_particles.loc[index]['http']
                tree_name = numbers_particles.loc[index]['tree_name']

                # A file was readed before?
                if file_http in READED:
                    success = 1
                    continue

                branches = root_numpy.list_branches(file_http, treename=tree_name)
                branches = numpy.array(branches)

                data_array = root_numpy.root2array(filenames=file_http,
                                                   treename=tree_name,
                                                   branches=branches[branches != 'piplus_OWNPV_COV_'],
                                                   selection=selection)

                data = pandas.DataFrame(data=data_array, columns=branches[branches != 'piplus_OWNPV_COV_'])

                LOG.write(file_http + '\n')
                LOG.flush()

                data_iter = pandas.DataFrame(columns=branches[branches != 'piplus_OWNPV_COV_'])
                data_iter_index = []

                for one_particle in particles:

                    p_type = numpy.abs(data['MCParticleType'].values)
                    data_particle = data[p_type == int(one_particle)]

                    number = numbers_particles.loc[index][one_particle]
                    number_take = int(round(part[one_particle] * number))

                    data_particle_take_index, _ = train_test_split(data_particle.index,
                                                                   train_size=number_take,
                                                                   random_state=42)

                    data_iter_index += list(data_particle_take_index)
                    numbers_per_particle[one_particle] += number_take


                data_iter = data.loc[data_iter_index]

                if os.path.exists(file_path):
                    data_iter.to_csv(file_path, mode='a', header=False)
                else:
                    data_iter.to_csv(file_path, mode='a', header=True)

                del data_iter, data, data_array
                gc.collect()


                READED.append(file_http)
                numpy.array(READED).tofile(readed_files_txt, sep="\n")

                LOG.write('Tracks selected: ' + str(numbers_per_particle) + '\n')
                LOG.flush()

                success = 1

            except:

                LOG.write('Unexpected error \n')
                LOG.flush()

    return 1
Ejemplo n.º 23
0
tree_wjets = file_wjets.Get('events')
tree_ww = file_ww.Get('events')
tree_wz = file_wz.Get('events')
tree_zz = file_zz.Get('events')
'''
'''
------------------------------ Define Branches, select the candidates and do cuts over the branches -----------------------
'''

SamplesList = [
    'data', 'dy', 'qcd', 'singletop', 'ttbar', 'wjets', 'ww', 'wz', 'zz'
]

#Tree branches by hand or all directly:
#Tree_Branches = ['Muon_Px', 'Muon_Py', 'Muon_Pz', 'Muon_E', 'Muon_Charge', 'Muon_Iso'] #for example
Tree_Branches = rn.list_branches('files/data.root')

#Select the candidates, two muons, zero electrons and zero photons with some requirements
Candidates_Selection = 'NMuon == 2 & NElectron == 0 & NPhoton == 0 & NJet == 0'
IsoTrigger_Selection = 'triggerIsoMu24 == 1.0'
HadroMC_Selection = 'MChadronicWDecayQuark_px == 0.0'

#Combine all the cuts in the selection:
Tree_Selection = [
    Candidates_Selection, IsoTrigger_Selection, HadroMC_Selection
]
Tree_Selection = '&'.join(Tree_Selection)

print('Choosing candidates ...')
#array_data = rn.root2array(filenames = Path_to_tree + 'data.root', treename = 'events', branches = ['Muon_Px' , 'NMuon'], selection = 'NMuon == 2.0' , object_selection = {'Muon_Px == 0.0' : 'Muon_Px'})
Ejemplo n.º 24
0
def read_root(paths, key=None, columns=None, ignore=None, chunksize=None, where=None, flatten=False, *args, **kwargs):
    """
    Read a ROOT file, or list of ROOT files, into a pandas DataFrame.
    Further *args and *kwargs are passed to root_numpy's root2array.
    If the root file contains a branch matching __index__*, it will become the DataFrame's index.

    Parameters
    ----------
    paths: string or list
        The path(s) to the root file(s)
    key: string
        The key of the tree to load.
    columns: str or sequence of str
        A sequence of shell-patterns (can contain *, ?, [] or {}). Matching columns are read.
        The columns beginning with `noexpand:` are not interpreted as shell-patterns,
        allowing formula columns such as `noexpand:2*x`. The column in the returned DataFrame
        will not have the `noexpand:` prefix.
    ignore: str or sequence of str
        A sequence of shell-patterns (can contain *, ?, [] or {}). All matching columns are ignored (overriding the columns argument).
    chunksize: int
        If this parameter is specified, an iterator is returned that yields DataFrames with `chunksize` rows.
    where: str
        Only rows that match the expression will be read.
    flatten: sequence of str
        A sequence of column names. Will use root_numpy.stretch to flatten arrays in the specified columns into
        individual entries. All arrays specified in the columns must have the same length for this to work.
        Be careful if you combine this with chunksize, as chunksize will refer to the number of unflattened entries,
        so you will be iterating over a number of entries that is potentially larger than chunksize.
        The index of each element within its former array will be saved in the __array_index column.

    Returns
    -------
        DataFrame created from matching data in the specified TTree

    Notes
    -----

        >>> df = read_root('test.root', 'MyTree', columns=['A{B,C}*', 'D'], where='ABB > 100')

    """

    if not isinstance(paths, list):
        paths = [paths]
    # Use a single file to search for trees and branches, ensuring the key exists
    for seed_path in paths:
        trees = list_trees(seed_path)
        if key and key not in trees:
            continue
        break
    else:
        if key:
            raise OSError('{} not found in any of the given paths'.format(key))
        else:
            raise OSError('No trees found in any of the given paths')

    if not key:
        if len(trees) == 1:
            key = trees[0]
        elif len(trees) == 0:
            raise ValueError('No trees found in {}'.format(seed_path))
        else:
            raise ValueError('More than one tree found in {}'.format(seed_path))

    branches = list_branches(seed_path, key)

    if not columns:
        all_vars = branches
    else:
        if isinstance(columns, string_types):
            columns = [columns]
        # __index__* is always loaded if it exists
        # XXX Figure out what should happen with multi-dimensional indices
        index_branches = list(filter(lambda x: x.startswith('__index__'), branches))
        if index_branches:
            columns = columns[:]
            columns.append(index_branches[0])
        columns, noexpand = filter_noexpand_columns(columns)
        columns = list(itertools.chain.from_iterable(list(map(expand_braces, columns))))
        all_vars = get_matching_variables(branches, columns) + noexpand

    if ignore:
        if isinstance(ignore, string_types):
            ignore = [ignore]
        ignored = get_matching_variables(branches, ignore, fail=False)
        ignored = list(itertools.chain.from_iterable(list(map(expand_braces, ignored))))
        if any(map(lambda x: x.startswith('__index__'), ignored)):
            raise ValueError('__index__* branch is being ignored!')
        for var in ignored:
            all_vars.remove(var)

    if chunksize:
        tchain = ROOT.TChain(key)
        for path in paths:
            tchain.Add(path)
        n_entries = tchain.GetEntries()
        n_chunks = int(ceil(float(n_entries) / chunksize))
        # XXX could explicitly clean up the opened TFiles with TChain::Reset

        class genchunk(object):
            def __len__(self):
                return n_chunks

            def __iter__(self):
                current_index = 0
                for chunk in range(n_chunks):
                    arr = root2array(paths, key, all_vars, start=chunk * chunksize, stop=(chunk+1) * chunksize, selection=where, *args, **kwargs)
                    if len(arr) == 0:
                        continue
                    if flatten:
                        arr = do_flatten(arr, flatten)
                    yield convert_to_dataframe(arr, start_index=current_index)
                    current_index += len(arr)

        return genchunk()

    arr = root2array(paths, key, all_vars, selection=where, *args, **kwargs)
    if flatten:
        arr = do_flatten(arr, flatten)
    return convert_to_dataframe(arr)
Ejemplo n.º 25
0
def test_flatten():
    tf = ROOT.TFile('tmp.root', 'RECREATE')
    tt = ROOT.TTree("a", "a")

    length = np.array([3])
    x = np.array([0, 1, 2], dtype='float64')
    y = np.array([6, 7, 8], dtype='float64')
    tt.Branch('length', length, 'length/I')
    tt.Branch('x', x, 'x[length]/D')
    tt.Branch('y', y, 'y[length]/D')
    tt.Fill()
    x[0] = 3
    x[1] = 4
    x[2] = 5
    y[0] = 9
    y[1] = 10
    y[2] = 11
    tt.Fill()

    tf.Write()
    tf.Close()

    branches = list_branches('tmp.root')
    assert (set(branches) == {'length', 'x', 'y'})

    # flatten one out of two array branches
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        df_ = read_root('tmp.root', flatten=['x'])
    assert ('__array_index' in df_.columns)
    assert (len(df_) == 6)
    assert ('length' in df_.columns.values)
    assert ('x' in df_.columns.values)
    assert ('y' not in df_.columns.values)
    assert (np.all(df_['__array_index'] == np.array([0, 1, 2, 0, 1, 2])))
    assert (np.all(df_['x'] == np.array([0, 1, 2, 3, 4, 5])))

    # flatten both array branches
    df_ = read_root('tmp.root', flatten=['x', 'y'])
    assert ('__array_index' in df_.columns)
    assert (len(df_) == 6)
    assert (np.all(df_['__array_index'] == np.array([0, 1, 2, 0, 1, 2])))
    assert ('length' in df_.columns.values)
    assert ('x' in df_.columns.values)
    assert ('y' in df_.columns.values)
    assert (np.all(df_['x'] == np.array([0, 1, 2, 3, 4, 5])))
    assert (np.all(df_['y'] == np.array([6, 7, 8, 9, 10, 11])))

    # Also flatten chunked data
    for df_ in read_root('tmp.root', flatten=['x'], chunksize=1):
        assert (len(df_) == 3)
        assert (np.all(df_['__array_index'] == np.array([0, 1, 2])))

    # Also test deprecated behaviour
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        df_ = read_root('tmp.root', flatten=True)
    assert ('__array_index' in df_.columns)
    assert (len(df_) == 6)
    assert (np.all(df_['__array_index'] == np.array([0, 1, 2, 0, 1, 2])))

    os.remove('tmp.root')
Ejemplo n.º 26
0
def generate_data_sample(numbers_particles, n_tracks, selection, file_path,
                         log_path, readed_files_txt):
    """
    Generates data sample from different decays.

    Parameters
    ----------
    numbers_particles : pandas.DataFrame
        Number of particles of each type in each data file.
    n_tracks : int
        Number of tracks of each particle type.
    selection : string
        Selection criteria for the particles.
    file_path : string
        Name of the data sample file.
    log_path : string
        Name of the log file.
    readed_files_txt : string
        Name of the file which contains https of all read files. Files from this file will not be read. This is needed for the warm start.

    Return
    ------
    1
    """

    # Estimate how many track of the each particle from the each file should be taken
    particles = numbers_particles.columns.drop(['http', 'tree_name'])
    part = 1. * n_tracks / numbers_particles[particles].sum()

    # Try to create or open LOG file
    if not os.path.exists(log_path):

        LOG = open(log_path, 'w')
        LOG.write('Particles pdgs: ' + str(particles) + '\n')
        LOG.write('Selection: ' + selection + '\n')
        LOG.write('Number of tracks: ' + str(n_tracks) + '\n')
        LOG.flush()

    else:

        LOG = open(log_path, 'a')

    # Try create or open file with the READED data files.
    if not os.path.exists(readed_files_txt):

        READED = open(readed_files_txt, 'w')
        READED.write("")
        READED.close()
        READED = list(
            numpy.loadtxt(readed_files_txt,
                          dtype='S',
                          delimiter='\n',
                          comments='#',
                          ndmin=1))

    else:

        READED = list(
            numpy.loadtxt(readed_files_txt,
                          dtype='S',
                          delimiter='\n',
                          comments='#',
                          ndmin=1))

    # Count how many track have been taken
    try:
        data = pandas.read_csv(file_path, usecols=['MCParticleType'])

        numbers_per_particle = {}
        for pdg in particles:
            numbers_per_particle[pdg] = len(
                data[numpy.abs(data.MCParticleType.values) == int(pdg)])

        data = pandas.DataFrame()

    except:

        data = pandas.DataFrame()

        numbers_per_particle = {}
        for i in particles:
            numbers_per_particle[i] = 0

    for index in numbers_particles.index:

        success = 0
        while success != 1:
            try:
                file_http = numbers_particles.loc[index]['http']
                tree_name = numbers_particles.loc[index]['tree_name']

                # A file was readed before?
                if file_http in READED:
                    success = 1
                    continue

                branches = root_numpy.list_branches(file_http,
                                                    treename=tree_name)
                branches = numpy.array(branches)

                data_array = root_numpy.root2array(
                    filenames=file_http,
                    treename=tree_name,
                    branches=branches[branches != 'piplus_OWNPV_COV_'],
                    selection=selection)

                data = pandas.DataFrame(
                    data=data_array,
                    columns=branches[branches != 'piplus_OWNPV_COV_'])

                LOG.write(file_http + '\n')
                LOG.flush()

                data_iter = pandas.DataFrame(
                    columns=branches[branches != 'piplus_OWNPV_COV_'])
                data_iter_index = []

                for one_particle in particles:

                    p_type = numpy.abs(data['MCParticleType'].values)
                    data_particle = data[p_type == int(one_particle)]

                    number = numbers_particles.loc[index][one_particle]
                    number_take = int(round(part[one_particle] * number))

                    data_particle_take_index, _ = train_test_split(
                        data_particle.index,
                        train_size=number_take,
                        random_state=42)

                    data_iter_index += list(data_particle_take_index)
                    numbers_per_particle[one_particle] += number_take

                data_iter = data.loc[data_iter_index]

                if os.path.exists(file_path):
                    data_iter.to_csv(file_path, mode='a', header=False)
                else:
                    data_iter.to_csv(file_path, mode='a', header=True)

                del data_iter, data, data_array
                gc.collect()

                READED.append(file_http)
                numpy.array(READED).tofile(readed_files_txt, sep="\n")

                LOG.write('Tracks selected: ' + str(numbers_per_particle) +
                          '\n')
                LOG.flush()

                success = 1

            except:

                LOG.write('Unexpected error \n')
                LOG.flush()

    return 1
Ejemplo n.º 27
0
def test_list_branches():
    branches = rnp.list_branches(load('single1.root'))
    assert_equal(branches, ['n_int', 'f_float', 'd_double'])
Ejemplo n.º 28
0
 def get_info(self):
     trees = rn.list_trees(self.file)
     for tree in trees:
         print(str.capitalize(tree) + ":")
         print(rn.list_branches(self.file, treename=tree))
Ejemplo n.º 29
0
 def find_product(self, product, tree="Events"):
     branches = rn.list_branches(self.file, treename=tree)
     return [
         branch for branch in branches
         if str.lower(product) in str.lower(branch)
     ]
Ejemplo n.º 30
0

# the variables to reweight
used_branch = ['C_PT', 'C_Y', 'nSPDHits', 'Dp_M', 'pK_M']

#set up data sets
"""
root2reweight are the mc need to reweight
rootfixdata are the data to weight to
root2addweight usually is the denominator for efficiency estimation (e.g: total mc truth)
"""
#used root files
root2reweight = 'root/endlamdab2DpK.root'
rootfixdata = 'root/endSplot54t59Xibc.root'
root2addweight = 'root/aaaalamdab2DpK.root'
all_branch = root_numpy.list_branches(root2addweight, 'DecayTree')

original = root_numpy.root2array(root2reweight, branches=used_branch)
target = root_numpy.root2array(rootfixdata, branches=used_branch)
used = root_numpy.root2array(root2addweight, branches=used_branch)

original = pandas.DataFrame(original)
target = pandas.DataFrame(target)
used = pandas.DataFrame(used)
used_tmp = read_root(root2addweight, columns=all_branch)

#set up the orignal weights (weights for the orignal file and the target files)
original_weights = numpy.ones(len(original))
tree01 = root_numpy.root2array('root/endSplot54t59Xibc.root', 'DecayTree')
target_weights = tree01['sig_sw']
Ejemplo n.º 31
0
def converter(arguments):
    """
    Process converting standard-format ROOT file to HDF5 file with cell
    content.

    Arguments:
        path: Path to the ROOT file to be converted.
        args: Namespace containing command-line arguments, to configure the
            reading and writing of files.

    Returns:
        Converted data in numpy array format
    """
    global args
    # Unpack arguments
    index, counter, path, start, stop = arguments

    # Suppress warnings like these when loading the files: TClass::Init:0: RuntimeWarning: no dictionary for class [bla] is available
    ROOT.gErrorIgnoreLevel = ROOT.kError

    # Split indexes into 10 sets.
    index_edges = list(map(int, np.linspace(start, stop, 10, endpoint=True)))
    index_ranges = zip(index_edges[:-1], index_edges[1:])

    import root_numpy
    # Read-in data from ROOT.TTree
    all_branches = root_numpy.list_branches(path, args.tree)

    # Any branches that needs to be removed is defined in "variableLists.py"
    # remove = remove_branches()
    remove = remove_branches_pho()
    # remove = remove_branchesData()

    keep_branches = sorted(list(set(all_branches) - set(remove)))

    one_evts_array = []
    for i, (loop_start, loop_stop) in enumerate(index_ranges):
        array = root_numpy.root2array(path,
                                      args.tree,
                                      start=loop_start,
                                      stop=loop_stop,
                                      selection=args.selection,
                                      branches=keep_branches,
                                      warn_missing_tree=True)

        ROOT.gErrorIgnoreLevel = ROOT.kInfo

        n_evts = len(array)
        # If NO events survived, it's probably the selection
        if n_evts == 0:
            print("n_evts = 0")
            return
        # If only one event survives ( can happen with small files) the tracks can't be saved properly???, for now add all of these and save them later
        if n_evts == 1:
            one_evts_array.append(array)
            continue
        # Convert to HDF5-ready format.
        data = convert_to_hdf5(array)

        if (args.tree == 'el_tree') and args.datatype == 'MC':
            scale = scale_eventWeight(data['mcChannelNumber'][0])
            data['event_totalWeight'] *= scale

        # Save output of every subprocess to a file
        filename = '{:s}_{:04d}.h5'.format(args.tag, index)
        if counter == 0 and i == 0:
            saveToFile(filename, data)
        else:
            appendToFile(filename, data)

        del data, array
        gc.collect()

    # Add all arrays with only one event and save them to the output file
    if len(one_evts_array) > 1:
        one_evts_array = np.concatenate(one_evts_array)
        one_evts_data = convert_to_hdf5(one_evts_array)
        filename = '{:s}_{:04d}.h5'.format(args.tag, index)
        appendToFile(filename, one_evts_data)
Ejemplo n.º 32
0
def read_root(paths,
              key=None,
              columns=None,
              ignore=None,
              chunksize=None,
              where=None,
              flatten=False,
              *args,
              **kwargs):
    """
    Read a ROOT file, or list of ROOT files, into a pandas DataFrame.
    Further *args and *kwargs are passed to root_numpy's root2array.
    If the root file contains a branch matching __index__*, it will become the DataFrame's index.

    Parameters
    ----------
    paths: string or list
        The path(s) to the root file(s)
    key: string
        The key of the tree to load.
    columns: str or sequence of str
        A sequence of shell-patterns (can contain *, ?, [] or {}). Matching columns are read.
        The columns beginning with `noexpand:` are not interpreted as shell-patterns,
        allowing formula columns such as `noexpand:2*x`. The column in the returned DataFrame
        will not have the `noexpand:` prefix.
    ignore: str or sequence of str
        A sequence of shell-patterns (can contain *, ?, [] or {}). All matching columns are ignored (overriding the columns argument).
    chunksize: int
        If this parameter is specified, an iterator is returned that yields DataFrames with `chunksize` rows.
    where: str
        Only rows that match the expression will be read.
    flatten: sequence of str
        A sequence of column names. Will use root_numpy.stretch to flatten arrays in the specified columns into
        individual entries. All arrays specified in the columns must have the same length for this to work.
        Be careful if you combine this with chunksize, as chunksize will refer to the number of unflattened entries,
        so you will be iterating over a number of entries that is potentially larger than chunksize.
        The index of each element within its former array will be saved in the __array_index column.

    Returns
    -------
        DataFrame created from matching data in the specified TTree

    Notes
    -----

        >>> df = read_root('test.root', 'MyTree', columns=['A{B,C}*', 'D'], where='ABB > 100')

    """

    if not isinstance(paths, list):
        paths = [paths]
    # Use a single file to search for trees and branches
    seed_path = paths[0]

    if not key:
        trees = list_trees(seed_path)
        if len(trees) == 1:
            key = trees[0]
        elif len(trees) == 0:
            raise ValueError('No trees found in {}'.format(seed_path))
        else:
            raise ValueError(
                'More than one tree found in {}'.format(seed_path))

    branches = list_branches(seed_path, key)

    if not columns:
        all_vars = branches
    else:
        if isinstance(columns, string_types):
            columns = [columns]
        # __index__* is always loaded if it exists
        # XXX Figure out what should happen with multi-dimensional indices
        index_branches = list(
            filter(lambda x: x.startswith('__index__'), branches))
        if index_branches:
            columns = columns[:]
            columns.append(index_branches[0])
        columns, noexpand = filter_noexpand_columns(columns)
        columns = list(
            itertools.chain.from_iterable(list(map(expand_braces, columns))))
        all_vars = get_matching_variables(branches, columns) + noexpand

    if ignore:
        if isinstance(ignore, string_types):
            ignore = [ignore]
        ignored = get_matching_variables(branches, ignore, fail=False)
        ignored = list(
            itertools.chain.from_iterable(list(map(expand_braces, ignored))))
        if any(map(lambda x: x.startswith('__index__'), ignored)):
            raise ValueError('__index__* branch is being ignored!')
        for var in ignored:
            all_vars.remove(var)

    def do_flatten(arr, flatten):
        if flatten is True:
            warnings.warn(
                " The option flatten=True is deprecated. Please specify the branches you would like "
                "to flatten in a list: flatten=['foo', 'bar']", FutureWarning)
            arr_, idx = stretch(arr, return_indices=True)
        else:
            nonscalar = get_nonscalar_columns(arr)
            fields = [
                x for x in arr.dtype.names
                if (x not in nonscalar or x in flatten)
            ]
            will_drop = [x for x in arr.dtype.names if x not in fields]
            if will_drop:
                warnings.warn(
                    "Ignored the following non-scalar branches: {bad_names}".
                    format(bad_names=", ".join(will_drop)), UserWarning)
            arr_, idx = stretch(arr, fields=fields, return_indices=True)
        arr = append_fields(arr_,
                            '__array_index',
                            idx,
                            usemask=False,
                            asrecarray=True)
        return arr

    if chunksize:
        tchain = ROOT.TChain(key)
        for path in paths:
            tchain.Add(path)
        n_entries = tchain.GetEntries()

        # XXX could explicitly clean up the opened TFiles with TChain::Reset

        def genchunks():
            for chunk in range(int(ceil(float(n_entries) / chunksize))):
                arr = root2array(paths,
                                 key,
                                 all_vars,
                                 start=chunk * chunksize,
                                 stop=(chunk + 1) * chunksize,
                                 selection=where,
                                 *args,
                                 **kwargs)
                if flatten:
                    arr = do_flatten(arr, flatten)
                yield convert_to_dataframe(arr)

        return genchunks()

    arr = root2array(paths, key, all_vars, selection=where, *args, **kwargs)
    if flatten:
        arr = do_flatten(arr, flatten)
    return convert_to_dataframe(arr)
Ejemplo n.º 33
0
path_file_to_save = "/users/LHCb/corentin/radiative_dataset/data/"
#path to MC :
path_mod = "/data/lhcb/marin/hhpi0gamma/radiativehhpi0RG_R16S28r1p1_MC.root"
#path to experimental dataset :
path_exp = "/users/LHCb/corentin/radiative_dataset/data/selected_exp10000.root"

path_branch_name_shared = path_file_to_save + "branch_names_shared.csv"
path_branch_name_scalar = path_file_to_save + "branch_names_scalar.csv"
path_branch_name_real = path_file_to_save + "branch_names_real.csv"
path_branch_name_nonconstant = path_file_to_save + "branch_names_nonconstant.csv"
path_branch_name_selec = path_file_to_save + "branch_names_selec.csv"
path_branch_name_noncorr = path_file_to_save + "branch_names_noncorr.csv"

data_mod = root2array(filenames=path_mod)

branch_name_mod = list_branches(filename=path_mod)
branch_name_exp = list_branches(filename=path_exp)

branch_name_shared = []
branch_name_scalar = []
branch_name_scalar_exp = []
branch_name_real = []

for i in range(len(branch_name_mod)):
    if branch_name_mod[i] in branch_name_exp:
        branch_name_shared += [branch_name_mod[i]]
        if type(data_mod[0][i]) in [
                np.float64, np.int64, np.int32, np.float32
        ]:
            branch_name_scalar += [branch_name_mod[i]]
print 'created branch_name_shared of length : ' + str(len(branch_name_shared))
Ejemplo n.º 34
0
def test_list_branches():
    branches = rnp.list_branches(load('single1.root'))
    assert_equal(branches, ['n_int', 'f_float', 'd_double'])
Ejemplo n.º 35
0
rp.to_root(DF_test, 'NNFlatTree_TestSample.root', key='NNFlatTree')

DF_test_VBF = DF_test[DF_test['ggFVBF'] == 1]
DF_test_ggF = DF_test[DF_test['ggFVBF'] == 0]

DF_train_VBF = DF_train[DF_train['ggFVBF'] == 1]
DF_train_ggF = DF_train[DF_train['ggFVBF'] == 0]

rp.to_root(DF_test_VBF, 'NNFlatTree_VBF1000.root', key='NNFlatTree')
rp.to_root(DF_test_ggF, 'NNFlatTree_ggF1000.root', key='NNFlatTree')

### Vectorial Tree from Reader ###
if runForVBFggF:
    VT_name = VT_path + 'VBF_H1000.root'
    DF_VT_VBF1000 = pd.DataFrame(
        root2array(VT_name, 'Nominal', branches=list_branches(VT_name)))
    NNTreeMakerForTestTrain(VT_name, 'VBFH1000', DF_test_VBF, DF_train_VBF)

    VT_name = VT_path + 'ggF_H1000.root'
    DF_VT_ggF1000 = pd.DataFrame(
        root2array(VT_name, 'Nominal', branches=list_branches(VT_name)))
    NNTreeMakerForTestTrain(VT_name, 'ggFH1000', DF_test_ggF, DF_train_ggF)

### add NNScore to VT from Reader for bkg (only Test DF) ###
Samples = [
    #'ZeeB_Sh221', 'ZeeC_Sh221', 'ZeeL_Sh221', 'Zee_Sh221',
    #'ZmumuB_Sh221', 'ZmumuC_Sh221', 'ZmumuL_Sh221', 'Zmumu_Sh221', 'WqqWlv_Sh221', 'WqqZll_Sh221', 'WlvZqq_Sh221', 'ZqqZll_Sh221',
    #'stops_PwPy8', 'stopWt_PwPy8', 'ttbar_nonallhad_PwPy8',
    #'ZtautauB_Sh221', 'ZtautauC_Sh221', 'ZtautauL_Sh221', #'Ztautau_Sh221',
    #'data15', 'data16',
    #'VBF_H2000', 'ggF_H2000', 'VBF_H3000', 'ggF_H3000',