Exemple #1
0
 def process(self, blob):
     blob["Arr"] = NDArray(np.arange(self.index + 1), h5loc="/arr")
     blob["Tab"] = Table(
         {"a": np.arange(self.index + 1), "i": self.index}, h5loc="/tab"
     )
     self.index += 1
     return blob
Exemple #2
0
 def process(self, blob):
     blob["Tab"] = Table({"a": self.i}, h5loc="/tab")
     blob["SplitTab"] = Table(
         {"b": self.i}, h5loc="/split_tab", split_h5=True
     )
     blob["Arr"] = NDArray(np.arange(self.i + 1), h5loc="/arr")
     self.i += 1
     return blob
Exemple #3
0
 def test_init_array(self):
     arr = np.random.random((2, 3, 4))
     arr = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])
     ndarr = NDArray(arr)
     assert 1 == ndarr[0, 0, 0]
     assert 6 == ndarr[1, 0, 1]
     assert 7 == ndarr[1, 1, 0]
     assert "/misc" == ndarr.h5loc
     assert "Unnamed NDArray" == ndarr.title
     assert ndarr.group_id is None
Exemple #4
0
    def get_blob(self, index):
        blob = Blob()
        group_id = self.group_ids[index]
        if self.cut_mask is not None:
            self.log.debug('Cut masks found, applying...')
            mask = self.cut_mask
            if not mask[index]:
                self.log.info('Cut mask blacklists this event, skipping...')
                return

        # skip groups with separate columns
        # and deal with them later
        # this should be solved using hdf5 attributes in near future
        split_table_locs = []
        ndarray_locs = []
        for tab in self.h5file.walk_nodes(classname="Table"):
            h5loc = tab._v_pathname
            loc, tabname = os.path.split(h5loc)
            if tabname in self.indices:
                self.log.info("index table '%s' already read, skip..." % h5loc)
                continue
            if loc in split_table_locs:
                self.log.info("get_blob: '%s' is noted, skip..." % h5loc)
                continue
            if tabname == "_indices":
                self.log.debug("get_blob: found index table '%s'" % h5loc)
                split_table_locs.append(loc)
                self.indices[loc] = self.h5file.get_node(h5loc)
                continue
            if tabname.endswith("_indices"):
                self.log.debug(
                    "get_blob: found index table '%s' for NDArray" % h5loc
                )
                ndarr_loc = h5loc.replace("_indices", '')
                ndarray_locs.append(ndarr_loc)
                if ndarr_loc in self.indices:
                    self.log.info(
                        "index table for NDArray '%s' already read, skip..." %
                        ndarr_loc
                    )
                    continue
                _index_table = self.h5file.get_node(h5loc)
                self.indices[ndarr_loc] = {
                    "index": _index_table.col('index')[:],
                    "n_items": _index_table.col('n_items')[:]
                }
                continue
            tabname = camelise(tabname)

            index_column = None
            if 'group_id' in tab.dtype.names:
                index_column = 'group_id'
            elif 'event_id' in tab.dtype.names:
                index_column = 'event_id'

            if index_column is not None:
                try:
                    if h5loc not in self._tab_indices:
                        self._read_tab_indices(h5loc)
                    tab_idx_start = self._tab_indices[h5loc][0][group_id]
                    tab_n_items = self._tab_indices[h5loc][1][group_id]
                    if tab_n_items == 0:
                        continue
                    arr = tab[tab_idx_start:tab_idx_start + tab_n_items]
                except IndexError:
                    self.log.debug("No data for h5loc '%s'" % h5loc)
                    continue
                except NotImplementedError:
                    # 64-bit unsigned integer columns like ``group_id``
                    # are not yet supported in conditions
                    self.log.debug(
                        "get_blob: found uint64 column at '{}'...".
                        format(h5loc)
                    )
                    arr = tab.read()
                    arr = arr[arr[index_column] == group_id]
                except ValueError:
                    # "there are no columns taking part
                    # in condition ``group_id == 0``"
                    self.log.info(
                        "get_blob: no `%s` column found in '%s'! "
                        "skipping... " % (index_column, h5loc)
                    )
                    continue
            else:
                if h5loc not in self._singletons:
                    log.info(
                        "Caching H5 singleton: {} ({})".format(tabname, h5loc)
                    )
                    self._singletons[h5loc] = Table(
                        tab.read(),
                        h5loc=h5loc,
                        split_h5=False,
                        name=tabname,
                        h5singleton=True
                    )
                blob[tabname] = self._singletons[h5loc]
                continue

            self.log.debug("h5loc: '{}'".format(h5loc))
            tab = Table(arr, h5loc=h5loc, split_h5=False, name=tabname)
            if self.shuffle and self.reset_index:
                tab.group_id[:] = index
            blob[tabname] = tab

        # skipped locs are now column wise datasets (usually hits)
        # currently hardcoded, in future using hdf5 attributes
        # to get the right constructor
        for loc in split_table_locs:
            # if some events are missing (group_id not continuous),
            # this does not work as intended
            # idx, n_items = self.indices[loc][group_id]
            idx = self.indices[loc].col('index')[group_id]
            n_items = self.indices[loc].col('n_items')[group_id]
            end = idx + n_items
            node = self.h5file.get_node(loc)
            columns = (c for c in node._v_children if c != '_indices')
            data = {}
            for col in columns:
                data[col] = self.h5file.get_node(loc + '/' + col)[idx:end]
            tabname = camelise(loc.split('/')[-1])
            s_tab = Table(data, h5loc=loc, split_h5=True, name=tabname)
            if self.shuffle and self.reset_index:
                s_tab.group_id[:] = index
            blob[tabname] = s_tab

        if self.header is not None:
            blob['Header'] = self.header

        for ndarr_loc in ndarray_locs:
            self.log.info("Reading %s" % ndarr_loc)
            try:
                idx = self.indices[ndarr_loc]['index'][group_id]
                n_items = self.indices[ndarr_loc]['n_items'][group_id]
            except IndexError:
                continue
            end = idx + n_items
            ndarr = self.h5file.get_node(ndarr_loc)
            ndarr_name = camelise(ndarr_loc.split('/')[-1])
            _ndarr = NDArray(
                ndarr[idx:end],
                h5loc=ndarr_loc,
                title=ndarr.title,
                group_id=group_id
            )
            if self.shuffle and self.reset_index:
                _ndarr.group_id = index
            blob[ndarr_name] = _ndarr

        return blob
Exemple #5
0
            def process(self, blob):
                self.i += 1

                if self.i == 5:
                    blob["Arr"] = NDArray([1, 2, 3], h5loc="/arr")
                return blob
Exemple #6
0
 def process(self, blob):
     blob["Arr"] = NDArray(np.arange(self.index + 1), h5loc="/arr")
     self.index += 1
     return blob
Exemple #7
0
 def process(self, blob):
     blob["Foo"] = NDArray(arr + self.index * 10, h5loc="/foo", title="Yep")
     self.index += 1
     return blob
Exemple #8
0
 def process(self, blob):
     blob["foo"] = NDArray(arr + self.index * 10, h5loc="/foo/bar/baz")
     self.index += 1
     return blob
Exemple #9
0
 def process(self, blob):
     blob["foo"] = NDArray(arr)
     return blob
Exemple #10
0
 def test_slicing_preserves_attribute(self):
     ndarr = NDArray([1, 2, 3], h5loc="/foo", title="Foo", group_id=23)
     a = ndarr[:1]
     assert "/foo" == a.h5loc
     assert "Foo" == a.title
     assert 23 == a.group_id
Exemple #11
0
 def test_attributes(self):
     ndarr = NDArray([1], h5loc="/foo", title="Foo", group_id=23)
     assert "/foo" == ndarr.h5loc
     assert "Foo" == ndarr.title
     assert 23 == ndarr.group_id
Exemple #12
0
 def test_init(self):
     arr = np.random.random((2, 3, 4))
     ndarr = NDArray(arr)