def process(self, blob): blob["Arr"] = NDArray(np.arange(self.index + 1), h5loc="/arr") blob["Tab"] = Table( {"a": np.arange(self.index + 1), "i": self.index}, h5loc="/tab" ) self.index += 1 return blob
def process(self, blob): blob["Tab"] = Table({"a": self.i}, h5loc="/tab") blob["SplitTab"] = Table( {"b": self.i}, h5loc="/split_tab", split_h5=True ) blob["Arr"] = NDArray(np.arange(self.i + 1), h5loc="/arr") self.i += 1 return blob
def test_init_array(self): arr = np.random.random((2, 3, 4)) arr = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]]) ndarr = NDArray(arr) assert 1 == ndarr[0, 0, 0] assert 6 == ndarr[1, 0, 1] assert 7 == ndarr[1, 1, 0] assert "/misc" == ndarr.h5loc assert "Unnamed NDArray" == ndarr.title assert ndarr.group_id is None
def get_blob(self, index): blob = Blob() group_id = self.group_ids[index] if self.cut_mask is not None: self.log.debug('Cut masks found, applying...') mask = self.cut_mask if not mask[index]: self.log.info('Cut mask blacklists this event, skipping...') return # skip groups with separate columns # and deal with them later # this should be solved using hdf5 attributes in near future split_table_locs = [] ndarray_locs = [] for tab in self.h5file.walk_nodes(classname="Table"): h5loc = tab._v_pathname loc, tabname = os.path.split(h5loc) if tabname in self.indices: self.log.info("index table '%s' already read, skip..." % h5loc) continue if loc in split_table_locs: self.log.info("get_blob: '%s' is noted, skip..." % h5loc) continue if tabname == "_indices": self.log.debug("get_blob: found index table '%s'" % h5loc) split_table_locs.append(loc) self.indices[loc] = self.h5file.get_node(h5loc) continue if tabname.endswith("_indices"): self.log.debug( "get_blob: found index table '%s' for NDArray" % h5loc ) ndarr_loc = h5loc.replace("_indices", '') ndarray_locs.append(ndarr_loc) if ndarr_loc in self.indices: self.log.info( "index table for NDArray '%s' already read, skip..." % ndarr_loc ) continue _index_table = self.h5file.get_node(h5loc) self.indices[ndarr_loc] = { "index": _index_table.col('index')[:], "n_items": _index_table.col('n_items')[:] } continue tabname = camelise(tabname) index_column = None if 'group_id' in tab.dtype.names: index_column = 'group_id' elif 'event_id' in tab.dtype.names: index_column = 'event_id' if index_column is not None: try: if h5loc not in self._tab_indices: self._read_tab_indices(h5loc) tab_idx_start = self._tab_indices[h5loc][0][group_id] tab_n_items = self._tab_indices[h5loc][1][group_id] if tab_n_items == 0: continue arr = tab[tab_idx_start:tab_idx_start + tab_n_items] except IndexError: self.log.debug("No data for h5loc '%s'" % h5loc) continue except NotImplementedError: # 64-bit unsigned integer columns like ``group_id`` # are not yet supported in conditions self.log.debug( "get_blob: found uint64 column at '{}'...". format(h5loc) ) arr = tab.read() arr = arr[arr[index_column] == group_id] except ValueError: # "there are no columns taking part # in condition ``group_id == 0``" self.log.info( "get_blob: no `%s` column found in '%s'! " "skipping... " % (index_column, h5loc) ) continue else: if h5loc not in self._singletons: log.info( "Caching H5 singleton: {} ({})".format(tabname, h5loc) ) self._singletons[h5loc] = Table( tab.read(), h5loc=h5loc, split_h5=False, name=tabname, h5singleton=True ) blob[tabname] = self._singletons[h5loc] continue self.log.debug("h5loc: '{}'".format(h5loc)) tab = Table(arr, h5loc=h5loc, split_h5=False, name=tabname) if self.shuffle and self.reset_index: tab.group_id[:] = index blob[tabname] = tab # skipped locs are now column wise datasets (usually hits) # currently hardcoded, in future using hdf5 attributes # to get the right constructor for loc in split_table_locs: # if some events are missing (group_id not continuous), # this does not work as intended # idx, n_items = self.indices[loc][group_id] idx = self.indices[loc].col('index')[group_id] n_items = self.indices[loc].col('n_items')[group_id] end = idx + n_items node = self.h5file.get_node(loc) columns = (c for c in node._v_children if c != '_indices') data = {} for col in columns: data[col] = self.h5file.get_node(loc + '/' + col)[idx:end] tabname = camelise(loc.split('/')[-1]) s_tab = Table(data, h5loc=loc, split_h5=True, name=tabname) if self.shuffle and self.reset_index: s_tab.group_id[:] = index blob[tabname] = s_tab if self.header is not None: blob['Header'] = self.header for ndarr_loc in ndarray_locs: self.log.info("Reading %s" % ndarr_loc) try: idx = self.indices[ndarr_loc]['index'][group_id] n_items = self.indices[ndarr_loc]['n_items'][group_id] except IndexError: continue end = idx + n_items ndarr = self.h5file.get_node(ndarr_loc) ndarr_name = camelise(ndarr_loc.split('/')[-1]) _ndarr = NDArray( ndarr[idx:end], h5loc=ndarr_loc, title=ndarr.title, group_id=group_id ) if self.shuffle and self.reset_index: _ndarr.group_id = index blob[ndarr_name] = _ndarr return blob
def process(self, blob): self.i += 1 if self.i == 5: blob["Arr"] = NDArray([1, 2, 3], h5loc="/arr") return blob
def process(self, blob): blob["Arr"] = NDArray(np.arange(self.index + 1), h5loc="/arr") self.index += 1 return blob
def process(self, blob): blob["Foo"] = NDArray(arr + self.index * 10, h5loc="/foo", title="Yep") self.index += 1 return blob
def process(self, blob): blob["foo"] = NDArray(arr + self.index * 10, h5loc="/foo/bar/baz") self.index += 1 return blob
def process(self, blob): blob["foo"] = NDArray(arr) return blob
def test_slicing_preserves_attribute(self): ndarr = NDArray([1, 2, 3], h5loc="/foo", title="Foo", group_id=23) a = ndarr[:1] assert "/foo" == a.h5loc assert "Foo" == a.title assert 23 == a.group_id
def test_attributes(self): ndarr = NDArray([1], h5loc="/foo", title="Foo", group_id=23) assert "/foo" == ndarr.h5loc assert "Foo" == ndarr.title assert 23 == ndarr.group_id
def test_init(self): arr = np.random.random((2, 3, 4)) ndarr = NDArray(arr)