def test_reader_with_limit(self): ws = workspace.C.Workspace() session = LocalSession(ws) """ 1. feed full dataset """ src_init = core.Net('src_init') src_values = Struct(('label', np.array(range(100)))) src_blobs = NewRecord(src_init, src_values) src_ds = Dataset(src_blobs) FeedRecord(src_blobs, src_values, ws) ws.run(src_init) """ 2. Read with limit smaller than size of dataset """ dst_init = core.Net('dst_init') dst_ds = Dataset(src_values.clone_schema()) dst_ds.init_empty(dst_init) ws.run(dst_init) with TaskGroup() as tg: reader = ReaderWithLimit(src_ds.reader(), num_iter=10) pipe(reader, dst_ds.writer(), num_threads=8) session.run(tg) self.assertFalse(ws.blobs[str(reader.data_finished())].fetch()) self.assertEquals( sorted(ws.blobs[str(dst_ds.content().label())].fetch()), range(10)) """ 3. Read with limit larger than size of dataset """ ws.run(dst_init) with TaskGroup() as tg: reader = ReaderWithLimit(src_ds.reader(), num_iter=110) pipe(reader, dst_ds.writer(), num_threads=8) session.run(tg) self.assertEquals( sorted(ws.blobs[str(dst_ds.content().label())].fetch()), range(100)) self.assertTrue(ws.blobs[str(reader.data_finished())].fetch())
def test_reader_with_limit(self): ws = workspace.C.Workspace() session = LocalSession(ws) """ 1. feed full dataset """ src_ds = init_dataset(ws) """ 2. Read with limit smaller than size of dataset """ dst_init = core.Net('dst_init') with core.NameScope('dst'): dst_ds = Dataset(src_ds.content().clone_schema()) dst_ds.init_empty(dst_init) ws.run(dst_init) # WorkspaceType.GLOBAL is required because we are fetching # reader.data_finished() after the TaskGroup finishes. with TaskGroup(workspace_type=WorkspaceType.GLOBAL) as tg: reader = ReaderWithLimit(src_ds.reader(), num_iter=10) pipe(reader, dst_ds.writer(), num_threads=8) session.run(tg) self.assertFalse(ws.blobs[str(reader.data_finished())].fetch()) self.assertEquals( sorted(ws.blobs[str(dst_ds.content().label())].fetch()), list(range(10)) ) """ 3. Read with limit larger than size of dataset """ ws.run(dst_init) with TaskGroup(workspace_type=WorkspaceType.GLOBAL) as tg: reader = ReaderWithLimit(src_ds.reader(), num_iter=110) pipe(reader, dst_ds.writer(), num_runtime_threads=8) session.run(tg) self.assertEquals( sorted(ws.blobs[str(dst_ds.content().label())].fetch()), list(range(100)) ) self.assertTrue(ws.blobs[str(reader.data_finished())].fetch()) """ 4. Read without counter """ ws.run(dst_init) with TaskGroup(workspace_type=WorkspaceType.GLOBAL) as tg: reader = ReaderWithLimit(src_ds.reader(), num_iter=None) pipe(reader, dst_ds.writer(), num_threads=8) session.run(tg) self.assertEquals( sorted(ws.blobs[str(dst_ds.content().label())].fetch()), list(range(100)) ) self.assertTrue(ws.blobs[str(reader.data_finished())].fetch()) """ 5. Read using the same reader without resetting workspace """ session.run(tg) self.assertEquals( sorted(ws.blobs[str(dst_ds.content().label())].fetch()), sorted(list(range(100)) * 2) )
def test_composite_reader_builder(self): ws = workspace.C.Workspace() session = LocalSession(ws) num_srcs = 3 names = ["src_{}".format(i) for i in range(num_srcs)] size = 100 offsets = [i * size for i in range(num_srcs)] src_ds_builders = [ TestReaderBuilder(offset=offset, size=size, name=name) for (name, offset) in zip(names, offsets) ] # Create an identically sized empty destnation dataset dst_init = core.Net('dst_init') with core.NameScope('dst'): dst_ds = Dataset( schema.Struct(*[(name, src_ds_builder.schema()) for name, src_ds_builder in zip( names, src_ds_builders)])) dst_ds.init_empty(dst_init) ws.run(dst_init) with TaskGroup() as tg: reader_builder = CompositeReaderBuilder(names, src_ds_builders) reader_builder.setup(ws=ws) pipe(reader_builder.new_reader(), dst_ds.writer(), num_runtime_threads=3) session.run(tg) for name, offset in zip(names, offsets): written_data = sorted( ws.fetch_blob(str(dst_ds.content()[name].label()))) npt.assert_array_equal(range(offset, offset + size), written_data)
def read_all_data(ws, reader, session): dst_init = core.Net('dst_init') with core.NameScope('dst'): dst_ds = Dataset(reader.schema().clone_schema()) dst_ds.init_empty(dst_init) session.run(dst_init) with TaskGroup(workspace_type=WorkspaceType.GLOBAL) as tg: pipe(reader, dst_ds.writer(), num_runtime_threads=8) session.run(tg) return ws.blobs[str(dst_ds.content().label())].fetch()
def test_composite_reader(self): ws = workspace.C.Workspace() session = LocalSession(ws) num_srcs = 3 names = ["src_{}".format(i) for i in range(num_srcs)] size = 100 offsets = [i * size for i in range(num_srcs)] src_dses = [ init_dataset(ws, offset=offset, size=size, name=name) for (name, offset) in zip(names, offsets) ] data = [ws.fetch_blob(str(src.field_blobs[0])) for src in src_dses] # Sanity check we didn't overwrite anything for d, offset in zip(data, offsets): npt.assert_array_equal(d, range(offset, offset + size)) # Create an identically sized empty destnation dataset dst_init = core.Net('dst_init') with core.NameScope('dst'): dst_ds = Dataset( schema.Struct(*[(name, src_ds.content().clone_schema()) for name, src_ds in zip(names, src_dses)])) dst_ds.init_empty(dst_init) ws.run(dst_init) with TaskGroup() as tg: reader = CompositeReader(names, [src_ds.reader() for src_ds in src_dses]) pipe(reader, dst_ds.writer(), num_runtime_threads=3) session.run(tg) for i in range(num_srcs): written_data = sorted( ws.fetch_blob(str(dst_ds.content()[names[i]].label()))) npt.assert_array_equal(data[i], written_data)
class CachedReader(Reader): """ Reader with persistent in-file cache. Example usage: cached_reader = CachedReader(reader) build_cache_step = cached_reader.build_cache('/tmp/cache.db') with LocalSession() as session: session.run(build_cache_step) Every time new reader is created, it's expected that build_cache will be called before setup_ex and usage of the reader. build_cache will check existence of provided file path and in case it's missing will initialize it by reading data from original reader. All consequent attempts to read will ignore original reader (i.e. no additional data will be read from it). """ def __init__(self, reader, db_type='leveldb', name='cached_reader'): super(CachedReader, self).__init__(reader.schema()) self.original_reader = reader self.cache_path = None self.ds_reader = None self.ds = Dataset(self._schema, name) self.db_type = db_type self.name = name self.field_names = self._schema.field_names() def setup_ex(self, init_net, finish_net): assert self.cache_path, 'build_cache must be called first' self._init_dataset(init_net) self._load_from_file(init_net) self.ds_reader = self.ds.reader(init_net, batch_size=100) def read(self, read_net): assert self.ds_reader, 'setup must be called first' return self.ds_reader.read(read_net) def has_cache(self): return self.cache_path and os.path.exists(self.cache_path) def build_cache(self, cache_path, overwrite=False): if not self.has_cache() or overwrite: self.cache_path = cache_path if self.has_cache() and not overwrite: # cache already exists, no need to rebuild it return core.execution_step('build_step', []) init_net = core.Net('init') self._init_dataset(init_net) with Cluster(), core.NameScope(self.name), TaskGroup() as copy_tg: pipe(self.original_reader, self.ds.writer(), num_threads=16) copy_step = copy_tg.to_task().get_step() save_net = core.Net('save') self._save_to_file(save_net) return core.execution_step('build_cache', [init_net, copy_step, save_net]) def _init_dataset(self, init_net): with core.NameScope(self.name): self.ds.init_empty(init_net) def _save_to_file(self, net): net.Save( self.ds.content().field_blobs(), [], db=self.cache_path, db_type=self.db_type, blob_name_overrides=self.field_names, absolute_path=True, ) def _load_from_file(self, net): net.Load( [], self.ds.content().field_blobs(), db=self.cache_path, db_type=self.db_type, absolute_path=True, source_blob_names=self.field_names, )