def test_reader_with_limit(self): ws = workspace.C.Workspace() session = LocalSession(ws) """ 1. feed full dataset """ src_ds = init_dataset(ws) """ 2. Read with limit smaller than size of dataset """ dst_init = core.Net('dst_init') with core.NameScope('dst'): dst_ds = Dataset(src_ds.content().clone_schema()) dst_ds.init_empty(dst_init) ws.run(dst_init) # WorkspaceType.GLOBAL is required because we are fetching # reader.data_finished() after the TaskGroup finishes. with TaskGroup(workspace_type=WorkspaceType.GLOBAL) as tg: reader = ReaderWithLimit(src_ds.reader(), num_iter=10) pipe(reader, dst_ds.writer(), num_threads=8) session.run(tg) self.assertFalse(ws.blobs[str(reader.data_finished())].fetch()) self.assertEquals( sorted(ws.blobs[str(dst_ds.content().label())].fetch()), list(range(10)) ) """ 3. Read with limit larger than size of dataset """ ws.run(dst_init) with TaskGroup(workspace_type=WorkspaceType.GLOBAL) as tg: reader = ReaderWithLimit(src_ds.reader(), num_iter=110) pipe(reader, dst_ds.writer(), num_runtime_threads=8) session.run(tg) self.assertEquals( sorted(ws.blobs[str(dst_ds.content().label())].fetch()), list(range(100)) ) self.assertTrue(ws.blobs[str(reader.data_finished())].fetch()) """ 4. Read without counter """ ws.run(dst_init) with TaskGroup(workspace_type=WorkspaceType.GLOBAL) as tg: reader = ReaderWithLimit(src_ds.reader(), num_iter=None) pipe(reader, dst_ds.writer(), num_threads=8) session.run(tg) self.assertEquals( sorted(ws.blobs[str(dst_ds.content().label())].fetch()), list(range(100)) ) self.assertTrue(ws.blobs[str(reader.data_finished())].fetch()) """ 5. Read using the same reader without resetting workspace """ session.run(tg) self.assertEquals( sorted(ws.blobs[str(dst_ds.content().label())].fetch()), sorted(list(range(100)) * 2) )
def init_dataset(ws): src_init = core.Net('src_init') with core.NameScope('src'): src_values = Struct(('label', np.array(range(100)))) src_blobs = NewRecord(src_init, src_values) src_ds = Dataset(src_blobs) FeedRecord(src_blobs, src_values, ws) ws.run(src_init) return src_ds
def __init__(self, reader, db_type='leveldb', name='cached_reader'): super(CachedReader, self).__init__(reader.schema()) self.original_reader = reader self.cache_path = None self.ds_reader = None self.ds = Dataset(self._schema, name) self.db_type = db_type self.name = name self.field_names = self._schema.field_names()
def test_local_session(self): init_net = core.Net('init') src_values = Struct(('uid', np.array([1, 2, 6])), ('value', np.array([1.4, 1.6, 1.7]))) expected_dst = Struct(('uid', np.array([2, 4, 12])), ('value', np.array([0.0, 0.0, 0.0]))) with core.NameScope('init'): src_blobs = NewRecord(init_net, src_values) dst_blobs = InitEmptyRecord(init_net, src_values.clone_schema()) def proc1(rec): net = core.Net('proc1') with core.NameScope('proc1'): out = NewRecord(net, rec) net.Add([rec.uid(), rec.uid()], [out.uid()]) out.value.set(blob=rec.value(), unsafe=True) return [net], out def proc2(rec): net = core.Net('proc2') with core.NameScope('proc2'): out = NewRecord(net, rec) out.uid.set(blob=rec.uid(), unsafe=True) net.Sub([rec.value(), rec.value()], [out.value()]) return [net], out src_ds = Dataset(src_blobs) dst_ds = Dataset(dst_blobs) with TaskGroup() as tg: out1 = pipe(src_ds.reader(), processor=proc1) out2 = pipe(out1, processor=proc2) pipe(out2, dst_ds.writer()) ws = workspace.C.Workspace() FeedRecord(src_blobs, src_values, ws) session = LocalSession(ws) session.run(init_net) session.run(tg) output = FetchRecord(dst_blobs, ws=ws) for a, b in zip(output.field_blobs(), expected_dst.field_blobs()): np.testing.assert_array_equal(a, b)
def make_source_dataset(ws, size=100, offset=0, name=None): name = name or "src" src_init = core.Net("{}_init".format(name)) with core.NameScope(name): src_values = Struct(('label', np.array(range(offset, offset + size)))) src_blobs = NewRecord(src_init, src_values) src_ds = Dataset(src_blobs, name=name) FeedRecord(src_blobs, src_values, ws) ws.run(src_init) return src_ds
def read_all_data(ws, reader, session): dst_init = core.Net('dst_init') with core.NameScope('dst'): dst_ds = Dataset(reader.schema().clone_schema()) dst_ds.init_empty(dst_init) session.run(dst_init) with TaskGroup(workspace_type=WorkspaceType.GLOBAL) as tg: pipe(reader, dst_ds.writer(), num_runtime_threads=8) session.run(tg) return ws.blobs[str(dst_ds.content().label())].fetch()
def _test_limit_reader_init_shared(self, size): ws = workspace.C.Workspace() session = LocalSession(ws) # Build test dataset src_ds = init_dataset(ws, size=size) # Create an identically sized empty destnation dataset dst_init = core.Net('dst_init') with core.NameScope('dst'): dst_ds = Dataset(src_ds.content().clone_schema()) dst_ds.init_empty(dst_init) ws.run(dst_init) return ws, session, src_ds, dst_init, dst_ds
def test_reader_with_limit(self): ws = workspace.C.Workspace() session = LocalSession(ws) """ 1. feed full dataset """ src_init = core.Net('src_init') with core.NameScope('src'): src_values = Struct(('label', np.array(range(100)))) src_blobs = NewRecord(src_init, src_values) src_ds = Dataset(src_blobs) FeedRecord(src_blobs, src_values, ws) ws.run(src_init) """ 2. Read with limit smaller than size of dataset """ dst_init = core.Net('dst_init') with core.NameScope('dst'): dst_ds = Dataset(src_values.clone_schema()) dst_ds.init_empty(dst_init) ws.run(dst_init) with TaskGroup() as tg: reader = ReaderWithLimit(src_ds.reader(), num_iter=10) pipe(reader, dst_ds.writer(), num_threads=8) session.run(tg) self.assertFalse(ws.blobs[str(reader.data_finished())].fetch()) self.assertEquals( sorted(ws.blobs[str(dst_ds.content().label())].fetch()), range(10)) """ 3. Read with limit larger than size of dataset """ ws.run(dst_init) with TaskGroup() as tg: reader = ReaderWithLimit(src_ds.reader(), num_iter=110) pipe(reader, dst_ds.writer(), num_threads=8) session.run(tg) self.assertEquals( sorted(ws.blobs[str(dst_ds.content().label())].fetch()), range(100)) self.assertTrue(ws.blobs[str(reader.data_finished())].fetch())
def build_pipeline(node_id): with Node('trainer_%d' % node_id): with Job.current().init_group, Task(): data_arr = Struct(('val', np.array(list(range(10))))) data = ConstRecord(ops, data_arr) ds = Dataset(data, name='dataset:%d' % node_id) full_reader = ds.reader(ops) total = ops.Const([100]) def inc_total(rec): ops.Add([total, rec.val()], [total]) epoch_reader = ReaderWithLimit(full_reader, num_iter=3) pipe(epoch_reader, processor=inc_total) Job.current().add_stop_signal(epoch_reader.data_finished()) return [total]
def build_job(): with Node('reader'): with Job() as job: with job.init_group: init_net = core.Net('init_net') data_arr = Struct(('val', np.array(range(10)))) data = ConstRecord(init_net, data_arr) ds = Dataset(data) full_reader = ds.reader(init_net) total = init_net.Const([100]) Task(step=init_net) def inc_total(rec): net = core.Net('inc_total') net.Add([total, rec.val()], [total]) return [net] epoch_reader = ReaderWithLimit(full_reader, num_iter=3) pipe(epoch_reader, processor=inc_total) job.add_stop_signal(epoch_reader.data_finished()) total_fetcher = Task(step=core.Net('empty'), outputs=[total]) return job, total_fetcher
def test_composite_reader(self): ws = workspace.C.Workspace() session = LocalSession(ws) num_srcs = 3 names = ["src_{}".format(i) for i in range(num_srcs)] size = 100 offsets = [i * size for i in range(num_srcs)] src_dses = [ init_dataset(ws, offset=offset, size=size, name=name) for (name, offset) in zip(names, offsets) ] data = [ws.fetch_blob(str(src.field_blobs[0])) for src in src_dses] # Sanity check we didn't overwrite anything for d, offset in zip(data, offsets): npt.assert_array_equal(d, range(offset, offset + size)) # Create an identically sized empty destnation dataset dst_init = core.Net('dst_init') with core.NameScope('dst'): dst_ds = Dataset( schema.Struct(*[(name, src_ds.content().clone_schema()) for name, src_ds in zip(names, src_dses)])) dst_ds.init_empty(dst_init) ws.run(dst_init) with TaskGroup() as tg: reader = CompositeReader(names, [src_ds.reader() for src_ds in src_dses]) pipe(reader, dst_ds.writer(), num_runtime_threads=3) session.run(tg) for i in range(num_srcs): written_data = sorted( ws.fetch_blob(str(dst_ds.content()[names[i]].label()))) npt.assert_array_equal(data[i], written_data)
def test_record_queue(self): num_prod = 8 num_consume = 3 schema = Struct( ('floats', Map(Scalar(np.int32), Scalar(np.float32))), ) contents_raw = [ [1, 2, 3], # len [11, 21, 22, 31, 32, 33], # key [1.1, 2.1, 2.2, 3.1, 3.2, 3.3], # value ] contents = from_blob_list(schema, contents_raw) ds = Dataset(schema) net = core.Net('init') ds.init_empty(net) content_blobs = NewRecord(net, contents) FeedRecord(content_blobs, contents) writer = ds.writer(init_net=net) writer.write_record(net, content_blobs) reader = ds.reader(init_net=net) # prepare receiving dataset rec_dataset = Dataset(contents, name='rec') rec_dataset.init_empty(init_net=net) rec_dataset_writer = rec_dataset.writer(init_net=net) workspace.RunNetOnce(net) queue = RecordQueue(contents, num_threads=num_prod) def process(net, fields): new_fields = [] for f in fields.field_blobs(): new_f = net.Copy(f) new_fields.append(new_f) new_fields = from_blob_list(fields, new_fields) return new_fields q_reader, q_step, q_exit, fields = queue.build(reader, process) producer_step = core.execution_step('producer', [q_step, q_exit]) consumer_steps = [] for i in range(num_consume): name = 'queue_reader_' + str(i) net_consume = core.Net(name) should_stop, fields = q_reader.read_record(net_consume) step_consume = core.execution_step(name, net_consume) name = 'dataset_writer_' + str(i) net_dataset = core.Net(name) rec_dataset_writer.write(net_dataset, fields.field_blobs()) step_dataset = core.execution_step(name, net_dataset) step = core.execution_step('consumer_' + str(i), [step_consume, step_dataset], should_stop_blob=should_stop) consumer_steps.append(step) consumer_step = core.execution_step('consumers', consumer_steps, concurrent_substeps=True) work_steps = core.execution_step('work', [producer_step, consumer_step], concurrent_substeps=True) plan = core.Plan('test') plan.AddStep(work_steps) core.workspace.RunPlan(plan) data = workspace.FetchBlobs(rec_dataset.get_blobs()) self.assertEqual(6, sum(data[0])) self.assertEqual(150, sum(data[1])) self.assertAlmostEqual(15, sum(data[2]), places=5)