def test_reader_with_limit(self): ws = workspace.C.Workspace() session = LocalSession(ws) """ 1. feed full dataset """ src_init = core.Net('src_init') src_values = Struct(('label', np.array(range(100)))) src_blobs = NewRecord(src_init, src_values) src_ds = Dataset(src_blobs) FeedRecord(src_blobs, src_values, ws) ws.run(src_init) """ 2. Read with limit smaller than size of dataset """ dst_init = core.Net('dst_init') dst_ds = Dataset(src_values.clone_schema()) dst_ds.init_empty(dst_init) ws.run(dst_init) with TaskGroup() as tg: reader = ReaderWithLimit(src_ds.reader(), num_iter=10) pipe(reader, dst_ds.writer(), num_threads=8) session.run(tg) self.assertFalse(ws.blobs[str(reader.data_finished())].fetch()) self.assertEquals( sorted(ws.blobs[str(dst_ds.content().label())].fetch()), range(10)) """ 3. Read with limit larger than size of dataset """ ws.run(dst_init) with TaskGroup() as tg: reader = ReaderWithLimit(src_ds.reader(), num_iter=110) pipe(reader, dst_ds.writer(), num_threads=8) session.run(tg) self.assertEquals( sorted(ws.blobs[str(dst_ds.content().label())].fetch()), range(100)) self.assertTrue(ws.blobs[str(reader.data_finished())].fetch())
def test_pack_unpack(self, input): """ Tests if packing and unpacking of the whole dataset is an identity. """ (schema, contents, num_records) = input dataset_fields = schema.field_names() net = core.Net('pack_unpack_net') batch = NewRecord(net, contents) FeedRecord(batch, contents) packed = net.PackRecords( batch.field_blobs(), 1, fields=dataset_fields ) unpacked = packed.UnPackRecords( [], len(dataset_fields), fields=dataset_fields ) workspace.RunNetOnce(net) for initial_tensor, unpacked_tensor in zip( batch.field_blobs(), unpacked ): npt.assert_array_equal( workspace.FetchBlob(initial_tensor), workspace.FetchBlob(unpacked_tensor) )
def init_dataset(ws, size=100): src_init = core.Net('src_init') with core.NameScope('src'): src_values = Struct(('label', np.array(range(size)))) src_blobs = NewRecord(src_init, src_values) src_ds = Dataset(src_blobs) FeedRecord(src_blobs, src_values, ws) ws.run(src_init) return src_ds
def test_dequeue_many(self): init_net = core.Net('init') N = 17 NUM_DEQUEUE_RECORDS = 3 src_values = Struct( ('uid', np.array(range(N))), ('value', 0.1 * np.array(range(N)))) expected_dst = Struct( ('uid', 2 * np.array(range(N))), ('value', np.array(N * [0.0]))) with core.NameScope('init'): src_blobs = NewRecord(init_net, src_values) dst_blobs = InitEmptyRecord(init_net, src_values.clone_schema()) counter = init_net.Const(0) ONE = init_net.Const(1) def proc1(rec): with core.NameScope('proc1'): out = NewRecord(ops, rec) ops.Add([rec.uid(), rec.uid()], [out.uid()]) out.value.set(blob=rec.value(), unsafe=True) return out def proc2(rec): with core.NameScope('proc2'): out = NewRecord(ops, rec) out.uid.set(blob=rec.uid(), unsafe=True) ops.Sub([rec.value(), rec.value()], [out.value()]) ops.Add([counter, ONE], [counter]) return out src_ds = Dataset(src_blobs) dst_ds = Dataset(dst_blobs) with TaskGroup() as tg: out1 = pipe( src_ds.reader(), output=Queue( capacity=11, num_dequeue_records=NUM_DEQUEUE_RECORDS), processor=proc1) out2 = pipe(out1, processor=proc2) pipe(out2, dst_ds.writer()) ws = workspace.C.Workspace() FeedRecord(src_blobs, src_values, ws) session = LocalSession(ws) session.run(init_net) session.run(tg) output = FetchRecord(dst_blobs, ws=ws) num_dequeues = ws.blobs[str(counter)].fetch() self.assertEquals( num_dequeues, int(math.ceil(float(N) / NUM_DEQUEUE_RECORDS))) for a, b in zip(output.field_blobs(), expected_dst.field_blobs()): np.testing.assert_array_equal(a, b)
def make_source_dataset(ws, size=100, offset=0, name=None): name = name or "src" src_init = core.Net("{}_init".format(name)) with core.NameScope(name): src_values = Struct(('label', np.array(range(offset, offset + size)))) src_blobs = NewRecord(src_init, src_values) src_ds = Dataset(src_blobs, name=name) FeedRecord(src_blobs, src_values, ws) ws.run(src_init) return src_ds
def test_local_session(self): init_net = core.Net('init') src_values = Struct( ('uid', np.array([1, 2, 6])), ('value', np.array([1.4, 1.6, 1.7]))) expected_dst = Struct( ('uid', np.array([2, 4, 12])), ('value', np.array([0.0, 0.0, 0.0]))) with core.NameScope('init'): src_blobs = NewRecord(init_net, src_values) dst_blobs = InitEmptyRecord(init_net, src_values.clone_schema()) def proc1(rec): net = core.Net('proc1') with core.NameScope('proc1'): out = NewRecord(net, rec) net.Add([rec.uid(), rec.uid()], [out.uid()]) out.value.set(blob=rec.value(), unsafe=True) return [net], out def proc2(rec): net = core.Net('proc2') with core.NameScope('proc2'): out = NewRecord(net, rec) out.uid.set(blob=rec.uid(), unsafe=True) net.Sub([rec.value(), rec.value()], [out.value()]) return [net], out src_ds = Dataset(src_blobs) dst_ds = Dataset(dst_blobs) with TaskGroup() as tg: out1 = pipe(src_ds.reader(), processor=proc1) out2 = pipe(out1, processor=proc2) pipe(out2, dst_ds.writer()) ws = workspace.C.Workspace() FeedRecord(src_blobs, src_values, ws) session = LocalSession(ws) session.run(init_net) session.run(tg) output = FetchRecord(dst_blobs, ws=ws) for a, b in zip(output.field_blobs(), expected_dst.field_blobs()): np.testing.assert_array_equal(a, b)
def test_dataset_ops(self): """ 1. Defining the schema of our dataset. This example schema could represent, for example, a search query log. """ schema = Struct( # fixed size vector, which will be stored as a matrix when batched ('dense', Scalar((np.float32, 3))), # could represent a feature map from feature ID to float value ('floats', Map( Scalar(np.int32), Scalar(np.float32) )), # could represent a multi-valued categorical feature map ('int_lists', Map( Scalar(np.int32), List(Scalar(np.int64)), )), # could represent a multi-valued, weighted categorical feature map ( 'id_score_pairs', Map( Scalar(np.int32), Map( Scalar(np.int64), Scalar(np.float32), keys_name='ids', values_name='scores' ), ) ), # additional scalar information ( 'metadata', Struct( ('user_id', Scalar(np.int64)), ('user_embed', Scalar((np.float32, 2))), ('query', Scalar(str)), ) ), ) """ This is what the flattened fields for this schema look like, along with its type. Each one of these fields will be stored, read and writen as a tensor. """ expected_fields = [ ('dense', (np.float32, 3)), ('floats:lengths', np.int32), ('floats:values:keys', np.int32), ('floats:values:values', np.float32), ('int_lists:lengths', np.int32), ('int_lists:values:keys', np.int32), ('int_lists:values:values:lengths', np.int32), ('int_lists:values:values:values', np.int64), ('id_score_pairs:lengths', np.int32), ('id_score_pairs:values:keys', np.int32), ('id_score_pairs:values:values:lengths', np.int32), ('id_score_pairs:values:values:values:ids', np.int64), ('id_score_pairs:values:values:values:scores', np.float32), ('metadata:user_id', np.int64), ('metadata:user_embed', (np.float32, 2)), ('metadata:query', str), ] zipped = zip( expected_fields, schema.field_names(), schema.field_types() ) for (ref_name, ref_type), name, dtype in zipped: self.assertEquals(ref_name, name) self.assertEquals(np.dtype(ref_type), dtype) """ 2. The contents of our dataset. Contents as defined below could represent, for example, a log of search queries along with dense, sparse features and metadata. The datset below has 3 top-level entries. """ contents_raw = [ # dense [[1.1, 1.2, 1.3], [2.1, 2.2, 2.3], [3.1, 3.2, 3.3]], # floats [1, 2, 3], # len [11, 21, 22, 31, 32, 33], # key [1.1, 2.1, 2.2, 3.1, 3.2, 3.3], # value # int lists [2, 0, 1], # len [11, 12, 31], # key [2, 4, 3], # value:len [111, 112, 121, 122, 123, 124, 311, 312, 313], # value:value # id score pairs [1, 2, 2], # len [11, 21, 22, 31, 32], # key [1, 1, 2, 2, 3], # value:len [111, 211, 221, 222, 311, 312, 321, 322, 323], # value:ids [11.1, 21.1, 22.1, 22.2, 31.1, 31.2, 32.1, 32.2, 32.3], # val:score # metadata [123, 234, 456], # user_id [[0.2, 0.8], [0.5, 0.5], [0.7, 0.3]], # user_embed ['dog posts', 'friends who like to', 'posts about ca'], # query ] # convert the above content to ndarrays, checking against the schema contents = from_blob_list(schema, contents_raw) """ 3. Creating and appending to the dataset. We first create an empty dataset with the given schema. Then, a Writer is used to append these entries to the dataset. """ ds = dataset.Dataset(schema) net = core.Net('init') with core.NameScope('init'): ds.init_empty(net) content_blobs = NewRecord(net, contents) FeedRecord(content_blobs, contents) writer = ds.writer(init_net=net) writer.write_record(net, content_blobs) workspace.RunNetOnce(net) """ 4. Iterating through the dataset contents. If we were to iterate through the top level entries of our dataset, this is what we should expect to see: """ entries_raw = [ ( [[1.1, 1.2, 1.3]], # dense [1], [11], [1.1], # floats [2], [11, 12], [2, 4], [111, 112, 121, 122, 123, 124], # intlst [1], [11], [1], [111], [11.1], # id score pairs [123], [[0.2, 0.8]], ['dog posts'], # metadata ), ( [[2.1, 2.2, 2.3]], # dense [2], [21, 22], [2.1, 2.2], # floats [0], [], [], [], # int list [2], [21, 22], [1, 2], [211, 221, 222], [21.1, 22.1, 22.2], [234], [[0.5, 0.5]], ['friends who like to'], # metadata ), ( [[3.1, 3.2, 3.3]], # dense [3], [31, 32, 33], [3.1, 3.2, 3.3], # floats [1], [31], [3], [311, 312, 313], # int lst [2], [31, 32], [2, 3], [311, 312, 321, 322, 323], [31.1, 31.2, 32.1, 32.2, 32.3], # id score list [456], [[0.7, 0.3]], ['posts about ca'], # metadata ), # after the end of the dataset, we will keep getting empty vectors ([], ) * 16, ([], ) * 16, ] entries = [from_blob_list(schema, e) for e in entries_raw] """ Let's go ahead and create the reading nets. We will run `read` net multiple times and assert that we are reading the entries the way we stated above. """ read_init_net = core.Net('read_init') read_next_net = core.Net('read_next') reader = ds.reader(read_init_net) should_continue, batch = reader.read_record(read_next_net) workspace.RunNetOnce(read_init_net) workspace.CreateNet(read_next_net, True) for entry in entries: workspace.RunNet(str(read_next_net)) actual = FetchRecord(batch) _assert_records_equal(actual, entry) """ 5. Reading/writing in a single plan If all of operations on the data are expressible as Caffe2 operators, we don't need to load the data to python, iterating through the dataset in a single Plan. Where we will process the dataset a little and store it in a second dataset. We can reuse the same Reader since it supports reset. """ reset_net = core.Net('reset_net') reader.reset(reset_net) read_step, batch = reader.execution_step() """ We will add the line number * 1000 to the feature ids. """ process_net = core.Net('process') line_no = Const(process_net, 0, dtype=np.int32) const_one = Const(process_net, 1000, dtype=np.int32) process_net.Add([line_no, const_one], [line_no]) field = batch.floats.keys.get() process_net.Print(field, []) process_net.Add([field, line_no], field, broadcast=1, axis=0) """ Lets create a second dataset and append to it. """ ds2 = dataset.Dataset(schema, name='dataset2') ds2.init_empty(reset_net) writer = ds2.writer(reset_net) writer.write_record(process_net, batch) # commit is not necessary for DatasetWriter but will add it for # generality of the example commit_net = core.Net('commit') writer.commit(commit_net) """ Time to create and run a plan which will do the processing """ plan = core.Plan('process') plan.AddStep(core.execution_step('reset', reset_net)) plan.AddStep(read_step.AddNet(process_net)) plan.AddStep(core.execution_step('commit', commit_net)) workspace.RunPlan(plan) """ Now we should have dataset2 populated. """ ds2_data = FetchRecord(ds2.content()) field = ds2_data.floats.keys field.set(blob=field.get() - [1000, 2000, 2000, 3000, 3000, 3000]) _assert_records_equal(contents, ds2_data) """ 6. Slicing a dataset You can create a new schema from pieces of another schema and reuse the same data. """ subschema = Struct(('top_level', schema.int_lists.values)) int_list_contents = contents.int_lists.values.field_names() self.assertEquals(len(subschema.field_names()), len(int_list_contents)) """ 7. Random Access a dataset """ read_init_net = core.Net('read_init') read_next_net = core.Net('read_next') idx = np.array([2, 1, 0]) indices_blob = Const(read_init_net, idx, name='indices') reader = ds.random_reader(read_init_net, indices_blob) reader.computeoffset(read_init_net) should_stop, batch = reader.read_record(read_next_net) workspace.CreateNet(read_init_net, True) workspace.RunNetOnce(read_init_net) workspace.CreateNet(read_next_net, True) for i in range(len(entries)): k = idx[i] if i in idx else i entry = entries[k] workspace.RunNet(str(read_next_net)) actual = FetchRecord(batch) _assert_records_equal(actual, entry) workspace.RunNet(str(read_next_net)) self.assertEquals(True, workspace.FetchBlob(should_stop)) """ 8. Random Access a dataset with loop_over = true """ read_init_net = core.Net('read_init') read_next_net = core.Net('read_next') idx = np.array([2, 1, 0]) indices_blob = Const(read_init_net, idx, name='indices') reader = ds.random_reader(read_init_net, indices_blob, loop_over=True) reader.computeoffset(read_init_net) should_stop, batch = reader.read_record(read_next_net) workspace.CreateNet(read_init_net, True) workspace.RunNetOnce(read_init_net) workspace.CreateNet(read_next_net, True) for _ in range(len(entries) * 3): workspace.RunNet(str(read_next_net)) self.assertEquals(False, workspace.FetchBlob(should_stop)) """ 9. Sort and shuffle a dataset This sort the dataset using the score of a certain column, and then shuffle within each chunk of size batch_size * shuffle_size before shuffling the chunks. """ read_init_net = core.Net('read_init') read_next_net = core.Net('read_next') reader = ds.random_reader(read_init_net) reader.sort_and_shuffle(read_init_net, 'int_lists:lengths', 1, 2) reader.computeoffset(read_init_net) should_continue, batch = reader.read_record(read_next_net) workspace.CreateNet(read_init_net, True) workspace.RunNetOnce(read_init_net) workspace.CreateNet(read_next_net, True) expected_idx = np.array([2, 1, 0]) for i in range(len(entries)): k = expected_idx[i] if i in expected_idx else i entry = entries[k] workspace.RunNet(str(read_next_net)) actual = FetchRecord(batch) _assert_records_equal(actual, entry)
def test_record_queue(self): num_prod = 8 num_consume = 3 schema = Struct( ('floats', Map(Scalar(np.int32), Scalar(np.float32))), ) contents_raw = [ [1, 2, 3], # len [11, 21, 22, 31, 32, 33], # key [1.1, 2.1, 2.2, 3.1, 3.2, 3.3], # value ] contents = from_blob_list(schema, contents_raw) ds = Dataset(schema) net = core.Net('init') ds.init_empty(net) content_blobs = NewRecord(net, contents) FeedRecord(content_blobs, contents) writer = ds.writer(init_net=net) writer.write_record(net, content_blobs) reader = ds.reader(init_net=net) # prepare receiving dataset rec_dataset = Dataset(contents, name='rec') rec_dataset.init_empty(init_net=net) rec_dataset_writer = rec_dataset.writer(init_net=net) workspace.RunNetOnce(net) queue = RecordQueue(contents, num_threads=num_prod) def process(net, fields): new_fields = [] for f in fields.field_blobs(): new_f = net.Copy(f) new_fields.append(new_f) new_fields = from_blob_list(fields, new_fields) return new_fields q_reader, q_step, q_exit, fields = queue.build(reader, process) producer_step = core.execution_step('producer', [q_step, q_exit]) consumer_steps = [] for i in range(num_consume): name = 'queue_reader_' + str(i) net_consume = core.Net(name) should_stop, fields = q_reader.read_record(net_consume) step_consume = core.execution_step(name, net_consume) name = 'dataset_writer_' + str(i) net_dataset = core.Net(name) rec_dataset_writer.write(net_dataset, fields.field_blobs()) step_dataset = core.execution_step(name, net_dataset) step = core.execution_step('consumer_' + str(i), [step_consume, step_dataset], should_stop_blob=should_stop) consumer_steps.append(step) consumer_step = core.execution_step('consumers', consumer_steps, concurrent_substeps=True) work_steps = core.execution_step('work', [producer_step, consumer_step], concurrent_substeps=True) plan = core.Plan('test') plan.AddStep(work_steps) core.workspace.RunPlan(plan) data = workspace.FetchBlobs(rec_dataset.get_blobs()) self.assertEqual(6, sum(data[0])) self.assertEqual(150, sum(data[1])) self.assertAlmostEqual(15, sum(data[2]), places=5)