def _dataset(draw, min_elements=3, max_elements=10, **kwargs): schema = Struct( # Dense Features Map ('floats', Map(Scalar(np.int32), Scalar(np.float32))), # Sparse Features Map ('int_lists', Map( Scalar(np.int32), List(Scalar(np.int64)), )), # Complex Type ('text', Scalar(str)), ) num_records = draw( st.integers(min_value=min_elements, max_value=max_elements)) raw_dense_features_map_contents = draw(_dense_features_map(num_records)) raw_sparse_features_map_contents = draw(_sparse_features_map(num_records)) raw_text_contents = [ draw( st.lists(st.text(alphabet=string.ascii_lowercase), min_size=num_records, max_size=num_records)) ] # Concatenate all raw contents to a single one contents_raw = raw_dense_features_map_contents + raw_sparse_features_map_contents + raw_text_contents contents = from_blob_list(schema, contents_raw) return (schema, contents, num_records)
def read_record_ex(self, local_init_net, local_finish_net): """Experimental extension to the interface. Don't use yet""" nets, should_stop, fields = self.read_ex( local_init_net, local_finish_net) if self._schema: fields = from_blob_list(self._schema, fields) return nets, should_stop, fields
def process(net, fields): new_fields = [] for f in fields.field_blobs(): new_f = net.Copy(f) new_fields.append(new_f) new_fields = from_blob_list(fields, new_fields) return new_fields
def read_record_ex(self, local_init_net, local_finish_net): """Experimental extension to the interface. Don't use yet""" nets, should_stop, fields = self.read_ex(local_init_net, local_finish_net) if self._schema: fields = from_blob_list(self._schema, fields) return nets, should_stop, fields
def testPreservesMetadata(self): s = schema.Struct( ('a', schema.Scalar(np.float32)), ('b', schema.Scalar(np.int32, metadata=schema.Metadata(categorical_limit=5))), ('c', schema.List( schema.Scalar( np.int32, metadata=schema.Metadata(categorical_limit=6))))) # attach metadata to lengths field s.c.lengths.set_metadata(schema.Metadata(categorical_limit=7)) self.assertEqual(None, s.a.metadata) self.assertEqual(5, s.b.metadata.categorical_limit) self.assertEqual(6, s.c.value.metadata.categorical_limit) self.assertEqual(7, s.c.lengths.metadata.categorical_limit) sc = s.clone() self.assertEqual(None, sc.a.metadata) self.assertEqual(5, sc.b.metadata.categorical_limit) self.assertEqual(6, sc.c.value.metadata.categorical_limit) self.assertEqual(7, sc.c.lengths.metadata.categorical_limit) sv = schema.from_blob_list(s, [ np.array([3.4]), np.array([2]), np.array([3]), np.array([1, 2, 3]) ]) self.assertEqual(None, sv.a.metadata) self.assertEqual(5, sv.b.metadata.categorical_limit) self.assertEqual(6, sv.c.value.metadata.categorical_limit) self.assertEqual(7, sv.c.lengths.metadata.categorical_limit)
def get(self): assert self._values is not None, 'Output value not set yet.' if self._is_scalar: return self._values[0] elif self._schema: return from_blob_list(self._schema, self._values) else: return self._values
def __init__(self, model, input_record, axis=1, add_axis=0, name='concat', **kwargs): super(Concat, self).__init__(model, name, input_record, **kwargs) self.axis = axis self.add_axis = add_axis assert not (axis == 0 and add_axis == 1), \ "It's not allowed to add axis=0" assert isinstance(input_record, schema.Struct),\ "Incorrect input type. Expected Struct, but received: {0}".\ format(input_record) shapes = [] for field_name, field_type in viewitems(input_record.fields): assert isinstance(field_type, schema.Scalar),\ "Incorrect input type for {}. Expected Scalar, but got: {}".\ format(field_name, field_type) # Assume that first dimension is batch, so actual axis in shape is # axis - 1 shape = list(field_type.field_type().shape) if add_axis: shape.insert(axis - 1, 1) assert len(shape) >= axis,\ "Concat expects that limited dimensions of the input tensor" shapes.append(shape) logger.info('Concat Layer input shapes: ' + str(shapes)) if axis == 0: self.output_schema = schema.from_blob_list( input_record[0], [self.get_next_blob_reference('output')]) return concat_dim = 0 for shape in shapes: concat_dim += shape[axis - 1] shape[axis - 1] = 0 assert shape == shapes[0],\ "Shapes {0} and {1} are not compatible for Concat".\ format(shape, shapes[0]) output_dims = shapes[0] output_dims[axis - 1] = concat_dim logger.info('Concat Layer output_dims: ' + str(output_dims)) self.output_schema = schema.Scalar( (np.float32, output_dims), self.get_next_blob_reference('output')) record_to_concat = input_record.fields.values() concated_feature_to_index = get_concatenated_feature_to_index( record_to_concat) if concated_feature_to_index: metadata = schema.Metadata(feature_specs=schema.FeatureSpec( feature_to_index=concated_feature_to_index)) self.output_schema.set_metadata(metadata)
def fetch(self): assert self._fetch_func is not None, ( 'Cannot fetch value for this output.') fetched_vals = [self._fetch_func(v) for v in self._values] if self._is_scalar: return fetched_vals[0] elif self._schema: return from_blob_list(self._schema, fetched_vals) else: return fetched_vals
def testPreservesEmptyFields(self): s = schema.Struct( ('a', schema.Scalar(np.float32)), ('b', schema.Struct()), ) sc = s.clone() self.assertIn("a", sc.fields) self.assertIn("b", sc.fields) sv = schema.from_blob_list(s, [np.array([3.4])]) self.assertIn("a", sv.fields) self.assertIn("b", sv.fields) self.assertEqual(0, len(sv.b.fields))
def __init__(self, model, input_record, num_to_collect, name='last_n_window_collector', **kwargs): super(LastNWindowCollector, self).__init__(model, name, input_record, **kwargs) assert num_to_collect > 0 self.num_to_collect = num_to_collect assert isinstance(input_record, schema.Scalar), \ "Got {!r}".format(input_record) self.last_n = self.create_param(param_name='last_n', shape=[0], initializer=('ConstantFill', {}), optimizer=model.NoOptim) self.next_blob = self.create_param(param_name='next', shape=[], initializer=('ConstantFill', { 'value': 0, 'dtype': core.DataType.INT32 }), optimizer=model.NoOptim) self.mutex = self.create_param( param_name='mutex', shape=None, initializer=('CreateMutex', ), optimizer=model.NoOptim, ) self.num_visited_blob = self.create_param( param_name='num_visited', shape=[], initializer=('ConstantFill', { 'value': 0, 'dtype': core.DataType.INT64, }), optimizer=model.NoOptim, ) self.output_schema = schema.Struct( ('last_n', schema.from_blob_list(input_record, [self.last_n])), ('num_visited', schema.Scalar(blob=self.num_visited_blob)), ('mutex', schema.Scalar(blob=self.mutex)), )
def __init__(self, model, input_record, axis=1, add_axis=0, name='concat', **kwargs): super(Concat, self).__init__(model, name, input_record, **kwargs) self.axis = axis self.add_axis = add_axis assert not (axis == 0 and add_axis == 1), \ "It's not allowed to add axis=0" assert isinstance(input_record, schema.Struct),\ "Incorrect input type. Excpected Struct, but received: {0}".\ format(input_record) shapes = [] for field_name, field_type in viewitems(input_record.fields): assert isinstance(field_type, schema.Scalar),\ "Incorrect input type for {}. Excpected Scalar, but got: {}".\ format(field_name, field_type) # Assume that first dimension is batch, so actual axis in shape is # axis - 1 assert len(field_type.field_type().shape) >= axis,\ "Concat expects that limited dimensions of the input tensor" shapes.append(list(field_type.field_type().shape)) if add_axis: for i in range(len(shapes)): shapes[i].insert(axis, 1) if axis == 0: self.output_schema = schema.from_blob_list( input_record[0], [model.net.NextScopedBlob(name + '_output')]) return concat_dim = 0 for shape in shapes: concat_dim += shape[axis - 1] shape[axis - 1] = 0 assert shape == shapes[0],\ "Shapes {0} and {1} are not compatible for Concat".\ format(shape, shapes[0]) output_dims = shapes[0] output_dims[axis - 1] = concat_dim self.output_schema = schema.Scalar( (np.float32, output_dims), model.net.NextScopedBlob(name + '_output'))
def __init__(self, model, input_record, axis=1, add_axis=0, name='concat', **kwargs): super(Concat, self).__init__(model, name, input_record, **kwargs) self.axis = axis self.add_axis = add_axis assert not (axis == 0 and add_axis == 1), \ "It's not allowed to add axis=0" assert isinstance(input_record, schema.Struct),\ "Incorrect input type. Excpected Struct, but received: {0}".\ format(input_record) shapes = [] for field_name, field_type in viewitems(input_record.fields): assert isinstance(field_type, schema.Scalar),\ "Incorrect input type for {}. Excpected Scalar, but got: {}".\ format(field_name, field_type) # Assume that first dimension is batch, so actual axis in shape is # axis - 1 shape = list(field_type.field_type().shape) if add_axis: shape.insert(axis - 1, 1) assert len(shape) >= axis,\ "Concat expects that limited dimensions of the input tensor" shapes.append(shape) logger.info('Concat Layer input shapes: ' + str(shapes)) if axis == 0: self.output_schema = schema.from_blob_list( input_record[0], [self.get_next_blob_reference('output')] ) return concat_dim = 0 for shape in shapes: concat_dim += shape[axis - 1] shape[axis - 1] = 0 assert shape == shapes[0],\ "Shapes {0} and {1} are not compatible for Concat".\ format(shape, shapes[0]) output_dims = shapes[0] output_dims[axis - 1] = concat_dim logger.info('Concat Layer output_dims: ' + str(output_dims)) self.output_schema = schema.Scalar( (np.float32, output_dims), self.get_next_blob_reference('output'))
def __init__(self, model, input_record, num_to_collect, name='last_n_window_collector', **kwargs): super(LastNWindowCollector, self).__init__( model, name, input_record, **kwargs) assert num_to_collect > 0 self.num_to_collect = num_to_collect assert isinstance(input_record, schema.Scalar), \ "Got {!r}".format(input_record) self.last_n = self.create_param(param_name='last_n', shape=[0], initializer=('ConstantFill', {}), optimizer=model.NoOptim) self.next_blob = self.create_param( param_name='next', shape=[], initializer=('ConstantFill', {'value': 0, 'dtype': core.DataType.INT32}), optimizer=model.NoOptim ) self.mutex = self.create_param( param_name='mutex', shape=None, initializer=('CreateMutex',), optimizer=model.NoOptim, ) self.num_visited_blob = self.create_param( param_name='num_visited', shape=[], initializer=('ConstantFill', { 'value': 0, 'dtype': core.DataType.INT64, }), optimizer=model.NoOptim, ) self.output_schema = schema.Struct( ( 'last_n', schema.from_blob_list(input_record, [self.last_n]) ), ('num_visited', schema.Scalar(blob=self.num_visited_blob)), ('mutex', schema.Scalar(blob=self.mutex)), )
def _dataset(draw, min_elements=3, max_elements=10, **kwargs): schema = Struct( # Dense Features Map ('floats', Map( Scalar(np.int32), Scalar(np.float32) )), # Sparse Features Map ('int_lists', Map( Scalar(np.int32), List(Scalar(np.int64)), )), # Complex Type ('text', Scalar(str)), ) num_records = draw( st.integers(min_value=min_elements, max_value=max_elements) ) raw_dense_features_map_contents = draw(_dense_features_map(num_records)) raw_sparse_features_map_contents = draw(_sparse_features_map(num_records)) raw_text_contents = [ draw( st.lists( st.text(alphabet=string.ascii_lowercase), min_size=num_records, max_size=num_records ) ) ] # Concatenate all raw contents to a single one contents_raw = raw_dense_features_map_contents + raw_sparse_features_map_contents + raw_text_contents contents = from_blob_list(schema, contents_raw) return (schema, contents, num_records)
def __init__(self, model, input_record, num_to_collect, name='last_n_window_collector', **kwargs): super(LastNWindowCollector, self).__init__(model, name, input_record, **kwargs) assert num_to_collect > 0 self.num_to_collect = num_to_collect assert isinstance(input_record, schema.Scalar), \ "Got {!r}".format(input_record) self.last_n = model.net.NextScopedBlob(self.name + "_last_n") self.next_blob = model.net.NextScopedBlob(self.name + "_next") self.params.append( LayerParameter( parameter=self.last_n, initializer=core.CreateOperator('ConstantFill', [], self.last_n, shape=[0]), optimizer=model.NoOptim, )) self.params.append( LayerParameter( parameter=self.next_blob, initializer=core.CreateOperator( 'ConstantFill', [], self.next_blob, shape=[], value=0, dtype=core.DataType.INT32, ), optimizer=model.NoOptim, )) self.output_schema = schema.from_blob_list( input_record, [model.net.NextScopedBlob(name + "_output")])
def testPreservesMetadata(self): s = schema.Struct( ('a', schema.Scalar(np.float32)), ( 'b', schema.Scalar( np.int32, metadata=schema.Metadata(categorical_limit=5) ) ), ( 'c', schema.List( schema.Scalar( np.int32, metadata=schema.Metadata(categorical_limit=6) ) ) ) ) # attach metadata to lengths field s.c.lengths.set_metadata(schema.Metadata(categorical_limit=7)) self.assertEqual(None, s.a.metadata) self.assertEqual(5, s.b.metadata.categorical_limit) self.assertEqual(6, s.c.value.metadata.categorical_limit) self.assertEqual(7, s.c.lengths.metadata.categorical_limit) sc = s.clone() self.assertEqual(None, sc.a.metadata) self.assertEqual(5, sc.b.metadata.categorical_limit) self.assertEqual(6, sc.c.value.metadata.categorical_limit) self.assertEqual(7, sc.c.lengths.metadata.categorical_limit) sv = schema.from_blob_list( s, [ np.array([3.4]), np.array([2]), np.array([3]), np.array([1, 2, 3]) ] ) self.assertEqual(None, sv.a.metadata) self.assertEqual(5, sv.b.metadata.categorical_limit) self.assertEqual(6, sv.c.value.metadata.categorical_limit) self.assertEqual(7, sv.c.lengths.metadata.categorical_limit)
def test_record_queue(self): num_prod = 8 num_consume = 3 schema = Struct( ('floats', Map(Scalar(np.int32), Scalar(np.float32))), ) contents_raw = [ [1, 2, 3], # len [11, 21, 22, 31, 32, 33], # key [1.1, 2.1, 2.2, 3.1, 3.2, 3.3], # value ] contents = from_blob_list(schema, contents_raw) ds = Dataset(schema) net = core.Net('init') ds.init_empty(net) content_blobs = NewRecord(net, contents) FeedRecord(content_blobs, contents) writer = ds.writer(init_net=net) writer.write_record(net, content_blobs) reader = ds.reader(init_net=net) # prepare receiving dataset rec_dataset = Dataset(contents, name='rec') rec_dataset.init_empty(init_net=net) rec_dataset_writer = rec_dataset.writer(init_net=net) workspace.RunNetOnce(net) queue = RecordQueue(contents, num_threads=num_prod) def process(net, fields): new_fields = [] for f in fields.field_blobs(): new_f = net.Copy(f) new_fields.append(new_f) new_fields = from_blob_list(fields, new_fields) return new_fields q_reader, q_step, q_exit, fields = queue.build(reader, process) producer_step = core.execution_step('producer', [q_step, q_exit]) consumer_steps = [] for i in range(num_consume): name = 'queue_reader_' + str(i) net_consume = core.Net(name) should_stop, fields = q_reader.read_record(net_consume) step_consume = core.execution_step(name, net_consume) name = 'dataset_writer_' + str(i) net_dataset = core.Net(name) rec_dataset_writer.write(net_dataset, fields.field_blobs()) step_dataset = core.execution_step(name, net_dataset) step = core.execution_step('consumer_' + str(i), [step_consume, step_dataset], should_stop_blob=should_stop) consumer_steps.append(step) consumer_step = core.execution_step('consumers', consumer_steps, concurrent_substeps=True) work_steps = core.execution_step('work', [producer_step, consumer_step], concurrent_substeps=True) plan = core.Plan('test') plan.AddStep(work_steps) core.workspace.RunPlan(plan) data = workspace.FetchBlobs(rec_dataset.get_blobs()) self.assertEqual(6, sum(data[0])) self.assertEqual(150, sum(data[1])) self.assertAlmostEqual(15, sum(data[2]), places=5)
def content(self): """ Return a Record of BlobReferences pointing to the full content of this dataset. """ return from_blob_list(self.schema, self.field_blobs)
def read_record(self, read_net): should_stop, fields = self.read(read_net) if self._schema: fields = from_blob_list(self._schema, fields) return should_stop, fields
def test_dataset_ops(self): """ 1. Defining the schema of our dataset. This example schema could represent, for example, a search query log. """ schema = Struct( # fixed size vector, which will be stored as a matrix when batched ('dense', Scalar((np.float32, 3))), # could represent a feature map from feature ID to float value ('floats', Map( Scalar(np.int32), Scalar(np.float32) )), # could represent a multi-valued categorical feature map ('int_lists', Map( Scalar(np.int32), List(Scalar(np.int64)), )), # could represent a multi-valued, weighted categorical feature map ( 'id_score_pairs', Map( Scalar(np.int32), Map( Scalar(np.int64), Scalar(np.float32), keys_name='ids', values_name='scores' ), ) ), # additional scalar information ( 'metadata', Struct( ('user_id', Scalar(np.int64)), ('user_embed', Scalar((np.float32, 2))), ('query', Scalar(str)), ) ), ) """ This is what the flattened fields for this schema look like, along with its type. Each one of these fields will be stored, read and writen as a tensor. """ expected_fields = [ ('dense', (np.float32, 3)), ('floats:lengths', np.int32), ('floats:values:keys', np.int32), ('floats:values:values', np.float32), ('int_lists:lengths', np.int32), ('int_lists:values:keys', np.int32), ('int_lists:values:values:lengths', np.int32), ('int_lists:values:values:values', np.int64), ('id_score_pairs:lengths', np.int32), ('id_score_pairs:values:keys', np.int32), ('id_score_pairs:values:values:lengths', np.int32), ('id_score_pairs:values:values:values:ids', np.int64), ('id_score_pairs:values:values:values:scores', np.float32), ('metadata:user_id', np.int64), ('metadata:user_embed', (np.float32, 2)), ('metadata:query', str), ] zipped = zip( expected_fields, schema.field_names(), schema.field_types() ) for (ref_name, ref_type), name, dtype in zipped: self.assertEquals(ref_name, name) self.assertEquals(np.dtype(ref_type), dtype) """ 2. The contents of our dataset. Contents as defined below could represent, for example, a log of search queries along with dense, sparse features and metadata. The datset below has 3 top-level entries. """ contents_raw = [ # dense [[1.1, 1.2, 1.3], [2.1, 2.2, 2.3], [3.1, 3.2, 3.3]], # floats [1, 2, 3], # len [11, 21, 22, 31, 32, 33], # key [1.1, 2.1, 2.2, 3.1, 3.2, 3.3], # value # int lists [2, 0, 1], # len [11, 12, 31], # key [2, 4, 3], # value:len [111, 112, 121, 122, 123, 124, 311, 312, 313], # value:value # id score pairs [1, 2, 2], # len [11, 21, 22, 31, 32], # key [1, 1, 2, 2, 3], # value:len [111, 211, 221, 222, 311, 312, 321, 322, 323], # value:ids [11.1, 21.1, 22.1, 22.2, 31.1, 31.2, 32.1, 32.2, 32.3], # val:score # metadata [123, 234, 456], # user_id [[0.2, 0.8], [0.5, 0.5], [0.7, 0.3]], # user_embed ['dog posts', 'friends who like to', 'posts about ca'], # query ] # convert the above content to ndarrays, checking against the schema contents = from_blob_list(schema, contents_raw) """ 3. Creating and appending to the dataset. We first create an empty dataset with the given schema. Then, a Writer is used to append these entries to the dataset. """ ds = dataset.Dataset(schema) net = core.Net('init') with core.NameScope('init'): ds.init_empty(net) content_blobs = NewRecord(net, contents) FeedRecord(content_blobs, contents) writer = ds.writer(init_net=net) writer.write_record(net, content_blobs) workspace.RunNetOnce(net) """ 4. Iterating through the dataset contents. If we were to iterate through the top level entries of our dataset, this is what we should expect to see: """ entries_raw = [ ( [[1.1, 1.2, 1.3]], # dense [1], [11], [1.1], # floats [2], [11, 12], [2, 4], [111, 112, 121, 122, 123, 124], # intlst [1], [11], [1], [111], [11.1], # id score pairs [123], [[0.2, 0.8]], ['dog posts'], # metadata ), ( [[2.1, 2.2, 2.3]], # dense [2], [21, 22], [2.1, 2.2], # floats [0], [], [], [], # int list [2], [21, 22], [1, 2], [211, 221, 222], [21.1, 22.1, 22.2], [234], [[0.5, 0.5]], ['friends who like to'], # metadata ), ( [[3.1, 3.2, 3.3]], # dense [3], [31, 32, 33], [3.1, 3.2, 3.3], # floats [1], [31], [3], [311, 312, 313], # int lst [2], [31, 32], [2, 3], [311, 312, 321, 322, 323], [31.1, 31.2, 32.1, 32.2, 32.3], # id score list [456], [[0.7, 0.3]], ['posts about ca'], # metadata ), # after the end of the dataset, we will keep getting empty vectors ([], ) * 16, ([], ) * 16, ] entries = [from_blob_list(schema, e) for e in entries_raw] """ Let's go ahead and create the reading nets. We will run `read` net multiple times and assert that we are reading the entries the way we stated above. """ read_init_net = core.Net('read_init') read_next_net = core.Net('read_next') reader = ds.reader(read_init_net) should_continue, batch = reader.read_record(read_next_net) workspace.RunNetOnce(read_init_net) workspace.CreateNet(read_next_net, True) for entry in entries: workspace.RunNet(str(read_next_net)) actual = FetchRecord(batch) _assert_records_equal(actual, entry) """ 5. Reading/writing in a single plan If all of operations on the data are expressible as Caffe2 operators, we don't need to load the data to python, iterating through the dataset in a single Plan. Where we will process the dataset a little and store it in a second dataset. We can reuse the same Reader since it supports reset. """ reset_net = core.Net('reset_net') reader.reset(reset_net) read_step, batch = reader.execution_step() """ We will add the line number * 1000 to the feature ids. """ process_net = core.Net('process') line_no = Const(process_net, 0, dtype=np.int32) const_one = Const(process_net, 1000, dtype=np.int32) process_net.Add([line_no, const_one], [line_no]) field = batch.floats.keys.get() process_net.Print(field, []) process_net.Add([field, line_no], field, broadcast=1, axis=0) """ Lets create a second dataset and append to it. """ ds2 = dataset.Dataset(schema, name='dataset2') ds2.init_empty(reset_net) writer = ds2.writer(reset_net) writer.write_record(process_net, batch) # commit is not necessary for DatasetWriter but will add it for # generality of the example commit_net = core.Net('commit') writer.commit(commit_net) """ Time to create and run a plan which will do the processing """ plan = core.Plan('process') plan.AddStep(core.execution_step('reset', reset_net)) plan.AddStep(read_step.AddNet(process_net)) plan.AddStep(core.execution_step('commit', commit_net)) workspace.RunPlan(plan) """ Now we should have dataset2 populated. """ ds2_data = FetchRecord(ds2.content()) field = ds2_data.floats.keys field.set(blob=field.get() - [1000, 2000, 2000, 3000, 3000, 3000]) _assert_records_equal(contents, ds2_data) """ 6. Slicing a dataset You can create a new schema from pieces of another schema and reuse the same data. """ subschema = Struct(('top_level', schema.int_lists.values)) int_list_contents = contents.int_lists.values.field_names() self.assertEquals(len(subschema.field_names()), len(int_list_contents)) """ 7. Random Access a dataset """ read_init_net = core.Net('read_init') read_next_net = core.Net('read_next') idx = np.array([2, 1, 0]) indices_blob = Const(read_init_net, idx, name='indices') reader = ds.random_reader(read_init_net, indices_blob) reader.computeoffset(read_init_net) should_stop, batch = reader.read_record(read_next_net) workspace.CreateNet(read_init_net, True) workspace.RunNetOnce(read_init_net) workspace.CreateNet(read_next_net, True) for i in range(len(entries)): k = idx[i] if i in idx else i entry = entries[k] workspace.RunNet(str(read_next_net)) actual = FetchRecord(batch) _assert_records_equal(actual, entry) workspace.RunNet(str(read_next_net)) self.assertEquals(True, workspace.FetchBlob(should_stop)) """ 8. Random Access a dataset with loop_over = true """ read_init_net = core.Net('read_init') read_next_net = core.Net('read_next') idx = np.array([2, 1, 0]) indices_blob = Const(read_init_net, idx, name='indices') reader = ds.random_reader(read_init_net, indices_blob, loop_over=True) reader.computeoffset(read_init_net) should_stop, batch = reader.read_record(read_next_net) workspace.CreateNet(read_init_net, True) workspace.RunNetOnce(read_init_net) workspace.CreateNet(read_next_net, True) for _ in range(len(entries) * 3): workspace.RunNet(str(read_next_net)) self.assertEquals(False, workspace.FetchBlob(should_stop)) """ 9. Sort and shuffle a dataset This sort the dataset using the score of a certain column, and then shuffle within each chunk of size batch_size * shuffle_size before shuffling the chunks. """ read_init_net = core.Net('read_init') read_next_net = core.Net('read_next') reader = ds.random_reader(read_init_net) reader.sort_and_shuffle(read_init_net, 'int_lists:lengths', 1, 2) reader.computeoffset(read_init_net) should_continue, batch = reader.read_record(read_next_net) workspace.CreateNet(read_init_net, True) workspace.RunNetOnce(read_init_net) workspace.CreateNet(read_next_net, True) expected_idx = np.array([2, 1, 0]) for i in range(len(entries)): k = expected_idx[i] if i in expected_idx else i entry = entries[k] workspace.RunNet(str(read_next_net)) actual = FetchRecord(batch) _assert_records_equal(actual, entry)
def read_record_ex(self, local_init_net, local_finish_net): nets, should_stop, fields = self.read_ex( local_init_net, local_finish_net) if self._schema: fields = from_blob_list(self._schema, fields) return nets, should_stop, fields
def test_dataset_ops(self): """ 1. Defining the schema of our dataset. This example schema could represent, for example, a search query log. """ schema = Struct( # fixed size vector, which will be stored as a matrix when batched ('dense', Scalar((np.float32, 3))), # could represent a feature map from feature ID to float value ('floats', Map( Scalar(np.int32), Scalar(np.float32) )), # could represent a multi-valued categorical feature map ('int_lists', Map( Scalar(np.int32), List(Scalar(np.int64)), )), # could represent a multi-valued, weighted categorical feature map ( 'id_score_pairs', Map( Scalar(np.int32), Map( Scalar(np.int64), Scalar(np.float32), keys_name='ids', values_name='scores' ), ) ), # additional scalar information ( 'metadata', Struct( ('user_id', Scalar(np.int64)), ('user_embed', Scalar((np.float32, 2))), ('query', Scalar(str)), ) ), ) """ This is what the flattened fields for this schema look like, along with its type. Each one of these fields will be stored, read and writen as a tensor. """ expected_fields = [ ('dense', (np.float32, 3)), ('floats:lengths', np.int32), ('floats:values:keys', np.int32), ('floats:values:values', np.float32), ('int_lists:lengths', np.int32), ('int_lists:values:keys', np.int32), ('int_lists:values:values:lengths', np.int32), ('int_lists:values:values:values', np.int64), ('id_score_pairs:lengths', np.int32), ('id_score_pairs:values:keys', np.int32), ('id_score_pairs:values:values:lengths', np.int32), ('id_score_pairs:values:values:values:ids', np.int64), ('id_score_pairs:values:values:values:scores', np.float32), ('metadata:user_id', np.int64), ('metadata:user_embed', (np.float32, 2)), ('metadata:query', str), ] zipped = zip( expected_fields, schema.field_names(), schema.field_types() ) for (ref_name, ref_type), name, dtype in zipped: self.assertEquals(ref_name, name) self.assertEquals(np.dtype(ref_type), dtype) """ 2. The contents of our dataset. Contents as defined below could represent, for example, a log of search queries along with dense, sparse features and metadata. The datset below has 3 top-level entries. """ contents_raw = [ # dense [[1.1, 1.2, 1.3], [2.1, 2.2, 2.3], [3.1, 3.2, 3.3]], # floats [1, 2, 3], # len [11, 21, 22, 31, 32, 33], # key [1.1, 2.1, 2.2, 3.1, 3.2, 3.3], # value # int lists [2, 0, 1], # len [11, 12, 31], # key [2, 4, 3], # value:len [111, 112, 121, 122, 123, 124, 311, 312, 313], # value:value # id score pairs [1, 2, 2], # len [11, 21, 22, 31, 32], # key [1, 1, 2, 2, 3], # value:len [111, 211, 221, 222, 311, 312, 321, 322, 323], # value:ids [11.1, 21.1, 22.1, 22.2, 31.1, 31.2, 32.1, 32.2, 32.3], # val:score # metadata [123, 234, 456], # user_id [[0.2, 0.8], [0.5, 0.5], [0.7, 0.3]], # user_embed ['dog posts', 'friends who like to', 'posts about ca'], # query ] # convert the above content to ndarrays, checking against the schema contents = from_blob_list(schema, contents_raw) """ 3. Creating and appending to the dataset. We first create an empty dataset with the given schema. Then, a Writer is used to append these entries to the dataset. """ ds = dataset.Dataset(schema) net = core.Net('init') with core.NameScope('init'): ds.init_empty(net) content_blobs = NewRecord(net, contents) FeedRecord(content_blobs, contents) writer = ds.writer(init_net=net) writer.write_record(net, content_blobs) workspace.RunNetOnce(net) """ 4. Iterating through the dataset contents. If we were to iterate through the top level entries of our dataset, this is what we should expect to see: """ entries_raw = [ ( [[1.1, 1.2, 1.3]], # dense [1], [11], [1.1], # floats [2], [11, 12], [2, 4], [111, 112, 121, 122, 123, 124], # intlst [1], [11], [1], [111], [11.1], # id score pairs [123], [[0.2, 0.8]], ['dog posts'], # metadata ), ( [[2.1, 2.2, 2.3]], # dense [2], [21, 22], [2.1, 2.2], # floats [0], [], [], [], # int list [2], [21, 22], [1, 2], [211, 221, 222], [21.1, 22.1, 22.2], [234], [[0.5, 0.5]], ['friends who like to'], # metadata ), ( [[3.1, 3.2, 3.3]], # dense [3], [31, 32, 33], [3.1, 3.2, 3.3], # floats [1], [31], [3], [311, 312, 313], # int lst [2], [31, 32], [2, 3], [311, 312, 321, 322, 323], [31.1, 31.2, 32.1, 32.2, 32.3], # id score list [456], [[0.7, 0.3]], ['posts about ca'], # metadata ), # after the end of the dataset, we will keep getting empty vectors ([], ) * 16, ([], ) * 16, ] entries = [from_blob_list(schema, e) for e in entries_raw] """ Let's go ahead and create the reading nets. We will run `read` net multiple times and assert that we are reading the entries the way we stated above. """ read_init_net = core.Net('read_init') read_next_net = core.Net('read_next') reader = ds.reader(read_init_net) should_continue, batch = reader.read_record(read_next_net) workspace.RunNetOnce(read_init_net) workspace.CreateNet(read_next_net, True) for entry in entries: workspace.RunNet(str(read_next_net)) actual = FetchRecord(batch) _assert_records_equal(actual, entry) """ 5. Reading/writing in a single plan If all of operations on the data are expressible as Caffe2 operators, we don't need to load the data to python, iterating through the dataset in a single Plan. Where we will process the dataset a little and store it in a second dataset. We can reuse the same Reader since it supports reset. """ reset_net = core.Net('reset_net') reader.reset(reset_net) read_step, batch = reader.execution_step() """ We will add the line number * 1000 to the feature ids. """ process_net = core.Net('process') line_no = Const(process_net, 0, dtype=np.int32) const_one = Const(process_net, 1000, dtype=np.int32) process_net.Add([line_no, const_one], [line_no]) field = batch.floats.keys.get() process_net.Print(field, []) process_net.Add([field, line_no], field, broadcast=1, axis=0) """ Lets create a second dataset and append to it. """ ds2 = dataset.Dataset(schema, name='dataset2') ds2.init_empty(reset_net) writer = ds2.writer(reset_net) writer.write_record(process_net, batch) # commit is not necessary for DatasetWriter but will add it for # generality of the example commit_net = core.Net('commit') writer.commit(commit_net) """ Time to create and run a plan which will do the processing """ plan = core.Plan('process') plan.AddStep(core.execution_step('reset', reset_net)) plan.AddStep(read_step.AddNet(process_net)) plan.AddStep(core.execution_step('commit', commit_net)) workspace.RunPlan(plan) """ Now we should have dataset2 populated. """ ds2_data = FetchRecord(ds2.content()) field = ds2_data.floats.keys field.set(blob=field.get() - [1000, 2000, 2000, 3000, 3000, 3000]) _assert_records_equal(contents, ds2_data) """ 6. Slicing a dataset You can create a new schema from pieces of another schema and reuse the same data. """ subschema = Struct(('top_level', schema.int_lists.values)) int_list_contents = contents.int_lists.values.field_names() self.assertEquals(len(subschema.field_names()), len(int_list_contents)) """ 7. Random Access a dataset """ read_init_net = core.Net('read_init') read_next_net = core.Net('read_next') idx = np.array([2, 1, 0]) indices_blob = Const(read_init_net, idx, name='indices') reader = ds.random_reader(read_init_net, indices_blob) reader.computeoffset(read_init_net) should_stop, batch = reader.read_record(read_next_net) workspace.CreateNet(read_init_net, True) workspace.RunNetOnce(read_init_net) workspace.CreateNet(read_next_net, True) for i in range(len(entries)): k = idx[i] if i in idx else i entry = entries[k] workspace.RunNet(str(read_next_net)) actual = FetchRecord(batch) _assert_records_equal(actual, entry) workspace.RunNet(str(read_next_net)) self.assertEquals(True, workspace.FetchBlob(should_stop)) """ 8. Random Access a dataset with loop_over = true """ read_init_net = core.Net('read_init') read_next_net = core.Net('read_next') idx = np.array([2, 1, 0]) indices_blob = Const(read_init_net, idx, name='indices') reader = ds.random_reader(read_init_net, indices_blob, loop_over=True) reader.computeoffset(read_init_net) should_stop, batch = reader.read_record(read_next_net) workspace.CreateNet(read_init_net, True) workspace.RunNetOnce(read_init_net) workspace.CreateNet(read_next_net, True) for _ in range(len(entries) * 3): workspace.RunNet(str(read_next_net)) self.assertEquals(False, workspace.FetchBlob(should_stop)) """ 9. Sort and shuffle a dataset This sort the dataset using the score of a certain column, and then shuffle within each chunk of size batch_size * shuffle_size before shuffling the chunks. """ read_init_net = core.Net('read_init') read_next_net = core.Net('read_next') reader = ds.random_reader(read_init_net) reader.sort_and_shuffle(read_init_net, 'int_lists:lengths', 1, 2) reader.computeoffset(read_init_net) should_continue, batch = reader.read_record(read_next_net) workspace.CreateNet(read_init_net, True) workspace.RunNetOnce(read_init_net) workspace.CreateNet(read_next_net, True) expected_idx = np.array([2, 1, 0]) for i in range(len(entries)): k = expected_idx[i] if i in expected_idx else i entry = entries[k] workspace.RunNet(str(read_next_net)) actual = FetchRecord(batch) _assert_records_equal(actual, entry) """ Trim a dataset """ trim_net = core.Net('trim_ds') ds.trim(trim_net, multiple_of=2) workspace.RunNetOnce(trim_net) trimmed = FetchRecord(ds.content()) EXPECTED_SIZES = [2, 2, 3, 3, 2, 2, 2, 6, 2, 3, 3, 4, 4, 2, 2, 2] actual_sizes = [d.shape[0] for d in trimmed.field_blobs()] self.assertEquals(EXPECTED_SIZES, actual_sizes)
def __init__(self, model, input_record, num_to_collect, name='reservoir_sampling', **kwargs): super(ReservoirSampling, self).__init__(model, name, input_record, **kwargs) assert num_to_collect > 0 self.num_to_collect = num_to_collect self.reservoir = model.net.NextScopedBlob(name + "_reservoir") self.num_visited_blob = model.net.NextScopedBlob(name + "_num_visited") self.mutex = model.net.NextScopedBlob(name + "_mutex") self.params.append( LayerParameter( parameter=self.reservoir, initializer=core.CreateOperator('ConstantFill', [], self.reservoir, shape=[0]), optimizer=model.NoOptim, )) self.params.append( LayerParameter( parameter=self.num_visited_blob, initializer=core.CreateOperator( 'ConstantFill', [], self.num_visited_blob, shape=[], value=0, dtype=core.DataType.INT64, ), optimizer=model.NoOptim, )) self.params.append( LayerParameter( parameter=self.mutex, initializer=core.CreateOperator("CreateMutex", [], self.mutex), optimizer=model.NoOptim, ), ) self.extra_input_blobs = [] self.extra_output_blobs = [] if 'object_id' in input_record: self.extra_input_blobs.append(input_record.object_id()) object_to_pos = model.net.NextScopedBlob(name + "_object_to_pos") pos_to_object = model.net.NextScopedBlob(name + "_pos_to_object") self.extra_input_blobs.extend([object_to_pos, pos_to_object]) self.extra_output_blobs.extend([object_to_pos, pos_to_object]) self.params.append( LayerParameter( parameter=object_to_pos, initializer=core.CreateOperator( 'CreateMap', [], object_to_pos, key_dtype=core.DataType.INT64, valued_dtype=core.DataType.INT32, ), optimizer=model.NoOptim, )) self.params.append( LayerParameter( parameter=pos_to_object, initializer=core.CreateOperator( 'ConstantFill', [], pos_to_object, shape=[0], value=0, dtype=core.DataType.INT64, ), optimizer=model.NoOptim, )) self.output_schema = schema.Struct( ('reservoir', schema.from_blob_list(input_record.data, [self.reservoir])), ('num_visited', schema.Scalar(blob=self.num_visited_blob)), ('mutex', schema.Scalar(blob=self.mutex)), )
def __init__(self, model, input_record, num_to_collect, name='reservoir_sampling', **kwargs): super(ReservoirSampling, self).__init__( model, name, input_record, **kwargs) assert num_to_collect > 0 self.num_to_collect = num_to_collect self.reservoir = self.create_param( param_name='reservoir', shape=[0], initializer=('ConstantFill',), optimizer=model.NoOptim, ) self.num_visited_blob = self.create_param( param_name='num_visited', shape=[], initializer=('ConstantFill', { 'value': 0, 'dtype': core.DataType.INT64, }), optimizer=model.NoOptim, ) self.mutex = self.create_param( param_name='mutex', shape=None, initializer=('CreateMutex',), optimizer=model.NoOptim, ) self.extra_input_blobs = [] self.extra_output_blobs = [] if 'object_id' in input_record: object_to_pos = self.create_param( param_name='object_to_pos', shape=None, initializer=('CreateMap', { 'key_dtype': core.DataType.INT64, 'valued_dtype': core.DataType.INT32, }), optimizer=model.NoOptim, ) pos_to_object = self.create_param( param_name='pos_to_object', shape=[0], initializer=('ConstantFill', { 'value': 0, 'dtype': core.DataType.INT64, }), optimizer=model.NoOptim, ) self.extra_input_blobs.append(input_record.object_id()) self.extra_input_blobs.extend([object_to_pos, pos_to_object]) self.extra_output_blobs.extend([object_to_pos, pos_to_object]) self.output_schema = schema.Struct( ( 'reservoir', schema.from_blob_list(input_record.data, [self.reservoir]) ), ('num_visited', schema.Scalar(blob=self.num_visited_blob)), ('mutex', schema.Scalar(blob=self.mutex)), )
def read_record_ex(self, local_init_net, local_finish_net): nets, should_stop, fields = self.read_ex(local_init_net, local_finish_net) if self._schema: fields = from_blob_list(self._schema, fields) return nets, should_stop, fields
def __init__(self, model, input_record, num_to_collect, name='reservoir_sampling', **kwargs): super(ReservoirSampling, self).__init__(model, name, input_record, **kwargs) assert num_to_collect > 0 self.num_to_collect = num_to_collect self.reservoir = self.create_param( param_name='reservoir', shape=[0], initializer=('ConstantFill', ), optimizer=model.NoOptim, ) self.num_visited_blob = self.create_param( param_name='num_visited', shape=[], initializer=('ConstantFill', { 'value': 0, 'dtype': core.DataType.INT64, }), optimizer=model.NoOptim, ) self.mutex = self.create_param( param_name='mutex', shape=[], initializer=('CreateMutex', ), optimizer=model.NoOptim, ) self.extra_input_blobs = [] self.extra_output_blobs = [] if 'object_id' in input_record: object_to_pos = self.create_param( param_name='object_to_pos', shape=None, initializer=('CreateMap', { 'key_dtype': core.DataType.INT64, 'valued_dtype': core.DataType.INT32, }), optimizer=model.NoOptim, ) pos_to_object = self.create_param( param_name='pos_to_object', shape=[0], initializer=('ConstantFill', { 'value': 0, 'dtype': core.DataType.INT64, }), optimizer=model.NoOptim, ) self.extra_input_blobs.append(input_record.object_id()) self.extra_input_blobs.extend([object_to_pos, pos_to_object]) self.extra_output_blobs.extend([object_to_pos, pos_to_object]) self.output_schema = schema.Struct( ('reservoir', schema.from_blob_list(input_record.data, [self.reservoir])), ('num_visited', schema.Scalar(blob=self.num_visited_blob)), ('mutex', schema.Scalar(blob=self.mutex)), )
def test_record_queue(self): num_prod = 8 num_consume = 3 schema = Struct( ('floats', Map( Scalar(np.int32), Scalar(np.float32))), ) contents_raw = [ [1, 2, 3], # len [11, 21, 22, 31, 32, 33], # key [1.1, 2.1, 2.2, 3.1, 3.2, 3.3], # value ] contents = from_blob_list(schema, contents_raw) ds = Dataset(schema) net = core.Net('init') ds.init_empty(net) content_blobs = NewRecord(net, contents) FeedRecord(content_blobs, contents) writer = ds.writer(init_net=net) writer.write_record(net, content_blobs) reader = ds.reader(init_net=net) # prepare receiving dataset rec_dataset = Dataset(contents, name='rec') rec_dataset.init_empty(init_net=net) rec_dataset_writer = rec_dataset.writer(init_net=net) workspace.RunNetOnce(net) queue = RecordQueue(contents, num_threads=num_prod) def process(net, fields): new_fields = [] for f in fields.field_blobs(): new_f = net.Copy(f) new_fields.append(new_f) new_fields = from_blob_list(fields, new_fields) return new_fields q_reader, q_step, q_exit, fields = queue.build(reader, process) producer_step = core.execution_step('producer', [q_step, q_exit]) consumer_steps = [] for i in range(num_consume): name = 'queue_reader_' + str(i) net_consume = core.Net(name) should_stop, fields = q_reader.read_record(net_consume) step_consume = core.execution_step(name, net_consume) name = 'dataset_writer_' + str(i) net_dataset = core.Net(name) rec_dataset_writer.write(net_dataset, fields.field_blobs()) step_dataset = core.execution_step(name, net_dataset) step = core.execution_step( 'consumer_' + str(i), [step_consume, step_dataset], should_stop_blob=should_stop) consumer_steps.append(step) consumer_step = core.execution_step( 'consumers', consumer_steps, concurrent_substeps=True) work_steps = core.execution_step( 'work', [producer_step, consumer_step], concurrent_substeps=True) plan = core.Plan('test') plan.AddStep(work_steps) core.workspace.RunPlan(plan) data = workspace.FetchBlobs(rec_dataset.get_blobs()) self.assertEqual(6, sum(data[0])) self.assertEqual(150, sum(data[1])) self.assertAlmostEqual(15, sum(data[2]), places=5)