예제 #1
0
def _dataset(draw, min_elements=3, max_elements=10, **kwargs):
    schema = Struct(
        # Dense Features Map
        ('floats', Map(Scalar(np.int32), Scalar(np.float32))),
        # Sparse Features Map
        ('int_lists', Map(
            Scalar(np.int32),
            List(Scalar(np.int64)),
        )),
        # Complex Type
        ('text', Scalar(str)),
    )

    num_records = draw(
        st.integers(min_value=min_elements, max_value=max_elements))

    raw_dense_features_map_contents = draw(_dense_features_map(num_records))

    raw_sparse_features_map_contents = draw(_sparse_features_map(num_records))

    raw_text_contents = [
        draw(
            st.lists(st.text(alphabet=string.ascii_lowercase),
                     min_size=num_records,
                     max_size=num_records))
    ]

    # Concatenate all raw contents to a single one
    contents_raw = raw_dense_features_map_contents + raw_sparse_features_map_contents + raw_text_contents

    contents = from_blob_list(schema, contents_raw)

    return (schema, contents, num_records)
예제 #2
0
 def read_record_ex(self, local_init_net, local_finish_net):
     """Experimental extension to the interface. Don't use yet"""
     nets, should_stop, fields = self.read_ex(
         local_init_net, local_finish_net)
     if self._schema:
         fields = from_blob_list(self._schema, fields)
     return nets, should_stop, fields
예제 #3
0
 def process(net, fields):
     new_fields = []
     for f in fields.field_blobs():
         new_f = net.Copy(f)
         new_fields.append(new_f)
     new_fields = from_blob_list(fields, new_fields)
     return new_fields
예제 #4
0
 def read_record_ex(self, local_init_net, local_finish_net):
     """Experimental extension to the interface. Don't use yet"""
     nets, should_stop, fields = self.read_ex(local_init_net,
                                              local_finish_net)
     if self._schema:
         fields = from_blob_list(self._schema, fields)
     return nets, should_stop, fields
예제 #5
0
 def process(net, fields):
     new_fields = []
     for f in fields.field_blobs():
         new_f = net.Copy(f)
         new_fields.append(new_f)
     new_fields = from_blob_list(fields, new_fields)
     return new_fields
예제 #6
0
    def testPreservesMetadata(self):
        s = schema.Struct(
            ('a', schema.Scalar(np.float32)),
            ('b',
             schema.Scalar(np.int32,
                           metadata=schema.Metadata(categorical_limit=5))),
            ('c',
             schema.List(
                 schema.Scalar(
                     np.int32,
                     metadata=schema.Metadata(categorical_limit=6)))))
        # attach metadata to lengths field
        s.c.lengths.set_metadata(schema.Metadata(categorical_limit=7))

        self.assertEqual(None, s.a.metadata)
        self.assertEqual(5, s.b.metadata.categorical_limit)
        self.assertEqual(6, s.c.value.metadata.categorical_limit)
        self.assertEqual(7, s.c.lengths.metadata.categorical_limit)
        sc = s.clone()
        self.assertEqual(None, sc.a.metadata)
        self.assertEqual(5, sc.b.metadata.categorical_limit)
        self.assertEqual(6, sc.c.value.metadata.categorical_limit)
        self.assertEqual(7, sc.c.lengths.metadata.categorical_limit)
        sv = schema.from_blob_list(s, [
            np.array([3.4]),
            np.array([2]),
            np.array([3]),
            np.array([1, 2, 3])
        ])
        self.assertEqual(None, sv.a.metadata)
        self.assertEqual(5, sv.b.metadata.categorical_limit)
        self.assertEqual(6, sv.c.value.metadata.categorical_limit)
        self.assertEqual(7, sv.c.lengths.metadata.categorical_limit)
예제 #7
0
파일: task.py 프로젝트: Ralfhund/caffe2
 def get(self):
     assert self._values is not None, 'Output value not set yet.'
     if self._is_scalar:
         return self._values[0]
     elif self._schema:
         return from_blob_list(self._schema, self._values)
     else:
         return self._values
예제 #8
0
파일: task.py 프로젝트: WoodoLee/TorchCraft
 def get(self):
     assert self._values is not None, 'Output value not set yet.'
     if self._is_scalar:
         return self._values[0]
     elif self._schema:
         return from_blob_list(self._schema, self._values)
     else:
         return self._values
예제 #9
0
    def __init__(self,
                 model,
                 input_record,
                 axis=1,
                 add_axis=0,
                 name='concat',
                 **kwargs):
        super(Concat, self).__init__(model, name, input_record, **kwargs)
        self.axis = axis
        self.add_axis = add_axis
        assert not (axis == 0 and add_axis == 1), \
            "It's not allowed to add axis=0"
        assert isinstance(input_record, schema.Struct),\
            "Incorrect input type. Expected Struct, but received: {0}".\
            format(input_record)

        shapes = []
        for field_name, field_type in viewitems(input_record.fields):
            assert isinstance(field_type, schema.Scalar),\
                "Incorrect input type for {}. Expected Scalar, but got: {}".\
                format(field_name, field_type)
            # Assume that first dimension is batch, so actual axis in shape is
            # axis - 1
            shape = list(field_type.field_type().shape)
            if add_axis:
                shape.insert(axis - 1, 1)
            assert len(shape) >= axis,\
                "Concat expects that limited dimensions of the input tensor"
            shapes.append(shape)
        logger.info('Concat Layer input shapes: ' + str(shapes))

        if axis == 0:
            self.output_schema = schema.from_blob_list(
                input_record[0], [self.get_next_blob_reference('output')])
            return

        concat_dim = 0
        for shape in shapes:
            concat_dim += shape[axis - 1]
            shape[axis - 1] = 0
            assert shape == shapes[0],\
                "Shapes {0} and {1} are not compatible for Concat".\
                format(shape, shapes[0])
        output_dims = shapes[0]
        output_dims[axis - 1] = concat_dim

        logger.info('Concat Layer output_dims: ' + str(output_dims))
        self.output_schema = schema.Scalar(
            (np.float32, output_dims), self.get_next_blob_reference('output'))

        record_to_concat = input_record.fields.values()
        concated_feature_to_index = get_concatenated_feature_to_index(
            record_to_concat)
        if concated_feature_to_index:
            metadata = schema.Metadata(feature_specs=schema.FeatureSpec(
                feature_to_index=concated_feature_to_index))
            self.output_schema.set_metadata(metadata)
예제 #10
0
파일: task.py 프로젝트: Ralfhund/caffe2
 def fetch(self):
     assert self._fetch_func is not None, (
         'Cannot fetch value for this output.')
     fetched_vals = [self._fetch_func(v) for v in self._values]
     if self._is_scalar:
         return fetched_vals[0]
     elif self._schema:
         return from_blob_list(self._schema, fetched_vals)
     else:
         return fetched_vals
예제 #11
0
파일: task.py 프로젝트: WoodoLee/TorchCraft
 def fetch(self):
     assert self._fetch_func is not None, (
         'Cannot fetch value for this output.')
     fetched_vals = [self._fetch_func(v) for v in self._values]
     if self._is_scalar:
         return fetched_vals[0]
     elif self._schema:
         return from_blob_list(self._schema, fetched_vals)
     else:
         return fetched_vals
예제 #12
0
 def testPreservesEmptyFields(self):
     s = schema.Struct(
         ('a', schema.Scalar(np.float32)),
         ('b', schema.Struct()),
     )
     sc = s.clone()
     self.assertIn("a", sc.fields)
     self.assertIn("b", sc.fields)
     sv = schema.from_blob_list(s, [np.array([3.4])])
     self.assertIn("a", sv.fields)
     self.assertIn("b", sv.fields)
     self.assertEqual(0, len(sv.b.fields))
예제 #13
0
 def testPreservesEmptyFields(self):
     s = schema.Struct(
         ('a', schema.Scalar(np.float32)),
         ('b', schema.Struct()),
     )
     sc = s.clone()
     self.assertIn("a", sc.fields)
     self.assertIn("b", sc.fields)
     sv = schema.from_blob_list(s, [np.array([3.4])])
     self.assertIn("a", sv.fields)
     self.assertIn("b", sv.fields)
     self.assertEqual(0, len(sv.b.fields))
예제 #14
0
    def __init__(self,
                 model,
                 input_record,
                 num_to_collect,
                 name='last_n_window_collector',
                 **kwargs):
        super(LastNWindowCollector, self).__init__(model, name, input_record,
                                                   **kwargs)
        assert num_to_collect > 0
        self.num_to_collect = num_to_collect
        assert isinstance(input_record, schema.Scalar), \
            "Got {!r}".format(input_record)

        self.last_n = self.create_param(param_name='last_n',
                                        shape=[0],
                                        initializer=('ConstantFill', {}),
                                        optimizer=model.NoOptim)

        self.next_blob = self.create_param(param_name='next',
                                           shape=[],
                                           initializer=('ConstantFill', {
                                               'value':
                                               0,
                                               'dtype':
                                               core.DataType.INT32
                                           }),
                                           optimizer=model.NoOptim)

        self.mutex = self.create_param(
            param_name='mutex',
            shape=None,
            initializer=('CreateMutex', ),
            optimizer=model.NoOptim,
        )

        self.num_visited_blob = self.create_param(
            param_name='num_visited',
            shape=[],
            initializer=('ConstantFill', {
                'value': 0,
                'dtype': core.DataType.INT64,
            }),
            optimizer=model.NoOptim,
        )

        self.output_schema = schema.Struct(
            ('last_n', schema.from_blob_list(input_record, [self.last_n])),
            ('num_visited', schema.Scalar(blob=self.num_visited_blob)),
            ('mutex', schema.Scalar(blob=self.mutex)),
        )
예제 #15
0
    def __init__(self,
                 model,
                 input_record,
                 axis=1,
                 add_axis=0,
                 name='concat',
                 **kwargs):
        super(Concat, self).__init__(model, name, input_record, **kwargs)
        self.axis = axis
        self.add_axis = add_axis
        assert not (axis == 0 and add_axis == 1), \
            "It's not allowed to add axis=0"
        assert isinstance(input_record, schema.Struct),\
            "Incorrect input type. Excpected Struct, but received: {0}".\
            format(input_record)

        shapes = []
        for field_name, field_type in viewitems(input_record.fields):
            assert isinstance(field_type, schema.Scalar),\
                "Incorrect input type for {}. Excpected Scalar, but got: {}".\
                format(field_name, field_type)
            # Assume that first dimension is batch, so actual axis in shape is
            # axis - 1
            assert len(field_type.field_type().shape) >= axis,\
                "Concat expects that limited dimensions of the input tensor"
            shapes.append(list(field_type.field_type().shape))

        if add_axis:
            for i in range(len(shapes)):
                shapes[i].insert(axis, 1)

        if axis == 0:
            self.output_schema = schema.from_blob_list(
                input_record[0], [model.net.NextScopedBlob(name + '_output')])
            return

        concat_dim = 0
        for shape in shapes:
            concat_dim += shape[axis - 1]
            shape[axis - 1] = 0
            assert shape == shapes[0],\
                "Shapes {0} and {1} are not compatible for Concat".\
                format(shape, shapes[0])
        output_dims = shapes[0]
        output_dims[axis - 1] = concat_dim

        self.output_schema = schema.Scalar(
            (np.float32, output_dims),
            model.net.NextScopedBlob(name + '_output'))
예제 #16
0
파일: concat.py 프로젝트: Ralfhund/caffe2
    def __init__(self, model, input_record, axis=1, add_axis=0,
                 name='concat', **kwargs):
        super(Concat, self).__init__(model, name, input_record, **kwargs)
        self.axis = axis
        self.add_axis = add_axis
        assert not (axis == 0 and add_axis == 1), \
            "It's not allowed to add axis=0"
        assert isinstance(input_record, schema.Struct),\
            "Incorrect input type. Excpected Struct, but received: {0}".\
            format(input_record)

        shapes = []
        for field_name, field_type in viewitems(input_record.fields):
            assert isinstance(field_type, schema.Scalar),\
                "Incorrect input type for {}. Excpected Scalar, but got: {}".\
                format(field_name, field_type)
            # Assume that first dimension is batch, so actual axis in shape is
            # axis - 1
            shape = list(field_type.field_type().shape)
            if add_axis:
                shape.insert(axis - 1, 1)
            assert len(shape) >= axis,\
                "Concat expects that limited dimensions of the input tensor"
            shapes.append(shape)
        logger.info('Concat Layer input shapes: ' + str(shapes))

        if axis == 0:
            self.output_schema = schema.from_blob_list(
                input_record[0],
                [self.get_next_blob_reference('output')]
            )
            return

        concat_dim = 0
        for shape in shapes:
            concat_dim += shape[axis - 1]
            shape[axis - 1] = 0
            assert shape == shapes[0],\
                "Shapes {0} and {1} are not compatible for Concat".\
                format(shape, shapes[0])
        output_dims = shapes[0]
        output_dims[axis - 1] = concat_dim

        logger.info('Concat Layer output_dims: ' + str(output_dims))
        self.output_schema = schema.Scalar(
            (np.float32, output_dims),
            self.get_next_blob_reference('output'))
예제 #17
0
    def __init__(self, model, input_record, num_to_collect,
                 name='last_n_window_collector', **kwargs):
        super(LastNWindowCollector, self).__init__(
            model, name, input_record, **kwargs)
        assert num_to_collect > 0
        self.num_to_collect = num_to_collect
        assert isinstance(input_record, schema.Scalar), \
            "Got {!r}".format(input_record)

        self.last_n = self.create_param(param_name='last_n',
                                        shape=[0],
                                        initializer=('ConstantFill', {}),
                                        optimizer=model.NoOptim)

        self.next_blob = self.create_param(
            param_name='next',
            shape=[],
            initializer=('ConstantFill',
                         {'value': 0, 'dtype': core.DataType.INT32}),
            optimizer=model.NoOptim
        )

        self.mutex = self.create_param(
            param_name='mutex',
            shape=None,
            initializer=('CreateMutex',),
            optimizer=model.NoOptim,
        )

        self.num_visited_blob = self.create_param(
            param_name='num_visited',
            shape=[],
            initializer=('ConstantFill', {
                'value': 0,
                'dtype': core.DataType.INT64,
            }),
            optimizer=model.NoOptim,
        )

        self.output_schema = schema.Struct(
            (
                'last_n',
                schema.from_blob_list(input_record, [self.last_n])
            ),
            ('num_visited', schema.Scalar(blob=self.num_visited_blob)),
            ('mutex', schema.Scalar(blob=self.mutex)),
        )
예제 #18
0
def _dataset(draw, min_elements=3, max_elements=10, **kwargs):
    schema = Struct(
        # Dense Features Map
        ('floats', Map(
            Scalar(np.int32), Scalar(np.float32)
        )),
        # Sparse Features Map
        ('int_lists', Map(
            Scalar(np.int32),
            List(Scalar(np.int64)),
        )),
        # Complex Type
        ('text', Scalar(str)),
    )

    num_records = draw(
        st.integers(min_value=min_elements,
                    max_value=max_elements)
    )

    raw_dense_features_map_contents = draw(_dense_features_map(num_records))

    raw_sparse_features_map_contents = draw(_sparse_features_map(num_records))

    raw_text_contents = [
        draw(
            st.lists(
                st.text(alphabet=string.ascii_lowercase),
                min_size=num_records,
                max_size=num_records
            )
        )
    ]

    # Concatenate all raw contents to a single one
    contents_raw = raw_dense_features_map_contents + raw_sparse_features_map_contents + raw_text_contents

    contents = from_blob_list(schema, contents_raw)

    return (schema, contents, num_records)
예제 #19
0
    def __init__(self,
                 model,
                 input_record,
                 num_to_collect,
                 name='last_n_window_collector',
                 **kwargs):
        super(LastNWindowCollector, self).__init__(model, name, input_record,
                                                   **kwargs)
        assert num_to_collect > 0
        self.num_to_collect = num_to_collect
        assert isinstance(input_record, schema.Scalar), \
            "Got {!r}".format(input_record)

        self.last_n = model.net.NextScopedBlob(self.name + "_last_n")
        self.next_blob = model.net.NextScopedBlob(self.name + "_next")

        self.params.append(
            LayerParameter(
                parameter=self.last_n,
                initializer=core.CreateOperator('ConstantFill', [],
                                                self.last_n,
                                                shape=[0]),
                optimizer=model.NoOptim,
            ))
        self.params.append(
            LayerParameter(
                parameter=self.next_blob,
                initializer=core.CreateOperator(
                    'ConstantFill',
                    [],
                    self.next_blob,
                    shape=[],
                    value=0,
                    dtype=core.DataType.INT32,
                ),
                optimizer=model.NoOptim,
            ))

        self.output_schema = schema.from_blob_list(
            input_record, [model.net.NextScopedBlob(name + "_output")])
예제 #20
0
    def testPreservesMetadata(self):
        s = schema.Struct(
            ('a', schema.Scalar(np.float32)), (
                'b', schema.Scalar(
                    np.int32,
                    metadata=schema.Metadata(categorical_limit=5)
                )
            ), (
                'c', schema.List(
                    schema.Scalar(
                        np.int32,
                        metadata=schema.Metadata(categorical_limit=6)
                    )
                )
            )
        )
        # attach metadata to lengths field
        s.c.lengths.set_metadata(schema.Metadata(categorical_limit=7))

        self.assertEqual(None, s.a.metadata)
        self.assertEqual(5, s.b.metadata.categorical_limit)
        self.assertEqual(6, s.c.value.metadata.categorical_limit)
        self.assertEqual(7, s.c.lengths.metadata.categorical_limit)
        sc = s.clone()
        self.assertEqual(None, sc.a.metadata)
        self.assertEqual(5, sc.b.metadata.categorical_limit)
        self.assertEqual(6, sc.c.value.metadata.categorical_limit)
        self.assertEqual(7, sc.c.lengths.metadata.categorical_limit)
        sv = schema.from_blob_list(
            s, [
                np.array([3.4]), np.array([2]), np.array([3]),
                np.array([1, 2, 3])
            ]
        )
        self.assertEqual(None, sv.a.metadata)
        self.assertEqual(5, sv.b.metadata.categorical_limit)
        self.assertEqual(6, sv.c.value.metadata.categorical_limit)
        self.assertEqual(7, sv.c.lengths.metadata.categorical_limit)
예제 #21
0
    def test_record_queue(self):
        num_prod = 8
        num_consume = 3
        schema = Struct(
            ('floats', Map(Scalar(np.int32), Scalar(np.float32))), )
        contents_raw = [
            [1, 2, 3],  # len
            [11, 21, 22, 31, 32, 33],  # key
            [1.1, 2.1, 2.2, 3.1, 3.2, 3.3],  # value
        ]
        contents = from_blob_list(schema, contents_raw)
        ds = Dataset(schema)
        net = core.Net('init')
        ds.init_empty(net)

        content_blobs = NewRecord(net, contents)
        FeedRecord(content_blobs, contents)
        writer = ds.writer(init_net=net)
        writer.write_record(net, content_blobs)
        reader = ds.reader(init_net=net)

        # prepare receiving dataset
        rec_dataset = Dataset(contents, name='rec')
        rec_dataset.init_empty(init_net=net)
        rec_dataset_writer = rec_dataset.writer(init_net=net)

        workspace.RunNetOnce(net)

        queue = RecordQueue(contents, num_threads=num_prod)

        def process(net, fields):
            new_fields = []
            for f in fields.field_blobs():
                new_f = net.Copy(f)
                new_fields.append(new_f)
            new_fields = from_blob_list(fields, new_fields)
            return new_fields

        q_reader, q_step, q_exit, fields = queue.build(reader, process)
        producer_step = core.execution_step('producer', [q_step, q_exit])

        consumer_steps = []
        for i in range(num_consume):
            name = 'queue_reader_' + str(i)
            net_consume = core.Net(name)
            should_stop, fields = q_reader.read_record(net_consume)
            step_consume = core.execution_step(name, net_consume)

            name = 'dataset_writer_' + str(i)
            net_dataset = core.Net(name)
            rec_dataset_writer.write(net_dataset, fields.field_blobs())
            step_dataset = core.execution_step(name, net_dataset)

            step = core.execution_step('consumer_' + str(i),
                                       [step_consume, step_dataset],
                                       should_stop_blob=should_stop)
            consumer_steps.append(step)
        consumer_step = core.execution_step('consumers',
                                            consumer_steps,
                                            concurrent_substeps=True)

        work_steps = core.execution_step('work',
                                         [producer_step, consumer_step],
                                         concurrent_substeps=True)

        plan = core.Plan('test')
        plan.AddStep(work_steps)
        core.workspace.RunPlan(plan)
        data = workspace.FetchBlobs(rec_dataset.get_blobs())
        self.assertEqual(6, sum(data[0]))
        self.assertEqual(150, sum(data[1]))
        self.assertAlmostEqual(15, sum(data[2]), places=5)
예제 #22
0
파일: dataset.py 프로젝트: gtgalone/pytorch
 def content(self):
     """
     Return a Record of BlobReferences pointing to the full content of
     this dataset.
     """
     return from_blob_list(self.schema, self.field_blobs)
예제 #23
0
 def read_record(self, read_net):
     should_stop, fields = self.read(read_net)
     if self._schema:
         fields = from_blob_list(self._schema, fields)
     return should_stop, fields
예제 #24
0
    def test_dataset_ops(self):
        """
        1. Defining the schema of our dataset.

        This example schema could represent, for example, a search query log.
        """
        schema = Struct(
            # fixed size vector, which will be stored as a matrix when batched
            ('dense', Scalar((np.float32, 3))),
            # could represent a feature map from feature ID to float value
            ('floats', Map(
                Scalar(np.int32), Scalar(np.float32)
            )),
            # could represent a multi-valued categorical feature map
            ('int_lists', Map(
                Scalar(np.int32),
                List(Scalar(np.int64)),
            )),
            # could represent a multi-valued, weighted categorical feature map
            (
                'id_score_pairs', Map(
                    Scalar(np.int32),
                    Map(
                        Scalar(np.int64),
                        Scalar(np.float32),
                        keys_name='ids',
                        values_name='scores'
                    ),
                )
            ),
            # additional scalar information
            (
                'metadata', Struct(
                    ('user_id', Scalar(np.int64)),
                    ('user_embed', Scalar((np.float32, 2))),
                    ('query', Scalar(str)),
                )
            ),
        )
        """
        This is what the flattened fields for this schema look like, along
        with its type. Each one of these fields will be stored, read and
        writen as a tensor.
        """
        expected_fields = [
            ('dense', (np.float32, 3)),
            ('floats:lengths', np.int32),
            ('floats:values:keys', np.int32),
            ('floats:values:values', np.float32),
            ('int_lists:lengths', np.int32),
            ('int_lists:values:keys', np.int32),
            ('int_lists:values:values:lengths', np.int32),
            ('int_lists:values:values:values', np.int64),
            ('id_score_pairs:lengths', np.int32),
            ('id_score_pairs:values:keys', np.int32),
            ('id_score_pairs:values:values:lengths', np.int32),
            ('id_score_pairs:values:values:values:ids', np.int64),
            ('id_score_pairs:values:values:values:scores', np.float32),
            ('metadata:user_id', np.int64),
            ('metadata:user_embed', (np.float32, 2)),
            ('metadata:query', str),
        ]
        zipped = zip(
            expected_fields, schema.field_names(), schema.field_types()
        )
        for (ref_name, ref_type), name, dtype in zipped:
            self.assertEquals(ref_name, name)
            self.assertEquals(np.dtype(ref_type), dtype)
        """
        2. The contents of our dataset.

        Contents as defined below could represent, for example, a log of
        search queries along with dense, sparse features and metadata.
        The datset below has 3 top-level entries.
        """
        contents_raw = [
            # dense
            [[1.1, 1.2, 1.3], [2.1, 2.2, 2.3], [3.1, 3.2, 3.3]],
            # floats
            [1, 2, 3],  # len
            [11, 21, 22, 31, 32, 33],  # key
            [1.1, 2.1, 2.2, 3.1, 3.2, 3.3],  # value
            # int lists
            [2, 0, 1],  # len
            [11, 12, 31],  # key
            [2, 4, 3],  # value:len
            [111, 112, 121, 122, 123, 124, 311, 312, 313],  # value:value
            # id score pairs
            [1, 2, 2],  # len
            [11, 21, 22, 31, 32],  # key
            [1, 1, 2, 2, 3],  # value:len
            [111, 211, 221, 222, 311, 312, 321, 322, 323],  # value:ids
            [11.1, 21.1, 22.1, 22.2, 31.1, 31.2, 32.1, 32.2, 32.3],  # val:score
            # metadata
            [123, 234, 456],  # user_id
            [[0.2, 0.8], [0.5, 0.5], [0.7, 0.3]],  # user_embed
            ['dog posts', 'friends who like to', 'posts about ca'],  # query
        ]
        # convert the above content to ndarrays, checking against the schema
        contents = from_blob_list(schema, contents_raw)
        """
        3. Creating and appending to the dataset.
        We first create an empty dataset with the given schema.
        Then, a Writer is used to append these entries to the dataset.
        """
        ds = dataset.Dataset(schema)
        net = core.Net('init')
        with core.NameScope('init'):
            ds.init_empty(net)

            content_blobs = NewRecord(net, contents)
            FeedRecord(content_blobs, contents)
            writer = ds.writer(init_net=net)
            writer.write_record(net, content_blobs)
        workspace.RunNetOnce(net)
        """
        4. Iterating through the dataset contents.

        If we were to iterate through the top level entries of our dataset,
        this is what we should expect to see:
        """
        entries_raw = [
            (
                [[1.1, 1.2, 1.3]],  # dense
                [1],
                [11],
                [1.1],  # floats
                [2],
                [11, 12],
                [2, 4],
                [111, 112, 121, 122, 123, 124],  # intlst
                [1],
                [11],
                [1],
                [111],
                [11.1],  # id score pairs
                [123],
                [[0.2, 0.8]],
                ['dog posts'],  # metadata
            ),
            (
                [[2.1, 2.2, 2.3]],  # dense
                [2],
                [21, 22],
                [2.1, 2.2],  # floats
                [0],
                [],
                [],
                [],  # int list
                [2],
                [21, 22],
                [1, 2],
                [211, 221, 222],
                [21.1, 22.1, 22.2],
                [234],
                [[0.5, 0.5]],
                ['friends who like to'],  # metadata
            ),
            (
                [[3.1, 3.2, 3.3]],  # dense
                [3],
                [31, 32, 33],
                [3.1, 3.2, 3.3],  # floats
                [1],
                [31],
                [3],
                [311, 312, 313],  # int lst
                [2],
                [31, 32],
                [2, 3],
                [311, 312, 321, 322, 323],
                [31.1, 31.2, 32.1, 32.2, 32.3],  # id score list
                [456],
                [[0.7, 0.3]],
                ['posts about ca'],  # metadata
            ),
            # after the end of the dataset, we will keep getting empty vectors
            ([], ) * 16,
            ([], ) * 16,
        ]
        entries = [from_blob_list(schema, e) for e in entries_raw]
        """
        Let's go ahead and create the reading nets.
        We will run `read` net multiple times and assert that we are reading the
        entries the way we stated above.
        """
        read_init_net = core.Net('read_init')
        read_next_net = core.Net('read_next')
        reader = ds.reader(read_init_net)
        should_continue, batch = reader.read_record(read_next_net)

        workspace.RunNetOnce(read_init_net)
        workspace.CreateNet(read_next_net, True)

        for entry in entries:
            workspace.RunNet(str(read_next_net))
            actual = FetchRecord(batch)
            _assert_records_equal(actual, entry)
        """
        5. Reading/writing in a single plan

        If all of operations on the data are expressible as Caffe2 operators,
        we don't need to load the data to python, iterating through the dataset
        in a single Plan.

        Where we will process the dataset a little and store it in a second
        dataset. We can reuse the same Reader since it supports reset.
        """
        reset_net = core.Net('reset_net')
        reader.reset(reset_net)
        read_step, batch = reader.execution_step()
        """ We will add the line number * 1000 to the feature ids. """
        process_net = core.Net('process')
        line_no = Const(process_net, 0, dtype=np.int32)
        const_one = Const(process_net, 1000, dtype=np.int32)
        process_net.Add([line_no, const_one], [line_no])
        field = batch.floats.keys.get()
        process_net.Print(field, [])
        process_net.Add([field, line_no], field, broadcast=1, axis=0)
        """ Lets create a second dataset and append to it. """
        ds2 = dataset.Dataset(schema, name='dataset2')
        ds2.init_empty(reset_net)
        writer = ds2.writer(reset_net)
        writer.write_record(process_net, batch)
        # commit is not necessary for DatasetWriter but will add it for
        # generality of the example
        commit_net = core.Net('commit')
        writer.commit(commit_net)
        """ Time to create and run a plan which will do the processing """
        plan = core.Plan('process')
        plan.AddStep(core.execution_step('reset', reset_net))
        plan.AddStep(read_step.AddNet(process_net))
        plan.AddStep(core.execution_step('commit', commit_net))
        workspace.RunPlan(plan)
        """
        Now we should have dataset2 populated.
        """
        ds2_data = FetchRecord(ds2.content())
        field = ds2_data.floats.keys
        field.set(blob=field.get() - [1000, 2000, 2000, 3000, 3000, 3000])
        _assert_records_equal(contents, ds2_data)
        """
        6. Slicing a dataset

        You can create a new schema from pieces of another schema and reuse
        the same data.
        """
        subschema = Struct(('top_level', schema.int_lists.values))
        int_list_contents = contents.int_lists.values.field_names()
        self.assertEquals(len(subschema.field_names()), len(int_list_contents))
        """
        7. Random Access a dataset

        """
        read_init_net = core.Net('read_init')
        read_next_net = core.Net('read_next')

        idx = np.array([2, 1, 0])
        indices_blob = Const(read_init_net, idx, name='indices')
        reader = ds.random_reader(read_init_net, indices_blob)
        reader.computeoffset(read_init_net)

        should_stop, batch = reader.read_record(read_next_net)

        workspace.CreateNet(read_init_net, True)
        workspace.RunNetOnce(read_init_net)

        workspace.CreateNet(read_next_net, True)

        for i in range(len(entries)):
            k = idx[i] if i in idx else i
            entry = entries[k]
            workspace.RunNet(str(read_next_net))
            actual = FetchRecord(batch)
            _assert_records_equal(actual, entry)
        workspace.RunNet(str(read_next_net))
        self.assertEquals(True, workspace.FetchBlob(should_stop))
        """
        8. Random Access a dataset with loop_over = true

        """
        read_init_net = core.Net('read_init')
        read_next_net = core.Net('read_next')

        idx = np.array([2, 1, 0])
        indices_blob = Const(read_init_net, idx, name='indices')
        reader = ds.random_reader(read_init_net, indices_blob, loop_over=True)
        reader.computeoffset(read_init_net)

        should_stop, batch = reader.read_record(read_next_net)

        workspace.CreateNet(read_init_net, True)
        workspace.RunNetOnce(read_init_net)

        workspace.CreateNet(read_next_net, True)

        for _ in range(len(entries) * 3):
            workspace.RunNet(str(read_next_net))
            self.assertEquals(False, workspace.FetchBlob(should_stop))
        """
        9. Sort and shuffle a dataset

        This sort the dataset using the score of a certain column,
        and then shuffle within each chunk of size batch_size * shuffle_size
        before shuffling the chunks.

        """
        read_init_net = core.Net('read_init')
        read_next_net = core.Net('read_next')

        reader = ds.random_reader(read_init_net)
        reader.sort_and_shuffle(read_init_net, 'int_lists:lengths', 1, 2)
        reader.computeoffset(read_init_net)

        should_continue, batch = reader.read_record(read_next_net)

        workspace.CreateNet(read_init_net, True)
        workspace.RunNetOnce(read_init_net)

        workspace.CreateNet(read_next_net, True)

        expected_idx = np.array([2, 1, 0])
        for i in range(len(entries)):
            k = expected_idx[i] if i in expected_idx else i
            entry = entries[k]
            workspace.RunNet(str(read_next_net))
            actual = FetchRecord(batch)
            _assert_records_equal(actual, entry)
예제 #25
0
파일: dataio.py 프로젝트: gtgalone/pytorch
 def read_record_ex(self, local_init_net, local_finish_net):
     nets, should_stop, fields = self.read_ex(
         local_init_net, local_finish_net)
     if self._schema:
         fields = from_blob_list(self._schema, fields)
     return nets, should_stop, fields
예제 #26
0
    def test_dataset_ops(self):
        """
        1. Defining the schema of our dataset.

        This example schema could represent, for example, a search query log.
        """
        schema = Struct(
            # fixed size vector, which will be stored as a matrix when batched
            ('dense', Scalar((np.float32, 3))),
            # could represent a feature map from feature ID to float value
            ('floats', Map(
                Scalar(np.int32), Scalar(np.float32)
            )),
            # could represent a multi-valued categorical feature map
            ('int_lists', Map(
                Scalar(np.int32),
                List(Scalar(np.int64)),
            )),
            # could represent a multi-valued, weighted categorical feature map
            (
                'id_score_pairs', Map(
                    Scalar(np.int32),
                    Map(
                        Scalar(np.int64),
                        Scalar(np.float32),
                        keys_name='ids',
                        values_name='scores'
                    ),
                )
            ),
            # additional scalar information
            (
                'metadata', Struct(
                    ('user_id', Scalar(np.int64)),
                    ('user_embed', Scalar((np.float32, 2))),
                    ('query', Scalar(str)),
                )
            ),
        )
        """
        This is what the flattened fields for this schema look like, along
        with its type. Each one of these fields will be stored, read and
        writen as a tensor.
        """
        expected_fields = [
            ('dense', (np.float32, 3)),
            ('floats:lengths', np.int32),
            ('floats:values:keys', np.int32),
            ('floats:values:values', np.float32),
            ('int_lists:lengths', np.int32),
            ('int_lists:values:keys', np.int32),
            ('int_lists:values:values:lengths', np.int32),
            ('int_lists:values:values:values', np.int64),
            ('id_score_pairs:lengths', np.int32),
            ('id_score_pairs:values:keys', np.int32),
            ('id_score_pairs:values:values:lengths', np.int32),
            ('id_score_pairs:values:values:values:ids', np.int64),
            ('id_score_pairs:values:values:values:scores', np.float32),
            ('metadata:user_id', np.int64),
            ('metadata:user_embed', (np.float32, 2)),
            ('metadata:query', str),
        ]
        zipped = zip(
            expected_fields, schema.field_names(), schema.field_types()
        )
        for (ref_name, ref_type), name, dtype in zipped:
            self.assertEquals(ref_name, name)
            self.assertEquals(np.dtype(ref_type), dtype)
        """
        2. The contents of our dataset.

        Contents as defined below could represent, for example, a log of
        search queries along with dense, sparse features and metadata.
        The datset below has 3 top-level entries.
        """
        contents_raw = [
            # dense
            [[1.1, 1.2, 1.3], [2.1, 2.2, 2.3], [3.1, 3.2, 3.3]],
            # floats
            [1, 2, 3],  # len
            [11, 21, 22, 31, 32, 33],  # key
            [1.1, 2.1, 2.2, 3.1, 3.2, 3.3],  # value
            # int lists
            [2, 0, 1],  # len
            [11, 12, 31],  # key
            [2, 4, 3],  # value:len
            [111, 112, 121, 122, 123, 124, 311, 312, 313],  # value:value
            # id score pairs
            [1, 2, 2],  # len
            [11, 21, 22, 31, 32],  # key
            [1, 1, 2, 2, 3],  # value:len
            [111, 211, 221, 222, 311, 312, 321, 322, 323],  # value:ids
            [11.1, 21.1, 22.1, 22.2, 31.1, 31.2, 32.1, 32.2, 32.3],  # val:score
            # metadata
            [123, 234, 456],  # user_id
            [[0.2, 0.8], [0.5, 0.5], [0.7, 0.3]],  # user_embed
            ['dog posts', 'friends who like to', 'posts about ca'],  # query
        ]
        # convert the above content to ndarrays, checking against the schema
        contents = from_blob_list(schema, contents_raw)
        """
        3. Creating and appending to the dataset.
        We first create an empty dataset with the given schema.
        Then, a Writer is used to append these entries to the dataset.
        """
        ds = dataset.Dataset(schema)
        net = core.Net('init')
        with core.NameScope('init'):
            ds.init_empty(net)

            content_blobs = NewRecord(net, contents)
            FeedRecord(content_blobs, contents)
            writer = ds.writer(init_net=net)
            writer.write_record(net, content_blobs)
        workspace.RunNetOnce(net)
        """
        4. Iterating through the dataset contents.

        If we were to iterate through the top level entries of our dataset,
        this is what we should expect to see:
        """
        entries_raw = [
            (
                [[1.1, 1.2, 1.3]],  # dense
                [1],
                [11],
                [1.1],  # floats
                [2],
                [11, 12],
                [2, 4],
                [111, 112, 121, 122, 123, 124],  # intlst
                [1],
                [11],
                [1],
                [111],
                [11.1],  # id score pairs
                [123],
                [[0.2, 0.8]],
                ['dog posts'],  # metadata
            ),
            (
                [[2.1, 2.2, 2.3]],  # dense
                [2],
                [21, 22],
                [2.1, 2.2],  # floats
                [0],
                [],
                [],
                [],  # int list
                [2],
                [21, 22],
                [1, 2],
                [211, 221, 222],
                [21.1, 22.1, 22.2],
                [234],
                [[0.5, 0.5]],
                ['friends who like to'],  # metadata
            ),
            (
                [[3.1, 3.2, 3.3]],  # dense
                [3],
                [31, 32, 33],
                [3.1, 3.2, 3.3],  # floats
                [1],
                [31],
                [3],
                [311, 312, 313],  # int lst
                [2],
                [31, 32],
                [2, 3],
                [311, 312, 321, 322, 323],
                [31.1, 31.2, 32.1, 32.2, 32.3],  # id score list
                [456],
                [[0.7, 0.3]],
                ['posts about ca'],  # metadata
            ),
            # after the end of the dataset, we will keep getting empty vectors
            ([], ) * 16,
            ([], ) * 16,
        ]
        entries = [from_blob_list(schema, e) for e in entries_raw]
        """
        Let's go ahead and create the reading nets.
        We will run `read` net multiple times and assert that we are reading the
        entries the way we stated above.
        """
        read_init_net = core.Net('read_init')
        read_next_net = core.Net('read_next')
        reader = ds.reader(read_init_net)
        should_continue, batch = reader.read_record(read_next_net)

        workspace.RunNetOnce(read_init_net)
        workspace.CreateNet(read_next_net, True)

        for entry in entries:
            workspace.RunNet(str(read_next_net))
            actual = FetchRecord(batch)
            _assert_records_equal(actual, entry)
        """
        5. Reading/writing in a single plan

        If all of operations on the data are expressible as Caffe2 operators,
        we don't need to load the data to python, iterating through the dataset
        in a single Plan.

        Where we will process the dataset a little and store it in a second
        dataset. We can reuse the same Reader since it supports reset.
        """
        reset_net = core.Net('reset_net')
        reader.reset(reset_net)
        read_step, batch = reader.execution_step()
        """ We will add the line number * 1000 to the feature ids. """
        process_net = core.Net('process')
        line_no = Const(process_net, 0, dtype=np.int32)
        const_one = Const(process_net, 1000, dtype=np.int32)
        process_net.Add([line_no, const_one], [line_no])
        field = batch.floats.keys.get()
        process_net.Print(field, [])
        process_net.Add([field, line_no], field, broadcast=1, axis=0)
        """ Lets create a second dataset and append to it. """
        ds2 = dataset.Dataset(schema, name='dataset2')
        ds2.init_empty(reset_net)
        writer = ds2.writer(reset_net)
        writer.write_record(process_net, batch)
        # commit is not necessary for DatasetWriter but will add it for
        # generality of the example
        commit_net = core.Net('commit')
        writer.commit(commit_net)
        """ Time to create and run a plan which will do the processing """
        plan = core.Plan('process')
        plan.AddStep(core.execution_step('reset', reset_net))
        plan.AddStep(read_step.AddNet(process_net))
        plan.AddStep(core.execution_step('commit', commit_net))
        workspace.RunPlan(plan)
        """
        Now we should have dataset2 populated.
        """
        ds2_data = FetchRecord(ds2.content())
        field = ds2_data.floats.keys
        field.set(blob=field.get() - [1000, 2000, 2000, 3000, 3000, 3000])
        _assert_records_equal(contents, ds2_data)
        """
        6. Slicing a dataset

        You can create a new schema from pieces of another schema and reuse
        the same data.
        """
        subschema = Struct(('top_level', schema.int_lists.values))
        int_list_contents = contents.int_lists.values.field_names()
        self.assertEquals(len(subschema.field_names()), len(int_list_contents))
        """
        7. Random Access a dataset

        """
        read_init_net = core.Net('read_init')
        read_next_net = core.Net('read_next')

        idx = np.array([2, 1, 0])
        indices_blob = Const(read_init_net, idx, name='indices')
        reader = ds.random_reader(read_init_net, indices_blob)
        reader.computeoffset(read_init_net)

        should_stop, batch = reader.read_record(read_next_net)

        workspace.CreateNet(read_init_net, True)
        workspace.RunNetOnce(read_init_net)

        workspace.CreateNet(read_next_net, True)

        for i in range(len(entries)):
            k = idx[i] if i in idx else i
            entry = entries[k]
            workspace.RunNet(str(read_next_net))
            actual = FetchRecord(batch)
            _assert_records_equal(actual, entry)
        workspace.RunNet(str(read_next_net))
        self.assertEquals(True, workspace.FetchBlob(should_stop))
        """
        8. Random Access a dataset with loop_over = true

        """
        read_init_net = core.Net('read_init')
        read_next_net = core.Net('read_next')

        idx = np.array([2, 1, 0])
        indices_blob = Const(read_init_net, idx, name='indices')
        reader = ds.random_reader(read_init_net, indices_blob, loop_over=True)
        reader.computeoffset(read_init_net)

        should_stop, batch = reader.read_record(read_next_net)

        workspace.CreateNet(read_init_net, True)
        workspace.RunNetOnce(read_init_net)

        workspace.CreateNet(read_next_net, True)

        for _ in range(len(entries) * 3):
            workspace.RunNet(str(read_next_net))
            self.assertEquals(False, workspace.FetchBlob(should_stop))
        """
        9. Sort and shuffle a dataset

        This sort the dataset using the score of a certain column,
        and then shuffle within each chunk of size batch_size * shuffle_size
        before shuffling the chunks.

        """
        read_init_net = core.Net('read_init')
        read_next_net = core.Net('read_next')

        reader = ds.random_reader(read_init_net)
        reader.sort_and_shuffle(read_init_net, 'int_lists:lengths', 1, 2)
        reader.computeoffset(read_init_net)

        should_continue, batch = reader.read_record(read_next_net)

        workspace.CreateNet(read_init_net, True)
        workspace.RunNetOnce(read_init_net)

        workspace.CreateNet(read_next_net, True)

        expected_idx = np.array([2, 1, 0])
        for i in range(len(entries)):
            k = expected_idx[i] if i in expected_idx else i
            entry = entries[k]
            workspace.RunNet(str(read_next_net))
            actual = FetchRecord(batch)
            _assert_records_equal(actual, entry)

        """
        Trim a dataset
        """
        trim_net = core.Net('trim_ds')
        ds.trim(trim_net, multiple_of=2)
        workspace.RunNetOnce(trim_net)
        trimmed = FetchRecord(ds.content())
        EXPECTED_SIZES = [2, 2, 3, 3, 2, 2, 2, 6, 2, 3, 3, 4, 4, 2, 2, 2]
        actual_sizes = [d.shape[0] for d in trimmed.field_blobs()]
        self.assertEquals(EXPECTED_SIZES, actual_sizes)
예제 #27
0
    def __init__(self,
                 model,
                 input_record,
                 num_to_collect,
                 name='reservoir_sampling',
                 **kwargs):
        super(ReservoirSampling, self).__init__(model, name, input_record,
                                                **kwargs)
        assert num_to_collect > 0
        self.num_to_collect = num_to_collect

        self.reservoir = model.net.NextScopedBlob(name + "_reservoir")
        self.num_visited_blob = model.net.NextScopedBlob(name + "_num_visited")
        self.mutex = model.net.NextScopedBlob(name + "_mutex")

        self.params.append(
            LayerParameter(
                parameter=self.reservoir,
                initializer=core.CreateOperator('ConstantFill', [],
                                                self.reservoir,
                                                shape=[0]),
                optimizer=model.NoOptim,
            ))
        self.params.append(
            LayerParameter(
                parameter=self.num_visited_blob,
                initializer=core.CreateOperator(
                    'ConstantFill',
                    [],
                    self.num_visited_blob,
                    shape=[],
                    value=0,
                    dtype=core.DataType.INT64,
                ),
                optimizer=model.NoOptim,
            ))
        self.params.append(
            LayerParameter(
                parameter=self.mutex,
                initializer=core.CreateOperator("CreateMutex", [], self.mutex),
                optimizer=model.NoOptim,
            ), )

        self.extra_input_blobs = []
        self.extra_output_blobs = []
        if 'object_id' in input_record:
            self.extra_input_blobs.append(input_record.object_id())
            object_to_pos = model.net.NextScopedBlob(name + "_object_to_pos")
            pos_to_object = model.net.NextScopedBlob(name + "_pos_to_object")
            self.extra_input_blobs.extend([object_to_pos, pos_to_object])
            self.extra_output_blobs.extend([object_to_pos, pos_to_object])
            self.params.append(
                LayerParameter(
                    parameter=object_to_pos,
                    initializer=core.CreateOperator(
                        'CreateMap',
                        [],
                        object_to_pos,
                        key_dtype=core.DataType.INT64,
                        valued_dtype=core.DataType.INT32,
                    ),
                    optimizer=model.NoOptim,
                ))
            self.params.append(
                LayerParameter(
                    parameter=pos_to_object,
                    initializer=core.CreateOperator(
                        'ConstantFill',
                        [],
                        pos_to_object,
                        shape=[0],
                        value=0,
                        dtype=core.DataType.INT64,
                    ),
                    optimizer=model.NoOptim,
                ))

        self.output_schema = schema.Struct(
            ('reservoir',
             schema.from_blob_list(input_record.data, [self.reservoir])),
            ('num_visited', schema.Scalar(blob=self.num_visited_blob)),
            ('mutex', schema.Scalar(blob=self.mutex)),
        )
예제 #28
0
    def __init__(self, model, input_record, num_to_collect,
                 name='reservoir_sampling', **kwargs):
        super(ReservoirSampling, self).__init__(
            model, name, input_record, **kwargs)
        assert num_to_collect > 0
        self.num_to_collect = num_to_collect

        self.reservoir = self.create_param(
            param_name='reservoir',
            shape=[0],
            initializer=('ConstantFill',),
            optimizer=model.NoOptim,
        )
        self.num_visited_blob = self.create_param(
            param_name='num_visited',
            shape=[],
            initializer=('ConstantFill', {
                'value': 0,
                'dtype': core.DataType.INT64,
            }),
            optimizer=model.NoOptim,
        )
        self.mutex = self.create_param(
            param_name='mutex',
            shape=None,
            initializer=('CreateMutex',),
            optimizer=model.NoOptim,
        )

        self.extra_input_blobs = []
        self.extra_output_blobs = []
        if 'object_id' in input_record:
            object_to_pos = self.create_param(
                param_name='object_to_pos',
                shape=None,
                initializer=('CreateMap', {
                    'key_dtype': core.DataType.INT64,
                    'valued_dtype': core.DataType.INT32,
                }),
                optimizer=model.NoOptim,
            )
            pos_to_object = self.create_param(
                param_name='pos_to_object',
                shape=[0],
                initializer=('ConstantFill', {
                    'value': 0,
                    'dtype': core.DataType.INT64,
                }),
                optimizer=model.NoOptim,
            )
            self.extra_input_blobs.append(input_record.object_id())
            self.extra_input_blobs.extend([object_to_pos, pos_to_object])
            self.extra_output_blobs.extend([object_to_pos, pos_to_object])

        self.output_schema = schema.Struct(
            (
                'reservoir',
                schema.from_blob_list(input_record.data, [self.reservoir])
            ),
            ('num_visited', schema.Scalar(blob=self.num_visited_blob)),
            ('mutex', schema.Scalar(blob=self.mutex)),
        )
예제 #29
0
 def read_record_ex(self, local_init_net, local_finish_net):
     nets, should_stop, fields = self.read_ex(local_init_net,
                                              local_finish_net)
     if self._schema:
         fields = from_blob_list(self._schema, fields)
     return nets, should_stop, fields
예제 #30
0
    def __init__(self,
                 model,
                 input_record,
                 num_to_collect,
                 name='reservoir_sampling',
                 **kwargs):
        super(ReservoirSampling, self).__init__(model, name, input_record,
                                                **kwargs)
        assert num_to_collect > 0
        self.num_to_collect = num_to_collect

        self.reservoir = self.create_param(
            param_name='reservoir',
            shape=[0],
            initializer=('ConstantFill', ),
            optimizer=model.NoOptim,
        )
        self.num_visited_blob = self.create_param(
            param_name='num_visited',
            shape=[],
            initializer=('ConstantFill', {
                'value': 0,
                'dtype': core.DataType.INT64,
            }),
            optimizer=model.NoOptim,
        )
        self.mutex = self.create_param(
            param_name='mutex',
            shape=[],
            initializer=('CreateMutex', ),
            optimizer=model.NoOptim,
        )

        self.extra_input_blobs = []
        self.extra_output_blobs = []
        if 'object_id' in input_record:
            object_to_pos = self.create_param(
                param_name='object_to_pos',
                shape=None,
                initializer=('CreateMap', {
                    'key_dtype': core.DataType.INT64,
                    'valued_dtype': core.DataType.INT32,
                }),
                optimizer=model.NoOptim,
            )
            pos_to_object = self.create_param(
                param_name='pos_to_object',
                shape=[0],
                initializer=('ConstantFill', {
                    'value': 0,
                    'dtype': core.DataType.INT64,
                }),
                optimizer=model.NoOptim,
            )
            self.extra_input_blobs.append(input_record.object_id())
            self.extra_input_blobs.extend([object_to_pos, pos_to_object])
            self.extra_output_blobs.extend([object_to_pos, pos_to_object])

        self.output_schema = schema.Struct(
            ('reservoir',
             schema.from_blob_list(input_record.data, [self.reservoir])),
            ('num_visited', schema.Scalar(blob=self.num_visited_blob)),
            ('mutex', schema.Scalar(blob=self.mutex)),
        )
예제 #31
0
 def content(self):
     """
     Return a Record of BlobReferences pointing to the full content of
     this dataset.
     """
     return from_blob_list(self.schema, self.field_blobs)
예제 #32
0
 def read_record(self, read_net):
     should_stop, fields = self.read(read_net)
     if self._schema:
         fields = from_blob_list(self._schema, fields)
     return should_stop, fields
예제 #33
0
    def test_record_queue(self):
        num_prod = 8
        num_consume = 3
        schema = Struct(
            ('floats', Map(
                Scalar(np.int32),
                Scalar(np.float32))),
        )
        contents_raw = [
            [1, 2, 3],  # len
            [11, 21, 22, 31, 32, 33],  # key
            [1.1, 2.1, 2.2, 3.1, 3.2, 3.3],  # value
        ]
        contents = from_blob_list(schema, contents_raw)
        ds = Dataset(schema)
        net = core.Net('init')
        ds.init_empty(net)

        content_blobs = NewRecord(net, contents)
        FeedRecord(content_blobs, contents)
        writer = ds.writer(init_net=net)
        writer.write_record(net, content_blobs)
        reader = ds.reader(init_net=net)

        # prepare receiving dataset
        rec_dataset = Dataset(contents, name='rec')
        rec_dataset.init_empty(init_net=net)
        rec_dataset_writer = rec_dataset.writer(init_net=net)

        workspace.RunNetOnce(net)

        queue = RecordQueue(contents, num_threads=num_prod)

        def process(net, fields):
            new_fields = []
            for f in fields.field_blobs():
                new_f = net.Copy(f)
                new_fields.append(new_f)
            new_fields = from_blob_list(fields, new_fields)
            return new_fields

        q_reader, q_step, q_exit, fields = queue.build(reader, process)
        producer_step = core.execution_step('producer', [q_step, q_exit])

        consumer_steps = []
        for i in range(num_consume):
            name = 'queue_reader_' + str(i)
            net_consume = core.Net(name)
            should_stop, fields = q_reader.read_record(net_consume)
            step_consume = core.execution_step(name, net_consume)

            name = 'dataset_writer_' + str(i)
            net_dataset = core.Net(name)
            rec_dataset_writer.write(net_dataset, fields.field_blobs())
            step_dataset = core.execution_step(name, net_dataset)

            step = core.execution_step(
                'consumer_' + str(i),
                [step_consume, step_dataset],
                should_stop_blob=should_stop)
            consumer_steps.append(step)
        consumer_step = core.execution_step(
            'consumers', consumer_steps, concurrent_substeps=True)

        work_steps = core.execution_step(
            'work', [producer_step, consumer_step], concurrent_substeps=True)

        plan = core.Plan('test')
        plan.AddStep(work_steps)
        core.workspace.RunPlan(plan)
        data = workspace.FetchBlobs(rec_dataset.get_blobs())
        self.assertEqual(6, sum(data[0]))
        self.assertEqual(150, sum(data[1]))
        self.assertAlmostEqual(15, sum(data[2]), places=5)