Beispiel #1
0
    def test_reader_with_limit(self):
        ws = workspace.C.Workspace()
        session = LocalSession(ws)

        """ 1. feed full dataset """
        src_init = core.Net('src_init')
        src_values = Struct(('label', np.array(range(100))))
        src_blobs = NewRecord(src_init, src_values)
        src_ds = Dataset(src_blobs)
        FeedRecord(src_blobs, src_values, ws)
        ws.run(src_init)

        """ 2. Read with limit smaller than size of dataset """
        dst_init = core.Net('dst_init')
        dst_ds = Dataset(src_values.clone_schema())
        dst_ds.init_empty(dst_init)
        ws.run(dst_init)

        with TaskGroup() as tg:
            reader = ReaderWithLimit(src_ds.reader(), num_iter=10)
            pipe(reader, dst_ds.writer(), num_threads=8)
        session.run(tg)
        self.assertFalse(ws.blobs[str(reader.data_finished())].fetch())
        self.assertEquals(
            sorted(ws.blobs[str(dst_ds.content().label())].fetch()), range(10))

        """ 3. Read with limit larger than size of dataset """
        ws.run(dst_init)
        with TaskGroup() as tg:
            reader = ReaderWithLimit(src_ds.reader(), num_iter=110)
            pipe(reader, dst_ds.writer(), num_threads=8)
        session.run(tg)
        self.assertEquals(
            sorted(ws.blobs[str(dst_ds.content().label())].fetch()), range(100))
        self.assertTrue(ws.blobs[str(reader.data_finished())].fetch())
Beispiel #2
0
    def test_reader_with_limit(self):
        ws = workspace.C.Workspace()
        session = LocalSession(ws)
        """ 1. feed full dataset """
        src_init = core.Net('src_init')
        src_values = Struct(('label', np.array(range(100))))
        src_blobs = NewRecord(src_init, src_values)
        src_ds = Dataset(src_blobs)
        FeedRecord(src_blobs, src_values, ws)
        ws.run(src_init)
        """ 2. Read with limit smaller than size of dataset """
        dst_init = core.Net('dst_init')
        dst_ds = Dataset(src_values.clone_schema())
        dst_ds.init_empty(dst_init)
        ws.run(dst_init)

        with TaskGroup() as tg:
            reader = ReaderWithLimit(src_ds.reader(), num_iter=10)
            pipe(reader, dst_ds.writer(), num_threads=8)
        session.run(tg)
        self.assertFalse(ws.blobs[str(reader.data_finished())].fetch())
        self.assertEquals(
            sorted(ws.blobs[str(dst_ds.content().label())].fetch()), range(10))
        """ 3. Read with limit larger than size of dataset """
        ws.run(dst_init)
        with TaskGroup() as tg:
            reader = ReaderWithLimit(src_ds.reader(), num_iter=110)
            pipe(reader, dst_ds.writer(), num_threads=8)
        session.run(tg)
        self.assertEquals(
            sorted(ws.blobs[str(dst_ds.content().label())].fetch()),
            range(100))
        self.assertTrue(ws.blobs[str(reader.data_finished())].fetch())
Beispiel #3
0
    def test_dequeue_many(self):
        init_net = core.Net('init')
        N = 17
        NUM_DEQUEUE_RECORDS = 3
        src_values = Struct(
            ('uid', np.array(range(N))),
            ('value', 0.1 * np.array(range(N))))
        expected_dst = Struct(
            ('uid', 2 * np.array(range(N))),
            ('value', np.array(N * [0.0])))

        with core.NameScope('init'):
            src_blobs = NewRecord(init_net, src_values)
            dst_blobs = InitEmptyRecord(init_net, src_values.clone_schema())
            counter = init_net.Const(0)
            ONE = init_net.Const(1)

        def proc1(rec):
            with core.NameScope('proc1'):
                out = NewRecord(ops, rec)
            ops.Add([rec.uid(), rec.uid()], [out.uid()])
            out.value.set(blob=rec.value(), unsafe=True)
            return out

        def proc2(rec):
            with core.NameScope('proc2'):
                out = NewRecord(ops, rec)
            out.uid.set(blob=rec.uid(), unsafe=True)
            ops.Sub([rec.value(), rec.value()], [out.value()])
            ops.Add([counter, ONE], [counter])
            return out

        src_ds = Dataset(src_blobs)
        dst_ds = Dataset(dst_blobs)

        with TaskGroup() as tg:
            out1 = pipe(
                src_ds.reader(),
                output=Queue(
                    capacity=11, num_dequeue_records=NUM_DEQUEUE_RECORDS),
                processor=proc1)
            out2 = pipe(out1, processor=proc2)
            pipe(out2, dst_ds.writer())

        ws = workspace.C.Workspace()
        FeedRecord(src_blobs, src_values, ws)
        session = LocalSession(ws)
        session.run(init_net)
        session.run(tg)
        output = FetchRecord(dst_blobs, ws=ws)
        num_dequeues = ws.blobs[str(counter)].fetch()

        self.assertEquals(
            num_dequeues, int(math.ceil(float(N) / NUM_DEQUEUE_RECORDS)))

        for a, b in zip(output.field_blobs(), expected_dst.field_blobs()):
            np.testing.assert_array_equal(a, b)
Beispiel #4
0
def build_job(node_id):
    all_outputs = []
    with Job() as job:
        with Node('reader' + str(node_id)):
            with job.init_group:
                init_net = core.Net('init_net' + str(node_id))
                data_arr = Struct(('val', np.array(range(10))))
                data = ConstRecord(init_net, data_arr)
                ds = Dataset(data, name='dataset' + str(node_id))
                full_reader = ds.reader(init_net)
                total = init_net.Const([100])
                Task(step=init_net)

            def inc_total(rec):
                net = core.Net('inc_total' + str(node_id))
                net.Add([total, rec.val()], [total])
                return [net]

            epoch_reader = ReaderWithLimit(full_reader, num_iter=3)
            pipe(epoch_reader, processor=inc_total)
            job.add_stop_signal(epoch_reader.data_finished())
            all_outputs.append(total)

    total_fetcher = Task(step=core.Net('empty'), outputs=all_outputs)
    return job, total_fetcher
Beispiel #5
0
def _dataset(draw, min_elements=3, max_elements=10, **kwargs):
    schema = Struct(
        # Dense Features Map
        ('floats', Map(Scalar(np.int32), Scalar(np.float32))),
        # Sparse Features Map
        ('int_lists', Map(
            Scalar(np.int32),
            List(Scalar(np.int64)),
        )),
        # Complex Type
        ('text', Scalar(str)),
    )

    num_records = draw(
        st.integers(min_value=min_elements, max_value=max_elements))

    raw_dense_features_map_contents = draw(_dense_features_map(num_records))

    raw_sparse_features_map_contents = draw(_sparse_features_map(num_records))

    raw_text_contents = [
        draw(
            st.lists(st.text(alphabet=string.ascii_lowercase),
                     min_size=num_records,
                     max_size=num_records))
    ]

    # Concatenate all raw contents to a single one
    contents_raw = raw_dense_features_map_contents + raw_sparse_features_map_contents + raw_text_contents

    contents = from_blob_list(schema, contents_raw)

    return (schema, contents, num_records)
Beispiel #6
0
    def test_local_session(self):
        init_net = core.Net('init')
        src_values = Struct(
            ('uid', np.array([1, 2, 6])),
            ('value', np.array([1.4, 1.6, 1.7])))
        expected_dst = Struct(
            ('uid', np.array([2, 4, 12])),
            ('value', np.array([0.0, 0.0, 0.0])))

        with core.NameScope('init'):
            src_blobs = NewRecord(init_net, src_values)
            dst_blobs = InitEmptyRecord(init_net, src_values.clone_schema())

        def proc1(rec):
            net = core.Net('proc1')
            with core.NameScope('proc1'):
                out = NewRecord(net, rec)
            net.Add([rec.uid(), rec.uid()], [out.uid()])
            out.value.set(blob=rec.value(), unsafe=True)
            return [net], out

        def proc2(rec):
            net = core.Net('proc2')
            with core.NameScope('proc2'):
                out = NewRecord(net, rec)
            out.uid.set(blob=rec.uid(), unsafe=True)
            net.Sub([rec.value(), rec.value()], [out.value()])
            return [net], out

        src_ds = Dataset(src_blobs)
        dst_ds = Dataset(dst_blobs)

        with TaskGroup() as tg:
            out1 = pipe(src_ds.reader(), processor=proc1)
            out2 = pipe(out1, processor=proc2)
            pipe(out2, dst_ds.writer())

        ws = workspace.C.Workspace()
        FeedRecord(src_blobs, src_values, ws)
        session = LocalSession(ws)
        session.run(init_net)
        session.run(tg)
        output = FetchRecord(dst_blobs, ws=ws)

        for a, b in zip(output.field_blobs(), expected_dst.field_blobs()):
            np.testing.assert_array_equal(a, b)
def init_dataset(ws, size=100):
    src_init = core.Net('src_init')
    with core.NameScope('src'):
        src_values = Struct(('label', np.array(range(size))))
        src_blobs = NewRecord(src_init, src_values)
        src_ds = Dataset(src_blobs)
        FeedRecord(src_blobs, src_values, ws)
    ws.run(src_init)
    return src_ds
Beispiel #8
0
def make_source_dataset(ws, size=100, offset=0, name=None):
    name = name or "src"
    src_init = core.Net("{}_init".format(name))
    with core.NameScope(name):
        src_values = Struct(('label', np.array(range(offset, offset + size))))
        src_blobs = NewRecord(src_init, src_values)
        src_ds = Dataset(src_blobs, name=name)
        FeedRecord(src_blobs, src_values, ws)
    ws.run(src_init)
    return src_ds
Beispiel #9
0
 def __init__(self, names, readers):
     """
     Args:
         names: list[str] names of readers; used as schema keys
         readers: list[Reader] Reader instances, must have schema
     """
     assert len(names) == len(readers)
     super(CompositeReader, self).__init__(schema=Struct(*[
         (name, reader.schema()) for name, reader in zip(names, readers)
     ]))
     self._names = names
     self._readers = readers
Beispiel #10
0
    def test_text_file_reader(self):
        schema = Struct(
            ('field1', Scalar(dtype=str)),
            ('field2', Scalar(dtype=str)),
            ('field3', Scalar(dtype=np.float32)))
        num_fields = 3
        col_data = [
            ['l1f1', 'l2f1', 'l3f1', 'l4f1'],
            ['l1f2', 'l2f2', 'l3f2', 'l4f2'],
            [0.456, 0.789, 0.10101, -24342.64],
        ]
        row_data = list(zip(*col_data))
        txt_file = tempfile.NamedTemporaryFile(delete=False)
        txt_file.write(
            '\n'.join(
                '\t'.join(str(x) for x in f)
                for f in row_data
            ) + '\n'
        )
        txt_file.close()

        for num_passes in range(1, 3):
            for batch_size in range(1, len(row_data) + 2):
                init_net = core.Net('init_net')
                reader = TextFileReader(
                    init_net,
                    filename=txt_file.name,
                    schema=schema,
                    batch_size=batch_size,
                    num_passes=num_passes)
                workspace.RunNetOnce(init_net)

                net = core.Net('read_net')
                should_stop, record = reader.read_record(net)

                results = [np.array([])] * num_fields
                while True:
                    workspace.RunNetOnce(net)
                    arrays = FetchRecord(record).field_blobs()
                    for i in range(num_fields):
                        results[i] = np.append(results[i], arrays[i])
                    if workspace.FetchBlob(should_stop):
                        break
                for i in range(num_fields):
                    col_batch = np.tile(col_data[i], num_passes)
                    if col_batch.dtype in (np.float32, np.float64):
                        np.testing.assert_array_almost_equal(
                            col_batch, results[i], decimal=3)
                    else:
                        np.testing.assert_array_equal(col_batch, results[i])

        os.remove(txt_file.name)
Beispiel #11
0
 def __init__(self, names, reader_builders):
     """
     Args:
         names: list[str] names of readers; used as schema keys
         reader_builders: list[ReaderBuilder] ReaderBuilder instances;
             must have schema
     """
     super(CompositeReaderBuilder, self).__init__()
     self._names = names
     self._reader_builders = reader_builders
     self._schema = Struct(
         *[(name, reader_builder.schema())
           for name, reader_builder in zip(names, reader_builders)])
    def test_dequeue_many(self):
        init_net = core.Net('init')
        N = 17
        NUM_DEQUEUE_RECORDS = 3
        src_values = Struct(
            ('uid', np.array(range(N))),
            ('value', 0.1 * np.array(range(N))))
        expected_dst = Struct(
            ('uid', 2 * np.array(range(N))),
            ('value', np.array(N * [0.0])))

        with core.NameScope('init'):
            src_blobs = NewRecord(init_net, src_values)
            dst_blobs = InitEmptyRecord(init_net, src_values.clone_schema())
            counter = init_net.Const(0)
            ONE = init_net.Const(1)

        def proc1(rec):
            with core.NameScope('proc1'):
                out = NewRecord(ops, rec)
            ops.Add([rec.uid(), rec.uid()], [out.uid()])
            out.value.set(blob=rec.value(), unsafe=True)
            return out

        def proc2(rec):
            with core.NameScope('proc2'):
                out = NewRecord(ops, rec)
            out.uid.set(blob=rec.uid(), unsafe=True)
            ops.Sub([rec.value(), rec.value()], [out.value()])
            ops.Add([counter, ONE], [counter])
            return out

        src_ds = Dataset(src_blobs)
        dst_ds = Dataset(dst_blobs)

        with TaskGroup() as tg:
            out1 = pipe(
                src_ds.reader(),
                output=Queue(
                    capacity=11, num_dequeue_records=NUM_DEQUEUE_RECORDS),
                processor=proc1)
            out2 = pipe(out1, processor=proc2)
            pipe(out2, dst_ds.writer())

        ws = workspace.C.Workspace()
        FeedRecord(src_blobs, src_values, ws)
        session = LocalSession(ws)
        session.run(init_net)
        session.run(tg)
        output = FetchRecord(dst_blobs, ws=ws)
        num_dequeues = ws.blobs[str(counter)].fetch()

        self.assertEquals(
            num_dequeues, int(math.ceil(float(N) / NUM_DEQUEUE_RECORDS)))

        for a, b in zip(output.field_blobs(), expected_dst.field_blobs()):
            np.testing.assert_array_equal(a, b)
def build_pipeline(node_id):
    with Node('trainer_%d' % node_id):
        with Job.current().init_group, Task():
            data_arr = Struct(('val', np.array(list(range(10)))))
            data = ConstRecord(ops, data_arr)
            ds = Dataset(data, name='dataset:%d' % node_id)
            full_reader = ds.reader(ops)
            total = ops.Const([100])

        def inc_total(rec):
            ops.Add([total, rec.val()], [total])

        epoch_reader = ReaderWithLimit(full_reader, num_iter=3)
        pipe(epoch_reader, processor=inc_total)
        Job.current().add_stop_signal(epoch_reader.data_finished())
    return [total]
    def test_local_session(self):
        init_net = core.Net('init')
        src_values = Struct(
            ('uid', np.array([1, 2, 6])),
            ('value', np.array([1.4, 1.6, 1.7])))
        expected_dst = Struct(
            ('uid', np.array([2, 4, 12])),
            ('value', np.array([0.0, 0.0, 0.0])))

        with core.NameScope('init'):
            src_blobs = NewRecord(init_net, src_values)
            dst_blobs = InitEmptyRecord(init_net, src_values.clone_schema())

        def proc1(rec):
            net = core.Net('proc1')
            with core.NameScope('proc1'):
                out = NewRecord(net, rec)
            net.Add([rec.uid(), rec.uid()], [out.uid()])
            out.value.set(blob=rec.value(), unsafe=True)
            return [net], out

        def proc2(rec):
            net = core.Net('proc2')
            with core.NameScope('proc2'):
                out = NewRecord(net, rec)
            out.uid.set(blob=rec.uid(), unsafe=True)
            net.Sub([rec.value(), rec.value()], [out.value()])
            return [net], out

        src_ds = Dataset(src_blobs)
        dst_ds = Dataset(dst_blobs)

        with TaskGroup() as tg:
            out1 = pipe(src_ds.reader(), processor=proc1)
            out2 = pipe(out1, processor=proc2)
            pipe(out2, dst_ds.writer())

        ws = workspace.C.Workspace()
        FeedRecord(src_blobs, src_values, ws)
        session = LocalSession(ws)
        session.run(init_net)
        session.run(tg)
        output = FetchRecord(dst_blobs, ws=ws)

        for a, b in zip(output.field_blobs(), expected_dst.field_blobs()):
            np.testing.assert_array_equal(a, b)
Beispiel #15
0
 def __init__(self):
     Reader.__init__(self, schema=Struct(('iter', np.int64)))
     self.counter = None
     self.should_stop = None
Beispiel #16
0
    def test_dataset_ops(self):
        """
        1. Defining the schema of our dataset.

        This example schema could represent, for example, a search query log.
        """
        schema = Struct(
            # fixed size vector, which will be stored as a matrix when batched
            ('dense', Scalar((np.float32, 3))),
            # could represent a feature map from feature ID to float value
            ('floats', Map(
                Scalar(np.int32), Scalar(np.float32)
            )),
            # could represent a multi-valued categorical feature map
            ('int_lists', Map(
                Scalar(np.int32),
                List(Scalar(np.int64)),
            )),
            # could represent a multi-valued, weighted categorical feature map
            (
                'id_score_pairs', Map(
                    Scalar(np.int32),
                    Map(
                        Scalar(np.int64),
                        Scalar(np.float32),
                        keys_name='ids',
                        values_name='scores'
                    ),
                )
            ),
            # additional scalar information
            (
                'metadata', Struct(
                    ('user_id', Scalar(np.int64)),
                    ('user_embed', Scalar((np.float32, 2))),
                    ('query', Scalar(str)),
                )
            ),
        )
        """
        This is what the flattened fields for this schema look like, along
        with its type. Each one of these fields will be stored, read and
        writen as a tensor.
        """
        expected_fields = [
            ('dense', (np.float32, 3)),
            ('floats:lengths', np.int32),
            ('floats:values:keys', np.int32),
            ('floats:values:values', np.float32),
            ('int_lists:lengths', np.int32),
            ('int_lists:values:keys', np.int32),
            ('int_lists:values:values:lengths', np.int32),
            ('int_lists:values:values:values', np.int64),
            ('id_score_pairs:lengths', np.int32),
            ('id_score_pairs:values:keys', np.int32),
            ('id_score_pairs:values:values:lengths', np.int32),
            ('id_score_pairs:values:values:values:ids', np.int64),
            ('id_score_pairs:values:values:values:scores', np.float32),
            ('metadata:user_id', np.int64),
            ('metadata:user_embed', (np.float32, 2)),
            ('metadata:query', str),
        ]
        zipped = zip(
            expected_fields, schema.field_names(), schema.field_types()
        )
        for (ref_name, ref_type), name, dtype in zipped:
            self.assertEquals(ref_name, name)
            self.assertEquals(np.dtype(ref_type), dtype)
        """
        2. The contents of our dataset.

        Contents as defined below could represent, for example, a log of
        search queries along with dense, sparse features and metadata.
        The datset below has 3 top-level entries.
        """
        contents_raw = [
            # dense
            [[1.1, 1.2, 1.3], [2.1, 2.2, 2.3], [3.1, 3.2, 3.3]],
            # floats
            [1, 2, 3],  # len
            [11, 21, 22, 31, 32, 33],  # key
            [1.1, 2.1, 2.2, 3.1, 3.2, 3.3],  # value
            # int lists
            [2, 0, 1],  # len
            [11, 12, 31],  # key
            [2, 4, 3],  # value:len
            [111, 112, 121, 122, 123, 124, 311, 312, 313],  # value:value
            # id score pairs
            [1, 2, 2],  # len
            [11, 21, 22, 31, 32],  # key
            [1, 1, 2, 2, 3],  # value:len
            [111, 211, 221, 222, 311, 312, 321, 322, 323],  # value:ids
            [11.1, 21.1, 22.1, 22.2, 31.1, 31.2, 32.1, 32.2, 32.3],  # val:score
            # metadata
            [123, 234, 456],  # user_id
            [[0.2, 0.8], [0.5, 0.5], [0.7, 0.3]],  # user_embed
            ['dog posts', 'friends who like to', 'posts about ca'],  # query
        ]
        # convert the above content to ndarrays, checking against the schema
        contents = from_blob_list(schema, contents_raw)
        """
        3. Creating and appending to the dataset.
        We first create an empty dataset with the given schema.
        Then, a Writer is used to append these entries to the dataset.
        """
        ds = dataset.Dataset(schema)
        net = core.Net('init')
        with core.NameScope('init'):
            ds.init_empty(net)

            content_blobs = NewRecord(net, contents)
            FeedRecord(content_blobs, contents)
            writer = ds.writer(init_net=net)
            writer.write_record(net, content_blobs)
        workspace.RunNetOnce(net)
        """
        4. Iterating through the dataset contents.

        If we were to iterate through the top level entries of our dataset,
        this is what we should expect to see:
        """
        entries_raw = [
            (
                [[1.1, 1.2, 1.3]],  # dense
                [1],
                [11],
                [1.1],  # floats
                [2],
                [11, 12],
                [2, 4],
                [111, 112, 121, 122, 123, 124],  # intlst
                [1],
                [11],
                [1],
                [111],
                [11.1],  # id score pairs
                [123],
                [[0.2, 0.8]],
                ['dog posts'],  # metadata
            ),
            (
                [[2.1, 2.2, 2.3]],  # dense
                [2],
                [21, 22],
                [2.1, 2.2],  # floats
                [0],
                [],
                [],
                [],  # int list
                [2],
                [21, 22],
                [1, 2],
                [211, 221, 222],
                [21.1, 22.1, 22.2],
                [234],
                [[0.5, 0.5]],
                ['friends who like to'],  # metadata
            ),
            (
                [[3.1, 3.2, 3.3]],  # dense
                [3],
                [31, 32, 33],
                [3.1, 3.2, 3.3],  # floats
                [1],
                [31],
                [3],
                [311, 312, 313],  # int lst
                [2],
                [31, 32],
                [2, 3],
                [311, 312, 321, 322, 323],
                [31.1, 31.2, 32.1, 32.2, 32.3],  # id score list
                [456],
                [[0.7, 0.3]],
                ['posts about ca'],  # metadata
            ),
            # after the end of the dataset, we will keep getting empty vectors
            ([], ) * 16,
            ([], ) * 16,
        ]
        entries = [from_blob_list(schema, e) for e in entries_raw]
        """
        Let's go ahead and create the reading nets.
        We will run `read` net multiple times and assert that we are reading the
        entries the way we stated above.
        """
        read_init_net = core.Net('read_init')
        read_next_net = core.Net('read_next')
        reader = ds.reader(read_init_net)
        should_continue, batch = reader.read_record(read_next_net)

        workspace.RunNetOnce(read_init_net)
        workspace.CreateNet(read_next_net, True)

        for entry in entries:
            workspace.RunNet(str(read_next_net))
            actual = FetchRecord(batch)
            _assert_records_equal(actual, entry)
        """
        5. Reading/writing in a single plan

        If all of operations on the data are expressible as Caffe2 operators,
        we don't need to load the data to python, iterating through the dataset
        in a single Plan.

        Where we will process the dataset a little and store it in a second
        dataset. We can reuse the same Reader since it supports reset.
        """
        reset_net = core.Net('reset_net')
        reader.reset(reset_net)
        read_step, batch = reader.execution_step()
        """ We will add the line number * 1000 to the feature ids. """
        process_net = core.Net('process')
        line_no = Const(process_net, 0, dtype=np.int32)
        const_one = Const(process_net, 1000, dtype=np.int32)
        process_net.Add([line_no, const_one], [line_no])
        field = batch.floats.keys.get()
        process_net.Print(field, [])
        process_net.Add([field, line_no], field, broadcast=1, axis=0)
        """ Lets create a second dataset and append to it. """
        ds2 = dataset.Dataset(schema, name='dataset2')
        ds2.init_empty(reset_net)
        writer = ds2.writer(reset_net)
        writer.write_record(process_net, batch)
        # commit is not necessary for DatasetWriter but will add it for
        # generality of the example
        commit_net = core.Net('commit')
        writer.commit(commit_net)
        """ Time to create and run a plan which will do the processing """
        plan = core.Plan('process')
        plan.AddStep(core.execution_step('reset', reset_net))
        plan.AddStep(read_step.AddNet(process_net))
        plan.AddStep(core.execution_step('commit', commit_net))
        workspace.RunPlan(plan)
        """
        Now we should have dataset2 populated.
        """
        ds2_data = FetchRecord(ds2.content())
        field = ds2_data.floats.keys
        field.set(blob=field.get() - [1000, 2000, 2000, 3000, 3000, 3000])
        _assert_records_equal(contents, ds2_data)
        """
        6. Slicing a dataset

        You can create a new schema from pieces of another schema and reuse
        the same data.
        """
        subschema = Struct(('top_level', schema.int_lists.values))
        int_list_contents = contents.int_lists.values.field_names()
        self.assertEquals(len(subschema.field_names()), len(int_list_contents))
        """
        7. Random Access a dataset

        """
        read_init_net = core.Net('read_init')
        read_next_net = core.Net('read_next')

        idx = np.array([2, 1, 0])
        indices_blob = Const(read_init_net, idx, name='indices')
        reader = ds.random_reader(read_init_net, indices_blob)
        reader.computeoffset(read_init_net)

        should_stop, batch = reader.read_record(read_next_net)

        workspace.CreateNet(read_init_net, True)
        workspace.RunNetOnce(read_init_net)

        workspace.CreateNet(read_next_net, True)

        for i in range(len(entries)):
            k = idx[i] if i in idx else i
            entry = entries[k]
            workspace.RunNet(str(read_next_net))
            actual = FetchRecord(batch)
            _assert_records_equal(actual, entry)
        workspace.RunNet(str(read_next_net))
        self.assertEquals(True, workspace.FetchBlob(should_stop))
        """
        8. Random Access a dataset with loop_over = true

        """
        read_init_net = core.Net('read_init')
        read_next_net = core.Net('read_next')

        idx = np.array([2, 1, 0])
        indices_blob = Const(read_init_net, idx, name='indices')
        reader = ds.random_reader(read_init_net, indices_blob, loop_over=True)
        reader.computeoffset(read_init_net)

        should_stop, batch = reader.read_record(read_next_net)

        workspace.CreateNet(read_init_net, True)
        workspace.RunNetOnce(read_init_net)

        workspace.CreateNet(read_next_net, True)

        for _ in range(len(entries) * 3):
            workspace.RunNet(str(read_next_net))
            self.assertEquals(False, workspace.FetchBlob(should_stop))
        """
        9. Sort and shuffle a dataset

        This sort the dataset using the score of a certain column,
        and then shuffle within each chunk of size batch_size * shuffle_size
        before shuffling the chunks.

        """
        read_init_net = core.Net('read_init')
        read_next_net = core.Net('read_next')

        reader = ds.random_reader(read_init_net)
        reader.sort_and_shuffle(read_init_net, 'int_lists:lengths', 1, 2)
        reader.computeoffset(read_init_net)

        should_continue, batch = reader.read_record(read_next_net)

        workspace.CreateNet(read_init_net, True)
        workspace.RunNetOnce(read_init_net)

        workspace.CreateNet(read_next_net, True)

        expected_idx = np.array([2, 1, 0])
        for i in range(len(entries)):
            k = expected_idx[i] if i in expected_idx else i
            entry = entries[k]
            workspace.RunNet(str(read_next_net))
            actual = FetchRecord(batch)
            _assert_records_equal(actual, entry)

        """
        Trim a dataset
        """
        trim_net = core.Net('trim_ds')
        ds.trim(trim_net, multiple_of=2)
        workspace.RunNetOnce(trim_net)
        trimmed = FetchRecord(ds.content())
        EXPECTED_SIZES = [2, 2, 3, 3, 2, 2, 2, 6, 2, 3, 3, 4, 4, 2, 2, 2]
        actual_sizes = [d.shape[0] for d in trimmed.field_blobs()]
        self.assertEquals(EXPECTED_SIZES, actual_sizes)
Beispiel #17
0
    def test_record_queue(self):
        num_prod = 8
        num_consume = 3
        schema = Struct(
            ('floats', Map(Scalar(np.int32), Scalar(np.float32))), )
        contents_raw = [
            [1, 2, 3],  # len
            [11, 21, 22, 31, 32, 33],  # key
            [1.1, 2.1, 2.2, 3.1, 3.2, 3.3],  # value
        ]
        contents = from_blob_list(schema, contents_raw)
        ds = Dataset(schema)
        net = core.Net('init')
        ds.init_empty(net)

        content_blobs = NewRecord(net, contents)
        FeedRecord(content_blobs, contents)
        writer = ds.writer(init_net=net)
        writer.write_record(net, content_blobs)
        reader = ds.reader(init_net=net)

        # prepare receiving dataset
        rec_dataset = Dataset(contents, name='rec')
        rec_dataset.init_empty(init_net=net)
        rec_dataset_writer = rec_dataset.writer(init_net=net)

        workspace.RunNetOnce(net)

        queue = RecordQueue(contents, num_threads=num_prod)

        def process(net, fields):
            new_fields = []
            for f in fields.field_blobs():
                new_f = net.Copy(f)
                new_fields.append(new_f)
            new_fields = from_blob_list(fields, new_fields)
            return new_fields

        q_reader, q_step, q_exit, fields = queue.build(reader, process)
        producer_step = core.execution_step('producer', [q_step, q_exit])

        consumer_steps = []
        for i in range(num_consume):
            name = 'queue_reader_' + str(i)
            net_consume = core.Net(name)
            should_stop, fields = q_reader.read_record(net_consume)
            step_consume = core.execution_step(name, net_consume)

            name = 'dataset_writer_' + str(i)
            net_dataset = core.Net(name)
            rec_dataset_writer.write(net_dataset, fields.field_blobs())
            step_dataset = core.execution_step(name, net_dataset)

            step = core.execution_step('consumer_' + str(i),
                                       [step_consume, step_dataset],
                                       should_stop_blob=should_stop)
            consumer_steps.append(step)
        consumer_step = core.execution_step('consumers',
                                            consumer_steps,
                                            concurrent_substeps=True)

        work_steps = core.execution_step('work',
                                         [producer_step, consumer_step],
                                         concurrent_substeps=True)

        plan = core.Plan('test')
        plan.AddStep(work_steps)
        core.workspace.RunPlan(plan)
        data = workspace.FetchBlobs(rec_dataset.get_blobs())
        self.assertEqual(6, sum(data[0]))
        self.assertEqual(150, sum(data[1]))
        self.assertAlmostEqual(15, sum(data[2]), places=5)
Beispiel #18
0
    def test_dataset_ops(self):
        """
        1. Defining the schema of our dataset.

        This example schema could represent, for example, a search query log.
        """
        schema = Struct(
            # fixed size vector, which will be stored as a matrix when batched
            ('dense', Scalar((np.float32, 3))),
            # could represent a feature map from feature ID to float value
            ('floats', Map(
                Scalar(np.int32), Scalar(np.float32)
            )),
            # could represent a multi-valued categorical feature map
            ('int_lists', Map(
                Scalar(np.int32),
                List(Scalar(np.int64)),
            )),
            # could represent a multi-valued, weighted categorical feature map
            (
                'id_score_pairs', Map(
                    Scalar(np.int32),
                    Map(
                        Scalar(np.int64),
                        Scalar(np.float32),
                        keys_name='ids',
                        values_name='scores'
                    ),
                )
            ),
            # additional scalar information
            (
                'metadata', Struct(
                    ('user_id', Scalar(np.int64)),
                    ('user_embed', Scalar((np.float32, 2))),
                    ('query', Scalar(str)),
                )
            ),
        )
        """
        This is what the flattened fields for this schema look like, along
        with its type. Each one of these fields will be stored, read and
        writen as a tensor.
        """
        expected_fields = [
            ('dense', (np.float32, 3)),
            ('floats:lengths', np.int32),
            ('floats:values:keys', np.int32),
            ('floats:values:values', np.float32),
            ('int_lists:lengths', np.int32),
            ('int_lists:values:keys', np.int32),
            ('int_lists:values:values:lengths', np.int32),
            ('int_lists:values:values:values', np.int64),
            ('id_score_pairs:lengths', np.int32),
            ('id_score_pairs:values:keys', np.int32),
            ('id_score_pairs:values:values:lengths', np.int32),
            ('id_score_pairs:values:values:values:ids', np.int64),
            ('id_score_pairs:values:values:values:scores', np.float32),
            ('metadata:user_id', np.int64),
            ('metadata:user_embed', (np.float32, 2)),
            ('metadata:query', str),
        ]
        zipped = zip(
            expected_fields, schema.field_names(), schema.field_types()
        )
        for (ref_name, ref_type), name, dtype in zipped:
            self.assertEquals(ref_name, name)
            self.assertEquals(np.dtype(ref_type), dtype)
        """
        2. The contents of our dataset.

        Contents as defined below could represent, for example, a log of
        search queries along with dense, sparse features and metadata.
        The datset below has 3 top-level entries.
        """
        contents_raw = [
            # dense
            [[1.1, 1.2, 1.3], [2.1, 2.2, 2.3], [3.1, 3.2, 3.3]],
            # floats
            [1, 2, 3],  # len
            [11, 21, 22, 31, 32, 33],  # key
            [1.1, 2.1, 2.2, 3.1, 3.2, 3.3],  # value
            # int lists
            [2, 0, 1],  # len
            [11, 12, 31],  # key
            [2, 4, 3],  # value:len
            [111, 112, 121, 122, 123, 124, 311, 312, 313],  # value:value
            # id score pairs
            [1, 2, 2],  # len
            [11, 21, 22, 31, 32],  # key
            [1, 1, 2, 2, 3],  # value:len
            [111, 211, 221, 222, 311, 312, 321, 322, 323],  # value:ids
            [11.1, 21.1, 22.1, 22.2, 31.1, 31.2, 32.1, 32.2, 32.3],  # val:score
            # metadata
            [123, 234, 456],  # user_id
            [[0.2, 0.8], [0.5, 0.5], [0.7, 0.3]],  # user_embed
            ['dog posts', 'friends who like to', 'posts about ca'],  # query
        ]
        # convert the above content to ndarrays, checking against the schema
        contents = from_blob_list(schema, contents_raw)
        """
        3. Creating and appending to the dataset.
        We first create an empty dataset with the given schema.
        Then, a Writer is used to append these entries to the dataset.
        """
        ds = dataset.Dataset(schema)
        net = core.Net('init')
        with core.NameScope('init'):
            ds.init_empty(net)

            content_blobs = NewRecord(net, contents)
            FeedRecord(content_blobs, contents)
            writer = ds.writer(init_net=net)
            writer.write_record(net, content_blobs)
        workspace.RunNetOnce(net)
        """
        4. Iterating through the dataset contents.

        If we were to iterate through the top level entries of our dataset,
        this is what we should expect to see:
        """
        entries_raw = [
            (
                [[1.1, 1.2, 1.3]],  # dense
                [1],
                [11],
                [1.1],  # floats
                [2],
                [11, 12],
                [2, 4],
                [111, 112, 121, 122, 123, 124],  # intlst
                [1],
                [11],
                [1],
                [111],
                [11.1],  # id score pairs
                [123],
                [[0.2, 0.8]],
                ['dog posts'],  # metadata
            ),
            (
                [[2.1, 2.2, 2.3]],  # dense
                [2],
                [21, 22],
                [2.1, 2.2],  # floats
                [0],
                [],
                [],
                [],  # int list
                [2],
                [21, 22],
                [1, 2],
                [211, 221, 222],
                [21.1, 22.1, 22.2],
                [234],
                [[0.5, 0.5]],
                ['friends who like to'],  # metadata
            ),
            (
                [[3.1, 3.2, 3.3]],  # dense
                [3],
                [31, 32, 33],
                [3.1, 3.2, 3.3],  # floats
                [1],
                [31],
                [3],
                [311, 312, 313],  # int lst
                [2],
                [31, 32],
                [2, 3],
                [311, 312, 321, 322, 323],
                [31.1, 31.2, 32.1, 32.2, 32.3],  # id score list
                [456],
                [[0.7, 0.3]],
                ['posts about ca'],  # metadata
            ),
            # after the end of the dataset, we will keep getting empty vectors
            ([], ) * 16,
            ([], ) * 16,
        ]
        entries = [from_blob_list(schema, e) for e in entries_raw]
        """
        Let's go ahead and create the reading nets.
        We will run `read` net multiple times and assert that we are reading the
        entries the way we stated above.
        """
        read_init_net = core.Net('read_init')
        read_next_net = core.Net('read_next')
        reader = ds.reader(read_init_net)
        should_continue, batch = reader.read_record(read_next_net)

        workspace.RunNetOnce(read_init_net)
        workspace.CreateNet(read_next_net, True)

        for entry in entries:
            workspace.RunNet(str(read_next_net))
            actual = FetchRecord(batch)
            _assert_records_equal(actual, entry)
        """
        5. Reading/writing in a single plan

        If all of operations on the data are expressible as Caffe2 operators,
        we don't need to load the data to python, iterating through the dataset
        in a single Plan.

        Where we will process the dataset a little and store it in a second
        dataset. We can reuse the same Reader since it supports reset.
        """
        reset_net = core.Net('reset_net')
        reader.reset(reset_net)
        read_step, batch = reader.execution_step()
        """ We will add the line number * 1000 to the feature ids. """
        process_net = core.Net('process')
        line_no = Const(process_net, 0, dtype=np.int32)
        const_one = Const(process_net, 1000, dtype=np.int32)
        process_net.Add([line_no, const_one], [line_no])
        field = batch.floats.keys.get()
        process_net.Print(field, [])
        process_net.Add([field, line_no], field, broadcast=1, axis=0)
        """ Lets create a second dataset and append to it. """
        ds2 = dataset.Dataset(schema, name='dataset2')
        ds2.init_empty(reset_net)
        writer = ds2.writer(reset_net)
        writer.write_record(process_net, batch)
        # commit is not necessary for DatasetWriter but will add it for
        # generality of the example
        commit_net = core.Net('commit')
        writer.commit(commit_net)
        """ Time to create and run a plan which will do the processing """
        plan = core.Plan('process')
        plan.AddStep(core.execution_step('reset', reset_net))
        plan.AddStep(read_step.AddNet(process_net))
        plan.AddStep(core.execution_step('commit', commit_net))
        workspace.RunPlan(plan)
        """
        Now we should have dataset2 populated.
        """
        ds2_data = FetchRecord(ds2.content())
        field = ds2_data.floats.keys
        field.set(blob=field.get() - [1000, 2000, 2000, 3000, 3000, 3000])
        _assert_records_equal(contents, ds2_data)
        """
        6. Slicing a dataset

        You can create a new schema from pieces of another schema and reuse
        the same data.
        """
        subschema = Struct(('top_level', schema.int_lists.values))
        int_list_contents = contents.int_lists.values.field_names()
        self.assertEquals(len(subschema.field_names()), len(int_list_contents))
        """
        7. Random Access a dataset

        """
        read_init_net = core.Net('read_init')
        read_next_net = core.Net('read_next')

        idx = np.array([2, 1, 0])
        indices_blob = Const(read_init_net, idx, name='indices')
        reader = ds.random_reader(read_init_net, indices_blob)
        reader.computeoffset(read_init_net)

        should_stop, batch = reader.read_record(read_next_net)

        workspace.CreateNet(read_init_net, True)
        workspace.RunNetOnce(read_init_net)

        workspace.CreateNet(read_next_net, True)

        for i in range(len(entries)):
            k = idx[i] if i in idx else i
            entry = entries[k]
            workspace.RunNet(str(read_next_net))
            actual = FetchRecord(batch)
            _assert_records_equal(actual, entry)
        workspace.RunNet(str(read_next_net))
        self.assertEquals(True, workspace.FetchBlob(should_stop))
        """
        8. Random Access a dataset with loop_over = true

        """
        read_init_net = core.Net('read_init')
        read_next_net = core.Net('read_next')

        idx = np.array([2, 1, 0])
        indices_blob = Const(read_init_net, idx, name='indices')
        reader = ds.random_reader(read_init_net, indices_blob, loop_over=True)
        reader.computeoffset(read_init_net)

        should_stop, batch = reader.read_record(read_next_net)

        workspace.CreateNet(read_init_net, True)
        workspace.RunNetOnce(read_init_net)

        workspace.CreateNet(read_next_net, True)

        for _ in range(len(entries) * 3):
            workspace.RunNet(str(read_next_net))
            self.assertEquals(False, workspace.FetchBlob(should_stop))
        """
        9. Sort and shuffle a dataset

        This sort the dataset using the score of a certain column,
        and then shuffle within each chunk of size batch_size * shuffle_size
        before shuffling the chunks.

        """
        read_init_net = core.Net('read_init')
        read_next_net = core.Net('read_next')

        reader = ds.random_reader(read_init_net)
        reader.sort_and_shuffle(read_init_net, 'int_lists:lengths', 1, 2)
        reader.computeoffset(read_init_net)

        should_continue, batch = reader.read_record(read_next_net)

        workspace.CreateNet(read_init_net, True)
        workspace.RunNetOnce(read_init_net)

        workspace.CreateNet(read_next_net, True)

        expected_idx = np.array([2, 1, 0])
        for i in range(len(entries)):
            k = expected_idx[i] if i in expected_idx else i
            entry = entries[k]
            workspace.RunNet(str(read_next_net))
            actual = FetchRecord(batch)
            _assert_records_equal(actual, entry)