Beispiel #1
0
    def test_reader_with_limit(self):
        ws = workspace.C.Workspace()
        session = LocalSession(ws)

        """ 1. feed full dataset """
        src_init = core.Net('src_init')
        src_values = Struct(('label', np.array(range(100))))
        src_blobs = NewRecord(src_init, src_values)
        src_ds = Dataset(src_blobs)
        FeedRecord(src_blobs, src_values, ws)
        ws.run(src_init)

        """ 2. Read with limit smaller than size of dataset """
        dst_init = core.Net('dst_init')
        dst_ds = Dataset(src_values.clone_schema())
        dst_ds.init_empty(dst_init)
        ws.run(dst_init)

        with TaskGroup() as tg:
            reader = ReaderWithLimit(src_ds.reader(), num_iter=10)
            pipe(reader, dst_ds.writer(), num_threads=8)
        session.run(tg)
        self.assertFalse(ws.blobs[str(reader.data_finished())].fetch())
        self.assertEquals(
            sorted(ws.blobs[str(dst_ds.content().label())].fetch()), range(10))

        """ 3. Read with limit larger than size of dataset """
        ws.run(dst_init)
        with TaskGroup() as tg:
            reader = ReaderWithLimit(src_ds.reader(), num_iter=110)
            pipe(reader, dst_ds.writer(), num_threads=8)
        session.run(tg)
        self.assertEquals(
            sorted(ws.blobs[str(dst_ds.content().label())].fetch()), range(100))
        self.assertTrue(ws.blobs[str(reader.data_finished())].fetch())
Beispiel #2
0
def make_destination_dataset(ws, schema, name=None):
    name = name or 'dst'
    dst_init = core.Net('{}_init'.format(name))
    with core.NameScope(name):
        dst_ds = Dataset(schema, name=name)
        dst_ds.init_empty(dst_init)
    ws.run(dst_init)
    return dst_ds
Beispiel #3
0
    def test_dequeue_many(self):
        init_net = core.Net('init')
        N = 17
        NUM_DEQUEUE_RECORDS = 3
        src_values = Struct(
            ('uid', np.array(range(N))),
            ('value', 0.1 * np.array(range(N))))
        expected_dst = Struct(
            ('uid', 2 * np.array(range(N))),
            ('value', np.array(N * [0.0])))

        with core.NameScope('init'):
            src_blobs = NewRecord(init_net, src_values)
            dst_blobs = InitEmptyRecord(init_net, src_values.clone_schema())
            counter = init_net.Const(0)
            ONE = init_net.Const(1)

        def proc1(rec):
            with core.NameScope('proc1'):
                out = NewRecord(ops, rec)
            ops.Add([rec.uid(), rec.uid()], [out.uid()])
            out.value.set(blob=rec.value(), unsafe=True)
            return out

        def proc2(rec):
            with core.NameScope('proc2'):
                out = NewRecord(ops, rec)
            out.uid.set(blob=rec.uid(), unsafe=True)
            ops.Sub([rec.value(), rec.value()], [out.value()])
            ops.Add([counter, ONE], [counter])
            return out

        src_ds = Dataset(src_blobs)
        dst_ds = Dataset(dst_blobs)

        with TaskGroup() as tg:
            out1 = pipe(
                src_ds.reader(),
                output=Queue(
                    capacity=11, num_dequeue_records=NUM_DEQUEUE_RECORDS),
                processor=proc1)
            out2 = pipe(out1, processor=proc2)
            pipe(out2, dst_ds.writer())

        ws = workspace.C.Workspace()
        FeedRecord(src_blobs, src_values, ws)
        session = LocalSession(ws)
        session.run(init_net)
        session.run(tg)
        output = FetchRecord(dst_blobs, ws=ws)
        num_dequeues = ws.blobs[str(counter)].fetch()

        self.assertEquals(
            num_dequeues, int(math.ceil(float(N) / NUM_DEQUEUE_RECORDS)))

        for a, b in zip(output.field_blobs(), expected_dst.field_blobs()):
            np.testing.assert_array_equal(a, b)
Beispiel #4
0
def read_all_data(ws, reader, session):
    dst_init = core.Net('dst_init')
    with core.NameScope('dst'):
        dst_ds = Dataset(reader.schema().clone_schema())
        dst_ds.init_empty(dst_init)
    session.run(dst_init)

    with TaskGroup(workspace_type=WorkspaceType.GLOBAL) as tg:
        pipe(reader, dst_ds.writer(), num_runtime_threads=8)
    session.run(tg)

    return ws.blobs[str(dst_ds.content().label())].fetch()
Beispiel #5
0
    def _test_limit_reader_init_shared(self, size):
        ws = workspace.C.Workspace()
        session = LocalSession(ws)

        # Build test dataset
        src_ds = init_dataset(ws, size=size)

        # Create an identically sized empty destnation dataset
        dst_init = core.Net('dst_init')
        with core.NameScope('dst'):
            dst_ds = Dataset(src_ds.content().clone_schema())
            dst_ds.init_empty(dst_init)
        ws.run(dst_init)

        return ws, session, src_ds, dst_init, dst_ds
Beispiel #6
0
def build_pipeline(node_id):
    with Node('trainer_%d' % node_id):
        with Job.current().init_group, Task():
            data_arr = Struct(('val', np.array(list(range(10)))))
            data = ConstRecord(ops, data_arr)
            ds = Dataset(data, name='dataset:%d' % node_id)
            full_reader = ds.reader(ops)
            total = ops.Const([100])

        def inc_total(rec):
            ops.Add([total, rec.val()], [total])

        epoch_reader = ReaderWithLimit(full_reader, num_iter=3)
        pipe(epoch_reader, processor=inc_total)
        Job.current().add_stop_signal(epoch_reader.data_finished())
    return [total]
Beispiel #7
0
    def __init__(
        self,
        db_path,
        db_type,
        name=None,
        batch_size=100,
    ):
        assert db_path is not None, "db_path can't be None."
        assert db_type in C.registered_dbs(), \
            "db_type [{db_type}] is not available. \n" \
            "Choose one of these: {registered_dbs}.".format(
                db_type=db_type,
                registered_dbs=C.registered_dbs(),
        )

        self.db_path = db_path
        self.db_type = db_type
        self.name = name or '{db_name}_{default_name_suffix}'.format(
            db_name=self._extract_db_name_from_db_path(),
            default_name_suffix=self.default_name_suffix,
        )
        self.batch_size = batch_size

        # Before self._init_reader_schema(...),
        # self.db_path and self.db_type are required to be set.
        super(DBFileReader, self).__init__(self._init_reader_schema())
        self.ds = Dataset(self._schema, self.name + '_dataset')
        self.ds_reader = None
Beispiel #8
0
    def test_local_session(self):
        init_net = core.Net('init')
        src_values = Struct(
            ('uid', np.array([1, 2, 6])),
            ('value', np.array([1.4, 1.6, 1.7])))
        expected_dst = Struct(
            ('uid', np.array([2, 4, 12])),
            ('value', np.array([0.0, 0.0, 0.0])))

        with core.NameScope('init'):
            src_blobs = NewRecord(init_net, src_values)
            dst_blobs = InitEmptyRecord(init_net, src_values.clone_schema())

        def proc1(rec):
            net = core.Net('proc1')
            with core.NameScope('proc1'):
                out = NewRecord(net, rec)
            net.Add([rec.uid(), rec.uid()], [out.uid()])
            out.value.set(blob=rec.value(), unsafe=True)
            return [net], out

        def proc2(rec):
            net = core.Net('proc2')
            with core.NameScope('proc2'):
                out = NewRecord(net, rec)
            out.uid.set(blob=rec.uid(), unsafe=True)
            net.Sub([rec.value(), rec.value()], [out.value()])
            return [net], out

        src_ds = Dataset(src_blobs)
        dst_ds = Dataset(dst_blobs)

        with TaskGroup() as tg:
            out1 = pipe(src_ds.reader(), processor=proc1)
            out2 = pipe(out1, processor=proc2)
            pipe(out2, dst_ds.writer())

        ws = workspace.C.Workspace()
        FeedRecord(src_blobs, src_values, ws)
        session = LocalSession(ws)
        session.run(init_net)
        session.run(tg)
        output = FetchRecord(dst_blobs, ws=ws)

        for a, b in zip(output.field_blobs(), expected_dst.field_blobs()):
            np.testing.assert_array_equal(a, b)
Beispiel #9
0
 def __init__(self, reader, db_type='leveldb', name='cached_reader'):
     super(CachedReader, self).__init__(reader.schema())
     self.original_reader = reader
     self.cache_path = None
     self.ds_reader = None
     self.ds = Dataset(self._schema, name)
     self.db_type = db_type
     self.name = name
     self.field_names = self._schema.field_names()
Beispiel #10
0
    def test_composite_reader_builder(self):
        ws = workspace.C.Workspace()
        session = LocalSession(ws)
        num_srcs = 3
        names = ["src_{}".format(i) for i in range(num_srcs)]
        size = 100
        offsets = [i * size for i in range(num_srcs)]
        src_ds_builders = [
            TestReaderBuilder(offset=offset, size=size, name=name)
            for (name, offset) in zip(names, offsets)
        ]

        # Create an identically sized empty destnation dataset
        dst_init = core.Net('dst_init')
        with core.NameScope('dst'):
            dst_ds = Dataset(
                schema.Struct(*[(name, src_ds_builder.schema())
                                for name, src_ds_builder in zip(
                                    names, src_ds_builders)]))
            dst_ds.init_empty(dst_init)
        ws.run(dst_init)

        with TaskGroup() as tg:
            reader_builder = CompositeReaderBuilder(names, src_ds_builders)
            reader_builder.setup(ws=ws)
            pipe(reader_builder.new_reader(),
                 dst_ds.writer(),
                 num_runtime_threads=3)
        session.run(tg)

        for name, offset in zip(names, offsets):
            written_data = sorted(
                ws.fetch_blob(str(dst_ds.content()[name].label())))
            npt.assert_array_equal(range(offset, offset + size), written_data)
Beispiel #11
0
def build_job():
    with Node('reader'):
        with Job() as job:
            with job.init_group:
                init_net = core.Net('init_net')
                data_arr = Struct(('val', np.array(range(10))))
                data = ConstRecord(init_net, data_arr)
                ds = Dataset(data)
                full_reader = ds.reader(init_net)
                total = init_net.Const([100])
                Task(step=init_net)

            def inc_total(rec):
                net = core.Net('inc_total')
                net.Add([total, rec.val()], [total])
                return [net]

            epoch_reader = ReaderWithLimit(full_reader, num_iter=3)
            pipe(epoch_reader, processor=inc_total)
            job.add_stop_signal(epoch_reader.data_finished())

        total_fetcher = Task(step=core.Net('empty'), outputs=[total])
    return job, total_fetcher
    def test_dequeue_many(self):
        init_net = core.Net('init')
        N = 17
        NUM_DEQUEUE_RECORDS = 3
        src_values = Struct(
            ('uid', np.array(range(N))),
            ('value', 0.1 * np.array(range(N))))
        expected_dst = Struct(
            ('uid', 2 * np.array(range(N))),
            ('value', np.array(N * [0.0])))

        with core.NameScope('init'):
            src_blobs = NewRecord(init_net, src_values)
            dst_blobs = InitEmptyRecord(init_net, src_values.clone_schema())
            counter = init_net.Const(0)
            ONE = init_net.Const(1)

        def proc1(rec):
            with core.NameScope('proc1'):
                out = NewRecord(ops, rec)
            ops.Add([rec.uid(), rec.uid()], [out.uid()])
            out.value.set(blob=rec.value(), unsafe=True)
            return out

        def proc2(rec):
            with core.NameScope('proc2'):
                out = NewRecord(ops, rec)
            out.uid.set(blob=rec.uid(), unsafe=True)
            ops.Sub([rec.value(), rec.value()], [out.value()])
            ops.Add([counter, ONE], [counter])
            return out

        src_ds = Dataset(src_blobs)
        dst_ds = Dataset(dst_blobs)

        with TaskGroup() as tg:
            out1 = pipe(
                src_ds.reader(),
                output=Queue(
                    capacity=11, num_dequeue_records=NUM_DEQUEUE_RECORDS),
                processor=proc1)
            out2 = pipe(out1, processor=proc2)
            pipe(out2, dst_ds.writer())

        ws = workspace.C.Workspace()
        FeedRecord(src_blobs, src_values, ws)
        session = LocalSession(ws)
        session.run(init_net)
        session.run(tg)
        output = FetchRecord(dst_blobs, ws=ws)
        num_dequeues = ws.blobs[str(counter)].fetch()

        self.assertEquals(
            num_dequeues, int(math.ceil(float(N) / NUM_DEQUEUE_RECORDS)))

        for a, b in zip(output.field_blobs(), expected_dst.field_blobs()):
            np.testing.assert_array_equal(a, b)
def read_all_data(ws, reader, session):
    dst_init = core.Net('dst_init')
    with core.NameScope('dst'):
        dst_ds = Dataset(reader.schema().clone_schema())
        dst_ds.init_empty(dst_init)
    session.run(dst_init)

    with TaskGroup(workspace_type=WorkspaceType.GLOBAL) as tg:
        pipe(reader, dst_ds.writer(), num_runtime_threads=8)
    session.run(tg)

    return ws.blobs[str(dst_ds.content().label())].fetch()
    def test_local_session(self):
        init_net = core.Net('init')
        src_values = Struct(
            ('uid', np.array([1, 2, 6])),
            ('value', np.array([1.4, 1.6, 1.7])))
        expected_dst = Struct(
            ('uid', np.array([2, 4, 12])),
            ('value', np.array([0.0, 0.0, 0.0])))

        with core.NameScope('init'):
            src_blobs = NewRecord(init_net, src_values)
            dst_blobs = InitEmptyRecord(init_net, src_values.clone_schema())

        def proc1(rec):
            net = core.Net('proc1')
            with core.NameScope('proc1'):
                out = NewRecord(net, rec)
            net.Add([rec.uid(), rec.uid()], [out.uid()])
            out.value.set(blob=rec.value(), unsafe=True)
            return [net], out

        def proc2(rec):
            net = core.Net('proc2')
            with core.NameScope('proc2'):
                out = NewRecord(net, rec)
            out.uid.set(blob=rec.uid(), unsafe=True)
            net.Sub([rec.value(), rec.value()], [out.value()])
            return [net], out

        src_ds = Dataset(src_blobs)
        dst_ds = Dataset(dst_blobs)

        with TaskGroup() as tg:
            out1 = pipe(src_ds.reader(), processor=proc1)
            out2 = pipe(out1, processor=proc2)
            pipe(out2, dst_ds.writer())

        ws = workspace.C.Workspace()
        FeedRecord(src_blobs, src_values, ws)
        session = LocalSession(ws)
        session.run(init_net)
        session.run(tg)
        output = FetchRecord(dst_blobs, ws=ws)

        for a, b in zip(output.field_blobs(), expected_dst.field_blobs()):
            np.testing.assert_array_equal(a, b)
Beispiel #15
0
    def test_composite_reader(self):
        ws = workspace.C.Workspace()
        session = LocalSession(ws)
        num_srcs = 3
        names = ["src_{}".format(i) for i in range(num_srcs)]
        size = 100
        offsets = [i * size for i in range(num_srcs)]
        src_dses = [
            init_dataset(ws, offset=offset, size=size, name=name)
            for (name, offset) in zip(names, offsets)
        ]

        data = [ws.fetch_blob(str(src.field_blobs[0])) for src in src_dses]
        # Sanity check we didn't overwrite anything
        for d, offset in zip(data, offsets):
            npt.assert_array_equal(d, range(offset, offset + size))

        # Create an identically sized empty destnation dataset
        dst_init = core.Net('dst_init')
        with core.NameScope('dst'):
            dst_ds = Dataset(
                schema.Struct(*[(name, src_ds.content().clone_schema())
                                for name, src_ds in zip(names, src_dses)]))
            dst_ds.init_empty(dst_init)
        ws.run(dst_init)

        with TaskGroup() as tg:
            reader = CompositeReader(names,
                                     [src_ds.reader() for src_ds in src_dses])
            pipe(reader, dst_ds.writer(), num_runtime_threads=3)
        session.run(tg)

        for i in range(num_srcs):
            written_data = sorted(
                ws.fetch_blob(str(dst_ds.content()[names[i]].label())))
            npt.assert_array_equal(data[i], written_data)
Beispiel #16
0
    def test_reader_with_limit(self):
        ws = workspace.C.Workspace()
        session = LocalSession(ws)
        """ 1. feed full dataset """
        src_init = core.Net('src_init')
        src_values = Struct(('label', np.array(range(100))))
        src_blobs = NewRecord(src_init, src_values)
        src_ds = Dataset(src_blobs)
        FeedRecord(src_blobs, src_values, ws)
        ws.run(src_init)
        """ 2. Read with limit smaller than size of dataset """
        dst_init = core.Net('dst_init')
        dst_ds = Dataset(src_values.clone_schema())
        dst_ds.init_empty(dst_init)
        ws.run(dst_init)

        with TaskGroup() as tg:
            reader = ReaderWithLimit(src_ds.reader(), num_iter=10)
            pipe(reader, dst_ds.writer(), num_threads=8)
        session.run(tg)
        self.assertFalse(ws.blobs[str(reader.data_finished())].fetch())
        self.assertEquals(
            sorted(ws.blobs[str(dst_ds.content().label())].fetch()), range(10))
        """ 3. Read with limit larger than size of dataset """
        ws.run(dst_init)
        with TaskGroup() as tg:
            reader = ReaderWithLimit(src_ds.reader(), num_iter=110)
            pipe(reader, dst_ds.writer(), num_threads=8)
        session.run(tg)
        self.assertEquals(
            sorted(ws.blobs[str(dst_ds.content().label())].fetch()),
            range(100))
        self.assertTrue(ws.blobs[str(reader.data_finished())].fetch())
Beispiel #17
0
class DBFileReader(Reader):

    default_name_suffix = 'db_file_reader'
    """Reader reads from a DB file.

    Example usage:
    db_file_reader = DBFileReader(db_path='/tmp/cache.db', db_type='LevelDB')

    Args:
        db_path: str.
        db_type: str. DB type of file. A db_type is registed by
            `REGISTER_CAFFE2_DB(<db_type>, <DB Class>)`.
        name: str or None. Name of DBFileReader.
            Optional name to prepend to blobs that will store the data.
            Default to '<db_name>_<default_name_suffix>'.
        batch_size: int.
            How many examples are read for each time the read_net is run.
        loop_over: bool.
            If True given, will go through examples in random order endlessly.
        field_names: List[str]. If the schema.field_names() should not in
            alphabetic order, it must be specified.
            Otherwise, schema will be automatically restored with
            schema.field_names() sorted in alphabetic order.
    """
    def __init__(
        self,
        db_path,
        db_type,
        name=None,
        batch_size=100,
        loop_over=False,
        field_names=None,
    ):
        assert db_path is not None, "db_path can't be None."
        assert db_type in C.registered_dbs(), \
            "db_type [{db_type}] is not available. \n" \
            "Choose one of these: {registered_dbs}.".format(
                db_type=db_type,
                registered_dbs=C.registered_dbs(),
        )

        self.db_path = os.path.expanduser(db_path)
        self.db_type = db_type
        self.name = name or '{db_name}_{default_name_suffix}'.format(
            db_name=self._extract_db_name_from_db_path(),
            default_name_suffix=self.default_name_suffix,
        )
        self.batch_size = batch_size
        self.loop_over = loop_over

        # Before self._init_reader_schema(...),
        # self.db_path and self.db_type are required to be set.
        super(DBFileReader,
              self).__init__(self._init_reader_schema(field_names))
        self.ds = Dataset(self._schema, self.name + '_dataset')
        self.ds_reader = None

    def _init_name(self, name):
        return name or self._extract_db_name_from_db_path() + '_db_file_reader'

    def _init_reader_schema(self, field_names=None):
        """Restore a reader schema from the DB file.

        If `field_names` given, restore scheme according to it.

        Overwise, loade blobs from the DB file into the workspace,
        and restore schema from these blob names.
        It is also assumed that:
        1). Each field of the schema have corresponding blobs
            stored in the DB file.
        2). Each blob loaded from the DB file corresponds to
            a field of the schema.
        3). field_names in the original schema are in alphabetic order,
            since blob names loaded to the workspace from the DB file
            will be in alphabetic order.

        Load a set of blobs from a DB file. From names of these blobs,
        restore the DB file schema using `from_column_list(...)`.

        Returns:
            schema: schema.Struct. Used in Reader.__init__(...).
        """
        if field_names:
            return from_column_list(field_names)

        assert os.path.exists(self.db_path), \
            'db_path [{db_path}] does not exist'.format(db_path=self.db_path)
        with core.NameScope(self.name):
            # blob_prefix is for avoiding name conflict in workspace
            blob_prefix = scope.CurrentNameScope()
        workspace.RunOperatorOnce(
            core.CreateOperator(
                'Load',
                [],
                [],
                absolute_path=True,
                db=self.db_path,
                db_type=self.db_type,
                load_all=True,
                add_prefix=blob_prefix,
            ))
        col_names = [
            blob_name[len(blob_prefix):] for blob_name in workspace.Blobs()
            if blob_name.startswith(blob_prefix)
        ]
        schema = from_column_list(col_names)
        return schema

    def setup_ex(self, init_net, finish_net):
        """From the Dataset, create a _DatasetReader and setup a init_net.

        Make sure the _init_field_blobs_as_empty(...) is only called once.

        Because the underlying NewRecord(...) creats blobs by calling
        NextScopedBlob(...), so that references to previously-initiated
        empty blobs will be lost, causing accessibility issue.
        """
        if self.ds_reader:
            self.ds_reader.setup_ex(init_net, finish_net)
        else:
            self._init_field_blobs_as_empty(init_net)
            self._feed_field_blobs_from_db_file(init_net)
            self.ds_reader = self.ds.random_reader(
                init_net,
                batch_size=self.batch_size,
                loop_over=self.loop_over,
            )
            self.ds_reader.sort_and_shuffle(init_net)
            self.ds_reader.computeoffset(init_net)

    def read(self, read_net):
        assert self.ds_reader, 'setup_ex must be called first'
        return self.ds_reader.read(read_net)

    def _init_field_blobs_as_empty(self, init_net):
        """Initialize dataset field blobs by creating an empty record"""
        with core.NameScope(self.name):
            self.ds.init_empty(init_net)

    def _feed_field_blobs_from_db_file(self, net):
        """Load from the DB file at db_path and feed dataset field blobs"""
        assert os.path.exists(self.db_path), \
            'db_path [{db_path}] does not exist'.format(db_path=self.db_path)
        net.Load(
            [],
            self.ds.get_blobs(),
            db=self.db_path,
            db_type=self.db_type,
            absolute_path=True,
            source_blob_names=self.ds.field_names(),
        )

    def _extract_db_name_from_db_path(self):
        """Extract DB name from DB path

            E.g. given self.db_path=`/tmp/sample.db`,
            it returns `sample`.

            Returns:
                db_name: str.
        """
        return os.path.basename(self.db_path).rsplit('.', 1)[0]
Beispiel #18
0
class DBFileReader(Reader):

    default_name_suffix = 'db_file_reader'

    """Reader reads from a DB file.

    Example usage:
    db_file_reader = DBFileReader(db_path='/tmp/cache.db', db_type='LevelDB')

    Args:
        db_path: str.
        db_type: str. DB type of file. A db_type is registed by
            `REGISTER_CAFFE2_DB(<db_type>, <DB Class>)`.
        name: str or None. Name of DBFileReader.
            Optional name to prepend to blobs that will store the data.
            Default to '<db_name>_<default_name_suffix>'.
        batch_size: int.
            How many examples are read for each time the read_net is run.
    """
    def __init__(
        self,
        db_path,
        db_type,
        name=None,
        batch_size=100,
    ):
        assert db_path is not None, "db_path can't be None."
        assert db_type in C.registered_dbs(), \
            "db_type [{db_type}] is not available. \n" \
            "Choose one of these: {registered_dbs}.".format(
                db_type=db_type,
                registered_dbs=C.registered_dbs(),
        )

        self.db_path = db_path
        self.db_type = db_type
        self.name = name or '{db_name}_{default_name_suffix}'.format(
            db_name=self._extract_db_name_from_db_path(),
            default_name_suffix=self.default_name_suffix,
        )
        self.batch_size = batch_size

        # Before self._init_reader_schema(...),
        # self.db_path and self.db_type are required to be set.
        super(DBFileReader, self).__init__(self._init_reader_schema())
        self.ds = Dataset(self._schema, self.name + '_dataset')
        self.ds_reader = None

    def _init_name(self, name):
        return name or self._extract_db_name_from_db_path(
        ) + '_db_file_reader'

    def _init_reader_schema(self):
        """Restore a reader schema from the DB file.

        Here it is assumed that:
        1). Each field of the schema have corresponding blobs
            stored in the DB file.
        2). Each blob loaded from the DB file corresponds to
            a field of the schema.

        Load a set of blobs from a DB file. From names of these blobs,
        restore the DB file schema using `from_column_list(...)`.

        Returns:
            schema: schema.Struct. Used in Reader.__init__(...).
        """
        assert os.path.exists(self.db_path), \
            'db_path [{db_path}] does not exist'.format(db_path=self.db_path)
        with core.NameScope(self.name):
            # blob_prefix is for avoiding name conflict in workspace
            blob_prefix = scope.CurrentNameScope()
        workspace.RunOperatorOnce(
            core.CreateOperator(
                'Load',
                [],
                [],
                absolute_path=True,
                db=self.db_path,
                db_type=self.db_type,
                load_all=True,
                add_prefix=blob_prefix,
            )
        )
        col_names = [
            blob_name[len(blob_prefix):] for blob_name in workspace.Blobs()
            if blob_name.startswith(blob_prefix)
        ]
        schema = from_column_list(col_names)
        return schema

    def setup_ex(self, init_net, finish_net):
        """From the Dataset, create a _DatasetReader and setup a init_net.

        Make sure the _init_field_blobs_as_empty(...) is only called once.

        Because the underlying NewRecord(...) creats blobs by calling
        NextScopedBlob(...), so that references to previously-initiated
        empty blobs will be lost, causing accessibility issue.
        """
        if self.ds_reader:
            self.ds_reader.setup_ex(init_net, finish_net)
        else:
            self._init_field_blobs_as_empty(init_net)
            self._feed_field_blobs_from_db_file(init_net)
            self.ds_reader = self.ds.reader(
                init_net,
                batch_size=self.batch_size,
            )

    def read(self, read_net):
        assert self.ds_reader, 'setup_ex must be called first'
        return self.ds_reader.read(read_net)

    def _init_field_blobs_as_empty(self, init_net):
        """Initialize dataset field blobs by creating an empty record"""
        with core.NameScope(self.name):
            self.ds.init_empty(init_net)

    def _feed_field_blobs_from_db_file(self, net):
        """Load from the DB file at db_path and feed dataset field blobs"""
        assert os.path.exists(self.db_path), \
            'db_path [{db_path}] does not exist'.format(db_path=self.db_path)
        net.Load(
            [],
            self.ds.get_blobs(),
            db=self.db_path,
            db_type=self.db_type,
            absolute_path=True,
            source_blob_names=self.ds.field_names(),
        )

    def _extract_db_name_from_db_path(self):
        """Extract DB name from DB path

            E.g. given self.db_path=`/tmp/sample.db`,
            it returns `sample`.

            Returns:
                db_name: str.
        """
        return os.path.basename(self.db_path).rsplit('.', 1)[0]
Beispiel #19
0
    def test_record_queue(self):
        num_prod = 8
        num_consume = 3
        schema = Struct(
            ('floats', Map(
                Scalar(np.int32),
                Scalar(np.float32))),
        )
        contents_raw = [
            [1, 2, 3],  # len
            [11, 21, 22, 31, 32, 33],  # key
            [1.1, 2.1, 2.2, 3.1, 3.2, 3.3],  # value
        ]
        contents = from_blob_list(schema, contents_raw)
        ds = Dataset(schema)
        net = core.Net('init')
        ds.init_empty(net)

        content_blobs = NewRecord(net, contents)
        FeedRecord(content_blobs, contents)
        writer = ds.writer(init_net=net)
        writer.write_record(net, content_blobs)
        reader = ds.reader(init_net=net)

        # prepare receiving dataset
        rec_dataset = Dataset(contents, name='rec')
        rec_dataset.init_empty(init_net=net)
        rec_dataset_writer = rec_dataset.writer(init_net=net)

        workspace.RunNetOnce(net)

        queue = RecordQueue(contents, num_threads=num_prod)

        def process(net, fields):
            new_fields = []
            for f in fields.field_blobs():
                new_f = net.Copy(f)
                new_fields.append(new_f)
            new_fields = from_blob_list(fields, new_fields)
            return new_fields

        q_reader, q_step, q_exit, fields = queue.build(reader, process)
        producer_step = core.execution_step('producer', [q_step, q_exit])

        consumer_steps = []
        for i in range(num_consume):
            name = 'queue_reader_' + str(i)
            net_consume = core.Net(name)
            should_stop, fields = q_reader.read_record(net_consume)
            step_consume = core.execution_step(name, net_consume)

            name = 'dataset_writer_' + str(i)
            net_dataset = core.Net(name)
            rec_dataset_writer.write(net_dataset, fields.field_blobs())
            step_dataset = core.execution_step(name, net_dataset)

            step = core.execution_step(
                'consumer_' + str(i),
                [step_consume, step_dataset],
                should_stop_blob=should_stop)
            consumer_steps.append(step)
        consumer_step = core.execution_step(
            'consumers', consumer_steps, concurrent_substeps=True)

        work_steps = core.execution_step(
            'work', [producer_step, consumer_step], concurrent_substeps=True)

        plan = core.Plan('test')
        plan.AddStep(work_steps)
        core.workspace.RunPlan(plan)
        data = workspace.FetchBlobs(rec_dataset.get_blobs())
        self.assertEqual(6, sum(data[0]))
        self.assertEqual(150, sum(data[1]))
        self.assertAlmostEqual(15, sum(data[2]), places=5)
Beispiel #20
0
class CachedReader(Reader):
    """
    Reader with persistent in-file cache.

    Example usage:
    cached_reader = CachedReader(reader)
    build_cache_step = cached_reader.build_cache('/tmp/cache.db')
    with LocalSession() as session:
        session.run(build_cache_step)

    Every time new reader is created, it's expected that build_cache will be
    called before setup_ex and usage of the reader. build_cache will check
    existence of provided file path and in case it's missing will initialize it
    by reading data from original reader. All consequent attempts to read will
    ignore original reader (i.e. no additional data will be read from it).
    """

    def __init__(self, reader, db_type='leveldb', name='cached_reader'):
        super(CachedReader, self).__init__(reader.schema())
        self.original_reader = reader
        self.cache_path = None
        self.ds_reader = None
        self.ds = Dataset(self._schema, name)
        self.db_type = db_type
        self.name = name
        self.field_names = self._schema.field_names()

    def setup_ex(self, init_net, finish_net):
        assert self.cache_path, 'build_cache must be called first'
        self._init_dataset(init_net)
        self._load_from_file(init_net)
        self.ds_reader = self.ds.reader(init_net, batch_size=100)

    def read(self, read_net):
        assert self.ds_reader, 'setup must be called first'
        return self.ds_reader.read(read_net)

    def has_cache(self):
        return self.cache_path and os.path.exists(self.cache_path)

    def build_cache(self, cache_path, overwrite=False):
        if not self.has_cache() or overwrite:
            self.cache_path = cache_path
        if self.has_cache() and not overwrite:
            # cache already exists, no need to rebuild it
            return core.execution_step('build_step', [])

        init_net = core.Net('init')
        self._init_dataset(init_net)
        with Cluster(), core.NameScope(self.name), TaskGroup() as copy_tg:
            pipe(self.original_reader, self.ds.writer(), num_threads=16)
            copy_step = copy_tg.to_task().get_step()
        save_net = core.Net('save')
        self._save_to_file(save_net)

        return core.execution_step('build_cache', [init_net, copy_step, save_net])

    def _init_dataset(self, init_net):
        with core.NameScope(self.name):
            self.ds.init_empty(init_net)

    def _save_to_file(self, net):
        net.Save(
            self.ds.content().field_blobs(),
            [],
            db=self.cache_path,
            db_type=self.db_type,
            blob_name_overrides=self.field_names,
            absolute_path=True,
        )

    def _load_from_file(self, net):
        net.Load(
            [],
            self.ds.content().field_blobs(),
            db=self.cache_path,
            db_type=self.db_type,
            absolute_path=True,
            source_blob_names=self.field_names,
        )
Beispiel #21
0
    def test_reader_with_limit(self):
        ws = workspace.C.Workspace()
        session = LocalSession(ws)

        """ 1. feed full dataset """
        src_ds = init_dataset(ws)

        """ 2. Read with limit smaller than size of dataset """
        dst_init = core.Net('dst_init')
        with core.NameScope('dst'):
            dst_ds = Dataset(src_ds.content().clone_schema())
            dst_ds.init_empty(dst_init)
        ws.run(dst_init)

        # WorkspaceType.GLOBAL is required because we are fetching
        # reader.data_finished() after the TaskGroup finishes.
        with TaskGroup(workspace_type=WorkspaceType.GLOBAL) as tg:
            reader = ReaderWithLimit(src_ds.reader(), num_iter=10)
            pipe(reader, dst_ds.writer(), num_threads=8)
        session.run(tg)

        self.assertFalse(ws.blobs[str(reader.data_finished())].fetch())
        self.assertEquals(
            sorted(ws.blobs[str(dst_ds.content().label())].fetch()),
            list(range(10))
        )

        """ 3. Read with limit larger than size of dataset """
        ws.run(dst_init)
        with TaskGroup(workspace_type=WorkspaceType.GLOBAL) as tg:
            reader = ReaderWithLimit(src_ds.reader(), num_iter=110)
            pipe(reader, dst_ds.writer(), num_runtime_threads=8)
        session.run(tg)
        self.assertEquals(
            sorted(ws.blobs[str(dst_ds.content().label())].fetch()),
            list(range(100))
        )
        self.assertTrue(ws.blobs[str(reader.data_finished())].fetch())

        """ 4. Read without counter """
        ws.run(dst_init)
        with TaskGroup(workspace_type=WorkspaceType.GLOBAL) as tg:
            reader = ReaderWithLimit(src_ds.reader(), num_iter=None)
            pipe(reader, dst_ds.writer(), num_threads=8)
        session.run(tg)
        self.assertEquals(
            sorted(ws.blobs[str(dst_ds.content().label())].fetch()),
            list(range(100))
        )
        self.assertTrue(ws.blobs[str(reader.data_finished())].fetch())

        """ 5. Read using the same reader without resetting workspace """
        session.run(tg)
        self.assertEquals(
            sorted(ws.blobs[str(dst_ds.content().label())].fetch()),
            sorted(list(range(100)) * 2)
        )
Beispiel #22
0
    def test_record_queue(self):
        num_prod = 8
        num_consume = 3
        schema = Struct(
            ('floats', Map(Scalar(np.int32), Scalar(np.float32))), )
        contents_raw = [
            [1, 2, 3],  # len
            [11, 21, 22, 31, 32, 33],  # key
            [1.1, 2.1, 2.2, 3.1, 3.2, 3.3],  # value
        ]
        contents = from_blob_list(schema, contents_raw)
        ds = Dataset(schema)
        net = core.Net('init')
        ds.init_empty(net)

        content_blobs = NewRecord(net, contents)
        FeedRecord(content_blobs, contents)
        writer = ds.writer(init_net=net)
        writer.write_record(net, content_blobs)
        reader = ds.reader(init_net=net)

        # prepare receiving dataset
        rec_dataset = Dataset(contents, name='rec')
        rec_dataset.init_empty(init_net=net)
        rec_dataset_writer = rec_dataset.writer(init_net=net)

        workspace.RunNetOnce(net)

        queue = RecordQueue(contents, num_threads=num_prod)

        def process(net, fields):
            new_fields = []
            for f in fields.field_blobs():
                new_f = net.Copy(f)
                new_fields.append(new_f)
            new_fields = from_blob_list(fields, new_fields)
            return new_fields

        q_reader, q_step, q_exit, fields = queue.build(reader, process)
        producer_step = core.execution_step('producer', [q_step, q_exit])

        consumer_steps = []
        for i in range(num_consume):
            name = 'queue_reader_' + str(i)
            net_consume = core.Net(name)
            should_stop, fields = q_reader.read_record(net_consume)
            step_consume = core.execution_step(name, net_consume)

            name = 'dataset_writer_' + str(i)
            net_dataset = core.Net(name)
            rec_dataset_writer.write(net_dataset, fields.field_blobs())
            step_dataset = core.execution_step(name, net_dataset)

            step = core.execution_step('consumer_' + str(i),
                                       [step_consume, step_dataset],
                                       should_stop_blob=should_stop)
            consumer_steps.append(step)
        consumer_step = core.execution_step('consumers',
                                            consumer_steps,
                                            concurrent_substeps=True)

        work_steps = core.execution_step('work',
                                         [producer_step, consumer_step],
                                         concurrent_substeps=True)

        plan = core.Plan('test')
        plan.AddStep(work_steps)
        core.workspace.RunPlan(plan)
        data = workspace.FetchBlobs(rec_dataset.get_blobs())
        self.assertEqual(6, sum(data[0]))
        self.assertEqual(150, sum(data[1]))
        self.assertAlmostEqual(15, sum(data[2]), places=5)