def _test_round(self, dumped_index, start_time, end_time):
        partition_num = self.data_source.data_source_meta.partition_num
        for i in range(partition_num):
            self.manifest_manager.forward_peer_dumped_index(i, dumped_index)
        visitor = data_block_visitor.DataBlockVisitor(
            self.data_source.data_source_meta.name, self.db_database,
            self.db_base_dir, self.db_addr, self.db_username, self.db_password,
            True)
        reps = visitor.LoadDataBlockRepByTimeFrame(start_time, end_time)
        metas = [
            meta for meta in self.data_block_matas
            if (not (meta.start_time > end_time or meta.end_time < start_time)
                and meta.data_block_index <= dumped_index)
        ]
        self.assertEqual(len(reps), len(metas))
        for meta in metas:
            self.assertTrue(meta.block_id in reps)
            rep = reps[meta.block_id]
            self.assertEqual(meta.block_id, rep.block_id)
            self.assertEqual(meta.start_time, rep.start_time)
            self.assertEqual(meta.end_time, rep.end_time)
            self.assertEqual(meta.partition_id, rep.partition_id)
            self.assertEqual(meta, rep.data_block_meta)
            data_block_fpath = os.path.join(
                common.data_source_data_block_dir(self.data_source),
                common.partition_repr(meta.partition_id),
                meta.block_id + common.DataBlockSuffix)
            self.assertEqual(data_block_fpath, rep.data_block_fpath)

        for i in range(0, 100):
            rep = visitor.LoadDataBlockReqByIndex(
                random.randint(0, partition_num - 1),
                random.randint(0, dumped_index))
            try:
                meta = [meta for meta in self.data_block_matas if \
                        meta.block_id == rep.block_id][0]
            except Exception as e:
                print(e)
            self.assertEqual(meta.block_id, rep.block_id)
            self.assertEqual(meta.start_time, rep.start_time)
            self.assertEqual(meta.end_time, rep.end_time)
            self.assertEqual(meta.partition_id, rep.partition_id)
            self.assertEqual(meta, rep.data_block_meta)
            data_block_fpath = os.path.join(
                common.data_source_data_block_dir(self.data_source),
                common.partition_repr(meta.partition_id),
                meta.block_id + common.DataBlockSuffix)
            self.assertEqual(data_block_fpath, rep.data_block_fpath)
            self.assertIsNone(
                visitor.LoadDataBlockReqByIndex(
                    random.randint(0, partition_num - 1),
                    random.randint(dumped_index, 10000)))
Exemple #2
0
    def _create_data_block(self, partition_id):
        dbm = data_block_manager.DataBlockManager(self.data_source, partition_id)
        self.assertEqual(dbm.get_dumped_data_block_count(), 0)
        self.assertEqual(dbm.get_lastest_data_block_meta(), None)

        leader_index = 0
        follower_index = 65536
        for i in range(64):
            builder = DataBlockBuilder(
                    common.data_source_data_block_dir(self.data_source),
                    self.data_source.data_source_meta.name,
                    partition_id, i,
                    dj_pb.WriterOptions(output_writer='TF_RECORD'), None
                )
            builder.set_data_block_manager(dbm)
            for j in range(4):
                feat = {}
                example_id = '{}'.format(i * 1024 + j).encode()
                feat['example_id'] = tf.train.Feature(
                        bytes_list=tf.train.BytesList(value=[example_id]))
                event_time = random.randint(0, 10)
                feat['event_time'] = tf.train.Feature(
                        int64_list=tf.train.Int64List(value=[event_time]))
                feat['leader_index'] = tf.train.Feature(
                        int64_list=tf.train.Int64List(value=[leader_index]))
                feat['follower_index'] = tf.train.Feature(
                        int64_list=tf.train.Int64List(value=[follower_index]))
                example = tf.train.Example(features=tf.train.Features(feature=feat))
                builder.append_item(TfExampleItem(example.SerializeToString()),
                                    leader_index, follower_index)
                leader_index += 1
                follower_index += 1
            self.data_block_matas.append(builder.finish_data_block())
Exemple #3
0
    def add_data_block(self, partition_id, x, y):
        dbm = self._dbms[partition_id]

        builder = DataBlockBuilder(
            common.data_source_data_block_dir(self._data_source),
            self._data_source.data_source_meta.name, partition_id,
            dbm.get_dumped_data_block_count(),
            dj_pb.WriterOptions(output_writer="TF_RECORD"), None)
        builder.set_data_block_manager(dbm)
        for i in range(x.shape[0]):
            feat = {}
            exam_id = '{}'.format(i).encode()
            feat['example_id'] = Feature(
                bytes_list=BytesList(value=[exam_id]))
            feat['event_time'] = Feature(
                int64_list = Int64List(value=[i])
            )
            feat['x'] = Feature(float_list=FloatList(value=list(x[i])))
            if y is not None:
                feat['y'] = Feature(int64_list=Int64List(value=[y[i]]))

            example = Example(features=Features(feature=feat))
            builder.append_item(TfExampleItem(example.SerializeToString()), i, 0)

        return builder.finish_data_block()
 def _make_data_block_builder(self, meta):
     assert self._partition_id == meta.partition_id, \
         "partition id of building data block meta mismatch "\
         "{} != {}".format(self._partition_id, meta.partition_id)
     builder = None
     expt = None
     try:
         builder = DataBlockBuilder(
                 common.data_source_data_block_dir(self._data_source),
                 self._data_source.data_source_meta.name,
                 self._partition_id,
                 meta.data_block_index,
                 self._data_block_builder_options
             )
         builder.init_by_meta(meta)
         builder.set_data_block_manager(self._data_block_manager)
         yield builder
     except Exception as e: # pylint: disable=broad-except
         logging.warning("Failed make data block builder, " \
                          "reason %s", e)
         expt = e
     if builder is not None:
         del builder
     if expt is not None:
         raise expt
 def _get_data_block_builder(self, create_if_no_existed):
     if self._data_block_builder is None and create_if_no_existed:
         data_block_index = \
                 self._data_block_manager.get_dumped_data_block_count()
         self._data_block_builder = DataBlockBuilder(
             common.data_source_data_block_dir(self._data_source),
             self._data_source.data_source_meta.name, self._partition_id,
             data_block_index, self._data_block_builder_options,
             self._example_joiner_options.data_block_dump_threshold)
         self._data_block_builder.set_data_block_manager(
             self._data_block_manager)
         self._data_block_builder.set_follower_restart_index(
             self._follower_restart_index)
     return self._data_block_builder
Exemple #6
0
    def _create_data_block(self, data_source, partition_id, x, y):
        data_block_metas = []
        dbm = data_block_manager.DataBlockManager(data_source, partition_id)
        self.assertEqual(dbm.get_dumped_data_block_count(), 0)
        self.assertEqual(dbm.get_lastest_data_block_meta(), None)
        N = 200
        chunk_size = x.shape[0] // N

        leader_index = 0
        follower_index = N * chunk_size * 10
        for i in range(N):
            builder = DataBlockBuilder(
                common.data_source_data_block_dir(data_source),
                data_source.data_source_meta.name,
                partition_id, i,
                dj_pb.WriterOptions(output_writer="TF_RECORD"), None
            )
            builder.set_data_block_manager(dbm)
            for j in range(chunk_size):
                feat = {}
                idx =  i * chunk_size + j
                exam_id = '{}'.format(idx).encode()
                feat['example_id'] = Feature(
                    bytes_list=BytesList(value=[exam_id]))
                evt_time = random.randint(1, 1000)
                feat['event_time'] = Feature(
                    int64_list = Int64List(value=[evt_time])
                )
                feat['x'] = Feature(float_list=FloatList(value=list(x[idx])))
                if y is not None:
                    feat['y'] = Feature(int64_list=Int64List(value=[y[idx]]))

                feat['leader_index'] = Feature(
                    int64_list = Int64List(value=[leader_index])
                )
                feat['follower_index'] = Feature(
                    int64_list = Int64List(value=[follower_index])
                )
                example = Example(features=Features(feature=feat))
                builder.append_item(TfExampleItem(example.SerializeToString()),
                                   leader_index, follower_index)
                leader_index += 1
                follower_index += 1
            data_block_metas.append(builder.finish_data_block())
        self.max_index = follower_index
        return data_block_metas
Exemple #7
0
 def generate_follower_data_block(self):
     dbm = data_block_manager.DataBlockManager(self.data_source_f, 0)
     self.assertEqual(dbm.get_dumped_data_block_count(), 0)
     self.assertEqual(dbm.get_lastest_data_block_meta(), None)
     leader_index = 0
     follower_index = 65536
     self.dumped_metas = []
     for i in range(5):
         builder = DataBlockBuilder(
             common.data_source_data_block_dir(self.data_source_f),
             self.data_source_f.data_source_meta.name, 0, i,
             dj_pb.WriterOptions(output_writer='TF_RECORD'), None)
         builder.set_data_block_manager(dbm)
         for j in range(1024):
             feat = {}
             example_id = '{}'.format(i * 1024 + j).encode()
             feat['example_id'] = tf.train.Feature(
                 bytes_list=tf.train.BytesList(value=[example_id]))
             event_time = 150000000 + i * 1024 + j
             feat['event_time'] = tf.train.Feature(
                 int64_list=tf.train.Int64List(value=[event_time]))
             feat['leader_index'] = tf.train.Feature(
                 int64_list=tf.train.Int64List(value=[leader_index]))
             feat['follower_index'] = tf.train.Feature(
                 int64_list=tf.train.Int64List(value=[follower_index]))
             example = tf.train.Example(features=tf.train.Features(
                 feature=feat))
             builder.append_item(TfExampleItem(example.SerializeToString()),
                                 leader_index, follower_index)
             leader_index += 3
             follower_index += 1
         meta = builder.finish_data_block()
         self.dumped_metas.append(meta)
     self.leader_start_index = 0
     self.leader_end_index = leader_index
     self.assertEqual(dbm.get_dumped_data_block_count(), 5)
     for (idx, meta) in enumerate(self.dumped_metas):
         self.assertEqual(dbm.get_data_block_meta_by_index(idx), meta)
Exemple #8
0
 def _partition_data_block_dir(self, partition_id):
     return os.path.join(data_source_data_block_dir(self._data_source),
                         partition_repr(partition_id))
    def test_data_block_manager(self):
        data_block_datas = []
        data_block_metas = []
        leader_index = 0
        follower_index = 65536
        for i in range(5):
            fill_examples = []
            builder = DataBlockBuilder(
                common.data_source_data_block_dir(self.data_source),
                self.data_source.data_source_meta.name, 0, i,
                dj_pb.WriterOptions(output_writer='TF_RECORD'), None)
            builder.set_data_block_manager(self.data_block_manager)
            for j in range(1024):
                feat = {}
                example_id = '{}'.format(i * 1024 + j).encode()
                feat['example_id'] = tf.train.Feature(
                    bytes_list=tf.train.BytesList(value=[example_id]))
                event_time = 150000000 + i * 1024 + j
                feat['event_time'] = tf.train.Feature(
                    int64_list=tf.train.Int64List(value=[event_time]))
                feat['leader_index'] = tf.train.Feature(
                    int64_list=tf.train.Int64List(value=[leader_index]))
                feat['follower_index'] = tf.train.Feature(
                    int64_list=tf.train.Int64List(value=[follower_index]))
                example = tf.train.Example(features=tf.train.Features(
                    feature=feat))
                builder.append_item(TfExampleItem(example.SerializeToString()),
                                    leader_index, follower_index)
                fill_examples.append((example, {
                    'example_id': example_id,
                    'event_time': event_time,
                    'leader_index': leader_index,
                    'follower_index': follower_index
                }))
                leader_index += 1
                follower_index += 1
            meta = builder.finish_data_block()
            data_block_datas.append(fill_examples)
            data_block_metas.append(meta)
        self.assertEqual(self.data_block_manager.get_dumped_data_block_count(),
                         5)
        self.assertEqual(self.data_block_manager.get_lastest_data_block_meta(),
                         data_block_metas[-1])
        for (idx, meta) in enumerate(data_block_metas):
            self.assertEqual(
                self.data_block_manager.get_data_block_meta_by_index(idx),
                meta)
            self.assertEqual(
                meta.block_id,
                common.encode_block_id(self.data_source.data_source_meta.name,
                                       meta))
        self.assertEqual(
            self.data_block_manager.get_data_block_meta_by_index(5), None)
        data_block_dir = os.path.join(
            common.data_source_data_block_dir(self.data_source),
            common.partition_repr(0))
        for (i, meta) in enumerate(data_block_metas):
            data_block_fpath = os.path.join(
                data_block_dir, meta.block_id) + common.DataBlockSuffix
            data_block_meta_fpath = os.path.join(
                data_block_dir,
                common.encode_data_block_meta_fname(
                    self.data_source.data_source_meta.name, 0,
                    meta.data_block_index))
            self.assertTrue(gfile.Exists(data_block_fpath))
            self.assertTrue(gfile.Exists(data_block_meta_fpath))
            fiter = tf.io.tf_record_iterator(data_block_meta_fpath)
            remote_meta = text_format.Parse(
                next(fiter).decode(), dj_pb.DataBlockMeta())
            self.assertEqual(meta, remote_meta)
            for (j, record) in enumerate(
                    tf.io.tf_record_iterator(data_block_fpath)):
                example = tf.train.Example()
                example.ParseFromString(record)
                stored_data = data_block_datas[i][j]
                self.assertEqual(example, stored_data[0])
                feat = example.features.feature
                stored_feat = stored_data[1]
                self.assertTrue('example_id' in feat)
                self.assertTrue('example_id' in stored_feat)
                self.assertEqual(stored_feat['example_id'],
                                 '{}'.format(i * 1024 + j).encode())
                self.assertEqual(stored_feat['example_id'],
                                 feat['example_id'].bytes_list.value[0])
                self.assertTrue('event_time' in feat)
                self.assertTrue('event_time' in stored_feat)
                self.assertEqual(stored_feat['event_time'],
                                 feat['event_time'].int64_list.value[0])
                self.assertTrue('leader_index' in feat)
                self.assertTrue('leader_index' in stored_feat)
                self.assertEqual(stored_feat['leader_index'],
                                 feat['leader_index'].int64_list.value[0])
                self.assertTrue('follower_index' in feat)
                self.assertTrue('follower_index' in stored_feat)
                self.assertEqual(stored_feat['follower_index'],
                                 feat['follower_index'].int64_list.value[0])
            self.assertEqual(j, 1023)

        data_block_manager2 = data_block_manager.DataBlockManager(
            self.data_source, 0)
        self.assertEqual(self.data_block_manager.get_dumped_data_block_count(),
                         5)
Exemple #10
0
 def test_data_block_dumper(self):
     self.generate_follower_data_block()
     self.generate_leader_raw_data()
     dbd = data_block_dumper.DataBlockDumperManager(
         self.etcd, self.data_source_l, 0,
         dj_pb.RawDataOptions(raw_data_iter='TF_RECORD',
                              read_ahead_size=1 << 20,
                              read_batch_size=128),
         dj_pb.WriterOptions(output_writer='TF_RECORD'))
     self.assertEqual(dbd.get_next_data_block_index(), 0)
     for (idx, meta) in enumerate(self.dumped_metas):
         success, next_index = dbd.add_synced_data_block_meta(meta)
         self.assertTrue(success)
         self.assertEqual(next_index, idx + 1)
     self.assertTrue(dbd.need_dump())
     self.assertEqual(dbd.get_next_data_block_index(),
                      len(self.dumped_metas))
     with dbd.make_data_block_dumper() as dumper:
         dumper()
     dbm_f = data_block_manager.DataBlockManager(self.data_source_f, 0)
     dbm_l = data_block_manager.DataBlockManager(self.data_source_l, 0)
     self.assertEqual(dbm_f.get_dumped_data_block_count(),
                      len(self.dumped_metas))
     self.assertEqual(dbm_f.get_dumped_data_block_count(),
                      dbm_l.get_dumped_data_block_count())
     for (idx, meta) in enumerate(self.dumped_metas):
         self.assertEqual(meta.data_block_index, idx)
         self.assertEqual(dbm_l.get_data_block_meta_by_index(idx), meta)
         self.assertEqual(dbm_f.get_data_block_meta_by_index(idx), meta)
         meta_fpth_l = os.path.join(
             common.data_source_data_block_dir(self.data_source_l),
             common.partition_repr(0),
             common.encode_data_block_meta_fname(
                 self.data_source_l.data_source_meta.name, 0,
                 meta.data_block_index))
         mitr = tf.io.tf_record_iterator(meta_fpth_l)
         meta_l = text_format.Parse(next(mitr), dj_pb.DataBlockMeta())
         self.assertEqual(meta_l, meta)
         meta_fpth_f = os.path.join(
             common.data_source_data_block_dir(self.data_source_f),
             common.partition_repr(0),
             common.encode_data_block_meta_fname(
                 self.data_source_f.data_source_meta.name, 0,
                 meta.data_block_index))
         mitr = tf.io.tf_record_iterator(meta_fpth_f)
         meta_f = text_format.Parse(next(mitr), dj_pb.DataBlockMeta())
         self.assertEqual(meta_f, meta)
         data_fpth_l = os.path.join(
             common.data_source_data_block_dir(self.data_source_l),
             common.partition_repr(0),
             common.encode_data_block_fname(
                 self.data_source_l.data_source_meta.name, meta_l))
         for (iidx,
              record) in enumerate(tf.io.tf_record_iterator(data_fpth_l)):
             example = tf.train.Example()
             example.ParseFromString(record)
             feat = example.features.feature
             self.assertEqual(feat['example_id'].bytes_list.value[0],
                              meta.example_ids[iidx])
         self.assertEqual(len(meta.example_ids), iidx + 1)
         data_fpth_f = os.path.join(
             common.data_source_data_block_dir(self.data_source_f),
             common.partition_repr(0),
             common.encode_data_block_fname(
                 self.data_source_l.data_source_meta.name, meta_f))
         for (iidx,
              record) in enumerate(tf.io.tf_record_iterator(data_fpth_f)):
             example = tf.train.Example()
             example.ParseFromString(record)
             feat = example.features.feature
             self.assertEqual(feat['example_id'].bytes_list.value[0],
                              meta.example_ids[iidx])
         self.assertEqual(len(meta.example_ids), iidx + 1)