def _build_data_block_meta(self):
     tmp_meta_fpath = self._get_tmp_fpath()
     meta = self._data_block_meta
     with tf.io.TFRecordWriter(tmp_meta_fpath) as meta_writer:
         meta_writer.write(text_format.MessageToString(meta).encode())
     if self._data_block_manager is not None:
         self._data_block_manager.commit_data_block_meta(
             tmp_meta_fpath, meta)
     else:
         meta_fname = encode_data_block_meta_fname(self._data_source_name,
                                                   self._partition_id,
                                                   meta.data_block_index)
         meta_fpath = os.path.join(self._get_data_block_dir(), meta_fname)
         gfile.Rename(tmp_meta_fpath, meta_fpath)
Beispiel #2
0
 def LoadDataBlockReqByIndex(self, partition_id, data_block_index):
     partition_num = self._data_source.data_source_meta.partition_num
     if partition_id < 0 or partition_id >= partition_num:
         raise IndexError("partition {} out range".format(partition_id))
     dirpath = self._partition_data_block_dir(partition_id)
     meta_fname = encode_data_block_meta_fname(self._data_source_name(),
                                               partition_id,
                                               data_block_index)
     meta_fpath = os.path.join(dirpath, meta_fname)
     meta = load_data_block_meta(meta_fpath)
     manifest = self._sync_raw_data_manifest(partition_id)
     if meta is not None and \
             not self._filter_by_visible(meta.data_block_index, manifest):
         fname = encode_data_block_fname(self._data_source_name(), meta)
         return DataBlockRep(self._data_source_name(), fname, partition_id,
                             dirpath)
     return None
Beispiel #3
0
 def __init__(self,
              data_source_name,
              data_block_fname,
              partition_id,
              dirpath,
              check_existed=True):
     assert data_block_fname.endswith(DataBlockSuffix), \
         "data block fname {} should has suffix {}".format(
             data_block_fname, DataBlockSuffix
         )
     block_id = data_block_fname[:-len(DataBlockSuffix)]
     segmap = decode_block_id(block_id)
     if segmap["data_source_name"] != data_source_name:
         raise ValueError("{} invalid. Data source name mismatch "\
                          "{} != {}".format(data_block_fname,
                              segmap["data_source_name"], data_source_name))
     self._data_source_name = data_source_name
     if segmap["partition_id"] != partition_id:
         raise ValueError("{} invalid. partition mismatch "\
                          "{} != {}".format(data_block_fname,
                              segmap["partition_id"], partition_id))
     self._partition_id = partition_id
     start_time, end_time = \
             segmap["time_frame"][0], segmap["time_frame"][1]
     if start_time > end_time:
         raise ValueError("{} invalid. time frame error start_time {} > "\
                          "end_time {}".format(data_block_fname,
                                               start_time, end_time))
     self._start_time, self._end_time = start_time, end_time
     self._data_block_index = segmap["data_block_index"]
     self._block_id = block_id
     meta_fname = encode_data_block_meta_fname(self._data_source_name,
                                               self._partition_id,
                                               self._data_block_index)
     meta_fpath = os.path.join(dirpath, meta_fname)
     if check_existed and (not gfile.Exists(meta_fpath) or \
                           gfile.IsDirectory(meta_fpath)):
         raise ValueError("{} invalid. the corresponding meta file "\
                          "is not existed".format(data_block_fname))
     self._data_block_meta_fpath = meta_fpath
     self._data_block_meta = None
     self._data_block_fpath = os.path.join(dirpath, data_block_fname)
 def _get_data_block_meta_path(self, data_block_index):
     meta_fname = encode_data_block_meta_fname(
         self._data_source.data_source_meta.name, self._partition_id,
         data_block_index)
     return os.path.join(self._data_block_dir(), meta_fname)
    def test_data_block_manager(self):
        data_block_datas = []
        data_block_metas = []
        leader_index = 0
        follower_index = 65536
        for i in range(5):
            fill_examples = []
            builder = DataBlockBuilder(
                self.data_source.data_block_dir,
                self.data_source.data_source_meta.name, 0, i,
                dj_pb.WriterOptions(output_writer='TF_RECORD'), None)
            builder.set_data_block_manager(self.data_block_manager)
            for j in range(1024):
                feat = {}
                example_id = '{}'.format(i * 1024 + j).encode()
                feat['example_id'] = tf.train.Feature(
                    bytes_list=tf.train.BytesList(value=[example_id]))
                event_time = 150000000 + i * 1024 + j
                feat['event_time'] = tf.train.Feature(
                    int64_list=tf.train.Int64List(value=[event_time]))
                feat['leader_index'] = tf.train.Feature(
                    int64_list=tf.train.Int64List(value=[leader_index]))
                feat['follower_index'] = tf.train.Feature(
                    int64_list=tf.train.Int64List(value=[follower_index]))
                example = tf.train.Example(features=tf.train.Features(
                    feature=feat))
                builder.append_item(TfExampleItem(example.SerializeToString()),
                                    leader_index, follower_index)
                fill_examples.append((example, {
                    'example_id': example_id,
                    'event_time': event_time,
                    'leader_index': leader_index,
                    'follower_index': follower_index
                }))
                leader_index += 1
                follower_index += 1
            meta = builder.finish_data_block()
            data_block_datas.append(fill_examples)
            data_block_metas.append(meta)
        self.assertEqual(self.data_block_manager.get_dumped_data_block_count(),
                         5)
        self.assertEqual(self.data_block_manager.get_lastest_data_block_meta(),
                         data_block_metas[-1])
        for (idx, meta) in enumerate(data_block_metas):
            self.assertEqual(
                self.data_block_manager.get_data_block_meta_by_index(idx),
                meta)
            self.assertEqual(
                meta.block_id,
                common.encode_block_id(self.data_source.data_source_meta.name,
                                       meta))
        self.assertEqual(
            self.data_block_manager.get_data_block_meta_by_index(5), None)
        data_block_dir = os.path.join(self.data_source.data_block_dir,
                                      common.partition_repr(0))
        for (i, meta) in enumerate(data_block_metas):
            data_block_fpath = os.path.join(
                data_block_dir, meta.block_id) + common.DataBlockSuffix
            data_block_meta_fpath = os.path.join(
                data_block_dir,
                common.encode_data_block_meta_fname(
                    self.data_source.data_source_meta.name, 0,
                    meta.data_block_index))
            self.assertTrue(gfile.Exists(data_block_fpath))
            self.assertTrue(gfile.Exists(data_block_meta_fpath))
            fiter = tf.io.tf_record_iterator(data_block_meta_fpath)
            remote_meta = text_format.Parse(
                next(fiter).decode(), dj_pb.DataBlockMeta())
            self.assertEqual(meta, remote_meta)
            for (j, record) in enumerate(
                    tf.io.tf_record_iterator(data_block_fpath)):
                example = tf.train.Example()
                example.ParseFromString(record)
                stored_data = data_block_datas[i][j]
                self.assertEqual(example, stored_data[0])
                feat = example.features.feature
                stored_feat = stored_data[1]
                self.assertTrue('example_id' in feat)
                self.assertTrue('example_id' in stored_feat)
                self.assertEqual(stored_feat['example_id'],
                                 '{}'.format(i * 1024 + j).encode())
                self.assertEqual(stored_feat['example_id'],
                                 feat['example_id'].bytes_list.value[0])
                self.assertTrue('event_time' in feat)
                self.assertTrue('event_time' in stored_feat)
                self.assertEqual(stored_feat['event_time'],
                                 feat['event_time'].int64_list.value[0])
                self.assertTrue('leader_index' in feat)
                self.assertTrue('leader_index' in stored_feat)
                self.assertEqual(stored_feat['leader_index'],
                                 feat['leader_index'].int64_list.value[0])
                self.assertTrue('follower_index' in feat)
                self.assertTrue('follower_index' in stored_feat)
                self.assertEqual(stored_feat['follower_index'],
                                 feat['follower_index'].int64_list.value[0])
            self.assertEqual(j, 1023)

        data_block_manager2 = data_block_manager.DataBlockManager(
            self.data_source, 0)
        self.assertEqual(self.data_block_manager.get_dumped_data_block_count(),
                         5)
Beispiel #6
0
 def test_data_block_dumper(self):
     self.generate_follower_data_block()
     self.generate_leader_raw_data()
     dbd = data_block_dumper.DataBlockDumperManager(
         self.etcd,
         self.data_source_l,
         0,
         dj_pb.RawDataOptions(raw_data_iter='TF_RECORD'),
         dj_pb.DataBlockBuilderOptions(
             data_block_builder='TF_RECORD_DATABLOCK_BUILDER'),
     )
     self.assertEqual(dbd.get_next_data_block_index(), 0)
     for (idx, meta) in enumerate(self.dumped_metas):
         success, next_index = dbd.add_synced_data_block_meta(meta)
         self.assertTrue(success)
         self.assertEqual(next_index, idx + 1)
     self.assertTrue(dbd.need_dump())
     self.assertEqual(dbd.get_next_data_block_index(),
                      len(self.dumped_metas))
     with dbd.make_data_block_dumper() as dumper:
         dumper()
     dbm_f = data_block_manager.DataBlockManager(self.data_source_f, 0)
     dbm_l = data_block_manager.DataBlockManager(self.data_source_l, 0)
     self.assertEqual(dbm_f.get_dumped_data_block_count(),
                      len(self.dumped_metas))
     self.assertEqual(dbm_f.get_dumped_data_block_count(),
                      dbm_l.get_dumped_data_block_count())
     for (idx, meta) in enumerate(self.dumped_metas):
         self.assertEqual(meta.data_block_index, idx)
         self.assertEqual(dbm_l.get_data_block_meta_by_index(idx), meta)
         self.assertEqual(dbm_f.get_data_block_meta_by_index(idx), meta)
         meta_fpth_l = os.path.join(
             self.data_source_l.data_block_dir, common.partition_repr(0),
             common.encode_data_block_meta_fname(
                 self.data_source_l.data_source_meta.name, 0,
                 meta.data_block_index))
         mitr = tf.io.tf_record_iterator(meta_fpth_l)
         meta_l = text_format.Parse(next(mitr), dj_pb.DataBlockMeta())
         self.assertEqual(meta_l, meta)
         meta_fpth_f = os.path.join(
             self.data_source_f.data_block_dir, common.partition_repr(0),
             common.encode_data_block_meta_fname(
                 self.data_source_f.data_source_meta.name, 0,
                 meta.data_block_index))
         mitr = tf.io.tf_record_iterator(meta_fpth_f)
         meta_f = text_format.Parse(next(mitr), dj_pb.DataBlockMeta())
         self.assertEqual(meta_f, meta)
         data_fpth_l = os.path.join(
             self.data_source_l.data_block_dir, common.partition_repr(0),
             common.encode_data_block_fname(
                 self.data_source_l.data_source_meta.name, meta_l))
         for (iidx,
              record) in enumerate(tf.io.tf_record_iterator(data_fpth_l)):
             example = tf.train.Example()
             example.ParseFromString(record)
             feat = example.features.feature
             self.assertEqual(feat['example_id'].bytes_list.value[0],
                              meta.example_ids[iidx])
         self.assertEqual(len(meta.example_ids), iidx + 1)
         data_fpth_f = os.path.join(
             self.data_source_f.data_block_dir, common.partition_repr(0),
             common.encode_data_block_fname(
                 self.data_source_l.data_source_meta.name, meta_f))
         for (iidx,
              record) in enumerate(tf.io.tf_record_iterator(data_fpth_f)):
             example = tf.train.Example()
             example.ParseFromString(record)
             feat = example.features.feature
             self.assertEqual(feat['example_id'].bytes_list.value[0],
                              meta.example_ids[iidx])
         self.assertEqual(len(meta.example_ids), iidx + 1)