Exemple #1
0
 def test_data_block_dumper(self):
     self.generate_follower_data_block()
     self.generate_leader_raw_data()
     options = customized_options.CustomizedOptions()
     options.set_raw_data_iter('TF_RECORD')
     dbd = data_block_dumper.DataBlockDumperManager(
             self.etcd, self.data_source_l, 0, options
         )
     self.assertEqual(dbd.get_partition_id(), 0)
     self.assertEqual(dbd.get_next_data_block_index(), 0)
     for (idx, meta) in enumerate(self.dumped_metas):
         success, next_index = dbd.append_synced_data_block_meta(meta)
         self.assertTrue(success)
         self.assertEqual(next_index, idx + 1)
     self.assertTrue(dbd.need_dump())
     self.assertEqual(dbd.get_next_data_block_index(), len(self.dumped_metas))
     dbd.dump_data_blocks()
     dbm_f = data_block_manager.DataBlockManager(self.data_source_f, 0)
     dbm_l = data_block_manager.DataBlockManager(self.data_source_l, 0)
     self.assertEqual(dbm_f.get_dumped_data_block_num(), len(self.dumped_metas))
     self.assertEqual(dbm_f.get_dumped_data_block_num(),
                         dbm_l.get_dumped_data_block_num())
     for (idx, meta) in enumerate(self.dumped_metas):
         self.assertEqual(meta.data_block_index, idx)
         self.assertEqual(dbm_l.get_data_block_meta_by_index(idx)[0], meta)
         self.assertEqual(dbm_f.get_data_block_meta_by_index(idx)[0], meta)
         block_id = meta.block_id
         meta_fpth_l = os.path.join(self.data_source_l.data_block_dir, 'partition_0',
                                    block_id + common.DataBlockMetaSuffix)
         mitr = tf.io.tf_record_iterator(meta_fpth_l)
         meta_l = dj_pb.DataBlockMeta()
         meta_l.ParseFromString(next(mitr))
         self.assertEqual(meta_l, meta)
         meta_fpth_f = os.path.join(self.data_source_f.data_block_dir, 'partition_0',
                                    block_id + common.DataBlockMetaSuffix)
         mitr = tf.io.tf_record_iterator(meta_fpth_f)
         meta_f = dj_pb.DataBlockMeta()
         meta_f.ParseFromString(next(mitr)) 
         self.assertEqual(meta_f, meta)
         data_fpth_l = os.path.join(self.data_source_l.data_block_dir, 'partition_0',
                                    block_id + common.DataBlockSuffix)
         for (iidx, record) in enumerate(tf.io.tf_record_iterator(data_fpth_l)):
             example = tf.train.Example()
             example.ParseFromString(record)
             feat = example.features.feature
             self.assertEqual(feat['example_id'].bytes_list.value[0],
                              meta.example_ids[iidx])
         self.assertEqual(len(meta.example_ids), iidx + 1)
         data_fpth_f = os.path.join(self.data_source_f.data_block_dir, 'partition_0',
                                    block_id + common.DataBlockSuffix)
         for (iidx, record) in enumerate(tf.io.tf_record_iterator(data_fpth_f)):
             example = tf.train.Example()
             example.ParseFromString(record)
             feat = example.features.feature
             self.assertEqual(feat['example_id'].bytes_list.value[0],
                              meta.example_ids[iidx])
         self.assertEqual(len(meta.example_ids), iidx + 1)
Exemple #2
0
def load_data_block_meta(meta_fpath):
    assert meta_fpath.endswith(DataBlockMetaSuffix)
    if not gfile.Exists(meta_fpath):
        return None
    with make_tf_record_iter(meta_fpath) as fitr:
        return text_format.Parse(next(fitr).decode(), dj_pb.DataBlockMeta(),
                                 allow_unknown_field=True)
Exemple #3
0
 def _sync_dumped_data_block_meta(self):
     dumped_data_block_path = {}
     dumped_data_block_meta_path = {}
     dumped_data_block_meta = []
     data_block_dir = self._data_block_dir()
     if not gfile.Exists(data_block_dir):
         gfile.MakeDirs(data_block_dir)
     elif not gfile.IsDirectory(data_block_dir):
         logging.fatal("%s must be the directory of data block for "\
                       "partition %d", data_block_dir, self._partition_id)
         os._exit(-1)  # pylint: disable=protected-access
     for fpath in self._list_data_block_dir():
         fname = ntpath.basename(fpath)
         if fname.endswith(DataBlockSuffix):
             ftag = fname[:-len(DataBlockSuffix)]
             dumped_data_block_path[ftag] = fpath
         elif fname.endswith(DataBlockMetaSuffix):
             ftag = fname[:-len(DataBlockMetaSuffix)]
             dumped_data_block_meta_path[ftag] = fpath
         else:
             gfile.Remove(fpath)
     for (ftag, fpath) in dumped_data_block_meta_path.items():
         if ftag not in dumped_data_block_path:
             gfile.Remove(fpath)
             gfile.Remove(dumped_data_block_path[ftag])
         else:
             with make_tf_record_iter(fpath) as record_iter:
                 dbm = dj_pb.DataBlockMeta()
                 dbm.ParseFromString(next(record_iter))
                 dumped_data_block_meta.append(dbm)
     dumped_data_block_meta = sorted(dumped_data_block_meta,
                                     key=lambda meta: meta.data_block_index)
     for (idx, meta) in enumerate(dumped_data_block_meta):
         if meta.data_block_index != idx:
             logging.fatal("data_block_index is not consecutive")
             os._exit(-1)  # pylint: disable=protected-access
         if idx == 0:
             continue
         prev_meta = dumped_data_block_meta[idx - 1]
         if prev_meta.follower_restart_index > meta.follower_restart_index:
             logging.fatal("follower_restart_index is not Incremental")
             os._exit(-1)  # pylint: disable=protected-access
         if prev_meta.leader_start_index >= meta.leader_start_index:
             logging.fatal("leader_start_index is not Incremental")
             os._exit(-1)  # pylint: disable=protected-access
         if prev_meta.leader_end_index >= meta.leader_end_index:
             logging.fatal("leader_end_index is not Incremental")
             os._exit(-1)  # pylint: disable=protected-access
     with self._lock:
         if len(dumped_data_block_meta) > len(self._dumped_data_block_meta):
             self._dumped_data_block_meta = dumped_data_block_meta
 def __init__(self, dirname, data_source_name, partition_id,
              data_block_index, max_example_num=None):
     self._data_source_name = data_source_name
     self._partition_id = partition_id
     self._max_example_num = max_example_num
     self._dirname = dirname
     self._tmp_fpath = self._get_tmp_fpath()
     self._writer = self._make_data_block_writer(self._tmp_fpath)
     self._data_block_meta = dj_pb.DataBlockMeta()
     self._data_block_meta.partition_id = partition_id
     self._data_block_meta.data_block_index = data_block_index
     self._data_block_meta.follower_restart_index = 0
     self._example_num = 0
     self._data_block_manager = None
Exemple #5
0
 def __init__(self, dirname, data_source_name, partition_id,
              data_block_index, write_options, max_example_num=None):
     self._data_source_name = data_source_name
     self._partition_id = partition_id
     self._max_example_num = max_example_num
     self._dirname = dirname
     self._tmp_fpath = self._get_tmp_fpath()
     self._writer = create_output_writer(write_options, self._tmp_fpath)
     self._data_block_meta = dj_pb.DataBlockMeta()
     self._data_block_meta.partition_id = partition_id
     self._data_block_meta.data_block_index = data_block_index
     self._data_block_meta.follower_restart_index = 0
     self._example_num = 0
     self._data_block_manager = None
     self._example_ids_size = 0
     self._metrics_tags = {'ds_name': self._data_source_name,
                           'partition': partition_id}
Exemple #6
0
 def __init__(self,
              dirname,
              partition_id,
              data_block_index,
              max_example_num=None):
     self._start_time = None
     self._end_time = None
     self._partition_id = partition_id
     self._max_example_num = max_example_num
     self._dirname = dirname
     self._tmp_fpath = self._get_tmp_fpath()
     self._tf_record_writer = tf.io.TFRecordWriter(self._tmp_fpath)
     self._data_block_meta = dj_pb.DataBlockMeta()
     self._data_block_meta.partition_id = partition_id
     self._data_block_meta.data_block_index = data_block_index
     self._data_block_meta.follower_restart_index = 0
     self._filled = False
     self._example_num = 0
    def test_data_block_manager(self):
        data_block_datas = []
        data_block_metas = []
        leader_index = 0
        follower_index = 65536
        for i in range(5):
            fill_examples = []
            builder = DataBlockBuilder(
                self.data_source.data_block_dir,
                self.data_source.data_source_meta.name, 0, i,
                dj_pb.WriterOptions(output_writer='TF_RECORD'), None)
            builder.set_data_block_manager(self.data_block_manager)
            for j in range(1024):
                feat = {}
                example_id = '{}'.format(i * 1024 + j).encode()
                feat['example_id'] = tf.train.Feature(
                    bytes_list=tf.train.BytesList(value=[example_id]))
                event_time = 150000000 + i * 1024 + j
                feat['event_time'] = tf.train.Feature(
                    int64_list=tf.train.Int64List(value=[event_time]))
                feat['leader_index'] = tf.train.Feature(
                    int64_list=tf.train.Int64List(value=[leader_index]))
                feat['follower_index'] = tf.train.Feature(
                    int64_list=tf.train.Int64List(value=[follower_index]))
                example = tf.train.Example(features=tf.train.Features(
                    feature=feat))
                builder.append_item(TfExampleItem(example.SerializeToString()),
                                    leader_index, follower_index)
                fill_examples.append((example, {
                    'example_id': example_id,
                    'event_time': event_time,
                    'leader_index': leader_index,
                    'follower_index': follower_index
                }))
                leader_index += 1
                follower_index += 1
            meta = builder.finish_data_block()
            data_block_datas.append(fill_examples)
            data_block_metas.append(meta)
        self.assertEqual(self.data_block_manager.get_dumped_data_block_count(),
                         5)
        self.assertEqual(self.data_block_manager.get_lastest_data_block_meta(),
                         data_block_metas[-1])
        for (idx, meta) in enumerate(data_block_metas):
            self.assertEqual(
                self.data_block_manager.get_data_block_meta_by_index(idx),
                meta)
            self.assertEqual(
                meta.block_id,
                common.encode_block_id(self.data_source.data_source_meta.name,
                                       meta))
        self.assertEqual(
            self.data_block_manager.get_data_block_meta_by_index(5), None)
        data_block_dir = os.path.join(self.data_source.data_block_dir,
                                      common.partition_repr(0))
        for (i, meta) in enumerate(data_block_metas):
            data_block_fpath = os.path.join(
                data_block_dir, meta.block_id) + common.DataBlockSuffix
            data_block_meta_fpath = os.path.join(
                data_block_dir,
                common.encode_data_block_meta_fname(
                    self.data_source.data_source_meta.name, 0,
                    meta.data_block_index))
            self.assertTrue(gfile.Exists(data_block_fpath))
            self.assertTrue(gfile.Exists(data_block_meta_fpath))
            fiter = tf.io.tf_record_iterator(data_block_meta_fpath)
            remote_meta = text_format.Parse(
                next(fiter).decode(), dj_pb.DataBlockMeta())
            self.assertEqual(meta, remote_meta)
            for (j, record) in enumerate(
                    tf.io.tf_record_iterator(data_block_fpath)):
                example = tf.train.Example()
                example.ParseFromString(record)
                stored_data = data_block_datas[i][j]
                self.assertEqual(example, stored_data[0])
                feat = example.features.feature
                stored_feat = stored_data[1]
                self.assertTrue('example_id' in feat)
                self.assertTrue('example_id' in stored_feat)
                self.assertEqual(stored_feat['example_id'],
                                 '{}'.format(i * 1024 + j).encode())
                self.assertEqual(stored_feat['example_id'],
                                 feat['example_id'].bytes_list.value[0])
                self.assertTrue('event_time' in feat)
                self.assertTrue('event_time' in stored_feat)
                self.assertEqual(stored_feat['event_time'],
                                 feat['event_time'].int64_list.value[0])
                self.assertTrue('leader_index' in feat)
                self.assertTrue('leader_index' in stored_feat)
                self.assertEqual(stored_feat['leader_index'],
                                 feat['leader_index'].int64_list.value[0])
                self.assertTrue('follower_index' in feat)
                self.assertTrue('follower_index' in stored_feat)
                self.assertEqual(stored_feat['follower_index'],
                                 feat['follower_index'].int64_list.value[0])
            self.assertEqual(j, 1023)

        data_block_manager2 = data_block_manager.DataBlockManager(
            self.data_source, 0)
        self.assertEqual(self.data_block_manager.get_dumped_data_block_count(),
                         5)
Exemple #8
0
 def test_data_block_dumper(self):
     self.generate_follower_data_block()
     self.generate_leader_raw_data()
     dbd = data_block_dumper.DataBlockDumperManager(
         self.etcd,
         self.data_source_l,
         0,
         dj_pb.RawDataOptions(raw_data_iter='TF_RECORD'),
         dj_pb.DataBlockBuilderOptions(
             data_block_builder='TF_RECORD_DATABLOCK_BUILDER'),
     )
     self.assertEqual(dbd.get_next_data_block_index(), 0)
     for (idx, meta) in enumerate(self.dumped_metas):
         success, next_index = dbd.add_synced_data_block_meta(meta)
         self.assertTrue(success)
         self.assertEqual(next_index, idx + 1)
     self.assertTrue(dbd.need_dump())
     self.assertEqual(dbd.get_next_data_block_index(),
                      len(self.dumped_metas))
     with dbd.make_data_block_dumper() as dumper:
         dumper()
     dbm_f = data_block_manager.DataBlockManager(self.data_source_f, 0)
     dbm_l = data_block_manager.DataBlockManager(self.data_source_l, 0)
     self.assertEqual(dbm_f.get_dumped_data_block_count(),
                      len(self.dumped_metas))
     self.assertEqual(dbm_f.get_dumped_data_block_count(),
                      dbm_l.get_dumped_data_block_count())
     for (idx, meta) in enumerate(self.dumped_metas):
         self.assertEqual(meta.data_block_index, idx)
         self.assertEqual(dbm_l.get_data_block_meta_by_index(idx), meta)
         self.assertEqual(dbm_f.get_data_block_meta_by_index(idx), meta)
         meta_fpth_l = os.path.join(
             self.data_source_l.data_block_dir, common.partition_repr(0),
             common.encode_data_block_meta_fname(
                 self.data_source_l.data_source_meta.name, 0,
                 meta.data_block_index))
         mitr = tf.io.tf_record_iterator(meta_fpth_l)
         meta_l = text_format.Parse(next(mitr), dj_pb.DataBlockMeta())
         self.assertEqual(meta_l, meta)
         meta_fpth_f = os.path.join(
             self.data_source_f.data_block_dir, common.partition_repr(0),
             common.encode_data_block_meta_fname(
                 self.data_source_f.data_source_meta.name, 0,
                 meta.data_block_index))
         mitr = tf.io.tf_record_iterator(meta_fpth_f)
         meta_f = text_format.Parse(next(mitr), dj_pb.DataBlockMeta())
         self.assertEqual(meta_f, meta)
         data_fpth_l = os.path.join(
             self.data_source_l.data_block_dir, common.partition_repr(0),
             common.encode_data_block_fname(
                 self.data_source_l.data_source_meta.name, meta_l))
         for (iidx,
              record) in enumerate(tf.io.tf_record_iterator(data_fpth_l)):
             example = tf.train.Example()
             example.ParseFromString(record)
             feat = example.features.feature
             self.assertEqual(feat['example_id'].bytes_list.value[0],
                              meta.example_ids[iidx])
         self.assertEqual(len(meta.example_ids), iidx + 1)
         data_fpth_f = os.path.join(
             self.data_source_f.data_block_dir, common.partition_repr(0),
             common.encode_data_block_fname(
                 self.data_source_l.data_source_meta.name, meta_f))
         for (iidx,
              record) in enumerate(tf.io.tf_record_iterator(data_fpth_f)):
             example = tf.train.Example()
             example.ParseFromString(record)
             feat = example.features.feature
             self.assertEqual(feat['example_id'].bytes_list.value[0],
                              meta.example_ids[iidx])
         self.assertEqual(len(meta.example_ids), iidx + 1)
Exemple #9
0
 def test_data_block_manager(self):
     data_block_datas = []
     data_block_metas = []
     leader_index = 0
     follower_index = 65536
     for i in range(5):
         fill_examples = []
         builder = data_block_manager.DataBlockBuilder(
             self.data_source.data_block_dir, 0, i, None)
         for j in range(1024):
             feat = {}
             example_id = '{}'.format(i * 1024 + j).encode()
             feat['example_id'] = tf.train.Feature(
                 bytes_list=tf.train.BytesList(value=[example_id]))
             event_time = 150000000 + i * 1024 + j
             feat['event_time'] = tf.train.Feature(
                 int64_list=tf.train.Int64List(value=[event_time]))
             feat['leader_index'] = tf.train.Feature(
                 int64_list=tf.train.Int64List(value=[leader_index]))
             feat['follower_index'] = tf.train.Feature(
                 int64_list=tf.train.Int64List(value=[follower_index]))
             example = tf.train.Example(features=tf.train.Features(
                 feature=feat))
             builder.append(example.SerializeToString(), example_id,
                            event_time, leader_index, follower_index)
             fill_examples.append((example, {
                 'example_id': example_id,
                 'event_time': event_time,
                 'leader_index': leader_index,
                 'follower_index': follower_index
             }))
             leader_index += 1
             follower_index += 1
         builder.finish_data_block()
         data_block_datas.append(fill_examples)
         data_block_metas.append(builder.get_data_block_meta())
     self.assertEqual(self.data_block_manager.get_dumped_data_block_num(),
                      0)
     self.assertEqual(self.data_block_manager.get_last_data_block_meta(),
                      None)
     self.assertEqual(
         self.data_block_manager.get_dumped_data_block_num(True), 5)
     for (idx, meta) in enumerate(data_block_metas):
         self.assertEqual(
             self.data_block_manager.get_data_block_meta_by_index(idx)[0],
             meta)
         self.assertEqual(
             meta.block_id, '{}-{}_{}'.format(meta.start_time,
                                              meta.end_time, idx))
     self.assertEqual(
         self.data_block_manager.get_data_block_meta_by_index(5)[0], None)
     data_block_dir = os.path.join(self.data_source.data_block_dir,
                                   'partition_{}'.format(0))
     for (i, meta) in enumerate(data_block_metas):
         data_block_fpath = os.path.join(
             data_block_dir, meta.block_id) + common.DataBlockSuffix
         data_block_meta_fpath = os.path.join(
             data_block_dir, meta.block_id) + common.DataBlockMetaSuffix
         self.assertTrue(gfile.Exists(data_block_fpath))
         self.assertTrue(gfile.Exists(data_block_meta_fpath))
         fiter = tf.io.tf_record_iterator(data_block_meta_fpath)
         remote_meta = dj_pb.DataBlockMeta()
         remote_meta.ParseFromString(next(fiter))
         self.assertEqual(meta, remote_meta)
         for (j, record) in enumerate(
                 tf.io.tf_record_iterator(data_block_fpath)):
             example = tf.train.Example()
             example.ParseFromString(record)
             stored_data = data_block_datas[i][j]
             self.assertEqual(example, stored_data[0])
             feat = example.features.feature
             stored_feat = stored_data[1]
             self.assertTrue('example_id' in feat)
             self.assertTrue('example_id' in stored_feat)
             self.assertEqual(stored_feat['example_id'],
                              '{}'.format(i * 1024 + j).encode())
             self.assertEqual(stored_feat['example_id'],
                              feat['example_id'].bytes_list.value[0])
             self.assertTrue('event_time' in feat)
             self.assertTrue('event_time' in stored_feat)
             self.assertEqual(stored_feat['event_time'],
                              feat['event_time'].int64_list.value[0])
             self.assertTrue('leader_index' in feat)
             self.assertTrue('leader_index' in stored_feat)
             self.assertEqual(stored_feat['leader_index'],
                              feat['leader_index'].int64_list.value[0])
             self.assertTrue('follower_index' in feat)
             self.assertTrue('follower_index' in stored_feat)
             self.assertEqual(stored_feat['follower_index'],
                              feat['follower_index'].int64_list.value[0])
         self.assertEqual(j, 1023)