def test_data_block_dumper(self): self.generate_follower_data_block() self.generate_leader_raw_data() options = customized_options.CustomizedOptions() options.set_raw_data_iter('TF_RECORD') dbd = data_block_dumper.DataBlockDumperManager( self.etcd, self.data_source_l, 0, options ) self.assertEqual(dbd.get_partition_id(), 0) self.assertEqual(dbd.get_next_data_block_index(), 0) for (idx, meta) in enumerate(self.dumped_metas): success, next_index = dbd.append_synced_data_block_meta(meta) self.assertTrue(success) self.assertEqual(next_index, idx + 1) self.assertTrue(dbd.need_dump()) self.assertEqual(dbd.get_next_data_block_index(), len(self.dumped_metas)) dbd.dump_data_blocks() dbm_f = data_block_manager.DataBlockManager(self.data_source_f, 0) dbm_l = data_block_manager.DataBlockManager(self.data_source_l, 0) self.assertEqual(dbm_f.get_dumped_data_block_num(), len(self.dumped_metas)) self.assertEqual(dbm_f.get_dumped_data_block_num(), dbm_l.get_dumped_data_block_num()) for (idx, meta) in enumerate(self.dumped_metas): self.assertEqual(meta.data_block_index, idx) self.assertEqual(dbm_l.get_data_block_meta_by_index(idx)[0], meta) self.assertEqual(dbm_f.get_data_block_meta_by_index(idx)[0], meta) block_id = meta.block_id meta_fpth_l = os.path.join(self.data_source_l.data_block_dir, 'partition_0', block_id + common.DataBlockMetaSuffix) mitr = tf.io.tf_record_iterator(meta_fpth_l) meta_l = dj_pb.DataBlockMeta() meta_l.ParseFromString(next(mitr)) self.assertEqual(meta_l, meta) meta_fpth_f = os.path.join(self.data_source_f.data_block_dir, 'partition_0', block_id + common.DataBlockMetaSuffix) mitr = tf.io.tf_record_iterator(meta_fpth_f) meta_f = dj_pb.DataBlockMeta() meta_f.ParseFromString(next(mitr)) self.assertEqual(meta_f, meta) data_fpth_l = os.path.join(self.data_source_l.data_block_dir, 'partition_0', block_id + common.DataBlockSuffix) for (iidx, record) in enumerate(tf.io.tf_record_iterator(data_fpth_l)): example = tf.train.Example() example.ParseFromString(record) feat = example.features.feature self.assertEqual(feat['example_id'].bytes_list.value[0], meta.example_ids[iidx]) self.assertEqual(len(meta.example_ids), iidx + 1) data_fpth_f = os.path.join(self.data_source_f.data_block_dir, 'partition_0', block_id + common.DataBlockSuffix) for (iidx, record) in enumerate(tf.io.tf_record_iterator(data_fpth_f)): example = tf.train.Example() example.ParseFromString(record) feat = example.features.feature self.assertEqual(feat['example_id'].bytes_list.value[0], meta.example_ids[iidx]) self.assertEqual(len(meta.example_ids), iidx + 1)
def load_data_block_meta(meta_fpath): assert meta_fpath.endswith(DataBlockMetaSuffix) if not gfile.Exists(meta_fpath): return None with make_tf_record_iter(meta_fpath) as fitr: return text_format.Parse(next(fitr).decode(), dj_pb.DataBlockMeta(), allow_unknown_field=True)
def _sync_dumped_data_block_meta(self): dumped_data_block_path = {} dumped_data_block_meta_path = {} dumped_data_block_meta = [] data_block_dir = self._data_block_dir() if not gfile.Exists(data_block_dir): gfile.MakeDirs(data_block_dir) elif not gfile.IsDirectory(data_block_dir): logging.fatal("%s must be the directory of data block for "\ "partition %d", data_block_dir, self._partition_id) os._exit(-1) # pylint: disable=protected-access for fpath in self._list_data_block_dir(): fname = ntpath.basename(fpath) if fname.endswith(DataBlockSuffix): ftag = fname[:-len(DataBlockSuffix)] dumped_data_block_path[ftag] = fpath elif fname.endswith(DataBlockMetaSuffix): ftag = fname[:-len(DataBlockMetaSuffix)] dumped_data_block_meta_path[ftag] = fpath else: gfile.Remove(fpath) for (ftag, fpath) in dumped_data_block_meta_path.items(): if ftag not in dumped_data_block_path: gfile.Remove(fpath) gfile.Remove(dumped_data_block_path[ftag]) else: with make_tf_record_iter(fpath) as record_iter: dbm = dj_pb.DataBlockMeta() dbm.ParseFromString(next(record_iter)) dumped_data_block_meta.append(dbm) dumped_data_block_meta = sorted(dumped_data_block_meta, key=lambda meta: meta.data_block_index) for (idx, meta) in enumerate(dumped_data_block_meta): if meta.data_block_index != idx: logging.fatal("data_block_index is not consecutive") os._exit(-1) # pylint: disable=protected-access if idx == 0: continue prev_meta = dumped_data_block_meta[idx - 1] if prev_meta.follower_restart_index > meta.follower_restart_index: logging.fatal("follower_restart_index is not Incremental") os._exit(-1) # pylint: disable=protected-access if prev_meta.leader_start_index >= meta.leader_start_index: logging.fatal("leader_start_index is not Incremental") os._exit(-1) # pylint: disable=protected-access if prev_meta.leader_end_index >= meta.leader_end_index: logging.fatal("leader_end_index is not Incremental") os._exit(-1) # pylint: disable=protected-access with self._lock: if len(dumped_data_block_meta) > len(self._dumped_data_block_meta): self._dumped_data_block_meta = dumped_data_block_meta
def __init__(self, dirname, data_source_name, partition_id, data_block_index, max_example_num=None): self._data_source_name = data_source_name self._partition_id = partition_id self._max_example_num = max_example_num self._dirname = dirname self._tmp_fpath = self._get_tmp_fpath() self._writer = self._make_data_block_writer(self._tmp_fpath) self._data_block_meta = dj_pb.DataBlockMeta() self._data_block_meta.partition_id = partition_id self._data_block_meta.data_block_index = data_block_index self._data_block_meta.follower_restart_index = 0 self._example_num = 0 self._data_block_manager = None
def __init__(self, dirname, data_source_name, partition_id, data_block_index, write_options, max_example_num=None): self._data_source_name = data_source_name self._partition_id = partition_id self._max_example_num = max_example_num self._dirname = dirname self._tmp_fpath = self._get_tmp_fpath() self._writer = create_output_writer(write_options, self._tmp_fpath) self._data_block_meta = dj_pb.DataBlockMeta() self._data_block_meta.partition_id = partition_id self._data_block_meta.data_block_index = data_block_index self._data_block_meta.follower_restart_index = 0 self._example_num = 0 self._data_block_manager = None self._example_ids_size = 0 self._metrics_tags = {'ds_name': self._data_source_name, 'partition': partition_id}
def __init__(self, dirname, partition_id, data_block_index, max_example_num=None): self._start_time = None self._end_time = None self._partition_id = partition_id self._max_example_num = max_example_num self._dirname = dirname self._tmp_fpath = self._get_tmp_fpath() self._tf_record_writer = tf.io.TFRecordWriter(self._tmp_fpath) self._data_block_meta = dj_pb.DataBlockMeta() self._data_block_meta.partition_id = partition_id self._data_block_meta.data_block_index = data_block_index self._data_block_meta.follower_restart_index = 0 self._filled = False self._example_num = 0
def test_data_block_manager(self): data_block_datas = [] data_block_metas = [] leader_index = 0 follower_index = 65536 for i in range(5): fill_examples = [] builder = DataBlockBuilder( self.data_source.data_block_dir, self.data_source.data_source_meta.name, 0, i, dj_pb.WriterOptions(output_writer='TF_RECORD'), None) builder.set_data_block_manager(self.data_block_manager) for j in range(1024): feat = {} example_id = '{}'.format(i * 1024 + j).encode() feat['example_id'] = tf.train.Feature( bytes_list=tf.train.BytesList(value=[example_id])) event_time = 150000000 + i * 1024 + j feat['event_time'] = tf.train.Feature( int64_list=tf.train.Int64List(value=[event_time])) feat['leader_index'] = tf.train.Feature( int64_list=tf.train.Int64List(value=[leader_index])) feat['follower_index'] = tf.train.Feature( int64_list=tf.train.Int64List(value=[follower_index])) example = tf.train.Example(features=tf.train.Features( feature=feat)) builder.append_item(TfExampleItem(example.SerializeToString()), leader_index, follower_index) fill_examples.append((example, { 'example_id': example_id, 'event_time': event_time, 'leader_index': leader_index, 'follower_index': follower_index })) leader_index += 1 follower_index += 1 meta = builder.finish_data_block() data_block_datas.append(fill_examples) data_block_metas.append(meta) self.assertEqual(self.data_block_manager.get_dumped_data_block_count(), 5) self.assertEqual(self.data_block_manager.get_lastest_data_block_meta(), data_block_metas[-1]) for (idx, meta) in enumerate(data_block_metas): self.assertEqual( self.data_block_manager.get_data_block_meta_by_index(idx), meta) self.assertEqual( meta.block_id, common.encode_block_id(self.data_source.data_source_meta.name, meta)) self.assertEqual( self.data_block_manager.get_data_block_meta_by_index(5), None) data_block_dir = os.path.join(self.data_source.data_block_dir, common.partition_repr(0)) for (i, meta) in enumerate(data_block_metas): data_block_fpath = os.path.join( data_block_dir, meta.block_id) + common.DataBlockSuffix data_block_meta_fpath = os.path.join( data_block_dir, common.encode_data_block_meta_fname( self.data_source.data_source_meta.name, 0, meta.data_block_index)) self.assertTrue(gfile.Exists(data_block_fpath)) self.assertTrue(gfile.Exists(data_block_meta_fpath)) fiter = tf.io.tf_record_iterator(data_block_meta_fpath) remote_meta = text_format.Parse( next(fiter).decode(), dj_pb.DataBlockMeta()) self.assertEqual(meta, remote_meta) for (j, record) in enumerate( tf.io.tf_record_iterator(data_block_fpath)): example = tf.train.Example() example.ParseFromString(record) stored_data = data_block_datas[i][j] self.assertEqual(example, stored_data[0]) feat = example.features.feature stored_feat = stored_data[1] self.assertTrue('example_id' in feat) self.assertTrue('example_id' in stored_feat) self.assertEqual(stored_feat['example_id'], '{}'.format(i * 1024 + j).encode()) self.assertEqual(stored_feat['example_id'], feat['example_id'].bytes_list.value[0]) self.assertTrue('event_time' in feat) self.assertTrue('event_time' in stored_feat) self.assertEqual(stored_feat['event_time'], feat['event_time'].int64_list.value[0]) self.assertTrue('leader_index' in feat) self.assertTrue('leader_index' in stored_feat) self.assertEqual(stored_feat['leader_index'], feat['leader_index'].int64_list.value[0]) self.assertTrue('follower_index' in feat) self.assertTrue('follower_index' in stored_feat) self.assertEqual(stored_feat['follower_index'], feat['follower_index'].int64_list.value[0]) self.assertEqual(j, 1023) data_block_manager2 = data_block_manager.DataBlockManager( self.data_source, 0) self.assertEqual(self.data_block_manager.get_dumped_data_block_count(), 5)
def test_data_block_dumper(self): self.generate_follower_data_block() self.generate_leader_raw_data() dbd = data_block_dumper.DataBlockDumperManager( self.etcd, self.data_source_l, 0, dj_pb.RawDataOptions(raw_data_iter='TF_RECORD'), dj_pb.DataBlockBuilderOptions( data_block_builder='TF_RECORD_DATABLOCK_BUILDER'), ) self.assertEqual(dbd.get_next_data_block_index(), 0) for (idx, meta) in enumerate(self.dumped_metas): success, next_index = dbd.add_synced_data_block_meta(meta) self.assertTrue(success) self.assertEqual(next_index, idx + 1) self.assertTrue(dbd.need_dump()) self.assertEqual(dbd.get_next_data_block_index(), len(self.dumped_metas)) with dbd.make_data_block_dumper() as dumper: dumper() dbm_f = data_block_manager.DataBlockManager(self.data_source_f, 0) dbm_l = data_block_manager.DataBlockManager(self.data_source_l, 0) self.assertEqual(dbm_f.get_dumped_data_block_count(), len(self.dumped_metas)) self.assertEqual(dbm_f.get_dumped_data_block_count(), dbm_l.get_dumped_data_block_count()) for (idx, meta) in enumerate(self.dumped_metas): self.assertEqual(meta.data_block_index, idx) self.assertEqual(dbm_l.get_data_block_meta_by_index(idx), meta) self.assertEqual(dbm_f.get_data_block_meta_by_index(idx), meta) meta_fpth_l = os.path.join( self.data_source_l.data_block_dir, common.partition_repr(0), common.encode_data_block_meta_fname( self.data_source_l.data_source_meta.name, 0, meta.data_block_index)) mitr = tf.io.tf_record_iterator(meta_fpth_l) meta_l = text_format.Parse(next(mitr), dj_pb.DataBlockMeta()) self.assertEqual(meta_l, meta) meta_fpth_f = os.path.join( self.data_source_f.data_block_dir, common.partition_repr(0), common.encode_data_block_meta_fname( self.data_source_f.data_source_meta.name, 0, meta.data_block_index)) mitr = tf.io.tf_record_iterator(meta_fpth_f) meta_f = text_format.Parse(next(mitr), dj_pb.DataBlockMeta()) self.assertEqual(meta_f, meta) data_fpth_l = os.path.join( self.data_source_l.data_block_dir, common.partition_repr(0), common.encode_data_block_fname( self.data_source_l.data_source_meta.name, meta_l)) for (iidx, record) in enumerate(tf.io.tf_record_iterator(data_fpth_l)): example = tf.train.Example() example.ParseFromString(record) feat = example.features.feature self.assertEqual(feat['example_id'].bytes_list.value[0], meta.example_ids[iidx]) self.assertEqual(len(meta.example_ids), iidx + 1) data_fpth_f = os.path.join( self.data_source_f.data_block_dir, common.partition_repr(0), common.encode_data_block_fname( self.data_source_l.data_source_meta.name, meta_f)) for (iidx, record) in enumerate(tf.io.tf_record_iterator(data_fpth_f)): example = tf.train.Example() example.ParseFromString(record) feat = example.features.feature self.assertEqual(feat['example_id'].bytes_list.value[0], meta.example_ids[iidx]) self.assertEqual(len(meta.example_ids), iidx + 1)
def test_data_block_manager(self): data_block_datas = [] data_block_metas = [] leader_index = 0 follower_index = 65536 for i in range(5): fill_examples = [] builder = data_block_manager.DataBlockBuilder( self.data_source.data_block_dir, 0, i, None) for j in range(1024): feat = {} example_id = '{}'.format(i * 1024 + j).encode() feat['example_id'] = tf.train.Feature( bytes_list=tf.train.BytesList(value=[example_id])) event_time = 150000000 + i * 1024 + j feat['event_time'] = tf.train.Feature( int64_list=tf.train.Int64List(value=[event_time])) feat['leader_index'] = tf.train.Feature( int64_list=tf.train.Int64List(value=[leader_index])) feat['follower_index'] = tf.train.Feature( int64_list=tf.train.Int64List(value=[follower_index])) example = tf.train.Example(features=tf.train.Features( feature=feat)) builder.append(example.SerializeToString(), example_id, event_time, leader_index, follower_index) fill_examples.append((example, { 'example_id': example_id, 'event_time': event_time, 'leader_index': leader_index, 'follower_index': follower_index })) leader_index += 1 follower_index += 1 builder.finish_data_block() data_block_datas.append(fill_examples) data_block_metas.append(builder.get_data_block_meta()) self.assertEqual(self.data_block_manager.get_dumped_data_block_num(), 0) self.assertEqual(self.data_block_manager.get_last_data_block_meta(), None) self.assertEqual( self.data_block_manager.get_dumped_data_block_num(True), 5) for (idx, meta) in enumerate(data_block_metas): self.assertEqual( self.data_block_manager.get_data_block_meta_by_index(idx)[0], meta) self.assertEqual( meta.block_id, '{}-{}_{}'.format(meta.start_time, meta.end_time, idx)) self.assertEqual( self.data_block_manager.get_data_block_meta_by_index(5)[0], None) data_block_dir = os.path.join(self.data_source.data_block_dir, 'partition_{}'.format(0)) for (i, meta) in enumerate(data_block_metas): data_block_fpath = os.path.join( data_block_dir, meta.block_id) + common.DataBlockSuffix data_block_meta_fpath = os.path.join( data_block_dir, meta.block_id) + common.DataBlockMetaSuffix self.assertTrue(gfile.Exists(data_block_fpath)) self.assertTrue(gfile.Exists(data_block_meta_fpath)) fiter = tf.io.tf_record_iterator(data_block_meta_fpath) remote_meta = dj_pb.DataBlockMeta() remote_meta.ParseFromString(next(fiter)) self.assertEqual(meta, remote_meta) for (j, record) in enumerate( tf.io.tf_record_iterator(data_block_fpath)): example = tf.train.Example() example.ParseFromString(record) stored_data = data_block_datas[i][j] self.assertEqual(example, stored_data[0]) feat = example.features.feature stored_feat = stored_data[1] self.assertTrue('example_id' in feat) self.assertTrue('example_id' in stored_feat) self.assertEqual(stored_feat['example_id'], '{}'.format(i * 1024 + j).encode()) self.assertEqual(stored_feat['example_id'], feat['example_id'].bytes_list.value[0]) self.assertTrue('event_time' in feat) self.assertTrue('event_time' in stored_feat) self.assertEqual(stored_feat['event_time'], feat['event_time'].int64_list.value[0]) self.assertTrue('leader_index' in feat) self.assertTrue('leader_index' in stored_feat) self.assertEqual(stored_feat['leader_index'], feat['leader_index'].int64_list.value[0]) self.assertTrue('follower_index' in feat) self.assertTrue('follower_index' in stored_feat) self.assertEqual(stored_feat['follower_index'], feat['follower_index'].int64_list.value[0]) self.assertEqual(j, 1023)