def generate_leader_raw_data(self): dbm = data_block_manager.DataBlockManager(self.data_source_l, 0) raw_data_dir = os.path.join(self.data_source_l.raw_data_dir, common.partition_repr(0)) if gfile.Exists(raw_data_dir): gfile.DeleteRecursively(raw_data_dir) gfile.MakeDirs(raw_data_dir) rdm = raw_data_visitor.RawDataManager(self.etcd, self.data_source_l, 0) block_index = 0 builder = DataBlockBuilder( self.data_source_l.raw_data_dir, self.data_source_l.data_source_meta.name, 0, block_index, dj_pb.WriterOptions(output_writer='TF_RECORD'), None) process_index = 0 start_index = 0 for i in range(0, self.leader_end_index + 3): if (i > 0 and i % 2048 == 0) or (i == self.leader_end_index + 2): meta = builder.finish_data_block() if meta is not None: ofname = common.encode_data_block_fname( self.data_source_l.data_source_meta.name, meta) fpath = os.path.join(raw_data_dir, ofname) self.manifest_manager.add_raw_data(0, [ dj_pb.RawDataMeta( file_path=fpath, timestamp=timestamp_pb2.Timestamp(seconds=3)) ], False) process_index += 1 start_index += len(meta.example_ids) block_index += 1 builder = DataBlockBuilder( self.data_source_l.raw_data_dir, self.data_source_l.data_source_meta.name, 0, block_index, dj_pb.WriterOptions(output_writer='TF_RECORD'), None) feat = {} pt = i + 1 << 30 if i % 3 == 0: pt = i // 3 example_id = '{}'.format(pt).encode() feat['example_id'] = tf.train.Feature( bytes_list=tf.train.BytesList(value=[example_id])) event_time = 150000000 + pt feat['event_time'] = tf.train.Feature( int64_list=tf.train.Int64List(value=[event_time])) example = tf.train.Example(features=tf.train.Features( feature=feat)) builder.append_item(TfExampleItem(example.SerializeToString()), i, i) fpaths = [ os.path.join(raw_data_dir, f) for f in gfile.ListDirectory(raw_data_dir) if not gfile.IsDirectory(os.path.join(raw_data_dir, f)) ] for fpath in fpaths: if not fpath.endswith(common.DataBlockSuffix): gfile.Remove(fpath)
def _preprocess_rsa_psi_follower(self): processors = [] rsa_key_pem = None with gfile.GFile(self._rsa_public_key_path, 'rb') as f: rsa_key_pem = f.read() for partition_id in range( self._data_source_f.data_source_meta.partition_num): options = dj_pb.RsaPsiPreProcessorOptions( preprocessor_name='follower-rsa-psi-processor', role=common_pb.FLRole.Follower, rsa_key_pem=rsa_key_pem, input_file_paths=[self._psi_raw_data_fpaths_f[partition_id]], output_file_dir=self._pre_processor_ouput_dir_f, raw_data_publish_dir=self._raw_data_pub_dir_f, partition_id=partition_id, leader_rsa_psi_signer_addr=self._rsa_psi_signer_addr, offload_processor_number=1, max_flying_sign_batch=128, max_flying_sign_rpc=64, sign_rpc_timeout_ms=100000, stub_fanout=2, slow_sign_threshold=8, sort_run_merger_read_ahead_buffer=1 << 20, batch_processor_options=dj_pb.BatchProcessorOptions( batch_size=1024, max_flying_item=1 << 14), input_raw_data=dj_pb.RawDataOptions(raw_data_iter='CSV_DICT', read_ahead_size=1 << 20), writer_options=dj_pb.WriterOptions(output_writer='TF_RECORD')) processor = rsa_psi_preprocessor.RsaPsiPreProcessor( options, self._etcd_name, self._etcd_addrs, self._etcd_base_dir_f, True) processor.start_process() processors.append(processor) for processor in processors: processor.wait_for_finished()
def generate_input_csv(base_dir, start_id, end_id, partition_num): for partition_id in range(partition_num): dirpath = os.path.join(base_dir, common.partition_repr(partition_id)) if not gfile.Exists(dirpath): gfile.MakeDirs(dirpath) assert gfile.IsDirectory(dirpath) writer_options = dj_pb.WriterOptions(output_writer='CSV_DICT') csv_writers = [ SortRunMergerWriter(base_dir, 0, partition_id, writer_options) for partition_id in range(partition_num) ] for idx in range(start_id, end_id): if idx % 262144 == 0: logging.info("Process at index %d", idx) partition_id = CityHash32(str(idx)) % partition_num raw = OrderedDict() raw['raw_id'] = str(idx) raw['feat_0'] = str((partition_id << 30) + 0) + str(idx) raw['feat_1'] = str((partition_id << 30) + 1) + str(idx) raw['feat_2'] = str((partition_id << 30) + 2) + str(idx) csv_writers[partition_id].append(CsvItem(raw)) for partition_id, csv_writer in enumerate(csv_writers): fpaths = csv_writer.finish() logging.info("partition %d dump %d files", partition_id, len(fpaths)) for seq_id, fpath in enumerate(fpaths): logging.info(" %d. %s", seq_id, fpath) logging.info("---------------")
def _create_data_block(self, partition_id): dbm = data_block_manager.DataBlockManager(self.data_source, partition_id) self.assertEqual(dbm.get_dumped_data_block_count(), 0) self.assertEqual(dbm.get_lastest_data_block_meta(), None) leader_index = 0 follower_index = 65536 for i in range(64): builder = DataBlockBuilder( common.data_source_data_block_dir(self.data_source), self.data_source.data_source_meta.name, partition_id, i, dj_pb.WriterOptions(output_writer='TF_RECORD'), None ) builder.set_data_block_manager(dbm) for j in range(4): feat = {} example_id = '{}'.format(i * 1024 + j).encode() feat['example_id'] = tf.train.Feature( bytes_list=tf.train.BytesList(value=[example_id])) event_time = random.randint(0, 10) feat['event_time'] = tf.train.Feature( int64_list=tf.train.Int64List(value=[event_time])) feat['leader_index'] = tf.train.Feature( int64_list=tf.train.Int64List(value=[leader_index])) feat['follower_index'] = tf.train.Feature( int64_list=tf.train.Int64List(value=[follower_index])) example = tf.train.Example(features=tf.train.Features(feature=feat)) builder.append_item(TfExampleItem(example.SerializeToString()), leader_index, follower_index) leader_index += 1 follower_index += 1 self.data_block_matas.append(builder.finish_data_block())
def add_data_block(self, partition_id, x, y): dbm = self._dbms[partition_id] builder = DataBlockBuilder( common.data_source_data_block_dir(self._data_source), self._data_source.data_source_meta.name, partition_id, dbm.get_dumped_data_block_count(), dj_pb.WriterOptions(output_writer="TF_RECORD"), None) builder.set_data_block_manager(dbm) for i in range(x.shape[0]): feat = {} exam_id = '{}'.format(i).encode() feat['example_id'] = Feature( bytes_list=BytesList(value=[exam_id])) feat['event_time'] = Feature( int64_list = Int64List(value=[i]) ) feat['x'] = Feature(float_list=FloatList(value=list(x[i]))) if y is not None: feat['y'] = Feature(int64_list=Int64List(value=[y[i]])) example = Example(features=Features(feature=feat)) builder.append_item(TfExampleItem(example.SerializeToString()), i, 0) return builder.finish_data_block()
def _preprocess_rsa_psi_leader(self): processors = [] rsa_key_pem = None with gfile.GFile(self._rsa_private_key_path, 'rb') as f: rsa_key_pem = f.read() for partition_id in range( self._data_source_l.data_source_meta.partition_num): options = dj_pb.RsaPsiPreProcessorOptions( preprocessor_name='leader-rsa-psi-processor', role=common_pb.FLRole.Leader, rsa_key_pem=rsa_key_pem, input_file_paths=[self._psi_raw_data_fpaths_l[partition_id]], output_file_dir=self._pre_processor_ouput_dir_l, raw_data_publish_dir=self._raw_data_pub_dir_l, partition_id=partition_id, offload_processor_number=1, max_flying_sign_batch=128, stub_fanout=2, slow_sign_threshold=8, sort_run_merger_read_ahead_buffer=1 << 20, sort_run_merger_read_batch_size=128, batch_processor_options=dj_pb.BatchProcessorOptions( batch_size=1024, max_flying_item=1 << 14), input_raw_data=dj_pb.RawDataOptions(raw_data_iter='CSV_DICT', read_ahead_size=1 << 20), writer_options=dj_pb.WriterOptions(output_writer='TF_RECORD')) os.environ['ETCD_BASE_DIR'] = self.leader_base_dir processor = rsa_psi_preprocessor.RsaPsiPreProcessor( options, self.kvstore_type, True) processor.start_process() processors.append(processor) for processor in processors: processor.wait_for_finished()
def test_example_joiner(self): sei = joiner_impl.create_example_joiner( self.example_joiner_options, self.raw_data_options, dj_pb.WriterOptions(output_writer='TF_RECORD'), self.kvstore, self.data_source, 0) metas = [] with sei.make_example_joiner() as joiner: for meta in joiner: metas.append(meta) self.assertEqual(len(metas), 0) self.generate_raw_data(0, 2 * 2048) dumper = example_id_dumper.ExampleIdDumperManager( self.kvstore, self.data_source, 0, self.example_id_dump_options) self.generate_example_id(dumper, 0, 3 * 2048) with sei.make_example_joiner() as joiner: for meta in joiner: metas.append(meta) self.generate_raw_data(2 * 2048, 2048) self.generate_example_id(dumper, 3 * 2048, 3 * 2048) with sei.make_example_joiner() as joiner: for meta in joiner: metas.append(meta) self.generate_raw_data(3 * 2048, 5 * 2048) self.generate_example_id(dumper, 6 * 2048, 2048) with sei.make_example_joiner() as joiner: for meta in joiner: metas.append(meta) self.generate_raw_data(8 * 2048, 2 * 2048) with sei.make_example_joiner() as joiner: for meta in joiner: metas.append(meta) self.generate_example_id(dumper, 7 * 2048, 3 * 2048) with sei.make_example_joiner() as joiner: for meta in joiner: metas.append(meta) sei.set_sync_example_id_finished() sei.set_raw_data_finished() with sei.make_example_joiner() as joiner: for meta in joiner: metas.append(meta) dbm = data_block_manager.DataBlockManager(self.data_source, 0) data_block_num = dbm.get_dumped_data_block_count() self.assertEqual(len(metas), data_block_num) join_count = 0 for data_block_index in range(data_block_num): meta = dbm.get_data_block_meta_by_index(data_block_index) self.assertEqual(meta, metas[data_block_index]) join_count += len(meta.example_ids) print("join rate {}/{}({}), min_matching_window {}, "\ "max_matching_window {}".format( join_count, 20480, (join_count+.0)/(10 * 2048), self.example_joiner_options.min_matching_window, self.example_joiner_options.max_matching_window))
def test_universal_join_key_mapper_error(self): mapper_code = """ from fedlearner.data_join.key_mapper.key_mapping import BaseKeyMapper class KeyMapperMock(BaseKeyMapper): def leader_mapping(self, item) -> dict: res = item.click_id.decode().split("_") raise ValueError return dict({"req_id":res[0], "cid":res[1]}) def follower_mapping(self, item) -> dict: return dict() @classmethod def name(cls): return "TEST_MAPPER" """ abspath = os.path.dirname(os.path.abspath(__file__)) fname = os.path.realpath( os.path.join( abspath, "../../fedlearner/data_join/key_mapper/impl/keymapper_mock.py") ) with open(fname, "w") as f: f.write(mapper_code) reload(key_mapper) self.example_joiner_options = dj_pb.ExampleJoinerOptions( example_joiner='UNIVERSAL_JOINER', min_matching_window=32, max_matching_window=51200, max_conversion_delay=interval_to_timestamp("258"), enable_negative_example_generator=True, data_block_dump_interval=32, data_block_dump_threshold=1024, negative_sampling_rate=0.8, join_expr="(cid,req_id) or (example_id)", join_key_mapper="TEST_MAPPER", negative_sampling_filter_expr='', ) self.version = dsp.Version.V2 sei = joiner_impl.create_example_joiner( self.example_joiner_options, self.raw_data_options, #dj_pb.WriterOptions(output_writer='TF_RECORD'), dj_pb.WriterOptions(output_writer='CSV_DICT'), self.kvstore, self.data_source, 0) self.run_join(sei, 0) os.remove(fname)
def _make_portal_worker(self): portal_worker_options = dp_pb.DataPortalWorkerOptions( raw_data_options=dj_pb.RawDataOptions(raw_data_iter="TF_RECORD", compressed_type=''), writer_options=dj_pb.WriterOptions(output_writer="TF_RECORD"), batch_processor_options=dj_pb.BatchProcessorOptions( batch_size=128, max_flying_item=300000), merge_buffer_size=4096, merger_read_ahead_size=1000000) self._portal_worker = DataPortalWorker(portal_worker_options, "localhost:5005", 0, "test_portal_worker_0", "portal_worker_0", "localhost:2379", True)
def _make_portal_worker(self): portal_worker_options = dp_pb.DataPortalWorkerOptions( raw_data_options=dj_pb.RawDataOptions(raw_data_iter="TF_RECORD", read_ahead_size=1 << 20, read_batch_size=128), writer_options=dj_pb.WriterOptions(output_writer="TF_RECORD"), batch_processor_options=dj_pb.BatchProcessorOptions( batch_size=128, max_flying_item=300000), merger_read_ahead_size=1000000, merger_read_batch_size=128) self._portal_worker = DataPortalWorker(portal_worker_options, "localhost:5005", 0, "test_portal_worker_0", "portal_worker_0", "localhost:2379", "test_user", "test_password", True)
def _create_data_block(self, data_source, partition_id, x, y): data_block_metas = [] dbm = data_block_manager.DataBlockManager(data_source, partition_id) self.assertEqual(dbm.get_dumped_data_block_count(), 0) self.assertEqual(dbm.get_lastest_data_block_meta(), None) N = 200 chunk_size = x.shape[0] // N leader_index = 0 follower_index = N * chunk_size * 10 for i in range(N): builder = DataBlockBuilder( common.data_source_data_block_dir(data_source), data_source.data_source_meta.name, partition_id, i, dj_pb.WriterOptions(output_writer="TF_RECORD"), None ) builder.set_data_block_manager(dbm) for j in range(chunk_size): feat = {} idx = i * chunk_size + j exam_id = '{}'.format(idx).encode() feat['example_id'] = Feature( bytes_list=BytesList(value=[exam_id])) evt_time = random.randint(1, 1000) feat['event_time'] = Feature( int64_list = Int64List(value=[evt_time]) ) feat['x'] = Feature(float_list=FloatList(value=list(x[idx]))) if y is not None: feat['y'] = Feature(int64_list=Int64List(value=[y[idx]])) feat['leader_index'] = Feature( int64_list = Int64List(value=[leader_index]) ) feat['follower_index'] = Feature( int64_list = Int64List(value=[follower_index]) ) example = Example(features=Features(feature=feat)) builder.append_item(TfExampleItem(example.SerializeToString()), leader_index, follower_index) leader_index += 1 follower_index += 1 data_block_metas.append(builder.finish_data_block()) self.max_index = follower_index return data_block_metas
def _launch_workers(self): worker_options = dj_pb.DataJoinWorkerOptions( use_mock_etcd=True, raw_data_options=dj_pb.RawDataOptions(raw_data_iter='TF_RECORD', compressed_type=''), example_id_dump_options=dj_pb.ExampleIdDumpOptions( example_id_dump_interval=1, example_id_dump_threshold=1024), example_joiner_options=dj_pb.ExampleJoinerOptions( example_joiner='SORT_RUN_JOINER', min_matching_window=64, max_matching_window=256, data_block_dump_interval=30, data_block_dump_threshold=1000), batch_processor_options=dj_pb.BatchProcessorOptions( batch_size=1024, max_flying_item=4096), data_block_builder_options=dj_pb.WriterOptions( output_writer='TF_RECORD')) self._worker_addrs_l = [ 'localhost:4161', 'localhost:4162', 'localhost:4163', 'localhost:4164' ] self._worker_addrs_f = [ 'localhost:5161', 'localhost:5162', 'localhost:5163', 'localhost:5164' ] self._workers_l = [] self._workers_f = [] for rank_id in range(4): worker_addr_l = self._worker_addrs_l[rank_id] worker_addr_f = self._worker_addrs_f[rank_id] self._workers_l.append( data_join_worker.DataJoinWorkerService( int(worker_addr_l.split(':')[1]), worker_addr_f, self._master_addr_l, rank_id, self._etcd_name, self._etcd_base_dir_l, self._etcd_addrs, worker_options)) self._workers_f.append( data_join_worker.DataJoinWorkerService( int(worker_addr_f.split(':')[1]), worker_addr_l, self._master_addr_f, rank_id, self._etcd_name, self._etcd_base_dir_f, self._etcd_addrs, worker_options)) for w in self._workers_l: w.start() for w in self._workers_f: w.start()
def _make_portal_worker(self, raw_data_iter, validation_ratio): portal_worker_options = dp_pb.DataPortalWorkerOptions( raw_data_options=dj_pb.RawDataOptions( raw_data_iter=raw_data_iter, read_ahead_size=1 << 20, read_batch_size=128, optional_fields=['label'], validation_ratio=validation_ratio, ), writer_options=dj_pb.WriterOptions(output_writer="TF_RECORD"), batch_processor_options=dj_pb.BatchProcessorOptions( batch_size=128, max_flying_item=300000), merger_read_ahead_size=1000000, merger_read_batch_size=128) os.environ['ETCD_BASE_DIR'] = "portal_worker_0" self._portal_worker = DataPortalWorker(portal_worker_options, "localhost:5005", 0, "etcd", True)
def test_universal_join_small_follower(self): self.example_joiner_options = dj_pb.ExampleJoinerOptions( example_joiner='UNIVERSAL_JOINER', min_matching_window=32, max_matching_window=20240, max_conversion_delay=interval_to_timestamp("128"), enable_negative_example_generator=False, data_block_dump_interval=32, data_block_dump_threshold=1024, negative_sampling_rate=0.8, join_expr="(id_type, example_id, trunc(event_time,1))", join_key_mapper="DEFAULT", negative_sampling_filter_expr='', ) self.version = dsp.Version.V2 sei = joiner_impl.create_example_joiner( self.example_joiner_options, self.raw_data_options, dj_pb.WriterOptions(output_writer='TF_RECORD'), self.kvstore, self.data_source, 0) self.run_join_small_follower(sei, 0.15)
def generate_follower_data_block(self): dbm = data_block_manager.DataBlockManager(self.data_source_f, 0) self.assertEqual(dbm.get_dumped_data_block_count(), 0) self.assertEqual(dbm.get_lastest_data_block_meta(), None) leader_index = 0 follower_index = 65536 self.dumped_metas = [] for i in range(5): builder = DataBlockBuilder( common.data_source_data_block_dir(self.data_source_f), self.data_source_f.data_source_meta.name, 0, i, dj_pb.WriterOptions(output_writer='TF_RECORD'), None) builder.set_data_block_manager(dbm) for j in range(1024): feat = {} example_id = '{}'.format(i * 1024 + j).encode() feat['example_id'] = tf.train.Feature( bytes_list=tf.train.BytesList(value=[example_id])) event_time = 150000000 + i * 1024 + j feat['event_time'] = tf.train.Feature( int64_list=tf.train.Int64List(value=[event_time])) feat['leader_index'] = tf.train.Feature( int64_list=tf.train.Int64List(value=[leader_index])) feat['follower_index'] = tf.train.Feature( int64_list=tf.train.Int64List(value=[follower_index])) example = tf.train.Example(features=tf.train.Features( feature=feat)) builder.append_item(TfExampleItem(example.SerializeToString()), leader_index, follower_index) leader_index += 3 follower_index += 1 meta = builder.finish_data_block() self.dumped_metas.append(meta) self.leader_start_index = 0 self.leader_end_index = leader_index self.assertEqual(dbm.get_dumped_data_block_count(), 5) for (idx, meta) in enumerate(self.dumped_metas): self.assertEqual(dbm.get_data_block_meta_by_index(idx), meta)
def generate_raw_data(self, begin_index, item_count): raw_data_dir = os.path.join(self.raw_data_dir, common.partition_repr(0)) if not gfile.Exists(raw_data_dir): gfile.MakeDirs(raw_data_dir) self.total_raw_data_count += item_count useless_index = 0 rdm = raw_data_visitor.RawDataManager(self.kvstore, self.data_source, 0) fpaths = [] for block_index in range(0, item_count // 2048): builder = DataBlockBuilder( self.raw_data_dir, self.data_source.data_source_meta.name, 0, block_index, dj_pb.WriterOptions(output_writer='TF_RECORD'), None) cands = list( range(begin_index + block_index * 2048, begin_index + (block_index + 1) * 2048)) start_index = cands[0] for i in range(len(cands)): if random.randint(1, 4) > 2: continue a = random.randint(i - 32, i + 32) b = random.randint(i - 32, i + 32) if a < 0: a = 0 if a >= len(cands): a = len(cands) - 1 if b < 0: b = 0 if b >= len(cands): b = len(cands) - 1 if (abs(cands[a] - i - start_index) <= 32 and abs(cands[b] - i - start_index) <= 32): cands[a], cands[b] = cands[b], cands[a] for example_idx in cands: feat = {} example_id = '{}'.format(example_idx).encode() feat['example_id'] = tf.train.Feature( bytes_list=tf.train.BytesList(value=[example_id])) event_time = 150000000 + example_idx feat['event_time'] = tf.train.Feature( int64_list=tf.train.Int64List(value=[event_time])) label = random.choice([1, 0]) if random.random() < 0.8: feat['label'] = tf.train.Feature( int64_list=tf.train.Int64List(value=[label])) example = tf.train.Example(features=tf.train.Features( feature=feat)) builder.append_item(TfExampleItem(example.SerializeToString()), useless_index, useless_index) useless_index += 1 meta = builder.finish_data_block() fname = common.encode_data_block_fname( self.data_source.data_source_meta.name, meta) fpath = os.path.join(raw_data_dir, fname) fpaths.append( dj_pb.RawDataMeta( file_path=fpath, timestamp=timestamp_pb2.Timestamp(seconds=3))) self.g_data_block_index += 1 all_files = [ os.path.join(raw_data_dir, f) for f in gfile.ListDirectory(raw_data_dir) if not gfile.IsDirectory(os.path.join(raw_data_dir, f)) ] for fpath in all_files: if not fpath.endswith(common.DataBlockSuffix): gfile.Remove(fpath) self.manifest_manager.add_raw_data(0, fpaths, False)
set_logger() if args.input_data_file_iter == 'TF_RECORD' or \ args.output_builder == 'TF_RECORD': import tensorflow tensorflow.compat.v1.enable_eager_execution() optional_fields = list( field for field in map(str.strip, args.optional_fields.split(',')) if field != '') portal_worker_options = dp_pb.DataPortalWorkerOptions( raw_data_options=dj_pb.RawDataOptions( raw_data_iter=args.input_data_file_iter, compressed_type=args.compressed_type, read_ahead_size=args.read_ahead_size, read_batch_size=args.read_batch_size, optional_fields=optional_fields), writer_options=dj_pb.WriterOptions( output_writer=args.output_builder, compressed_type=args.builder_compressed_type), batch_processor_options=dj_pb.BatchProcessorOptions( batch_size=args.batch_size, max_flying_item=-1), merger_read_ahead_size=args.merger_read_ahead_size, merger_read_batch_size=args.merger_read_batch_size, memory_limit_ratio=args.memory_limit_ratio / 100) data_portal_worker = DataPortalWorker(portal_worker_options, args.master_addr, args.rank_id, args.kvstore_type, (args.kvstore_type == 'mock')) data_portal_worker.start()
def test_data_block_dumper(self): self.generate_follower_data_block() self.generate_leader_raw_data() dbd = data_block_dumper.DataBlockDumperManager( self.etcd, self.data_source_l, 0, dj_pb.RawDataOptions(raw_data_iter='TF_RECORD', read_ahead_size=1 << 20, read_batch_size=128), dj_pb.WriterOptions(output_writer='TF_RECORD')) self.assertEqual(dbd.get_next_data_block_index(), 0) for (idx, meta) in enumerate(self.dumped_metas): success, next_index = dbd.add_synced_data_block_meta(meta) self.assertTrue(success) self.assertEqual(next_index, idx + 1) self.assertTrue(dbd.need_dump()) self.assertEqual(dbd.get_next_data_block_index(), len(self.dumped_metas)) with dbd.make_data_block_dumper() as dumper: dumper() dbm_f = data_block_manager.DataBlockManager(self.data_source_f, 0) dbm_l = data_block_manager.DataBlockManager(self.data_source_l, 0) self.assertEqual(dbm_f.get_dumped_data_block_count(), len(self.dumped_metas)) self.assertEqual(dbm_f.get_dumped_data_block_count(), dbm_l.get_dumped_data_block_count()) for (idx, meta) in enumerate(self.dumped_metas): self.assertEqual(meta.data_block_index, idx) self.assertEqual(dbm_l.get_data_block_meta_by_index(idx), meta) self.assertEqual(dbm_f.get_data_block_meta_by_index(idx), meta) meta_fpth_l = os.path.join( common.data_source_data_block_dir(self.data_source_l), common.partition_repr(0), common.encode_data_block_meta_fname( self.data_source_l.data_source_meta.name, 0, meta.data_block_index)) mitr = tf.io.tf_record_iterator(meta_fpth_l) meta_l = text_format.Parse(next(mitr), dj_pb.DataBlockMeta()) self.assertEqual(meta_l, meta) meta_fpth_f = os.path.join( common.data_source_data_block_dir(self.data_source_f), common.partition_repr(0), common.encode_data_block_meta_fname( self.data_source_f.data_source_meta.name, 0, meta.data_block_index)) mitr = tf.io.tf_record_iterator(meta_fpth_f) meta_f = text_format.Parse(next(mitr), dj_pb.DataBlockMeta()) self.assertEqual(meta_f, meta) data_fpth_l = os.path.join( common.data_source_data_block_dir(self.data_source_l), common.partition_repr(0), common.encode_data_block_fname( self.data_source_l.data_source_meta.name, meta_l)) for (iidx, record) in enumerate(tf.io.tf_record_iterator(data_fpth_l)): example = tf.train.Example() example.ParseFromString(record) feat = example.features.feature self.assertEqual(feat['example_id'].bytes_list.value[0], meta.example_ids[iidx]) self.assertEqual(len(meta.example_ids), iidx + 1) data_fpth_f = os.path.join( common.data_source_data_block_dir(self.data_source_f), common.partition_repr(0), common.encode_data_block_fname( self.data_source_l.data_source_meta.name, meta_f)) for (iidx, record) in enumerate(tf.io.tf_record_iterator(data_fpth_f)): example = tf.train.Example() example.ParseFromString(record) feat = example.features.feature self.assertEqual(feat['example_id'].bytes_list.value[0], meta.example_ids[iidx]) self.assertEqual(len(meta.example_ids), iidx + 1)
def setUp(self): etcd_name = 'test_etcd' etcd_addrs = 'localhost:2379' etcd_base_dir_l = 'byefl_l' etcd_base_dir_f = 'byefl_f' data_source_name = 'test_data_source' etcd_l = EtcdClient(etcd_name, etcd_addrs, etcd_base_dir_l, True) etcd_f = EtcdClient(etcd_name, etcd_addrs, etcd_base_dir_f, True) etcd_l.delete_prefix( common.data_source_etcd_base_dir(data_source_name)) etcd_f.delete_prefix( common.data_source_etcd_base_dir(data_source_name)) data_source_l = common_pb.DataSource() self.raw_data_pub_dir_l = './raw_data_pub_dir_l' data_source_l.raw_data_sub_dir = self.raw_data_pub_dir_l data_source_l.role = common_pb.FLRole.Leader data_source_l.state = common_pb.DataSourceState.Init data_source_l.data_block_dir = "./data_block_l" data_source_l.raw_data_dir = "./raw_data_l" data_source_l.example_dumped_dir = "./example_dumped_l" data_source_f = common_pb.DataSource() self.raw_data_pub_dir_f = './raw_data_pub_dir_f' data_source_f.role = common_pb.FLRole.Follower data_source_f.raw_data_sub_dir = self.raw_data_pub_dir_f data_source_f.state = common_pb.DataSourceState.Init data_source_f.data_block_dir = "./data_block_f" data_source_f.raw_data_dir = "./raw_data_f" data_source_f.example_dumped_dir = "./example_dumped_f" data_source_meta = common_pb.DataSourceMeta() data_source_meta.name = data_source_name data_source_meta.partition_num = 2 data_source_meta.start_time = 0 data_source_meta.end_time = 100000000 data_source_l.data_source_meta.MergeFrom(data_source_meta) common.commit_data_source(etcd_l, data_source_l) data_source_f.data_source_meta.MergeFrom(data_source_meta) common.commit_data_source(etcd_f, data_source_f) master_options = dj_pb.DataJoinMasterOptions(use_mock_etcd=True) master_addr_l = 'localhost:4061' master_addr_f = 'localhost:4062' master_l = data_join_master.DataJoinMasterService( int(master_addr_l.split(':')[1]), master_addr_f, data_source_name, etcd_name, etcd_base_dir_l, etcd_addrs, master_options, ) master_l.start() master_f = data_join_master.DataJoinMasterService( int(master_addr_f.split(':')[1]), master_addr_l, data_source_name, etcd_name, etcd_base_dir_f, etcd_addrs, master_options) master_f.start() channel_l = make_insecure_channel(master_addr_l, ChannelType.INTERNAL) master_client_l = dj_grpc.DataJoinMasterServiceStub(channel_l) channel_f = make_insecure_channel(master_addr_f, ChannelType.INTERNAL) master_client_f = dj_grpc.DataJoinMasterServiceStub(channel_f) while True: req_l = dj_pb.DataSourceRequest( data_source_meta=data_source_l.data_source_meta) req_f = dj_pb.DataSourceRequest( data_source_meta=data_source_f.data_source_meta) dss_l = master_client_l.GetDataSourceStatus(req_l) dss_f = master_client_f.GetDataSourceStatus(req_f) self.assertEqual(dss_l.role, common_pb.FLRole.Leader) self.assertEqual(dss_f.role, common_pb.FLRole.Follower) if dss_l.state == common_pb.DataSourceState.Processing and \ dss_f.state == common_pb.DataSourceState.Processing: break else: time.sleep(2) self.master_client_l = master_client_l self.master_client_f = master_client_f self.master_addr_l = master_addr_l self.master_addr_f = master_addr_f self.etcd_l = etcd_l self.etcd_f = etcd_f self.data_source_l = data_source_l self.data_source_f = data_source_f self.master_l = master_l self.master_f = master_f self.data_source_name = data_source_name, self.etcd_name = etcd_name self.etcd_addrs = etcd_addrs self.etcd_base_dir_l = etcd_base_dir_l self.etcd_base_dir_f = etcd_base_dir_f self.raw_data_publisher_l = raw_data_publisher.RawDataPublisher( self.etcd_l, self.raw_data_pub_dir_l) self.raw_data_publisher_f = raw_data_publisher.RawDataPublisher( self.etcd_f, self.raw_data_pub_dir_f) if gfile.Exists(data_source_l.data_block_dir): gfile.DeleteRecursively(data_source_l.data_block_dir) if gfile.Exists(data_source_l.example_dumped_dir): gfile.DeleteRecursively(data_source_l.example_dumped_dir) if gfile.Exists(data_source_l.raw_data_dir): gfile.DeleteRecursively(data_source_l.raw_data_dir) if gfile.Exists(data_source_f.data_block_dir): gfile.DeleteRecursively(data_source_f.data_block_dir) if gfile.Exists(data_source_f.example_dumped_dir): gfile.DeleteRecursively(data_source_f.example_dumped_dir) if gfile.Exists(data_source_f.raw_data_dir): gfile.DeleteRecursively(data_source_f.raw_data_dir) self.worker_options = dj_pb.DataJoinWorkerOptions( use_mock_etcd=True, raw_data_options=dj_pb.RawDataOptions(raw_data_iter='TF_RECORD', compressed_type=''), example_id_dump_options=dj_pb.ExampleIdDumpOptions( example_id_dump_interval=1, example_id_dump_threshold=1024), example_joiner_options=dj_pb.ExampleJoinerOptions( example_joiner='STREAM_JOINER', min_matching_window=64, max_matching_window=256, data_block_dump_interval=30, data_block_dump_threshold=1000), batch_processor_options=dj_pb.BatchProcessorOptions( batch_size=512, max_flying_item=2048), data_block_builder_options=dj_pb.WriterOptions( output_writer='TF_RECORD')) self.total_index = 1 << 13
def generate_raw_data(self, etcd, rdp, data_source, partition_id, block_size, shuffle_win_size, feat_key_fmt, feat_val_fmt): dbm = data_block_manager.DataBlockManager(data_source, partition_id) raw_data_dir = os.path.join(data_source.raw_data_dir, common.partition_repr(partition_id)) if gfile.Exists(raw_data_dir): gfile.DeleteRecursively(raw_data_dir) gfile.MakeDirs(raw_data_dir) useless_index = 0 new_raw_data_fnames = [] for block_index in range(self.total_index // block_size): builder = DataBlockBuilder( data_source.raw_data_dir, data_source.data_source_meta.name, partition_id, block_index, dj_pb.WriterOptions(output_writer='TF_RECORD'), None) cands = list( range(block_index * block_size, (block_index + 1) * block_size)) start_index = cands[0] for i in range(len(cands)): if random.randint(1, 4) > 2: continue a = random.randint(i - shuffle_win_size, i + shuffle_win_size) b = random.randint(i - shuffle_win_size, i + shuffle_win_size) if a < 0: a = 0 if a >= len(cands): a = len(cands) - 1 if b < 0: b = 0 if b >= len(cands): b = len(cands) - 1 if (abs(cands[a] - i - start_index) <= shuffle_win_size and abs(cands[b] - i - start_index) <= shuffle_win_size): cands[a], cands[b] = cands[b], cands[a] for example_idx in cands: feat = {} example_id = '{}'.format(example_idx).encode() feat['example_id'] = tf.train.Feature( bytes_list=tf.train.BytesList(value=[example_id])) event_time = 150000000 + example_idx feat['event_time'] = tf.train.Feature( int64_list=tf.train.Int64List(value=[event_time])) feat[feat_key_fmt.format(example_idx)] = tf.train.Feature( bytes_list=tf.train.BytesList( value=[feat_val_fmt.format(example_idx).encode()])) example = tf.train.Example(features=tf.train.Features( feature=feat)) builder.append_item(TfExampleItem(example.SerializeToString()), useless_index, useless_index) useless_index += 1 meta = builder.finish_data_block() fname = common.encode_data_block_fname( data_source.data_source_meta.name, meta) new_raw_data_fnames.append(os.path.join(raw_data_dir, fname)) fpaths = [ os.path.join(raw_data_dir, f) for f in gfile.ListDirectory(raw_data_dir) if not gfile.IsDirectory(os.path.join(raw_data_dir, f)) ] for fpath in fpaths: if fpath.endswith(common.DataBlockMetaSuffix): gfile.Remove(fpath) rdp.publish_raw_data(partition_id, new_raw_data_fnames)
def test_data_block_manager(self): data_block_datas = [] data_block_metas = [] leader_index = 0 follower_index = 65536 for i in range(5): fill_examples = [] builder = DataBlockBuilder( self.data_source.data_block_dir, self.data_source.data_source_meta.name, 0, i, dj_pb.WriterOptions(output_writer='TF_RECORD'), None) builder.set_data_block_manager(self.data_block_manager) for j in range(1024): feat = {} example_id = '{}'.format(i * 1024 + j).encode() feat['example_id'] = tf.train.Feature( bytes_list=tf.train.BytesList(value=[example_id])) event_time = 150000000 + i * 1024 + j feat['event_time'] = tf.train.Feature( int64_list=tf.train.Int64List(value=[event_time])) feat['leader_index'] = tf.train.Feature( int64_list=tf.train.Int64List(value=[leader_index])) feat['follower_index'] = tf.train.Feature( int64_list=tf.train.Int64List(value=[follower_index])) example = tf.train.Example(features=tf.train.Features( feature=feat)) builder.append_item(TfExampleItem(example.SerializeToString()), leader_index, follower_index) fill_examples.append((example, { 'example_id': example_id, 'event_time': event_time, 'leader_index': leader_index, 'follower_index': follower_index })) leader_index += 1 follower_index += 1 meta = builder.finish_data_block() data_block_datas.append(fill_examples) data_block_metas.append(meta) self.assertEqual(self.data_block_manager.get_dumped_data_block_count(), 5) self.assertEqual(self.data_block_manager.get_lastest_data_block_meta(), data_block_metas[-1]) for (idx, meta) in enumerate(data_block_metas): self.assertEqual( self.data_block_manager.get_data_block_meta_by_index(idx), meta) self.assertEqual( meta.block_id, common.encode_block_id(self.data_source.data_source_meta.name, meta)) self.assertEqual( self.data_block_manager.get_data_block_meta_by_index(5), None) data_block_dir = os.path.join(self.data_source.data_block_dir, common.partition_repr(0)) for (i, meta) in enumerate(data_block_metas): data_block_fpath = os.path.join( data_block_dir, meta.block_id) + common.DataBlockSuffix data_block_meta_fpath = os.path.join( data_block_dir, common.encode_data_block_meta_fname( self.data_source.data_source_meta.name, 0, meta.data_block_index)) self.assertTrue(gfile.Exists(data_block_fpath)) self.assertTrue(gfile.Exists(data_block_meta_fpath)) fiter = tf.io.tf_record_iterator(data_block_meta_fpath) remote_meta = text_format.Parse( next(fiter).decode(), dj_pb.DataBlockMeta()) self.assertEqual(meta, remote_meta) for (j, record) in enumerate( tf.io.tf_record_iterator(data_block_fpath)): example = tf.train.Example() example.ParseFromString(record) stored_data = data_block_datas[i][j] self.assertEqual(example, stored_data[0]) feat = example.features.feature stored_feat = stored_data[1] self.assertTrue('example_id' in feat) self.assertTrue('example_id' in stored_feat) self.assertEqual(stored_feat['example_id'], '{}'.format(i * 1024 + j).encode()) self.assertEqual(stored_feat['example_id'], feat['example_id'].bytes_list.value[0]) self.assertTrue('event_time' in feat) self.assertTrue('event_time' in stored_feat) self.assertEqual(stored_feat['event_time'], feat['event_time'].int64_list.value[0]) self.assertTrue('leader_index' in feat) self.assertTrue('leader_index' in stored_feat) self.assertEqual(stored_feat['leader_index'], feat['leader_index'].int64_list.value[0]) self.assertTrue('follower_index' in feat) self.assertTrue('follower_index' in stored_feat) self.assertEqual(stored_feat['follower_index'], feat['follower_index'].int64_list.value[0]) self.assertEqual(j, 1023) data_block_manager2 = data_block_manager.DataBlockManager( self.data_source, 0) self.assertEqual(self.data_block_manager.get_dumped_data_block_count(), 5)
def setUp(self): etcd_name = 'test_etcd' etcd_addrs = 'localhost:2379' etcd_base_dir_l = 'byefl_l' etcd_base_dir_f= 'byefl_f' data_source_name = 'test_data_source' etcd_l = EtcdClient(etcd_name, etcd_addrs, etcd_base_dir_l, True) etcd_f = EtcdClient(etcd_name, etcd_addrs, etcd_base_dir_f, True) etcd_l.delete_prefix(common.data_source_etcd_base_dir(data_source_name)) etcd_f.delete_prefix(common.data_source_etcd_base_dir(data_source_name)) data_source_l = common_pb.DataSource() self.raw_data_pub_dir_l = './raw_data_pub_dir_l' data_source_l.raw_data_sub_dir = self.raw_data_pub_dir_l data_source_l.role = common_pb.FLRole.Leader data_source_l.state = common_pb.DataSourceState.Init data_source_l.output_base_dir = "./ds_output_l" self.raw_data_dir_l = "./raw_data_l" data_source_f = common_pb.DataSource() self.raw_data_pub_dir_f = './raw_data_pub_dir_f' data_source_f.role = common_pb.FLRole.Follower data_source_f.raw_data_sub_dir = self.raw_data_pub_dir_f data_source_f.state = common_pb.DataSourceState.Init data_source_f.output_base_dir = "./ds_output_f" self.raw_data_dir_f = "./raw_data_f" data_source_meta = common_pb.DataSourceMeta() data_source_meta.name = data_source_name data_source_meta.partition_num = 2 data_source_meta.start_time = 0 data_source_meta.end_time = 100000000 data_source_l.data_source_meta.MergeFrom(data_source_meta) common.commit_data_source(etcd_l, data_source_l) data_source_f.data_source_meta.MergeFrom(data_source_meta) common.commit_data_source(etcd_f, data_source_f) self.etcd_l = etcd_l self.etcd_f = etcd_f self.data_source_l = data_source_l self.data_source_f = data_source_f self.data_source_name = data_source_name self.etcd_name = etcd_name self.etcd_addrs = etcd_addrs self.etcd_base_dir_l = etcd_base_dir_l self.etcd_base_dir_f = etcd_base_dir_f self.raw_data_publisher_l = raw_data_publisher.RawDataPublisher( self.etcd_l, self.raw_data_pub_dir_l ) self.raw_data_publisher_f = raw_data_publisher.RawDataPublisher( self.etcd_f, self.raw_data_pub_dir_f ) if gfile.Exists(data_source_l.output_base_dir): gfile.DeleteRecursively(data_source_l.output_base_dir) if gfile.Exists(self.raw_data_dir_l): gfile.DeleteRecursively(self.raw_data_dir_l) if gfile.Exists(data_source_f.output_base_dir): gfile.DeleteRecursively(data_source_f.output_base_dir) if gfile.Exists(self.raw_data_dir_f): gfile.DeleteRecursively(self.raw_data_dir_f) self.worker_options = dj_pb.DataJoinWorkerOptions( use_mock_etcd=True, raw_data_options=dj_pb.RawDataOptions( raw_data_iter='TF_RECORD', read_ahead_size=1<<20, read_batch_size=128 ), example_id_dump_options=dj_pb.ExampleIdDumpOptions( example_id_dump_interval=1, example_id_dump_threshold=1024 ), example_joiner_options=dj_pb.ExampleJoinerOptions( example_joiner='STREAM_JOINER', min_matching_window=64, max_matching_window=256, data_block_dump_interval=30, data_block_dump_threshold=1000 ), batch_processor_options=dj_pb.BatchProcessorOptions( batch_size=512, max_flying_item=2048 ), data_block_builder_options=dj_pb.WriterOptions( output_writer='TF_RECORD' ) ) self.total_index = 1 << 12
min_matching_window=args.min_matching_window, max_matching_window=args.max_matching_window, data_block_dump_interval=args.data_block_dump_interval, data_block_dump_threshold=args.data_block_dump_threshold, max_conversion_delay=interval_to_timestamp(\ args.max_conversion_delay), enable_negative_example_generator=\ args.enable_negative_example_generator, ), example_id_dump_options=dj_pb.ExampleIdDumpOptions( example_id_dump_interval=args.example_id_dump_interval, example_id_dump_threshold=args.example_id_dump_threshold ), batch_processor_options=dj_pb.BatchProcessorOptions( batch_size=4096, max_flying_item=-1 ), data_block_builder_options=dj_pb.WriterOptions( output_writer=args.data_block_builder, compressed_type=args.data_block_compressed_type ) ) db_database, db_addr, db_username, db_password, db_base_dir = \ get_kvstore_config(args.kvstore_type) worker_srv = DataJoinWorkerService(args.listen_port, args.peer_addr, args.master_addr, args.rank_id, db_database, db_base_dir, db_addr, db_username, db_password, worker_options) worker_srv.run()
def _launch_workers(self): worker_options_l = dj_pb.DataJoinWorkerOptions( use_mock_etcd=True, raw_data_options=dj_pb.RawDataOptions(raw_data_iter='TF_RECORD', read_ahead_size=1 << 20, read_batch_size=128), example_id_dump_options=dj_pb.ExampleIdDumpOptions( example_id_dump_interval=1, example_id_dump_threshold=1024), example_joiner_options=dj_pb.ExampleJoinerOptions( example_joiner='SORT_RUN_JOINER', min_matching_window=64, max_matching_window=256, data_block_dump_interval=30, data_block_dump_threshold=1000), batch_processor_options=dj_pb.BatchProcessorOptions( batch_size=1024, max_flying_item=4096), data_block_builder_options=dj_pb.WriterOptions( output_writer='CSV_DICT')) worker_options_f = dj_pb.DataJoinWorkerOptions( use_mock_etcd=True, raw_data_options=dj_pb.RawDataOptions(raw_data_iter='CSV_DICT', read_ahead_size=1 << 20, read_batch_size=128), example_id_dump_options=dj_pb.ExampleIdDumpOptions( example_id_dump_interval=1, example_id_dump_threshold=1024), example_joiner_options=dj_pb.ExampleJoinerOptions( example_joiner='SORT_RUN_JOINER', min_matching_window=64, max_matching_window=256, data_block_dump_interval=30, data_block_dump_threshold=1000), batch_processor_options=dj_pb.BatchProcessorOptions( batch_size=1024, max_flying_item=4096), data_block_builder_options=dj_pb.WriterOptions( output_writer='TF_RECORD')) self._worker_addrs_l = [ 'localhost:4161', 'localhost:4162', 'localhost:4163', 'localhost:4164' ] self._worker_addrs_f = [ 'localhost:5161', 'localhost:5162', 'localhost:5163', 'localhost:5164' ] self._workers_l = [] self._workers_f = [] for rank_id in range(4): worker_addr_l = self._worker_addrs_l[rank_id] worker_addr_f = self._worker_addrs_f[rank_id] os.environ['ETCD_BASE_DIR'] = self.leader_base_dir self._workers_l.append( data_join_worker.DataJoinWorkerService( int(worker_addr_l.split(':')[1]), worker_addr_f, self._master_addr_l, rank_id, self.kvstore_type, worker_options_l)) os.environ['ETCD_BASE_DIR'] = self.follower_base_dir self._workers_f.append( data_join_worker.DataJoinWorkerService( int(worker_addr_f.split(':')[1]), worker_addr_l, self._master_addr_f, rank_id, self.kvstore_type, worker_options_f)) for w in self._workers_l: w.start() for w in self._workers_f: w.start()