def generate_leader_raw_data(self):
     dbm = data_block_manager.DataBlockManager(self.data_source_l, 0)
     raw_data_dir = os.path.join(self.data_source_l.raw_data_dir,
                                 common.partition_repr(0))
     if gfile.Exists(raw_data_dir):
         gfile.DeleteRecursively(raw_data_dir)
     gfile.MakeDirs(raw_data_dir)
     rdm = raw_data_visitor.RawDataManager(self.etcd, self.data_source_l, 0)
     block_index = 0
     builder = DataBlockBuilder(
         self.data_source_l.raw_data_dir,
         self.data_source_l.data_source_meta.name, 0, block_index,
         dj_pb.WriterOptions(output_writer='TF_RECORD'), None)
     process_index = 0
     start_index = 0
     for i in range(0, self.leader_end_index + 3):
         if (i > 0 and i % 2048 == 0) or (i == self.leader_end_index + 2):
             meta = builder.finish_data_block()
             if meta is not None:
                 ofname = common.encode_data_block_fname(
                     self.data_source_l.data_source_meta.name, meta)
                 fpath = os.path.join(raw_data_dir, ofname)
                 self.manifest_manager.add_raw_data(0, [
                     dj_pb.RawDataMeta(
                         file_path=fpath,
                         timestamp=timestamp_pb2.Timestamp(seconds=3))
                 ], False)
                 process_index += 1
                 start_index += len(meta.example_ids)
             block_index += 1
             builder = DataBlockBuilder(
                 self.data_source_l.raw_data_dir,
                 self.data_source_l.data_source_meta.name, 0, block_index,
                 dj_pb.WriterOptions(output_writer='TF_RECORD'), None)
         feat = {}
         pt = i + 1 << 30
         if i % 3 == 0:
             pt = i // 3
         example_id = '{}'.format(pt).encode()
         feat['example_id'] = tf.train.Feature(
             bytes_list=tf.train.BytesList(value=[example_id]))
         event_time = 150000000 + pt
         feat['event_time'] = tf.train.Feature(
             int64_list=tf.train.Int64List(value=[event_time]))
         example = tf.train.Example(features=tf.train.Features(
             feature=feat))
         builder.append_item(TfExampleItem(example.SerializeToString()), i,
                             i)
     fpaths = [
         os.path.join(raw_data_dir, f)
         for f in gfile.ListDirectory(raw_data_dir)
         if not gfile.IsDirectory(os.path.join(raw_data_dir, f))
     ]
     for fpath in fpaths:
         if not fpath.endswith(common.DataBlockSuffix):
             gfile.Remove(fpath)
Beispiel #2
0
 def _preprocess_rsa_psi_follower(self):
     processors = []
     rsa_key_pem = None
     with gfile.GFile(self._rsa_public_key_path, 'rb') as f:
         rsa_key_pem = f.read()
     for partition_id in range(
             self._data_source_f.data_source_meta.partition_num):
         options = dj_pb.RsaPsiPreProcessorOptions(
             preprocessor_name='follower-rsa-psi-processor',
             role=common_pb.FLRole.Follower,
             rsa_key_pem=rsa_key_pem,
             input_file_paths=[self._psi_raw_data_fpaths_f[partition_id]],
             output_file_dir=self._pre_processor_ouput_dir_f,
             raw_data_publish_dir=self._raw_data_pub_dir_f,
             partition_id=partition_id,
             leader_rsa_psi_signer_addr=self._rsa_psi_signer_addr,
             offload_processor_number=1,
             max_flying_sign_batch=128,
             max_flying_sign_rpc=64,
             sign_rpc_timeout_ms=100000,
             stub_fanout=2,
             slow_sign_threshold=8,
             sort_run_merger_read_ahead_buffer=1 << 20,
             batch_processor_options=dj_pb.BatchProcessorOptions(
                 batch_size=1024, max_flying_item=1 << 14),
             input_raw_data=dj_pb.RawDataOptions(raw_data_iter='CSV_DICT',
                                                 read_ahead_size=1 << 20),
             writer_options=dj_pb.WriterOptions(output_writer='TF_RECORD'))
         processor = rsa_psi_preprocessor.RsaPsiPreProcessor(
             options, self._etcd_name, self._etcd_addrs,
             self._etcd_base_dir_f, True)
         processor.start_process()
         processors.append(processor)
     for processor in processors:
         processor.wait_for_finished()
Beispiel #3
0
def generate_input_csv(base_dir, start_id, end_id, partition_num):
    for partition_id in range(partition_num):
        dirpath = os.path.join(base_dir, common.partition_repr(partition_id))
        if not gfile.Exists(dirpath):
            gfile.MakeDirs(dirpath)
        assert gfile.IsDirectory(dirpath)
    writer_options = dj_pb.WriterOptions(output_writer='CSV_DICT')
    csv_writers = [
        SortRunMergerWriter(base_dir, 0, partition_id, writer_options)
        for partition_id in range(partition_num)
    ]
    for idx in range(start_id, end_id):
        if idx % 262144 == 0:
            logging.info("Process at index %d", idx)
        partition_id = CityHash32(str(idx)) % partition_num
        raw = OrderedDict()
        raw['raw_id'] = str(idx)
        raw['feat_0'] = str((partition_id << 30) + 0) + str(idx)
        raw['feat_1'] = str((partition_id << 30) + 1) + str(idx)
        raw['feat_2'] = str((partition_id << 30) + 2) + str(idx)
        csv_writers[partition_id].append(CsvItem(raw))
    for partition_id, csv_writer in enumerate(csv_writers):
        fpaths = csv_writer.finish()
        logging.info("partition %d dump %d files", partition_id, len(fpaths))
        for seq_id, fpath in enumerate(fpaths):
            logging.info("  %d. %s", seq_id, fpath)
        logging.info("---------------")
Beispiel #4
0
    def _create_data_block(self, partition_id):
        dbm = data_block_manager.DataBlockManager(self.data_source, partition_id)
        self.assertEqual(dbm.get_dumped_data_block_count(), 0)
        self.assertEqual(dbm.get_lastest_data_block_meta(), None)

        leader_index = 0
        follower_index = 65536
        for i in range(64):
            builder = DataBlockBuilder(
                    common.data_source_data_block_dir(self.data_source),
                    self.data_source.data_source_meta.name,
                    partition_id, i,
                    dj_pb.WriterOptions(output_writer='TF_RECORD'), None
                )
            builder.set_data_block_manager(dbm)
            for j in range(4):
                feat = {}
                example_id = '{}'.format(i * 1024 + j).encode()
                feat['example_id'] = tf.train.Feature(
                        bytes_list=tf.train.BytesList(value=[example_id]))
                event_time = random.randint(0, 10)
                feat['event_time'] = tf.train.Feature(
                        int64_list=tf.train.Int64List(value=[event_time]))
                feat['leader_index'] = tf.train.Feature(
                        int64_list=tf.train.Int64List(value=[leader_index]))
                feat['follower_index'] = tf.train.Feature(
                        int64_list=tf.train.Int64List(value=[follower_index]))
                example = tf.train.Example(features=tf.train.Features(feature=feat))
                builder.append_item(TfExampleItem(example.SerializeToString()),
                                    leader_index, follower_index)
                leader_index += 1
                follower_index += 1
            self.data_block_matas.append(builder.finish_data_block())
Beispiel #5
0
    def add_data_block(self, partition_id, x, y):
        dbm = self._dbms[partition_id]

        builder = DataBlockBuilder(
            common.data_source_data_block_dir(self._data_source),
            self._data_source.data_source_meta.name, partition_id,
            dbm.get_dumped_data_block_count(),
            dj_pb.WriterOptions(output_writer="TF_RECORD"), None)
        builder.set_data_block_manager(dbm)
        for i in range(x.shape[0]):
            feat = {}
            exam_id = '{}'.format(i).encode()
            feat['example_id'] = Feature(
                bytes_list=BytesList(value=[exam_id]))
            feat['event_time'] = Feature(
                int64_list = Int64List(value=[i])
            )
            feat['x'] = Feature(float_list=FloatList(value=list(x[i])))
            if y is not None:
                feat['y'] = Feature(int64_list=Int64List(value=[y[i]]))

            example = Example(features=Features(feature=feat))
            builder.append_item(TfExampleItem(example.SerializeToString()), i, 0)

        return builder.finish_data_block()
Beispiel #6
0
 def _preprocess_rsa_psi_leader(self):
     processors = []
     rsa_key_pem = None
     with gfile.GFile(self._rsa_private_key_path, 'rb') as f:
         rsa_key_pem = f.read()
     for partition_id in range(
             self._data_source_l.data_source_meta.partition_num):
         options = dj_pb.RsaPsiPreProcessorOptions(
             preprocessor_name='leader-rsa-psi-processor',
             role=common_pb.FLRole.Leader,
             rsa_key_pem=rsa_key_pem,
             input_file_paths=[self._psi_raw_data_fpaths_l[partition_id]],
             output_file_dir=self._pre_processor_ouput_dir_l,
             raw_data_publish_dir=self._raw_data_pub_dir_l,
             partition_id=partition_id,
             offload_processor_number=1,
             max_flying_sign_batch=128,
             stub_fanout=2,
             slow_sign_threshold=8,
             sort_run_merger_read_ahead_buffer=1 << 20,
             sort_run_merger_read_batch_size=128,
             batch_processor_options=dj_pb.BatchProcessorOptions(
                 batch_size=1024, max_flying_item=1 << 14),
             input_raw_data=dj_pb.RawDataOptions(raw_data_iter='CSV_DICT',
                                                 read_ahead_size=1 << 20),
             writer_options=dj_pb.WriterOptions(output_writer='TF_RECORD'))
         os.environ['ETCD_BASE_DIR'] = self.leader_base_dir
         processor = rsa_psi_preprocessor.RsaPsiPreProcessor(
             options, self.kvstore_type, True)
         processor.start_process()
         processors.append(processor)
     for processor in processors:
         processor.wait_for_finished()
    def test_example_joiner(self):
        sei = joiner_impl.create_example_joiner(
            self.example_joiner_options, self.raw_data_options,
            dj_pb.WriterOptions(output_writer='TF_RECORD'), self.kvstore,
            self.data_source, 0)
        metas = []
        with sei.make_example_joiner() as joiner:
            for meta in joiner:
                metas.append(meta)
        self.assertEqual(len(metas), 0)
        self.generate_raw_data(0, 2 * 2048)
        dumper = example_id_dumper.ExampleIdDumperManager(
            self.kvstore, self.data_source, 0, self.example_id_dump_options)
        self.generate_example_id(dumper, 0, 3 * 2048)
        with sei.make_example_joiner() as joiner:
            for meta in joiner:
                metas.append(meta)
        self.generate_raw_data(2 * 2048, 2048)
        self.generate_example_id(dumper, 3 * 2048, 3 * 2048)
        with sei.make_example_joiner() as joiner:
            for meta in joiner:
                metas.append(meta)
        self.generate_raw_data(3 * 2048, 5 * 2048)
        self.generate_example_id(dumper, 6 * 2048, 2048)
        with sei.make_example_joiner() as joiner:
            for meta in joiner:
                metas.append(meta)
        self.generate_raw_data(8 * 2048, 2 * 2048)
        with sei.make_example_joiner() as joiner:
            for meta in joiner:
                metas.append(meta)
        self.generate_example_id(dumper, 7 * 2048, 3 * 2048)
        with sei.make_example_joiner() as joiner:
            for meta in joiner:
                metas.append(meta)
        sei.set_sync_example_id_finished()
        sei.set_raw_data_finished()
        with sei.make_example_joiner() as joiner:
            for meta in joiner:
                metas.append(meta)

        dbm = data_block_manager.DataBlockManager(self.data_source, 0)
        data_block_num = dbm.get_dumped_data_block_count()
        self.assertEqual(len(metas), data_block_num)
        join_count = 0
        for data_block_index in range(data_block_num):
            meta = dbm.get_data_block_meta_by_index(data_block_index)
            self.assertEqual(meta, metas[data_block_index])
            join_count += len(meta.example_ids)

        print("join rate {}/{}({}), min_matching_window {}, "\
              "max_matching_window {}".format(
              join_count, 20480,
              (join_count+.0)/(10 * 2048),
              self.example_joiner_options.min_matching_window,
              self.example_joiner_options.max_matching_window))
    def test_universal_join_key_mapper_error(self):
        mapper_code = """
from fedlearner.data_join.key_mapper.key_mapping import BaseKeyMapper
class KeyMapperMock(BaseKeyMapper):
    def leader_mapping(self, item) -> dict:
        res = item.click_id.decode().split("_")
        raise ValueError
        return dict({"req_id":res[0], "cid":res[1]})

    def follower_mapping(self, item) -> dict:
        return dict()

    @classmethod
    def name(cls):
        return "TEST_MAPPER"
"""
        abspath = os.path.dirname(os.path.abspath(__file__))
        fname = os.path.realpath(
            os.path.join(
                abspath,
                "../../fedlearner/data_join/key_mapper/impl/keymapper_mock.py")
        )
        with open(fname, "w") as f:
            f.write(mapper_code)
        reload(key_mapper)

        self.example_joiner_options = dj_pb.ExampleJoinerOptions(
            example_joiner='UNIVERSAL_JOINER',
            min_matching_window=32,
            max_matching_window=51200,
            max_conversion_delay=interval_to_timestamp("258"),
            enable_negative_example_generator=True,
            data_block_dump_interval=32,
            data_block_dump_threshold=1024,
            negative_sampling_rate=0.8,
            join_expr="(cid,req_id) or (example_id)",
            join_key_mapper="TEST_MAPPER",
            negative_sampling_filter_expr='',
        )
        self.version = dsp.Version.V2

        sei = joiner_impl.create_example_joiner(
            self.example_joiner_options,
            self.raw_data_options,
            #dj_pb.WriterOptions(output_writer='TF_RECORD'),
            dj_pb.WriterOptions(output_writer='CSV_DICT'),
            self.kvstore,
            self.data_source,
            0)
        self.run_join(sei, 0)
        os.remove(fname)
    def _make_portal_worker(self):
        portal_worker_options = dp_pb.DataPortalWorkerOptions(
            raw_data_options=dj_pb.RawDataOptions(raw_data_iter="TF_RECORD",
                                                  compressed_type=''),
            writer_options=dj_pb.WriterOptions(output_writer="TF_RECORD"),
            batch_processor_options=dj_pb.BatchProcessorOptions(
                batch_size=128, max_flying_item=300000),
            merge_buffer_size=4096,
            merger_read_ahead_size=1000000)

        self._portal_worker = DataPortalWorker(portal_worker_options,
                                               "localhost:5005", 0,
                                               "test_portal_worker_0",
                                               "portal_worker_0",
                                               "localhost:2379", True)
Beispiel #10
0
    def _make_portal_worker(self):
        portal_worker_options = dp_pb.DataPortalWorkerOptions(
            raw_data_options=dj_pb.RawDataOptions(raw_data_iter="TF_RECORD",
                                                  read_ahead_size=1 << 20,
                                                  read_batch_size=128),
            writer_options=dj_pb.WriterOptions(output_writer="TF_RECORD"),
            batch_processor_options=dj_pb.BatchProcessorOptions(
                batch_size=128, max_flying_item=300000),
            merger_read_ahead_size=1000000,
            merger_read_batch_size=128)

        self._portal_worker = DataPortalWorker(portal_worker_options,
                                               "localhost:5005", 0,
                                               "test_portal_worker_0",
                                               "portal_worker_0",
                                               "localhost:2379", "test_user",
                                               "test_password", True)
Beispiel #11
0
    def _create_data_block(self, data_source, partition_id, x, y):
        data_block_metas = []
        dbm = data_block_manager.DataBlockManager(data_source, partition_id)
        self.assertEqual(dbm.get_dumped_data_block_count(), 0)
        self.assertEqual(dbm.get_lastest_data_block_meta(), None)
        N = 200
        chunk_size = x.shape[0] // N

        leader_index = 0
        follower_index = N * chunk_size * 10
        for i in range(N):
            builder = DataBlockBuilder(
                common.data_source_data_block_dir(data_source),
                data_source.data_source_meta.name,
                partition_id, i,
                dj_pb.WriterOptions(output_writer="TF_RECORD"), None
            )
            builder.set_data_block_manager(dbm)
            for j in range(chunk_size):
                feat = {}
                idx =  i * chunk_size + j
                exam_id = '{}'.format(idx).encode()
                feat['example_id'] = Feature(
                    bytes_list=BytesList(value=[exam_id]))
                evt_time = random.randint(1, 1000)
                feat['event_time'] = Feature(
                    int64_list = Int64List(value=[evt_time])
                )
                feat['x'] = Feature(float_list=FloatList(value=list(x[idx])))
                if y is not None:
                    feat['y'] = Feature(int64_list=Int64List(value=[y[idx]]))

                feat['leader_index'] = Feature(
                    int64_list = Int64List(value=[leader_index])
                )
                feat['follower_index'] = Feature(
                    int64_list = Int64List(value=[follower_index])
                )
                example = Example(features=Features(feature=feat))
                builder.append_item(TfExampleItem(example.SerializeToString()),
                                   leader_index, follower_index)
                leader_index += 1
                follower_index += 1
            data_block_metas.append(builder.finish_data_block())
        self.max_index = follower_index
        return data_block_metas
Beispiel #12
0
 def _launch_workers(self):
     worker_options = dj_pb.DataJoinWorkerOptions(
         use_mock_etcd=True,
         raw_data_options=dj_pb.RawDataOptions(raw_data_iter='TF_RECORD',
                                               compressed_type=''),
         example_id_dump_options=dj_pb.ExampleIdDumpOptions(
             example_id_dump_interval=1, example_id_dump_threshold=1024),
         example_joiner_options=dj_pb.ExampleJoinerOptions(
             example_joiner='SORT_RUN_JOINER',
             min_matching_window=64,
             max_matching_window=256,
             data_block_dump_interval=30,
             data_block_dump_threshold=1000),
         batch_processor_options=dj_pb.BatchProcessorOptions(
             batch_size=1024, max_flying_item=4096),
         data_block_builder_options=dj_pb.WriterOptions(
             output_writer='TF_RECORD'))
     self._worker_addrs_l = [
         'localhost:4161', 'localhost:4162', 'localhost:4163',
         'localhost:4164'
     ]
     self._worker_addrs_f = [
         'localhost:5161', 'localhost:5162', 'localhost:5163',
         'localhost:5164'
     ]
     self._workers_l = []
     self._workers_f = []
     for rank_id in range(4):
         worker_addr_l = self._worker_addrs_l[rank_id]
         worker_addr_f = self._worker_addrs_f[rank_id]
         self._workers_l.append(
             data_join_worker.DataJoinWorkerService(
                 int(worker_addr_l.split(':')[1]), worker_addr_f,
                 self._master_addr_l, rank_id, self._etcd_name,
                 self._etcd_base_dir_l, self._etcd_addrs, worker_options))
         self._workers_f.append(
             data_join_worker.DataJoinWorkerService(
                 int(worker_addr_f.split(':')[1]), worker_addr_l,
                 self._master_addr_f, rank_id, self._etcd_name,
                 self._etcd_base_dir_f, self._etcd_addrs, worker_options))
     for w in self._workers_l:
         w.start()
     for w in self._workers_f:
         w.start()
Beispiel #13
0
    def _make_portal_worker(self, raw_data_iter, validation_ratio):
        portal_worker_options = dp_pb.DataPortalWorkerOptions(
            raw_data_options=dj_pb.RawDataOptions(
                raw_data_iter=raw_data_iter,
                read_ahead_size=1 << 20,
                read_batch_size=128,
                optional_fields=['label'],
                validation_ratio=validation_ratio,
            ),
            writer_options=dj_pb.WriterOptions(output_writer="TF_RECORD"),
            batch_processor_options=dj_pb.BatchProcessorOptions(
                batch_size=128, max_flying_item=300000),
            merger_read_ahead_size=1000000,
            merger_read_batch_size=128)

        os.environ['ETCD_BASE_DIR'] = "portal_worker_0"
        self._portal_worker = DataPortalWorker(portal_worker_options,
                                               "localhost:5005", 0, "etcd",
                                               True)
    def test_universal_join_small_follower(self):
        self.example_joiner_options = dj_pb.ExampleJoinerOptions(
            example_joiner='UNIVERSAL_JOINER',
            min_matching_window=32,
            max_matching_window=20240,
            max_conversion_delay=interval_to_timestamp("128"),
            enable_negative_example_generator=False,
            data_block_dump_interval=32,
            data_block_dump_threshold=1024,
            negative_sampling_rate=0.8,
            join_expr="(id_type, example_id, trunc(event_time,1))",
            join_key_mapper="DEFAULT",
            negative_sampling_filter_expr='',
        )
        self.version = dsp.Version.V2

        sei = joiner_impl.create_example_joiner(
            self.example_joiner_options, self.raw_data_options,
            dj_pb.WriterOptions(output_writer='TF_RECORD'), self.kvstore,
            self.data_source, 0)
        self.run_join_small_follower(sei, 0.15)
Beispiel #15
0
 def generate_follower_data_block(self):
     dbm = data_block_manager.DataBlockManager(self.data_source_f, 0)
     self.assertEqual(dbm.get_dumped_data_block_count(), 0)
     self.assertEqual(dbm.get_lastest_data_block_meta(), None)
     leader_index = 0
     follower_index = 65536
     self.dumped_metas = []
     for i in range(5):
         builder = DataBlockBuilder(
             common.data_source_data_block_dir(self.data_source_f),
             self.data_source_f.data_source_meta.name, 0, i,
             dj_pb.WriterOptions(output_writer='TF_RECORD'), None)
         builder.set_data_block_manager(dbm)
         for j in range(1024):
             feat = {}
             example_id = '{}'.format(i * 1024 + j).encode()
             feat['example_id'] = tf.train.Feature(
                 bytes_list=tf.train.BytesList(value=[example_id]))
             event_time = 150000000 + i * 1024 + j
             feat['event_time'] = tf.train.Feature(
                 int64_list=tf.train.Int64List(value=[event_time]))
             feat['leader_index'] = tf.train.Feature(
                 int64_list=tf.train.Int64List(value=[leader_index]))
             feat['follower_index'] = tf.train.Feature(
                 int64_list=tf.train.Int64List(value=[follower_index]))
             example = tf.train.Example(features=tf.train.Features(
                 feature=feat))
             builder.append_item(TfExampleItem(example.SerializeToString()),
                                 leader_index, follower_index)
             leader_index += 3
             follower_index += 1
         meta = builder.finish_data_block()
         self.dumped_metas.append(meta)
     self.leader_start_index = 0
     self.leader_end_index = leader_index
     self.assertEqual(dbm.get_dumped_data_block_count(), 5)
     for (idx, meta) in enumerate(self.dumped_metas):
         self.assertEqual(dbm.get_data_block_meta_by_index(idx), meta)
 def generate_raw_data(self, begin_index, item_count):
     raw_data_dir = os.path.join(self.raw_data_dir,
                                 common.partition_repr(0))
     if not gfile.Exists(raw_data_dir):
         gfile.MakeDirs(raw_data_dir)
     self.total_raw_data_count += item_count
     useless_index = 0
     rdm = raw_data_visitor.RawDataManager(self.kvstore, self.data_source,
                                           0)
     fpaths = []
     for block_index in range(0, item_count // 2048):
         builder = DataBlockBuilder(
             self.raw_data_dir,
             self.data_source.data_source_meta.name, 0, block_index,
             dj_pb.WriterOptions(output_writer='TF_RECORD'), None)
         cands = list(
             range(begin_index + block_index * 2048,
                   begin_index + (block_index + 1) * 2048))
         start_index = cands[0]
         for i in range(len(cands)):
             if random.randint(1, 4) > 2:
                 continue
             a = random.randint(i - 32, i + 32)
             b = random.randint(i - 32, i + 32)
             if a < 0:
                 a = 0
             if a >= len(cands):
                 a = len(cands) - 1
             if b < 0:
                 b = 0
             if b >= len(cands):
                 b = len(cands) - 1
             if (abs(cands[a] - i - start_index) <= 32
                     and abs(cands[b] - i - start_index) <= 32):
                 cands[a], cands[b] = cands[b], cands[a]
         for example_idx in cands:
             feat = {}
             example_id = '{}'.format(example_idx).encode()
             feat['example_id'] = tf.train.Feature(
                 bytes_list=tf.train.BytesList(value=[example_id]))
             event_time = 150000000 + example_idx
             feat['event_time'] = tf.train.Feature(
                 int64_list=tf.train.Int64List(value=[event_time]))
             label = random.choice([1, 0])
             if random.random() < 0.8:
                 feat['label'] = tf.train.Feature(
                     int64_list=tf.train.Int64List(value=[label]))
             example = tf.train.Example(features=tf.train.Features(
                 feature=feat))
             builder.append_item(TfExampleItem(example.SerializeToString()),
                                 useless_index, useless_index)
             useless_index += 1
         meta = builder.finish_data_block()
         fname = common.encode_data_block_fname(
             self.data_source.data_source_meta.name, meta)
         fpath = os.path.join(raw_data_dir, fname)
         fpaths.append(
             dj_pb.RawDataMeta(
                 file_path=fpath,
                 timestamp=timestamp_pb2.Timestamp(seconds=3)))
         self.g_data_block_index += 1
     all_files = [
         os.path.join(raw_data_dir, f)
         for f in gfile.ListDirectory(raw_data_dir)
         if not gfile.IsDirectory(os.path.join(raw_data_dir, f))
     ]
     for fpath in all_files:
         if not fpath.endswith(common.DataBlockSuffix):
             gfile.Remove(fpath)
     self.manifest_manager.add_raw_data(0, fpaths, False)
    set_logger()
    if args.input_data_file_iter == 'TF_RECORD' or \
            args.output_builder == 'TF_RECORD':
        import tensorflow
        tensorflow.compat.v1.enable_eager_execution()

    optional_fields = list(
        field for field in map(str.strip, args.optional_fields.split(','))
        if field != '')

    portal_worker_options = dp_pb.DataPortalWorkerOptions(
        raw_data_options=dj_pb.RawDataOptions(
            raw_data_iter=args.input_data_file_iter,
            compressed_type=args.compressed_type,
            read_ahead_size=args.read_ahead_size,
            read_batch_size=args.read_batch_size,
            optional_fields=optional_fields),
        writer_options=dj_pb.WriterOptions(
            output_writer=args.output_builder,
            compressed_type=args.builder_compressed_type),
        batch_processor_options=dj_pb.BatchProcessorOptions(
            batch_size=args.batch_size, max_flying_item=-1),
        merger_read_ahead_size=args.merger_read_ahead_size,
        merger_read_batch_size=args.merger_read_batch_size,
        memory_limit_ratio=args.memory_limit_ratio / 100)
    data_portal_worker = DataPortalWorker(portal_worker_options,
                                          args.master_addr, args.rank_id,
                                          args.kvstore_type,
                                          (args.kvstore_type == 'mock'))
    data_portal_worker.start()
Beispiel #18
0
 def test_data_block_dumper(self):
     self.generate_follower_data_block()
     self.generate_leader_raw_data()
     dbd = data_block_dumper.DataBlockDumperManager(
         self.etcd, self.data_source_l, 0,
         dj_pb.RawDataOptions(raw_data_iter='TF_RECORD',
                              read_ahead_size=1 << 20,
                              read_batch_size=128),
         dj_pb.WriterOptions(output_writer='TF_RECORD'))
     self.assertEqual(dbd.get_next_data_block_index(), 0)
     for (idx, meta) in enumerate(self.dumped_metas):
         success, next_index = dbd.add_synced_data_block_meta(meta)
         self.assertTrue(success)
         self.assertEqual(next_index, idx + 1)
     self.assertTrue(dbd.need_dump())
     self.assertEqual(dbd.get_next_data_block_index(),
                      len(self.dumped_metas))
     with dbd.make_data_block_dumper() as dumper:
         dumper()
     dbm_f = data_block_manager.DataBlockManager(self.data_source_f, 0)
     dbm_l = data_block_manager.DataBlockManager(self.data_source_l, 0)
     self.assertEqual(dbm_f.get_dumped_data_block_count(),
                      len(self.dumped_metas))
     self.assertEqual(dbm_f.get_dumped_data_block_count(),
                      dbm_l.get_dumped_data_block_count())
     for (idx, meta) in enumerate(self.dumped_metas):
         self.assertEqual(meta.data_block_index, idx)
         self.assertEqual(dbm_l.get_data_block_meta_by_index(idx), meta)
         self.assertEqual(dbm_f.get_data_block_meta_by_index(idx), meta)
         meta_fpth_l = os.path.join(
             common.data_source_data_block_dir(self.data_source_l),
             common.partition_repr(0),
             common.encode_data_block_meta_fname(
                 self.data_source_l.data_source_meta.name, 0,
                 meta.data_block_index))
         mitr = tf.io.tf_record_iterator(meta_fpth_l)
         meta_l = text_format.Parse(next(mitr), dj_pb.DataBlockMeta())
         self.assertEqual(meta_l, meta)
         meta_fpth_f = os.path.join(
             common.data_source_data_block_dir(self.data_source_f),
             common.partition_repr(0),
             common.encode_data_block_meta_fname(
                 self.data_source_f.data_source_meta.name, 0,
                 meta.data_block_index))
         mitr = tf.io.tf_record_iterator(meta_fpth_f)
         meta_f = text_format.Parse(next(mitr), dj_pb.DataBlockMeta())
         self.assertEqual(meta_f, meta)
         data_fpth_l = os.path.join(
             common.data_source_data_block_dir(self.data_source_l),
             common.partition_repr(0),
             common.encode_data_block_fname(
                 self.data_source_l.data_source_meta.name, meta_l))
         for (iidx,
              record) in enumerate(tf.io.tf_record_iterator(data_fpth_l)):
             example = tf.train.Example()
             example.ParseFromString(record)
             feat = example.features.feature
             self.assertEqual(feat['example_id'].bytes_list.value[0],
                              meta.example_ids[iidx])
         self.assertEqual(len(meta.example_ids), iidx + 1)
         data_fpth_f = os.path.join(
             common.data_source_data_block_dir(self.data_source_f),
             common.partition_repr(0),
             common.encode_data_block_fname(
                 self.data_source_l.data_source_meta.name, meta_f))
         for (iidx,
              record) in enumerate(tf.io.tf_record_iterator(data_fpth_f)):
             example = tf.train.Example()
             example.ParseFromString(record)
             feat = example.features.feature
             self.assertEqual(feat['example_id'].bytes_list.value[0],
                              meta.example_ids[iidx])
         self.assertEqual(len(meta.example_ids), iidx + 1)
    def setUp(self):
        etcd_name = 'test_etcd'
        etcd_addrs = 'localhost:2379'
        etcd_base_dir_l = 'byefl_l'
        etcd_base_dir_f = 'byefl_f'
        data_source_name = 'test_data_source'
        etcd_l = EtcdClient(etcd_name, etcd_addrs, etcd_base_dir_l, True)
        etcd_f = EtcdClient(etcd_name, etcd_addrs, etcd_base_dir_f, True)
        etcd_l.delete_prefix(
            common.data_source_etcd_base_dir(data_source_name))
        etcd_f.delete_prefix(
            common.data_source_etcd_base_dir(data_source_name))
        data_source_l = common_pb.DataSource()
        self.raw_data_pub_dir_l = './raw_data_pub_dir_l'
        data_source_l.raw_data_sub_dir = self.raw_data_pub_dir_l
        data_source_l.role = common_pb.FLRole.Leader
        data_source_l.state = common_pb.DataSourceState.Init
        data_source_l.data_block_dir = "./data_block_l"
        data_source_l.raw_data_dir = "./raw_data_l"
        data_source_l.example_dumped_dir = "./example_dumped_l"
        data_source_f = common_pb.DataSource()
        self.raw_data_pub_dir_f = './raw_data_pub_dir_f'
        data_source_f.role = common_pb.FLRole.Follower
        data_source_f.raw_data_sub_dir = self.raw_data_pub_dir_f
        data_source_f.state = common_pb.DataSourceState.Init
        data_source_f.data_block_dir = "./data_block_f"
        data_source_f.raw_data_dir = "./raw_data_f"
        data_source_f.example_dumped_dir = "./example_dumped_f"
        data_source_meta = common_pb.DataSourceMeta()
        data_source_meta.name = data_source_name
        data_source_meta.partition_num = 2
        data_source_meta.start_time = 0
        data_source_meta.end_time = 100000000
        data_source_l.data_source_meta.MergeFrom(data_source_meta)
        common.commit_data_source(etcd_l, data_source_l)
        data_source_f.data_source_meta.MergeFrom(data_source_meta)
        common.commit_data_source(etcd_f, data_source_f)
        master_options = dj_pb.DataJoinMasterOptions(use_mock_etcd=True)

        master_addr_l = 'localhost:4061'
        master_addr_f = 'localhost:4062'
        master_l = data_join_master.DataJoinMasterService(
            int(master_addr_l.split(':')[1]),
            master_addr_f,
            data_source_name,
            etcd_name,
            etcd_base_dir_l,
            etcd_addrs,
            master_options,
        )
        master_l.start()
        master_f = data_join_master.DataJoinMasterService(
            int(master_addr_f.split(':')[1]), master_addr_l, data_source_name,
            etcd_name, etcd_base_dir_f, etcd_addrs, master_options)
        master_f.start()
        channel_l = make_insecure_channel(master_addr_l, ChannelType.INTERNAL)
        master_client_l = dj_grpc.DataJoinMasterServiceStub(channel_l)
        channel_f = make_insecure_channel(master_addr_f, ChannelType.INTERNAL)
        master_client_f = dj_grpc.DataJoinMasterServiceStub(channel_f)

        while True:
            req_l = dj_pb.DataSourceRequest(
                data_source_meta=data_source_l.data_source_meta)
            req_f = dj_pb.DataSourceRequest(
                data_source_meta=data_source_f.data_source_meta)
            dss_l = master_client_l.GetDataSourceStatus(req_l)
            dss_f = master_client_f.GetDataSourceStatus(req_f)
            self.assertEqual(dss_l.role, common_pb.FLRole.Leader)
            self.assertEqual(dss_f.role, common_pb.FLRole.Follower)
            if dss_l.state == common_pb.DataSourceState.Processing and \
                    dss_f.state == common_pb.DataSourceState.Processing:
                break
            else:
                time.sleep(2)

        self.master_client_l = master_client_l
        self.master_client_f = master_client_f
        self.master_addr_l = master_addr_l
        self.master_addr_f = master_addr_f
        self.etcd_l = etcd_l
        self.etcd_f = etcd_f
        self.data_source_l = data_source_l
        self.data_source_f = data_source_f
        self.master_l = master_l
        self.master_f = master_f
        self.data_source_name = data_source_name,
        self.etcd_name = etcd_name
        self.etcd_addrs = etcd_addrs
        self.etcd_base_dir_l = etcd_base_dir_l
        self.etcd_base_dir_f = etcd_base_dir_f
        self.raw_data_publisher_l = raw_data_publisher.RawDataPublisher(
            self.etcd_l, self.raw_data_pub_dir_l)
        self.raw_data_publisher_f = raw_data_publisher.RawDataPublisher(
            self.etcd_f, self.raw_data_pub_dir_f)
        if gfile.Exists(data_source_l.data_block_dir):
            gfile.DeleteRecursively(data_source_l.data_block_dir)
        if gfile.Exists(data_source_l.example_dumped_dir):
            gfile.DeleteRecursively(data_source_l.example_dumped_dir)
        if gfile.Exists(data_source_l.raw_data_dir):
            gfile.DeleteRecursively(data_source_l.raw_data_dir)
        if gfile.Exists(data_source_f.data_block_dir):
            gfile.DeleteRecursively(data_source_f.data_block_dir)
        if gfile.Exists(data_source_f.example_dumped_dir):
            gfile.DeleteRecursively(data_source_f.example_dumped_dir)
        if gfile.Exists(data_source_f.raw_data_dir):
            gfile.DeleteRecursively(data_source_f.raw_data_dir)

        self.worker_options = dj_pb.DataJoinWorkerOptions(
            use_mock_etcd=True,
            raw_data_options=dj_pb.RawDataOptions(raw_data_iter='TF_RECORD',
                                                  compressed_type=''),
            example_id_dump_options=dj_pb.ExampleIdDumpOptions(
                example_id_dump_interval=1, example_id_dump_threshold=1024),
            example_joiner_options=dj_pb.ExampleJoinerOptions(
                example_joiner='STREAM_JOINER',
                min_matching_window=64,
                max_matching_window=256,
                data_block_dump_interval=30,
                data_block_dump_threshold=1000),
            batch_processor_options=dj_pb.BatchProcessorOptions(
                batch_size=512, max_flying_item=2048),
            data_block_builder_options=dj_pb.WriterOptions(
                output_writer='TF_RECORD'))

        self.total_index = 1 << 13
 def generate_raw_data(self, etcd, rdp, data_source, partition_id,
                       block_size, shuffle_win_size, feat_key_fmt,
                       feat_val_fmt):
     dbm = data_block_manager.DataBlockManager(data_source, partition_id)
     raw_data_dir = os.path.join(data_source.raw_data_dir,
                                 common.partition_repr(partition_id))
     if gfile.Exists(raw_data_dir):
         gfile.DeleteRecursively(raw_data_dir)
     gfile.MakeDirs(raw_data_dir)
     useless_index = 0
     new_raw_data_fnames = []
     for block_index in range(self.total_index // block_size):
         builder = DataBlockBuilder(
             data_source.raw_data_dir, data_source.data_source_meta.name,
             partition_id, block_index,
             dj_pb.WriterOptions(output_writer='TF_RECORD'), None)
         cands = list(
             range(block_index * block_size,
                   (block_index + 1) * block_size))
         start_index = cands[0]
         for i in range(len(cands)):
             if random.randint(1, 4) > 2:
                 continue
             a = random.randint(i - shuffle_win_size, i + shuffle_win_size)
             b = random.randint(i - shuffle_win_size, i + shuffle_win_size)
             if a < 0:
                 a = 0
             if a >= len(cands):
                 a = len(cands) - 1
             if b < 0:
                 b = 0
             if b >= len(cands):
                 b = len(cands) - 1
             if (abs(cands[a] - i - start_index) <= shuffle_win_size and
                     abs(cands[b] - i - start_index) <= shuffle_win_size):
                 cands[a], cands[b] = cands[b], cands[a]
         for example_idx in cands:
             feat = {}
             example_id = '{}'.format(example_idx).encode()
             feat['example_id'] = tf.train.Feature(
                 bytes_list=tf.train.BytesList(value=[example_id]))
             event_time = 150000000 + example_idx
             feat['event_time'] = tf.train.Feature(
                 int64_list=tf.train.Int64List(value=[event_time]))
             feat[feat_key_fmt.format(example_idx)] = tf.train.Feature(
                 bytes_list=tf.train.BytesList(
                     value=[feat_val_fmt.format(example_idx).encode()]))
             example = tf.train.Example(features=tf.train.Features(
                 feature=feat))
             builder.append_item(TfExampleItem(example.SerializeToString()),
                                 useless_index, useless_index)
             useless_index += 1
         meta = builder.finish_data_block()
         fname = common.encode_data_block_fname(
             data_source.data_source_meta.name, meta)
         new_raw_data_fnames.append(os.path.join(raw_data_dir, fname))
     fpaths = [
         os.path.join(raw_data_dir, f)
         for f in gfile.ListDirectory(raw_data_dir)
         if not gfile.IsDirectory(os.path.join(raw_data_dir, f))
     ]
     for fpath in fpaths:
         if fpath.endswith(common.DataBlockMetaSuffix):
             gfile.Remove(fpath)
     rdp.publish_raw_data(partition_id, new_raw_data_fnames)
    def test_data_block_manager(self):
        data_block_datas = []
        data_block_metas = []
        leader_index = 0
        follower_index = 65536
        for i in range(5):
            fill_examples = []
            builder = DataBlockBuilder(
                self.data_source.data_block_dir,
                self.data_source.data_source_meta.name, 0, i,
                dj_pb.WriterOptions(output_writer='TF_RECORD'), None)
            builder.set_data_block_manager(self.data_block_manager)
            for j in range(1024):
                feat = {}
                example_id = '{}'.format(i * 1024 + j).encode()
                feat['example_id'] = tf.train.Feature(
                    bytes_list=tf.train.BytesList(value=[example_id]))
                event_time = 150000000 + i * 1024 + j
                feat['event_time'] = tf.train.Feature(
                    int64_list=tf.train.Int64List(value=[event_time]))
                feat['leader_index'] = tf.train.Feature(
                    int64_list=tf.train.Int64List(value=[leader_index]))
                feat['follower_index'] = tf.train.Feature(
                    int64_list=tf.train.Int64List(value=[follower_index]))
                example = tf.train.Example(features=tf.train.Features(
                    feature=feat))
                builder.append_item(TfExampleItem(example.SerializeToString()),
                                    leader_index, follower_index)
                fill_examples.append((example, {
                    'example_id': example_id,
                    'event_time': event_time,
                    'leader_index': leader_index,
                    'follower_index': follower_index
                }))
                leader_index += 1
                follower_index += 1
            meta = builder.finish_data_block()
            data_block_datas.append(fill_examples)
            data_block_metas.append(meta)
        self.assertEqual(self.data_block_manager.get_dumped_data_block_count(),
                         5)
        self.assertEqual(self.data_block_manager.get_lastest_data_block_meta(),
                         data_block_metas[-1])
        for (idx, meta) in enumerate(data_block_metas):
            self.assertEqual(
                self.data_block_manager.get_data_block_meta_by_index(idx),
                meta)
            self.assertEqual(
                meta.block_id,
                common.encode_block_id(self.data_source.data_source_meta.name,
                                       meta))
        self.assertEqual(
            self.data_block_manager.get_data_block_meta_by_index(5), None)
        data_block_dir = os.path.join(self.data_source.data_block_dir,
                                      common.partition_repr(0))
        for (i, meta) in enumerate(data_block_metas):
            data_block_fpath = os.path.join(
                data_block_dir, meta.block_id) + common.DataBlockSuffix
            data_block_meta_fpath = os.path.join(
                data_block_dir,
                common.encode_data_block_meta_fname(
                    self.data_source.data_source_meta.name, 0,
                    meta.data_block_index))
            self.assertTrue(gfile.Exists(data_block_fpath))
            self.assertTrue(gfile.Exists(data_block_meta_fpath))
            fiter = tf.io.tf_record_iterator(data_block_meta_fpath)
            remote_meta = text_format.Parse(
                next(fiter).decode(), dj_pb.DataBlockMeta())
            self.assertEqual(meta, remote_meta)
            for (j, record) in enumerate(
                    tf.io.tf_record_iterator(data_block_fpath)):
                example = tf.train.Example()
                example.ParseFromString(record)
                stored_data = data_block_datas[i][j]
                self.assertEqual(example, stored_data[0])
                feat = example.features.feature
                stored_feat = stored_data[1]
                self.assertTrue('example_id' in feat)
                self.assertTrue('example_id' in stored_feat)
                self.assertEqual(stored_feat['example_id'],
                                 '{}'.format(i * 1024 + j).encode())
                self.assertEqual(stored_feat['example_id'],
                                 feat['example_id'].bytes_list.value[0])
                self.assertTrue('event_time' in feat)
                self.assertTrue('event_time' in stored_feat)
                self.assertEqual(stored_feat['event_time'],
                                 feat['event_time'].int64_list.value[0])
                self.assertTrue('leader_index' in feat)
                self.assertTrue('leader_index' in stored_feat)
                self.assertEqual(stored_feat['leader_index'],
                                 feat['leader_index'].int64_list.value[0])
                self.assertTrue('follower_index' in feat)
                self.assertTrue('follower_index' in stored_feat)
                self.assertEqual(stored_feat['follower_index'],
                                 feat['follower_index'].int64_list.value[0])
            self.assertEqual(j, 1023)

        data_block_manager2 = data_block_manager.DataBlockManager(
            self.data_source, 0)
        self.assertEqual(self.data_block_manager.get_dumped_data_block_count(),
                         5)
Beispiel #22
0
    def setUp(self):
        etcd_name = 'test_etcd'
        etcd_addrs = 'localhost:2379'
        etcd_base_dir_l = 'byefl_l'
        etcd_base_dir_f= 'byefl_f'
        data_source_name = 'test_data_source'
        etcd_l = EtcdClient(etcd_name, etcd_addrs, etcd_base_dir_l, True)
        etcd_f = EtcdClient(etcd_name, etcd_addrs, etcd_base_dir_f, True)
        etcd_l.delete_prefix(common.data_source_etcd_base_dir(data_source_name))
        etcd_f.delete_prefix(common.data_source_etcd_base_dir(data_source_name))
        data_source_l = common_pb.DataSource()
        self.raw_data_pub_dir_l = './raw_data_pub_dir_l'
        data_source_l.raw_data_sub_dir = self.raw_data_pub_dir_l
        data_source_l.role = common_pb.FLRole.Leader
        data_source_l.state = common_pb.DataSourceState.Init
        data_source_l.output_base_dir = "./ds_output_l"
        self.raw_data_dir_l = "./raw_data_l"
        data_source_f = common_pb.DataSource()
        self.raw_data_pub_dir_f = './raw_data_pub_dir_f'
        data_source_f.role = common_pb.FLRole.Follower
        data_source_f.raw_data_sub_dir = self.raw_data_pub_dir_f
        data_source_f.state = common_pb.DataSourceState.Init
        data_source_f.output_base_dir = "./ds_output_f"
        self.raw_data_dir_f = "./raw_data_f"
        data_source_meta = common_pb.DataSourceMeta()
        data_source_meta.name = data_source_name
        data_source_meta.partition_num = 2
        data_source_meta.start_time = 0
        data_source_meta.end_time = 100000000
        data_source_l.data_source_meta.MergeFrom(data_source_meta)
        common.commit_data_source(etcd_l, data_source_l)
        data_source_f.data_source_meta.MergeFrom(data_source_meta)
        common.commit_data_source(etcd_f, data_source_f)

        self.etcd_l = etcd_l
        self.etcd_f = etcd_f
        self.data_source_l = data_source_l
        self.data_source_f = data_source_f
        self.data_source_name = data_source_name
        self.etcd_name = etcd_name
        self.etcd_addrs = etcd_addrs
        self.etcd_base_dir_l = etcd_base_dir_l
        self.etcd_base_dir_f = etcd_base_dir_f
        self.raw_data_publisher_l = raw_data_publisher.RawDataPublisher(
                self.etcd_l, self.raw_data_pub_dir_l
            )
        self.raw_data_publisher_f = raw_data_publisher.RawDataPublisher(
                self.etcd_f, self.raw_data_pub_dir_f
            )
        if gfile.Exists(data_source_l.output_base_dir):
            gfile.DeleteRecursively(data_source_l.output_base_dir)
        if gfile.Exists(self.raw_data_dir_l):
            gfile.DeleteRecursively(self.raw_data_dir_l)
        if gfile.Exists(data_source_f.output_base_dir):
            gfile.DeleteRecursively(data_source_f.output_base_dir)
        if gfile.Exists(self.raw_data_dir_f):
            gfile.DeleteRecursively(self.raw_data_dir_f)

        self.worker_options = dj_pb.DataJoinWorkerOptions(
                use_mock_etcd=True,
                raw_data_options=dj_pb.RawDataOptions(
                    raw_data_iter='TF_RECORD',
                    read_ahead_size=1<<20,
                    read_batch_size=128
                ),
                example_id_dump_options=dj_pb.ExampleIdDumpOptions(
                    example_id_dump_interval=1,
                    example_id_dump_threshold=1024
                ),
                example_joiner_options=dj_pb.ExampleJoinerOptions(
                    example_joiner='STREAM_JOINER',
                    min_matching_window=64,
                    max_matching_window=256,
                    data_block_dump_interval=30,
                    data_block_dump_threshold=1000
                ),
                batch_processor_options=dj_pb.BatchProcessorOptions(
                    batch_size=512,
                    max_flying_item=2048
                ),
                data_block_builder_options=dj_pb.WriterOptions(
                    output_writer='TF_RECORD'
                )
            )

        self.total_index = 1 << 12
Beispiel #23
0
                    min_matching_window=args.min_matching_window,
                    max_matching_window=args.max_matching_window,
                    data_block_dump_interval=args.data_block_dump_interval,
                    data_block_dump_threshold=args.data_block_dump_threshold,
                    max_conversion_delay=interval_to_timestamp(\
                                            args.max_conversion_delay),
                    enable_negative_example_generator=\
                        args.enable_negative_example_generator,
                ),
            example_id_dump_options=dj_pb.ExampleIdDumpOptions(
                    example_id_dump_interval=args.example_id_dump_interval,
                    example_id_dump_threshold=args.example_id_dump_threshold
                ),
            batch_processor_options=dj_pb.BatchProcessorOptions(
                    batch_size=4096,
                    max_flying_item=-1
                ),
            data_block_builder_options=dj_pb.WriterOptions(
                    output_writer=args.data_block_builder,
                    compressed_type=args.data_block_compressed_type
                )
        )
    db_database, db_addr, db_username, db_password, db_base_dir = \
        get_kvstore_config(args.kvstore_type)
    worker_srv = DataJoinWorkerService(args.listen_port, args.peer_addr,
                                       args.master_addr, args.rank_id,
                                       db_database, db_base_dir,
                                       db_addr, db_username,
                                       db_password, worker_options)
    worker_srv.run()
Beispiel #24
0
    def _launch_workers(self):
        worker_options_l = dj_pb.DataJoinWorkerOptions(
            use_mock_etcd=True,
            raw_data_options=dj_pb.RawDataOptions(raw_data_iter='TF_RECORD',
                                                  read_ahead_size=1 << 20,
                                                  read_batch_size=128),
            example_id_dump_options=dj_pb.ExampleIdDumpOptions(
                example_id_dump_interval=1, example_id_dump_threshold=1024),
            example_joiner_options=dj_pb.ExampleJoinerOptions(
                example_joiner='SORT_RUN_JOINER',
                min_matching_window=64,
                max_matching_window=256,
                data_block_dump_interval=30,
                data_block_dump_threshold=1000),
            batch_processor_options=dj_pb.BatchProcessorOptions(
                batch_size=1024, max_flying_item=4096),
            data_block_builder_options=dj_pb.WriterOptions(
                output_writer='CSV_DICT'))
        worker_options_f = dj_pb.DataJoinWorkerOptions(
            use_mock_etcd=True,
            raw_data_options=dj_pb.RawDataOptions(raw_data_iter='CSV_DICT',
                                                  read_ahead_size=1 << 20,
                                                  read_batch_size=128),
            example_id_dump_options=dj_pb.ExampleIdDumpOptions(
                example_id_dump_interval=1, example_id_dump_threshold=1024),
            example_joiner_options=dj_pb.ExampleJoinerOptions(
                example_joiner='SORT_RUN_JOINER',
                min_matching_window=64,
                max_matching_window=256,
                data_block_dump_interval=30,
                data_block_dump_threshold=1000),
            batch_processor_options=dj_pb.BatchProcessorOptions(
                batch_size=1024, max_flying_item=4096),
            data_block_builder_options=dj_pb.WriterOptions(
                output_writer='TF_RECORD'))

        self._worker_addrs_l = [
            'localhost:4161', 'localhost:4162', 'localhost:4163',
            'localhost:4164'
        ]
        self._worker_addrs_f = [
            'localhost:5161', 'localhost:5162', 'localhost:5163',
            'localhost:5164'
        ]
        self._workers_l = []
        self._workers_f = []
        for rank_id in range(4):
            worker_addr_l = self._worker_addrs_l[rank_id]
            worker_addr_f = self._worker_addrs_f[rank_id]
            os.environ['ETCD_BASE_DIR'] = self.leader_base_dir
            self._workers_l.append(
                data_join_worker.DataJoinWorkerService(
                    int(worker_addr_l.split(':')[1]), worker_addr_f,
                    self._master_addr_l, rank_id, self.kvstore_type,
                    worker_options_l))
            os.environ['ETCD_BASE_DIR'] = self.follower_base_dir
            self._workers_f.append(
                data_join_worker.DataJoinWorkerService(
                    int(worker_addr_f.split(':')[1]), worker_addr_l,
                    self._master_addr_f, rank_id, self.kvstore_type,
                    worker_options_f))
        for w in self._workers_l:
            w.start()
        for w in self._workers_f:
            w.start()