Example #1
0
 def _setUpDataSource(self):
     self._data_source_name = 'test_data_source'
     self._etcd_l.delete_prefix(self._data_source_name)
     self._etcd_f.delete_prefix(self._data_source_name)
     self._data_source_l = common_pb.DataSource()
     self._data_source_l.role = common_pb.FLRole.Leader
     self._data_source_l.state = common_pb.DataSourceState.Init
     self._data_source_l.data_block_dir = "./data_block_l"
     self._data_source_l.raw_data_dir = "./raw_data_l"
     self._data_source_l.example_dumped_dir = "./example_dumped_l"
     self._data_source_l.raw_data_sub_dir = "./raw_data_sub_dir_l"
     self._data_source_f = common_pb.DataSource()
     self._data_source_f.role = common_pb.FLRole.Follower
     self._data_source_f.state = common_pb.DataSourceState.Init
     self._data_source_f.data_block_dir = "./data_block_f"
     self._data_source_f.raw_data_dir = "./raw_data_f"
     self._data_source_f.example_dumped_dir = "./example_dumped_f"
     self._data_source_f.raw_data_sub_dir = "./raw_data_sub_dir_f"
     data_source_meta = common_pb.DataSourceMeta()
     data_source_meta.name = self._data_source_name
     data_source_meta.partition_num = 4
     data_source_meta.start_time = 0
     data_source_meta.end_time = 100000000
     self._data_source_l.data_source_meta.MergeFrom(data_source_meta)
     self._data_source_f.data_source_meta.MergeFrom(data_source_meta)
     common.commit_data_source(self._etcd_l, self._data_source_l)
     common.commit_data_source(self._etcd_f, self._data_source_f)
Example #2
0
 def setUp(self):
     data_source_f = common_pb.DataSource()
     data_source_f.data_source_meta.name = "milestone"
     data_source_f.data_source_meta.partition_num = 1
     data_source_f.output_base_dir = "./output-f"
     self.data_source_f = data_source_f
     if gfile.Exists(self.data_source_f.output_base_dir):
         gfile.DeleteRecursively(self.data_source_f.output_base_dir)
     data_source_l = common_pb.DataSource()
     data_source_l.data_source_meta.name = "milestone"
     data_source_l.data_source_meta.partition_num = 1
     data_source_l.output_base_dir = "./output-l"
     self.raw_data_dir_l = "./raw_data-l"
     self.data_source_l = data_source_l
     if gfile.Exists(self.data_source_l.output_base_dir):
         gfile.DeleteRecursively(self.data_source_l.output_base_dir)
     if gfile.Exists(self.raw_data_dir_l):
         gfile.DeleteRecursively(self.raw_data_dir_l)
     self.etcd = etcd_client.EtcdClient('test_cluster', 'localhost:2379',
                                        'fedlearner', True)
     self.etcd.delete_prefix(
         common.data_source_etcd_base_dir(
             self.data_source_l.data_source_meta.name))
     self.manifest_manager = raw_data_manifest_manager.RawDataManifestManager(
         self.etcd, self.data_source_l)
Example #3
0
 def _setUpDataSource(self):
     self._data_source_name = 'test_data_source'
     self._kvstore_l.delete_prefix(
         common.data_source_kvstore_base_dir(self._data_source_name))
     self._kvstore_f.delete_prefix(
         common.data_source_kvstore_base_dir(self._data_source_name))
     self._data_source_l = common_pb.DataSource()
     self._data_source_l.role = common_pb.FLRole.Leader
     self._data_source_l.state = common_pb.DataSourceState.Init
     self._data_source_l.output_base_dir = "./ds_output_l"
     self._raw_data_dir_l = "./raw_data_l"
     self._data_source_l.raw_data_sub_dir = "./raw_data_sub_dir_l"
     self._data_source_f = common_pb.DataSource()
     self._data_source_f.role = common_pb.FLRole.Follower
     self._data_source_f.state = common_pb.DataSourceState.Init
     self._data_source_f.output_base_dir = "./ds_output_f"
     self._raw_data_dir_f = "./raw_data_f"
     self._data_source_f.raw_data_sub_dir = "./raw_data_sub_dir_f"
     data_source_meta = common_pb.DataSourceMeta()
     data_source_meta.name = self._data_source_name
     data_source_meta.partition_num = 4
     data_source_meta.start_time = 0
     data_source_meta.end_time = 100000000
     self._data_source_l.data_source_meta.MergeFrom(data_source_meta)
     self._data_source_f.data_source_meta.MergeFrom(data_source_meta)
     common.commit_data_source(self._kvstore_l, self._data_source_l)
     common.commit_data_source(self._kvstore_f, self._data_source_f)
Example #4
0
 def test_raw_data_visitor(self):
     self.data_source = common_pb.DataSource()
     self.data_source.data_source_meta.name = 'fclh_test'
     self.data_source.data_source_meta.partition_num = 1
     self.data_source.raw_data_dir = "./test/compressed_raw_data"
     self.etcd = etcd_client.EtcdClient('test_cluster', 'localhost:2379',
                                        'fedlearner')
     self.etcd.delete_prefix(self.data_source.data_source_meta.name)
     self.assertEqual(self.data_source.data_source_meta.partition_num, 1)
     partition_dir = os.path.join(self.data_source.raw_data_dir,
                                  'partition_0')
     self.assertTrue(gfile.Exists(partition_dir))
     manifest_manager = raw_data_manifest_manager.RawDataManifestManager(
         self.etcd, self.data_source)
     options = customized_options.CustomizedOptions()
     options.set_raw_data_iter('TF_DATASET')
     options.set_compressed_type('GZIP')
     rdv = raw_data_visitor.RawDataVisitor(self.etcd, self.data_source, 0,
                                           options)
     expected_index = 0
     for (index, item) in rdv:
         if index > 0 and index % 1024 == 0:
             print("{} {} {}".format(index, item.example_id,
                                     item.event_time))
         self.assertEqual(index, expected_index)
         expected_index += 1
     self.assertGreater(expected_index, 0)
Example #5
0
 def setUp(self):
     data_source = common_pb.DataSource()
     data_source.data_source_meta.name = "milestone-x"
     data_source.data_source_meta.partition_num = 1
     data_source.example_dumped_dir = "./example_ids"
     self.data_source = data_source
     if gfile.Exists(self.data_source.example_dumped_dir):
         gfile.DeleteRecursively(self.data_source.example_dumped_dir)
     self.partition_dir = os.path.join(self.data_source.example_dumped_dir,
                                       'partition_0')
     gfile.MakeDirs(self.partition_dir)
     self._example_id_dumper = example_id_dumper.ExampleIdDumperManager(
         self.data_source, 0)
     self.assertEqual(self._example_id_dumper.get_next_index(), 0)
     index = 0
     for i in range(5):
         req = dj_pb.SyncExamplesRequest(
             data_source_meta=data_source.data_source_meta,
             partition_id=0,
             begin_index=index)
         for j in range(1 << 15):
             req.example_id.append('{}'.format(index).encode())
             req.event_time.append(150000000 + index)
             self.end_index = index
             index += 1
         self._example_id_dumper.append_synced_example_req(req)
         self.assertEqual(self._example_id_dumper.get_next_index(), index)
     self._example_id_dumper.finish_sync_example()
     self.assertTrue(self._example_id_dumper.need_dump())
     self._example_id_dumper.dump_example_ids()
Example #6
0
 def setUp(self):
     data_source = common_pb.DataSource()
     data_source.data_source_meta.name = "milestone-f"
     data_source.data_source_meta.partition_num = 1
     data_source.output_base_dir = "./ds_output"
     self.raw_data_dir = "./raw_data"
     self.data_source = data_source
     self.raw_data_options = dj_pb.RawDataOptions(raw_data_iter='TF_RECORD',
                                                  compressed_type='',
                                                  optional_fields=['label'])
     self.example_id_dump_options = dj_pb.ExampleIdDumpOptions(
         example_id_dump_interval=1, example_id_dump_threshold=1024)
     self.example_joiner_options = dj_pb.ExampleJoinerOptions(
         example_joiner='STREAM_JOINER',
         min_matching_window=32,
         max_matching_window=128,
         data_block_dump_interval=30,
         data_block_dump_threshold=128)
     if gfile.Exists(self.data_source.output_base_dir):
         gfile.DeleteRecursively(self.data_source.output_base_dir)
     if gfile.Exists(self.raw_data_dir):
         gfile.DeleteRecursively(self.raw_data_dir)
     self.kvstore = mysql_client.DBClient('test_cluster', 'localhost:2379',
                                          'test_user', 'test_password',
                                          'fedlearner', True)
     self.kvstore.delete_prefix(
         common.data_source_kvstore_base_dir(
             self.data_source.data_source_meta.name))
     self.total_raw_data_count = 0
     self.total_example_id_count = 0
     self.manifest_manager = raw_data_manifest_manager.RawDataManifestManager(
         self.kvstore, self.data_source)
     self.g_data_block_index = 0
 def setUp(self):
     data_source = common_pb.DataSource()
     data_source.data_source_meta.name = "milestone-x"
     data_source.data_source_meta.partition_num = 4
     data_source.data_source_meta.start_time = 0
     data_source.data_source_meta.end_time = 10000
     data_source.output_base_dir = "./ds_output"
     data_source.role = common_pb.FLRole.Follower
     self.data_source = data_source
     self.db_database = 'test_cluster'
     self.db_addr = 'localhost:2379'
     self.db_base_dir = 'fedlearner'
     self.db_username = '******'
     self.db_password = '******'
     self.kvstore = mysql_client.DBClient(self.db_database, self.db_addr,
                                          self.db_username,
                                          self.db_password,
                                          self.db_base_dir, True)
     common.commit_data_source(self.kvstore, self.data_source)
     if gfile.Exists(data_source.output_base_dir):
         gfile.DeleteRecursively(data_source.output_base_dir)
     self.data_block_matas = []
     self.manifest_manager = raw_data_manifest_manager.RawDataManifestManager(
         self.kvstore, self.data_source)
     partition_num = self.data_source.data_source_meta.partition_num
     for i in range(partition_num):
         self._create_data_block(i)
Example #8
0
 def setUp(self):
     data_source = common_pb.DataSource()
     data_source.data_source_meta.name = "milestone-f"
     data_source.data_source_meta.partition_num = 1
     data_source.data_block_dir = "./data_block"
     data_source.example_dumped_dir = "./example_id"
     data_source.raw_data_dir = "./raw_data"
     self.data_source = data_source
     self.raw_data_options = dj_pb.RawDataOptions(raw_data_iter='TF_RECORD',
                                                  compressed_type='')
     self.example_id_dump_options = dj_pb.ExampleIdDumpOptions(
         example_id_dump_interval=1, example_id_dump_threshold=1024)
     self.example_joiner_options = dj_pb.ExampleJoinerOptions(
         example_joiner='STREAM_JOINER',
         min_matching_window=32,
         max_matching_window=128,
         data_block_dump_interval=30,
         data_block_dump_threshold=128)
     if gfile.Exists(self.data_source.data_block_dir):
         gfile.DeleteRecursively(self.data_source.data_block_dir)
     if gfile.Exists(self.data_source.example_dumped_dir):
         gfile.DeleteRecursively(self.data_source.example_dumped_dir)
     if gfile.Exists(self.data_source.raw_data_dir):
         gfile.DeleteRecursively(self.data_source.raw_data_dir)
     self.etcd = etcd_client.EtcdClient('test_cluster', 'localhost:2379',
                                        'fedlearner', True)
     self.etcd.delete_prefix(self.data_source.data_source_meta.name)
     self.total_raw_data_count = 0
     self.total_example_id_count = 0
     self.manifest_manager = raw_data_manifest_manager.RawDataManifestManager(
         self.etcd, self.data_source)
     self.g_data_block_index = 0
 def test_compressed_raw_data_visitor(self):
     self.data_source = common_pb.DataSource()
     self.data_source.data_source_meta.name = 'fclh_test'
     self.data_source.data_source_meta.partition_num = 1
     self.raw_data_dir = path.join(
             path.dirname(path.abspath(__file__)), "../compressed_raw_data"
         )
     self.kvstore = DBClient('etcd', True)
     self.kvstore.delete_prefix(common.data_source_kvstore_base_dir(self.data_source.data_source_meta.name))
     self.assertEqual(self.data_source.data_source_meta.partition_num, 1)
     partition_dir = path.join(self.raw_data_dir, common.partition_repr(0))
     self.assertTrue(gfile.Exists(partition_dir))
     manifest_manager = raw_data_manifest_manager.RawDataManifestManager(
         self.kvstore, self.data_source)
     manifest_manager.add_raw_data(
             0, [dj_pb.RawDataMeta(file_path=path.join(partition_dir, "0-0.idx"),
                                   timestamp=timestamp_pb2.Timestamp(seconds=3))],
             True)
     raw_data_options = dj_pb.RawDataOptions(
             raw_data_iter='TF_RECORD',
             compressed_type='GZIP',
             read_ahead_size=1<<20,
             read_batch_size=128
         )
     rdm = raw_data_visitor.RawDataManager(self.kvstore, self.data_source,0)
     self.assertTrue(rdm.check_index_meta_by_process_index(0))
     rdv = raw_data_visitor.RawDataVisitor(self.kvstore, self.data_source, 0,
                                           raw_data_options)
     expected_index = 0
     for (index, item) in rdv:
         if index > 0 and index % 32 == 0:
             print("{} {}".format(index, item.example_id))
         self.assertEqual(index, expected_index)
         expected_index += 1
     self.assertGreater(expected_index, 0)
Example #10
0
 def test_csv_raw_data_visitor(self):
     self.data_source = common_pb.DataSource()
     self.data_source.data_source_meta.name = 'fclh_test'
     self.data_source.data_source_meta.partition_num = 1
     self.raw_data_dir = path.join(path.dirname(path.abspath(__file__)),
                                   "../csv_raw_data")
     self.etcd = etcd_client.EtcdClient('test_cluster', 'localhost:2379',
                                        'fedlearner', True)
     self.etcd.delete_prefix(
         common.data_source_etcd_base_dir(
             self.data_source.data_source_meta.name))
     self.assertEqual(self.data_source.data_source_meta.partition_num, 1)
     partition_dir = path.join(self.raw_data_dir, common.partition_repr(0))
     self.assertTrue(gfile.Exists(partition_dir))
     manifest_manager = raw_data_manifest_manager.RawDataManifestManager(
         self.etcd, self.data_source)
     manifest_manager.add_raw_data(0, [
         dj_pb.RawDataMeta(file_path=path.join(partition_dir,
                                               "test_raw_data.csv"),
                           timestamp=timestamp_pb2.Timestamp(seconds=3))
     ], True)
     raw_data_options = dj_pb.RawDataOptions(raw_data_iter='CSV_DICT',
                                             read_ahead_size=1 << 20)
     rdm = raw_data_visitor.RawDataManager(self.etcd, self.data_source, 0)
     self.assertTrue(rdm.check_index_meta_by_process_index(0))
     rdv = raw_data_visitor.RawDataVisitor(self.etcd, self.data_source, 0,
                                           raw_data_options)
     expected_index = 0
     for (index, item) in rdv:
         if index > 0 and index % 1024 == 0:
             print("{} {}".format(index, item.raw_id))
         self.assertEqual(index, expected_index)
         expected_index += 1
     self.assertEqual(expected_index, 4999)
Example #11
0
def retrieve_data_source(kvstore, data_source_name):
    kvstore_key = data_source_kvstore_base_dir(data_source_name)
    raw_data = kvstore.get_data(kvstore_key)
    if raw_data is None:
        raise ValueError(
            "kvstore master key is None for {}".format(data_source_name))
    return text_format.Parse(raw_data, common_pb.DataSource())
Example #12
0
def retrieve_data_source(etcd, data_source_name):
    etcd_key = data_source_etcd_base_dir(data_source_name)
    raw_data = etcd.get_data(etcd_key)
    if raw_data is None:
        raise ValueError(
            "etcd master key is None for {}".format(data_source_name))
    return text_format.Parse(raw_data, common_pb.DataSource())
 def test_raw_data_visitor(self):
     self.data_source = common_pb.DataSource()
     self.data_source.data_source_meta.name = 'fclh_test'
     self.data_source.data_source_meta.partition_num = 1
     self.data_source.raw_data_dir = "./test/compressed_raw_data"
     self.etcd = etcd_client.EtcdClient('test_cluster', 'localhost:2379',
                                        'fedlearner', True)
     self.etcd.delete_prefix(self.data_source.data_source_meta.name)
     self.assertEqual(self.data_source.data_source_meta.partition_num, 1)
     partition_dir = os.path.join(self.data_source.raw_data_dir, common.partition_repr(0))
     self.assertTrue(gfile.Exists(partition_dir))
     manifest_manager = raw_data_manifest_manager.RawDataManifestManager(
         self.etcd, self.data_source)
     manifest_manager.add_raw_data(
             0, [dj_pb.RawDataMeta(file_path=os.path.join(partition_dir, "0-0.idx"),
                                   timestamp=timestamp_pb2.Timestamp(seconds=3))],
             True)
     raw_data_options = dj_pb.RawDataOptions(
             raw_data_iter='TF_DATASET',
             compressed_type='GZIP'
         )
     rdm = raw_data_visitor.RawDataManager(self.etcd, self.data_source,0)
     self.assertTrue(rdm.check_index_meta_by_process_index(0))
     rdv = raw_data_visitor.RawDataVisitor(self.etcd, self.data_source, 0,
                                           raw_data_options)
     expected_index = 0
     for (index, item) in rdv:
         if index > 0 and index % 32 == 0:
             print("{} {}".format(index, item.example_id))
         self.assertEqual(index, expected_index)
         expected_index += 1
     self.assertGreater(expected_index, 0)
Example #14
0
 def setUp(self):
     data_source = common_pb.DataSource()
     data_source.data_source_meta.name = "milestone-f"
     data_source.data_source_meta.partition_num = 1
     data_source.output_base_dir = "./ds_output"
     self.raw_data_dir = "./raw_data"
     self.data_source = data_source
     self.raw_data_options = dj_pb.RawDataOptions(raw_data_iter='TF_RECORD',
                                                  compressed_type='')
     self.example_id_dump_options = dj_pb.ExampleIdDumpOptions(
         example_id_dump_interval=1, example_id_dump_threshold=1024)
     self.example_joiner_options = dj_pb.ExampleJoinerOptions(
         example_joiner='ATTRIBUTION_JOINER',
         min_matching_window=32,
         max_matching_window=51200,
         max_conversion_delay=interval_to_timestamp("124"),
         enable_negative_example_generator=True,
         data_block_dump_interval=32,
         data_block_dump_threshold=128,
         negative_sampling_rate=0.8,
     )
     if gfile.Exists(self.data_source.output_base_dir):
         gfile.DeleteRecursively(self.data_source.output_base_dir)
     if gfile.Exists(self.raw_data_dir):
         gfile.DeleteRecursively(self.raw_data_dir)
     self.kvstore = db_client.DBClient('etcd', True)
     self.kvstore.delete_prefix(
         common.data_source_kvstore_base_dir(
             self.data_source.data_source_meta.name))
     self.total_raw_data_count = 0
     self.total_example_id_count = 0
     self.manifest_manager = raw_data_manifest_manager.RawDataManifestManager(
         self.kvstore, self.data_source)
     self.g_data_block_index = 0
Example #15
0
 def setUp(self):
     self.data_source = common_pb.DataSource()
     self.data_source.data_source_meta.name = 'fclh_test'
     self.data_source.data_source_meta.partition_num = 1
     self.data_source.raw_data_dir = "./raw_data"
     self.etcd = etcd_client.EtcdClient('test_cluster', 'localhost:2379',
                                        'fedlearner', True)
     self.etcd.delete_prefix(self.data_source.data_source_meta.name)
     self.assertEqual(self.data_source.data_source_meta.partition_num, 1)
     partition_dir = os.path.join(self.data_source.raw_data_dir,
                                  'partition_0')
     if gfile.Exists(partition_dir):
         gfile.DeleteRecursively(partition_dir)
     gfile.MakeDirs(partition_dir)
     for i in range(2):
         fname = 'raw_data_{}'.format(i)
         fpath = os.path.join(partition_dir, fname)
         writer = tf.io.TFRecordWriter(fpath)
         for j in range(100):
             feat = {}
             example_id = '{}'.format(i * 100 + j).encode()
             feat['example_id'] = tf.train.Feature(
                 bytes_list=tf.train.BytesList(value=[example_id]))
             example = tf.train.Example(features=tf.train.Features(
                 feature=feat))
             writer.write(example.SerializeToString())
         writer.close()
 def _sync_data_source(self):
     if self._data_source is None:
         raw_data = self._etcd.get_data(self._master_etcd_key)
         if raw_data is None:
             raise ValueError("etcd master key is None for {}".format(
                 self._data_source_name))
         self._data_source = text_format.Parse(raw_data,
                                               common_pb.DataSource())
Example #17
0
 def _gen_ds_meta(self, role):
     data_source = common_pb.DataSource()
     data_source.data_source_meta.name = self.app_id
     data_source.data_source_meta.partition_num = 1
     data_source.data_source_meta.start_time = 0
     data_source.data_source_meta.end_time = 100000
     data_source.output_base_dir = "{}/{}_{}/data_source/".format(
         output_path, data_source.data_source_meta.name, role)
     data_source.role = role
     return data_source
Example #18
0
 def setUp(self):
     data_source_f = common_pb.DataSource()
     data_source_f.data_source_meta.name = "milestone-f"
     data_source_f.data_source_meta.partition_num = 1
     data_source_f.data_block_dir = "./data_block-f"
     self.data_source_f = data_source_f
     if gfile.Exists(self.data_source_f.data_block_dir):
         gfile.DeleteRecursively(self.data_source_f.data_block_dir)
     data_source_l = common_pb.DataSource()
     data_source_l.data_source_meta.name = "milestone-l"
     data_source_l.data_source_meta.partition_num = 1
     data_source_l.data_block_dir = "./data_block-l"
     data_source_l.raw_data_dir = "./raw_data-l"
     self.data_source_l = data_source_l
     if gfile.Exists(self.data_source_l.data_block_dir):
         gfile.DeleteRecursively(self.data_source_l.data_block_dir)
     if gfile.Exists(self.data_source_l.raw_data_dir):
         gfile.DeleteRecursively(self.data_source_l.raw_data_dir)
     self.etcd = etcd_client.EtcdClient('test_cluster', '10.8.163.165:4578', 'byte_fl')
     self.etcd.delete_prefix(self.data_source_l.data_source_meta.name)
Example #19
0
 def __init__(self, etcd, raw_data_options, mock_data_source_name,
              raw_data_sub_dir):
     mock_data_source = common_pb.DataSource(
         state=common_pb.DataSourceState.Processing,
         data_source_meta=common_pb.DataSourceMeta(
             name=mock_data_source_name,
             partition_num=1,
             raw_data_sub_dir=raw_data_sub_dir))
     self._mock_rd_manifest_manager = RawDataManifestManager(
         etcd, mock_data_source)
     super(EtcdBasedMockRawDataVisitor,
           self).__init__(etcd, mock_data_source, 0, raw_data_options)
Example #20
0
 def __init__(self, kvstore, raw_data_options, mock_data_source_name,
              raw_data_sub_dir, partition_id):
     mock_data_source = common_pb.DataSource(
         state=common_pb.DataSourceState.Processing,
         raw_data_sub_dir=raw_data_sub_dir,
         data_source_meta=common_pb.DataSourceMeta(
             name=mock_data_source_name, partition_num=partition_id + 1))
     self._mock_rd_manifest_manager = RawDataManifestManager(
         kvstore, mock_data_source, False)
     self._partition_id = partition_id
     super(DBBasedMockRawDataVisitor,
           self).__init__(kvstore, mock_data_source, partition_id,
                          raw_data_options)
 def setUp(self):
     data_source = common_pb.DataSource()
     data_source.data_source_meta.name = "milestone-x"
     data_source.data_source_meta.partition_num = 1
     data_source.data_block_dir = "./data_block"
     self.data_source = data_source
     if gfile.Exists(data_source.data_block_dir):
         gfile.DeleteRecursively(data_source.data_block_dir)
     self.data_block_manager = data_block_manager.DataBlockManager(
         data_source, 0)
     self.assertEqual(self.data_block_manager.get_dumped_data_block_count(),
                      0)
     self.assertEqual(self.data_block_manager.get_lastest_data_block_meta(),
                      None)
 def setUp(self):
     self.data_source = common_pb.DataSource()
     self.data_source.data_source_meta.name = 'fclh_test'
     self.data_source.data_source_meta.partition_num = 1
     self.raw_data_dir = "./raw_data"
     self.kvstore = db_client.DBClient('etcd', True)
     self.kvstore.delete_prefix(common.data_source_kvstore_base_dir(self.data_source.data_source_meta.name))
     self.assertEqual(self.data_source.data_source_meta.partition_num, 1)
     partition_dir = os.path.join(self.raw_data_dir, common.partition_repr(0))
     if gfile.Exists(partition_dir):
         gfile.DeleteRecursively(partition_dir)
     gfile.MakeDirs(partition_dir)
     self.manifest_manager = raw_data_manifest_manager.RawDataManifestManager(
         self.kvstore, self.data_source)
Example #23
0
 def setUp(self):
     data_source_f = common_pb.DataSource()
     data_source_f.data_source_meta.name = "milestone"
     data_source_f.data_source_meta.partition_num = 1
     data_source_f.output_base_dir = "./output-f"
     self.data_source_f = data_source_f
     if gfile.Exists(self.data_source_f.output_base_dir):
         gfile.DeleteRecursively(self.data_source_f.output_base_dir)
     data_source_l = common_pb.DataSource()
     data_source_l.data_source_meta.name = "milestone"
     data_source_l.data_source_meta.partition_num = 1
     data_source_l.output_base_dir = "./output-l"
     self.raw_data_dir_l = "./raw_data-l"
     self.data_source_l = data_source_l
     if gfile.Exists(self.data_source_l.output_base_dir):
         gfile.DeleteRecursively(self.data_source_l.output_base_dir)
     if gfile.Exists(self.raw_data_dir_l):
         gfile.DeleteRecursively(self.raw_data_dir_l)
     self.kvstore = db_client.DBClient('etcd', True)
     self.kvstore.delete_prefix(
         common.data_source_kvstore_base_dir(
             self.data_source_l.data_source_meta.name))
     self.manifest_manager = raw_data_manifest_manager.RawDataManifestManager(
         self.kvstore, self.data_source_l)
 def setUp(self):
     self.etcd = etcd_client.EtcdClient('test_cluster', 'localhost:2379',
                                        'fedlearner', True)
     data_source = common_pb.DataSource()
     data_source.data_source_meta.name = "milestone-x"
     data_source.data_source_meta.partition_num = 1
     data_source.example_dumped_dir = "./example_ids"
     self.etcd.delete_prefix(data_source.data_source_meta.name)
     self.data_source = data_source
     self.example_id_dump_options = dj_pb.ExampleIdDumpOptions(
         example_id_dump_interval=-1, example_id_dump_threshold=1024)
     if gfile.Exists(self.data_source.example_dumped_dir):
         gfile.DeleteRecursively(self.data_source.example_dumped_dir)
     self.partition_dir = os.path.join(self.data_source.example_dumped_dir,
                                       common.partition_repr(0))
     gfile.MakeDirs(self.partition_dir)
Example #25
0
 def setUp(self):
     self.data_source = common_pb.DataSource()
     self.data_source.data_source_meta.name = 'fclh_test'
     self.data_source.data_source_meta.partition_num = 1
     self.data_source.raw_data_dir = "./raw_data"
     self.etcd = etcd_client.EtcdClient('test_cluster', 'localhost:2379',
                                        'fedlearner', True)
     self.etcd.delete_prefix(self.data_source.data_source_meta.name)
     self.assertEqual(self.data_source.data_source_meta.partition_num, 1)
     partition_dir = os.path.join(self.data_source.raw_data_dir,
                                  common.partition_repr(0))
     if gfile.Exists(partition_dir):
         gfile.DeleteRecursively(partition_dir)
     gfile.MakeDirs(partition_dir)
     self.manifest_manager = raw_data_manifest_manager.RawDataManifestManager(
         self.etcd, self.data_source)
Example #26
0
 def init(self,
          dsname,
          joiner_name,
          version=Version.V1,
          cache_type="memory"):
     data_source = common_pb.DataSource()
     data_source.data_source_meta.name = dsname
     data_source.data_source_meta.partition_num = 1
     data_source.output_base_dir = "%s_ds_output" % dsname
     self.raw_data_dir = "%s_raw_data" % dsname
     self.data_source = data_source
     self.raw_data_options = dj_pb.RawDataOptions(
         raw_data_iter='TF_RECORD',
         compressed_type='',
         raw_data_cache_type=cache_type,
     )
     self.example_id_dump_options = dj_pb.ExampleIdDumpOptions(
         example_id_dump_interval=1, example_id_dump_threshold=1024)
     self.example_joiner_options = dj_pb.ExampleJoinerOptions(
         example_joiner=joiner_name,
         min_matching_window=32,
         max_matching_window=51200,
         max_conversion_delay=interval_to_timestamp("124"),
         enable_negative_example_generator=True,
         data_block_dump_interval=32,
         data_block_dump_threshold=128,
         negative_sampling_rate=0.8,
         join_expr="example_id",
         join_key_mapper="DEFAULT",
         negative_sampling_filter_expr='',
     )
     if gfile.Exists(self.data_source.output_base_dir):
         gfile.DeleteRecursively(self.data_source.output_base_dir)
     if gfile.Exists(self.raw_data_dir):
         gfile.DeleteRecursively(self.raw_data_dir)
     self.kvstore = db_client.DBClient('etcd', True)
     self.kvstore.delete_prefix(
         common.data_source_kvstore_base_dir(
             self.data_source.data_source_meta.name))
     self.total_raw_data_count = 0
     self.total_example_id_count = 0
     self.manifest_manager = raw_data_manifest_manager.RawDataManifestManager(
         self.kvstore, self.data_source)
     self.g_data_block_index = 0
     self.version = version
 def setUp(self):
     self.kvstore = db_client.DBClient('etcd', True)
     data_source = common_pb.DataSource()
     data_source.data_source_meta.name = "milestone-x"
     data_source.data_source_meta.partition_num = 1
     data_source.output_base_dir = "./ds_output"
     self.kvstore.delete_prefix(
         common.data_source_kvstore_base_dir(
             data_source.data_source_meta.name))
     self.data_source = data_source
     self.example_id_dump_options = dj_pb.ExampleIdDumpOptions(
         example_id_dump_interval=-1, example_id_dump_threshold=1024)
     if gfile.Exists(self.data_source.output_base_dir):
         gfile.DeleteRecursively(self.data_source.output_base_dir)
     self.partition_dir = os.path.join(
         common.data_source_example_dumped_dir(self.data_source),
         common.partition_repr(0))
     gfile.MakeDirs(self.partition_dir)
 def __init__(self, etcd, raw_data_options, mock_data_source_name,
              input_fpaths):
     mock_data_source = common_pb.DataSource(
         state=common_pb.DataSourceState.Processing,
         data_source_meta=common_pb.DataSourceMeta(
             name=mock_data_source_name, partition_num=1))
     mock_rd_manifest_manager = RawDataManifestManager(
         etcd, mock_data_source)
     manifest = mock_rd_manifest_manager.get_manifest(0)
     if not manifest.finished:
         metas = []
         for fpath in input_fpaths:
             metas.append(dj_pb.RawDataMeta(file_path=fpath,
                                            start_index=-1))
         mock_rd_manifest_manager.add_raw_data(0, metas, True)
         mock_rd_manifest_manager.finish_raw_data(0)
     super(MockRawDataVisitor, self).__init__(etcd, mock_data_source, 0,
                                              raw_data_options)
    def __init__(self,
                 base_path,
                 name,
                 role,
                 partition_num=1,
                 start_time=0,
                 end_time=100000):
        if role == 'leader':
            role = 0
        elif role == 'follower':
            role = 1
        else:
            raise ValueError("Unknown role %s" % role)
        data_source = common_pb.DataSource()
        data_source.data_source_meta.name = name
        data_source.data_source_meta.partition_num = partition_num
        data_source.data_source_meta.start_time = start_time
        data_source.data_source_meta.end_time = end_time
        data_source.output_base_dir = "{}/{}_{}/data_source/".format(
            base_path, data_source.data_source_meta.name, role)
        data_source.role = role
        if gfile.Exists(data_source.output_base_dir):
            gfile.DeleteRecursively(data_source.output_base_dir)

        self._data_source = data_source

        db_database, db_addr, db_username, db_password, db_base_dir = \
            get_kvstore_config("etcd")
        self._kv_store = mysql_client.DBClient(db_database, db_addr,
                                               db_username, db_password,
                                               db_base_dir, True)

        common.commit_data_source(self._kv_store, self._data_source)
        self._dbms = []
        for i in range(partition_num):
            manifest_manager = raw_data_manifest_manager.RawDataManifestManager(
                self._kv_store, self._data_source)
            manifest_manager._finish_partition('join_example_rep',
                                               dj_pb.JoinExampleState.UnJoined,
                                               dj_pb.JoinExampleState.Joined,
                                               -1, i)
            self._dbms.append(
                data_block_manager.DataBlockManager(self._data_source, i))
 def setUp(self):
     data_source = common_pb.DataSource()
     data_source.data_source_meta.name = "milestone-x"
     data_source.data_source_meta.partition_num = 4
     data_source.data_source_meta.start_time = 0
     data_source.data_source_meta.end_time = 10000
     data_source.output_base_dir = "./ds_output"
     data_source.role = common_pb.FLRole.Follower
     self.data_source = data_source
     self.kvstore = db_client.DBClient('etcd', True)
     common.commit_data_source(self.kvstore, self.data_source)
     if gfile.Exists(data_source.output_base_dir):
         gfile.DeleteRecursively(data_source.output_base_dir)
     self.data_block_matas = []
     self.manifest_manager = raw_data_manifest_manager.RawDataManifestManager(
         self.kvstore, self.data_source)
     partition_num = self.data_source.data_source_meta.partition_num
     for i in range(partition_num):
         self._create_data_block(i)