def _setUpDataSource(self): self._data_source_name = 'test_data_source' self._etcd_l.delete_prefix(self._data_source_name) self._etcd_f.delete_prefix(self._data_source_name) self._data_source_l = common_pb.DataSource() self._data_source_l.role = common_pb.FLRole.Leader self._data_source_l.state = common_pb.DataSourceState.Init self._data_source_l.data_block_dir = "./data_block_l" self._data_source_l.raw_data_dir = "./raw_data_l" self._data_source_l.example_dumped_dir = "./example_dumped_l" self._data_source_l.raw_data_sub_dir = "./raw_data_sub_dir_l" self._data_source_f = common_pb.DataSource() self._data_source_f.role = common_pb.FLRole.Follower self._data_source_f.state = common_pb.DataSourceState.Init self._data_source_f.data_block_dir = "./data_block_f" self._data_source_f.raw_data_dir = "./raw_data_f" self._data_source_f.example_dumped_dir = "./example_dumped_f" self._data_source_f.raw_data_sub_dir = "./raw_data_sub_dir_f" data_source_meta = common_pb.DataSourceMeta() data_source_meta.name = self._data_source_name data_source_meta.partition_num = 4 data_source_meta.start_time = 0 data_source_meta.end_time = 100000000 self._data_source_l.data_source_meta.MergeFrom(data_source_meta) self._data_source_f.data_source_meta.MergeFrom(data_source_meta) common.commit_data_source(self._etcd_l, self._data_source_l) common.commit_data_source(self._etcd_f, self._data_source_f)
def setUp(self): data_source_f = common_pb.DataSource() data_source_f.data_source_meta.name = "milestone" data_source_f.data_source_meta.partition_num = 1 data_source_f.output_base_dir = "./output-f" self.data_source_f = data_source_f if gfile.Exists(self.data_source_f.output_base_dir): gfile.DeleteRecursively(self.data_source_f.output_base_dir) data_source_l = common_pb.DataSource() data_source_l.data_source_meta.name = "milestone" data_source_l.data_source_meta.partition_num = 1 data_source_l.output_base_dir = "./output-l" self.raw_data_dir_l = "./raw_data-l" self.data_source_l = data_source_l if gfile.Exists(self.data_source_l.output_base_dir): gfile.DeleteRecursively(self.data_source_l.output_base_dir) if gfile.Exists(self.raw_data_dir_l): gfile.DeleteRecursively(self.raw_data_dir_l) self.etcd = etcd_client.EtcdClient('test_cluster', 'localhost:2379', 'fedlearner', True) self.etcd.delete_prefix( common.data_source_etcd_base_dir( self.data_source_l.data_source_meta.name)) self.manifest_manager = raw_data_manifest_manager.RawDataManifestManager( self.etcd, self.data_source_l)
def _setUpDataSource(self): self._data_source_name = 'test_data_source' self._kvstore_l.delete_prefix( common.data_source_kvstore_base_dir(self._data_source_name)) self._kvstore_f.delete_prefix( common.data_source_kvstore_base_dir(self._data_source_name)) self._data_source_l = common_pb.DataSource() self._data_source_l.role = common_pb.FLRole.Leader self._data_source_l.state = common_pb.DataSourceState.Init self._data_source_l.output_base_dir = "./ds_output_l" self._raw_data_dir_l = "./raw_data_l" self._data_source_l.raw_data_sub_dir = "./raw_data_sub_dir_l" self._data_source_f = common_pb.DataSource() self._data_source_f.role = common_pb.FLRole.Follower self._data_source_f.state = common_pb.DataSourceState.Init self._data_source_f.output_base_dir = "./ds_output_f" self._raw_data_dir_f = "./raw_data_f" self._data_source_f.raw_data_sub_dir = "./raw_data_sub_dir_f" data_source_meta = common_pb.DataSourceMeta() data_source_meta.name = self._data_source_name data_source_meta.partition_num = 4 data_source_meta.start_time = 0 data_source_meta.end_time = 100000000 self._data_source_l.data_source_meta.MergeFrom(data_source_meta) self._data_source_f.data_source_meta.MergeFrom(data_source_meta) common.commit_data_source(self._kvstore_l, self._data_source_l) common.commit_data_source(self._kvstore_f, self._data_source_f)
def test_raw_data_visitor(self): self.data_source = common_pb.DataSource() self.data_source.data_source_meta.name = 'fclh_test' self.data_source.data_source_meta.partition_num = 1 self.data_source.raw_data_dir = "./test/compressed_raw_data" self.etcd = etcd_client.EtcdClient('test_cluster', 'localhost:2379', 'fedlearner') self.etcd.delete_prefix(self.data_source.data_source_meta.name) self.assertEqual(self.data_source.data_source_meta.partition_num, 1) partition_dir = os.path.join(self.data_source.raw_data_dir, 'partition_0') self.assertTrue(gfile.Exists(partition_dir)) manifest_manager = raw_data_manifest_manager.RawDataManifestManager( self.etcd, self.data_source) options = customized_options.CustomizedOptions() options.set_raw_data_iter('TF_DATASET') options.set_compressed_type('GZIP') rdv = raw_data_visitor.RawDataVisitor(self.etcd, self.data_source, 0, options) expected_index = 0 for (index, item) in rdv: if index > 0 and index % 1024 == 0: print("{} {} {}".format(index, item.example_id, item.event_time)) self.assertEqual(index, expected_index) expected_index += 1 self.assertGreater(expected_index, 0)
def setUp(self): data_source = common_pb.DataSource() data_source.data_source_meta.name = "milestone-x" data_source.data_source_meta.partition_num = 1 data_source.example_dumped_dir = "./example_ids" self.data_source = data_source if gfile.Exists(self.data_source.example_dumped_dir): gfile.DeleteRecursively(self.data_source.example_dumped_dir) self.partition_dir = os.path.join(self.data_source.example_dumped_dir, 'partition_0') gfile.MakeDirs(self.partition_dir) self._example_id_dumper = example_id_dumper.ExampleIdDumperManager( self.data_source, 0) self.assertEqual(self._example_id_dumper.get_next_index(), 0) index = 0 for i in range(5): req = dj_pb.SyncExamplesRequest( data_source_meta=data_source.data_source_meta, partition_id=0, begin_index=index) for j in range(1 << 15): req.example_id.append('{}'.format(index).encode()) req.event_time.append(150000000 + index) self.end_index = index index += 1 self._example_id_dumper.append_synced_example_req(req) self.assertEqual(self._example_id_dumper.get_next_index(), index) self._example_id_dumper.finish_sync_example() self.assertTrue(self._example_id_dumper.need_dump()) self._example_id_dumper.dump_example_ids()
def setUp(self): data_source = common_pb.DataSource() data_source.data_source_meta.name = "milestone-f" data_source.data_source_meta.partition_num = 1 data_source.output_base_dir = "./ds_output" self.raw_data_dir = "./raw_data" self.data_source = data_source self.raw_data_options = dj_pb.RawDataOptions(raw_data_iter='TF_RECORD', compressed_type='', optional_fields=['label']) self.example_id_dump_options = dj_pb.ExampleIdDumpOptions( example_id_dump_interval=1, example_id_dump_threshold=1024) self.example_joiner_options = dj_pb.ExampleJoinerOptions( example_joiner='STREAM_JOINER', min_matching_window=32, max_matching_window=128, data_block_dump_interval=30, data_block_dump_threshold=128) if gfile.Exists(self.data_source.output_base_dir): gfile.DeleteRecursively(self.data_source.output_base_dir) if gfile.Exists(self.raw_data_dir): gfile.DeleteRecursively(self.raw_data_dir) self.kvstore = mysql_client.DBClient('test_cluster', 'localhost:2379', 'test_user', 'test_password', 'fedlearner', True) self.kvstore.delete_prefix( common.data_source_kvstore_base_dir( self.data_source.data_source_meta.name)) self.total_raw_data_count = 0 self.total_example_id_count = 0 self.manifest_manager = raw_data_manifest_manager.RawDataManifestManager( self.kvstore, self.data_source) self.g_data_block_index = 0
def setUp(self): data_source = common_pb.DataSource() data_source.data_source_meta.name = "milestone-x" data_source.data_source_meta.partition_num = 4 data_source.data_source_meta.start_time = 0 data_source.data_source_meta.end_time = 10000 data_source.output_base_dir = "./ds_output" data_source.role = common_pb.FLRole.Follower self.data_source = data_source self.db_database = 'test_cluster' self.db_addr = 'localhost:2379' self.db_base_dir = 'fedlearner' self.db_username = '******' self.db_password = '******' self.kvstore = mysql_client.DBClient(self.db_database, self.db_addr, self.db_username, self.db_password, self.db_base_dir, True) common.commit_data_source(self.kvstore, self.data_source) if gfile.Exists(data_source.output_base_dir): gfile.DeleteRecursively(data_source.output_base_dir) self.data_block_matas = [] self.manifest_manager = raw_data_manifest_manager.RawDataManifestManager( self.kvstore, self.data_source) partition_num = self.data_source.data_source_meta.partition_num for i in range(partition_num): self._create_data_block(i)
def setUp(self): data_source = common_pb.DataSource() data_source.data_source_meta.name = "milestone-f" data_source.data_source_meta.partition_num = 1 data_source.data_block_dir = "./data_block" data_source.example_dumped_dir = "./example_id" data_source.raw_data_dir = "./raw_data" self.data_source = data_source self.raw_data_options = dj_pb.RawDataOptions(raw_data_iter='TF_RECORD', compressed_type='') self.example_id_dump_options = dj_pb.ExampleIdDumpOptions( example_id_dump_interval=1, example_id_dump_threshold=1024) self.example_joiner_options = dj_pb.ExampleJoinerOptions( example_joiner='STREAM_JOINER', min_matching_window=32, max_matching_window=128, data_block_dump_interval=30, data_block_dump_threshold=128) if gfile.Exists(self.data_source.data_block_dir): gfile.DeleteRecursively(self.data_source.data_block_dir) if gfile.Exists(self.data_source.example_dumped_dir): gfile.DeleteRecursively(self.data_source.example_dumped_dir) if gfile.Exists(self.data_source.raw_data_dir): gfile.DeleteRecursively(self.data_source.raw_data_dir) self.etcd = etcd_client.EtcdClient('test_cluster', 'localhost:2379', 'fedlearner', True) self.etcd.delete_prefix(self.data_source.data_source_meta.name) self.total_raw_data_count = 0 self.total_example_id_count = 0 self.manifest_manager = raw_data_manifest_manager.RawDataManifestManager( self.etcd, self.data_source) self.g_data_block_index = 0
def test_compressed_raw_data_visitor(self): self.data_source = common_pb.DataSource() self.data_source.data_source_meta.name = 'fclh_test' self.data_source.data_source_meta.partition_num = 1 self.raw_data_dir = path.join( path.dirname(path.abspath(__file__)), "../compressed_raw_data" ) self.kvstore = DBClient('etcd', True) self.kvstore.delete_prefix(common.data_source_kvstore_base_dir(self.data_source.data_source_meta.name)) self.assertEqual(self.data_source.data_source_meta.partition_num, 1) partition_dir = path.join(self.raw_data_dir, common.partition_repr(0)) self.assertTrue(gfile.Exists(partition_dir)) manifest_manager = raw_data_manifest_manager.RawDataManifestManager( self.kvstore, self.data_source) manifest_manager.add_raw_data( 0, [dj_pb.RawDataMeta(file_path=path.join(partition_dir, "0-0.idx"), timestamp=timestamp_pb2.Timestamp(seconds=3))], True) raw_data_options = dj_pb.RawDataOptions( raw_data_iter='TF_RECORD', compressed_type='GZIP', read_ahead_size=1<<20, read_batch_size=128 ) rdm = raw_data_visitor.RawDataManager(self.kvstore, self.data_source,0) self.assertTrue(rdm.check_index_meta_by_process_index(0)) rdv = raw_data_visitor.RawDataVisitor(self.kvstore, self.data_source, 0, raw_data_options) expected_index = 0 for (index, item) in rdv: if index > 0 and index % 32 == 0: print("{} {}".format(index, item.example_id)) self.assertEqual(index, expected_index) expected_index += 1 self.assertGreater(expected_index, 0)
def test_csv_raw_data_visitor(self): self.data_source = common_pb.DataSource() self.data_source.data_source_meta.name = 'fclh_test' self.data_source.data_source_meta.partition_num = 1 self.raw_data_dir = path.join(path.dirname(path.abspath(__file__)), "../csv_raw_data") self.etcd = etcd_client.EtcdClient('test_cluster', 'localhost:2379', 'fedlearner', True) self.etcd.delete_prefix( common.data_source_etcd_base_dir( self.data_source.data_source_meta.name)) self.assertEqual(self.data_source.data_source_meta.partition_num, 1) partition_dir = path.join(self.raw_data_dir, common.partition_repr(0)) self.assertTrue(gfile.Exists(partition_dir)) manifest_manager = raw_data_manifest_manager.RawDataManifestManager( self.etcd, self.data_source) manifest_manager.add_raw_data(0, [ dj_pb.RawDataMeta(file_path=path.join(partition_dir, "test_raw_data.csv"), timestamp=timestamp_pb2.Timestamp(seconds=3)) ], True) raw_data_options = dj_pb.RawDataOptions(raw_data_iter='CSV_DICT', read_ahead_size=1 << 20) rdm = raw_data_visitor.RawDataManager(self.etcd, self.data_source, 0) self.assertTrue(rdm.check_index_meta_by_process_index(0)) rdv = raw_data_visitor.RawDataVisitor(self.etcd, self.data_source, 0, raw_data_options) expected_index = 0 for (index, item) in rdv: if index > 0 and index % 1024 == 0: print("{} {}".format(index, item.raw_id)) self.assertEqual(index, expected_index) expected_index += 1 self.assertEqual(expected_index, 4999)
def retrieve_data_source(kvstore, data_source_name): kvstore_key = data_source_kvstore_base_dir(data_source_name) raw_data = kvstore.get_data(kvstore_key) if raw_data is None: raise ValueError( "kvstore master key is None for {}".format(data_source_name)) return text_format.Parse(raw_data, common_pb.DataSource())
def retrieve_data_source(etcd, data_source_name): etcd_key = data_source_etcd_base_dir(data_source_name) raw_data = etcd.get_data(etcd_key) if raw_data is None: raise ValueError( "etcd master key is None for {}".format(data_source_name)) return text_format.Parse(raw_data, common_pb.DataSource())
def test_raw_data_visitor(self): self.data_source = common_pb.DataSource() self.data_source.data_source_meta.name = 'fclh_test' self.data_source.data_source_meta.partition_num = 1 self.data_source.raw_data_dir = "./test/compressed_raw_data" self.etcd = etcd_client.EtcdClient('test_cluster', 'localhost:2379', 'fedlearner', True) self.etcd.delete_prefix(self.data_source.data_source_meta.name) self.assertEqual(self.data_source.data_source_meta.partition_num, 1) partition_dir = os.path.join(self.data_source.raw_data_dir, common.partition_repr(0)) self.assertTrue(gfile.Exists(partition_dir)) manifest_manager = raw_data_manifest_manager.RawDataManifestManager( self.etcd, self.data_source) manifest_manager.add_raw_data( 0, [dj_pb.RawDataMeta(file_path=os.path.join(partition_dir, "0-0.idx"), timestamp=timestamp_pb2.Timestamp(seconds=3))], True) raw_data_options = dj_pb.RawDataOptions( raw_data_iter='TF_DATASET', compressed_type='GZIP' ) rdm = raw_data_visitor.RawDataManager(self.etcd, self.data_source,0) self.assertTrue(rdm.check_index_meta_by_process_index(0)) rdv = raw_data_visitor.RawDataVisitor(self.etcd, self.data_source, 0, raw_data_options) expected_index = 0 for (index, item) in rdv: if index > 0 and index % 32 == 0: print("{} {}".format(index, item.example_id)) self.assertEqual(index, expected_index) expected_index += 1 self.assertGreater(expected_index, 0)
def setUp(self): data_source = common_pb.DataSource() data_source.data_source_meta.name = "milestone-f" data_source.data_source_meta.partition_num = 1 data_source.output_base_dir = "./ds_output" self.raw_data_dir = "./raw_data" self.data_source = data_source self.raw_data_options = dj_pb.RawDataOptions(raw_data_iter='TF_RECORD', compressed_type='') self.example_id_dump_options = dj_pb.ExampleIdDumpOptions( example_id_dump_interval=1, example_id_dump_threshold=1024) self.example_joiner_options = dj_pb.ExampleJoinerOptions( example_joiner='ATTRIBUTION_JOINER', min_matching_window=32, max_matching_window=51200, max_conversion_delay=interval_to_timestamp("124"), enable_negative_example_generator=True, data_block_dump_interval=32, data_block_dump_threshold=128, negative_sampling_rate=0.8, ) if gfile.Exists(self.data_source.output_base_dir): gfile.DeleteRecursively(self.data_source.output_base_dir) if gfile.Exists(self.raw_data_dir): gfile.DeleteRecursively(self.raw_data_dir) self.kvstore = db_client.DBClient('etcd', True) self.kvstore.delete_prefix( common.data_source_kvstore_base_dir( self.data_source.data_source_meta.name)) self.total_raw_data_count = 0 self.total_example_id_count = 0 self.manifest_manager = raw_data_manifest_manager.RawDataManifestManager( self.kvstore, self.data_source) self.g_data_block_index = 0
def setUp(self): self.data_source = common_pb.DataSource() self.data_source.data_source_meta.name = 'fclh_test' self.data_source.data_source_meta.partition_num = 1 self.data_source.raw_data_dir = "./raw_data" self.etcd = etcd_client.EtcdClient('test_cluster', 'localhost:2379', 'fedlearner', True) self.etcd.delete_prefix(self.data_source.data_source_meta.name) self.assertEqual(self.data_source.data_source_meta.partition_num, 1) partition_dir = os.path.join(self.data_source.raw_data_dir, 'partition_0') if gfile.Exists(partition_dir): gfile.DeleteRecursively(partition_dir) gfile.MakeDirs(partition_dir) for i in range(2): fname = 'raw_data_{}'.format(i) fpath = os.path.join(partition_dir, fname) writer = tf.io.TFRecordWriter(fpath) for j in range(100): feat = {} example_id = '{}'.format(i * 100 + j).encode() feat['example_id'] = tf.train.Feature( bytes_list=tf.train.BytesList(value=[example_id])) example = tf.train.Example(features=tf.train.Features( feature=feat)) writer.write(example.SerializeToString()) writer.close()
def _sync_data_source(self): if self._data_source is None: raw_data = self._etcd.get_data(self._master_etcd_key) if raw_data is None: raise ValueError("etcd master key is None for {}".format( self._data_source_name)) self._data_source = text_format.Parse(raw_data, common_pb.DataSource())
def _gen_ds_meta(self, role): data_source = common_pb.DataSource() data_source.data_source_meta.name = self.app_id data_source.data_source_meta.partition_num = 1 data_source.data_source_meta.start_time = 0 data_source.data_source_meta.end_time = 100000 data_source.output_base_dir = "{}/{}_{}/data_source/".format( output_path, data_source.data_source_meta.name, role) data_source.role = role return data_source
def setUp(self): data_source_f = common_pb.DataSource() data_source_f.data_source_meta.name = "milestone-f" data_source_f.data_source_meta.partition_num = 1 data_source_f.data_block_dir = "./data_block-f" self.data_source_f = data_source_f if gfile.Exists(self.data_source_f.data_block_dir): gfile.DeleteRecursively(self.data_source_f.data_block_dir) data_source_l = common_pb.DataSource() data_source_l.data_source_meta.name = "milestone-l" data_source_l.data_source_meta.partition_num = 1 data_source_l.data_block_dir = "./data_block-l" data_source_l.raw_data_dir = "./raw_data-l" self.data_source_l = data_source_l if gfile.Exists(self.data_source_l.data_block_dir): gfile.DeleteRecursively(self.data_source_l.data_block_dir) if gfile.Exists(self.data_source_l.raw_data_dir): gfile.DeleteRecursively(self.data_source_l.raw_data_dir) self.etcd = etcd_client.EtcdClient('test_cluster', '10.8.163.165:4578', 'byte_fl') self.etcd.delete_prefix(self.data_source_l.data_source_meta.name)
def __init__(self, etcd, raw_data_options, mock_data_source_name, raw_data_sub_dir): mock_data_source = common_pb.DataSource( state=common_pb.DataSourceState.Processing, data_source_meta=common_pb.DataSourceMeta( name=mock_data_source_name, partition_num=1, raw_data_sub_dir=raw_data_sub_dir)) self._mock_rd_manifest_manager = RawDataManifestManager( etcd, mock_data_source) super(EtcdBasedMockRawDataVisitor, self).__init__(etcd, mock_data_source, 0, raw_data_options)
def __init__(self, kvstore, raw_data_options, mock_data_source_name, raw_data_sub_dir, partition_id): mock_data_source = common_pb.DataSource( state=common_pb.DataSourceState.Processing, raw_data_sub_dir=raw_data_sub_dir, data_source_meta=common_pb.DataSourceMeta( name=mock_data_source_name, partition_num=partition_id + 1)) self._mock_rd_manifest_manager = RawDataManifestManager( kvstore, mock_data_source, False) self._partition_id = partition_id super(DBBasedMockRawDataVisitor, self).__init__(kvstore, mock_data_source, partition_id, raw_data_options)
def setUp(self): data_source = common_pb.DataSource() data_source.data_source_meta.name = "milestone-x" data_source.data_source_meta.partition_num = 1 data_source.data_block_dir = "./data_block" self.data_source = data_source if gfile.Exists(data_source.data_block_dir): gfile.DeleteRecursively(data_source.data_block_dir) self.data_block_manager = data_block_manager.DataBlockManager( data_source, 0) self.assertEqual(self.data_block_manager.get_dumped_data_block_count(), 0) self.assertEqual(self.data_block_manager.get_lastest_data_block_meta(), None)
def setUp(self): self.data_source = common_pb.DataSource() self.data_source.data_source_meta.name = 'fclh_test' self.data_source.data_source_meta.partition_num = 1 self.raw_data_dir = "./raw_data" self.kvstore = db_client.DBClient('etcd', True) self.kvstore.delete_prefix(common.data_source_kvstore_base_dir(self.data_source.data_source_meta.name)) self.assertEqual(self.data_source.data_source_meta.partition_num, 1) partition_dir = os.path.join(self.raw_data_dir, common.partition_repr(0)) if gfile.Exists(partition_dir): gfile.DeleteRecursively(partition_dir) gfile.MakeDirs(partition_dir) self.manifest_manager = raw_data_manifest_manager.RawDataManifestManager( self.kvstore, self.data_source)
def setUp(self): data_source_f = common_pb.DataSource() data_source_f.data_source_meta.name = "milestone" data_source_f.data_source_meta.partition_num = 1 data_source_f.output_base_dir = "./output-f" self.data_source_f = data_source_f if gfile.Exists(self.data_source_f.output_base_dir): gfile.DeleteRecursively(self.data_source_f.output_base_dir) data_source_l = common_pb.DataSource() data_source_l.data_source_meta.name = "milestone" data_source_l.data_source_meta.partition_num = 1 data_source_l.output_base_dir = "./output-l" self.raw_data_dir_l = "./raw_data-l" self.data_source_l = data_source_l if gfile.Exists(self.data_source_l.output_base_dir): gfile.DeleteRecursively(self.data_source_l.output_base_dir) if gfile.Exists(self.raw_data_dir_l): gfile.DeleteRecursively(self.raw_data_dir_l) self.kvstore = db_client.DBClient('etcd', True) self.kvstore.delete_prefix( common.data_source_kvstore_base_dir( self.data_source_l.data_source_meta.name)) self.manifest_manager = raw_data_manifest_manager.RawDataManifestManager( self.kvstore, self.data_source_l)
def setUp(self): self.etcd = etcd_client.EtcdClient('test_cluster', 'localhost:2379', 'fedlearner', True) data_source = common_pb.DataSource() data_source.data_source_meta.name = "milestone-x" data_source.data_source_meta.partition_num = 1 data_source.example_dumped_dir = "./example_ids" self.etcd.delete_prefix(data_source.data_source_meta.name) self.data_source = data_source self.example_id_dump_options = dj_pb.ExampleIdDumpOptions( example_id_dump_interval=-1, example_id_dump_threshold=1024) if gfile.Exists(self.data_source.example_dumped_dir): gfile.DeleteRecursively(self.data_source.example_dumped_dir) self.partition_dir = os.path.join(self.data_source.example_dumped_dir, common.partition_repr(0)) gfile.MakeDirs(self.partition_dir)
def setUp(self): self.data_source = common_pb.DataSource() self.data_source.data_source_meta.name = 'fclh_test' self.data_source.data_source_meta.partition_num = 1 self.data_source.raw_data_dir = "./raw_data" self.etcd = etcd_client.EtcdClient('test_cluster', 'localhost:2379', 'fedlearner', True) self.etcd.delete_prefix(self.data_source.data_source_meta.name) self.assertEqual(self.data_source.data_source_meta.partition_num, 1) partition_dir = os.path.join(self.data_source.raw_data_dir, common.partition_repr(0)) if gfile.Exists(partition_dir): gfile.DeleteRecursively(partition_dir) gfile.MakeDirs(partition_dir) self.manifest_manager = raw_data_manifest_manager.RawDataManifestManager( self.etcd, self.data_source)
def init(self, dsname, joiner_name, version=Version.V1, cache_type="memory"): data_source = common_pb.DataSource() data_source.data_source_meta.name = dsname data_source.data_source_meta.partition_num = 1 data_source.output_base_dir = "%s_ds_output" % dsname self.raw_data_dir = "%s_raw_data" % dsname self.data_source = data_source self.raw_data_options = dj_pb.RawDataOptions( raw_data_iter='TF_RECORD', compressed_type='', raw_data_cache_type=cache_type, ) self.example_id_dump_options = dj_pb.ExampleIdDumpOptions( example_id_dump_interval=1, example_id_dump_threshold=1024) self.example_joiner_options = dj_pb.ExampleJoinerOptions( example_joiner=joiner_name, min_matching_window=32, max_matching_window=51200, max_conversion_delay=interval_to_timestamp("124"), enable_negative_example_generator=True, data_block_dump_interval=32, data_block_dump_threshold=128, negative_sampling_rate=0.8, join_expr="example_id", join_key_mapper="DEFAULT", negative_sampling_filter_expr='', ) if gfile.Exists(self.data_source.output_base_dir): gfile.DeleteRecursively(self.data_source.output_base_dir) if gfile.Exists(self.raw_data_dir): gfile.DeleteRecursively(self.raw_data_dir) self.kvstore = db_client.DBClient('etcd', True) self.kvstore.delete_prefix( common.data_source_kvstore_base_dir( self.data_source.data_source_meta.name)) self.total_raw_data_count = 0 self.total_example_id_count = 0 self.manifest_manager = raw_data_manifest_manager.RawDataManifestManager( self.kvstore, self.data_source) self.g_data_block_index = 0 self.version = version
def setUp(self): self.kvstore = db_client.DBClient('etcd', True) data_source = common_pb.DataSource() data_source.data_source_meta.name = "milestone-x" data_source.data_source_meta.partition_num = 1 data_source.output_base_dir = "./ds_output" self.kvstore.delete_prefix( common.data_source_kvstore_base_dir( data_source.data_source_meta.name)) self.data_source = data_source self.example_id_dump_options = dj_pb.ExampleIdDumpOptions( example_id_dump_interval=-1, example_id_dump_threshold=1024) if gfile.Exists(self.data_source.output_base_dir): gfile.DeleteRecursively(self.data_source.output_base_dir) self.partition_dir = os.path.join( common.data_source_example_dumped_dir(self.data_source), common.partition_repr(0)) gfile.MakeDirs(self.partition_dir)
def __init__(self, etcd, raw_data_options, mock_data_source_name, input_fpaths): mock_data_source = common_pb.DataSource( state=common_pb.DataSourceState.Processing, data_source_meta=common_pb.DataSourceMeta( name=mock_data_source_name, partition_num=1)) mock_rd_manifest_manager = RawDataManifestManager( etcd, mock_data_source) manifest = mock_rd_manifest_manager.get_manifest(0) if not manifest.finished: metas = [] for fpath in input_fpaths: metas.append(dj_pb.RawDataMeta(file_path=fpath, start_index=-1)) mock_rd_manifest_manager.add_raw_data(0, metas, True) mock_rd_manifest_manager.finish_raw_data(0) super(MockRawDataVisitor, self).__init__(etcd, mock_data_source, 0, raw_data_options)
def __init__(self, base_path, name, role, partition_num=1, start_time=0, end_time=100000): if role == 'leader': role = 0 elif role == 'follower': role = 1 else: raise ValueError("Unknown role %s" % role) data_source = common_pb.DataSource() data_source.data_source_meta.name = name data_source.data_source_meta.partition_num = partition_num data_source.data_source_meta.start_time = start_time data_source.data_source_meta.end_time = end_time data_source.output_base_dir = "{}/{}_{}/data_source/".format( base_path, data_source.data_source_meta.name, role) data_source.role = role if gfile.Exists(data_source.output_base_dir): gfile.DeleteRecursively(data_source.output_base_dir) self._data_source = data_source db_database, db_addr, db_username, db_password, db_base_dir = \ get_kvstore_config("etcd") self._kv_store = mysql_client.DBClient(db_database, db_addr, db_username, db_password, db_base_dir, True) common.commit_data_source(self._kv_store, self._data_source) self._dbms = [] for i in range(partition_num): manifest_manager = raw_data_manifest_manager.RawDataManifestManager( self._kv_store, self._data_source) manifest_manager._finish_partition('join_example_rep', dj_pb.JoinExampleState.UnJoined, dj_pb.JoinExampleState.Joined, -1, i) self._dbms.append( data_block_manager.DataBlockManager(self._data_source, i))
def setUp(self): data_source = common_pb.DataSource() data_source.data_source_meta.name = "milestone-x" data_source.data_source_meta.partition_num = 4 data_source.data_source_meta.start_time = 0 data_source.data_source_meta.end_time = 10000 data_source.output_base_dir = "./ds_output" data_source.role = common_pb.FLRole.Follower self.data_source = data_source self.kvstore = db_client.DBClient('etcd', True) common.commit_data_source(self.kvstore, self.data_source) if gfile.Exists(data_source.output_base_dir): gfile.DeleteRecursively(data_source.output_base_dir) self.data_block_matas = [] self.manifest_manager = raw_data_manifest_manager.RawDataManifestManager( self.kvstore, self.data_source) partition_num = self.data_source.data_source_meta.partition_num for i in range(partition_num): self._create_data_block(i)