def get_existing_subvolume_path(segmentation_dir, corner, allow_cpoint=False): """Returns the path to an existing FFN subvolume. This like `get_subvolume_path`, but returns paths to existing data only. Args: segmentation_dir: directory containing FFN subvolumes corner: lower corner of the FFN subvolume as a (z, y, x) tuple allow_cpoint: whether to return a checkpoint path in case the final segmentation is not ready Returns: path to an existing FFN subvolume (string) or None if no such subvolume is found """ target_path = segmentation_path(segmentation_dir, corner) if gfile.Exists(target_path): return target_path target_path = legacy_segmentation_path(segmentation_dir, corner) if gfile.Exists(target_path): return target_path if allow_cpoint: target_path = checkpoint_path(segmentation_dir, corner) if gfile.Exists(target_path): return target_path return None
def setUp(self): data_source = common_pb.DataSource() data_source.data_source_meta.name = "milestone-f" data_source.data_source_meta.partition_num = 1 data_source.output_base_dir = "./ds_output" self.raw_data_dir = "./raw_data" self.data_source = data_source self.raw_data_options = dj_pb.RawDataOptions(raw_data_iter='TF_RECORD', compressed_type='', optional_fields=['label']) self.example_id_dump_options = dj_pb.ExampleIdDumpOptions( example_id_dump_interval=1, example_id_dump_threshold=1024) self.example_joiner_options = dj_pb.ExampleJoinerOptions( example_joiner='STREAM_JOINER', min_matching_window=32, max_matching_window=128, data_block_dump_interval=30, data_block_dump_threshold=128) if gfile.Exists(self.data_source.output_base_dir): gfile.DeleteRecursively(self.data_source.output_base_dir) if gfile.Exists(self.raw_data_dir): gfile.DeleteRecursively(self.raw_data_dir) self.kvstore = mysql_client.DBClient('test_cluster', 'localhost:2379', 'test_user', 'test_password', 'fedlearner', True) self.kvstore.delete_prefix( common.data_source_kvstore_base_dir( self.data_source.data_source_meta.name)) self.total_raw_data_count = 0 self.total_example_id_count = 0 self.manifest_manager = raw_data_manifest_manager.RawDataManifestManager( self.kvstore, self.data_source) self.g_data_block_index = 0
def copy_to_gcs(src, dst): assert gfile.Exists(src), src assert not gfile.Exists(dst), dst print("Saving to", dst) with gfile.GFile(src, "rb") as src_f, gfile.GFile(dst, "wb") as dst_f: shutil.copyfileobj(src_f, dst_f)
def setUp(self): data_source = common_pb.DataSource() data_source.data_source_meta.name = "milestone-f" data_source.data_source_meta.partition_num = 1 data_source.data_block_dir = "./data_block" data_source.example_dumped_dir = "./example_id" data_source.raw_data_dir = "./raw_data" self.data_source = data_source self.raw_data_options = dj_pb.RawDataOptions(raw_data_iter='TF_RECORD', compressed_type='') self.example_id_dump_options = dj_pb.ExampleIdDumpOptions( example_id_dump_interval=1, example_id_dump_threshold=1024) self.example_joiner_options = dj_pb.ExampleJoinerOptions( example_joiner='STREAM_JOINER', min_matching_window=32, max_matching_window=128, data_block_dump_interval=30, data_block_dump_threshold=128) if gfile.Exists(self.data_source.data_block_dir): gfile.DeleteRecursively(self.data_source.data_block_dir) if gfile.Exists(self.data_source.example_dumped_dir): gfile.DeleteRecursively(self.data_source.example_dumped_dir) if gfile.Exists(self.data_source.raw_data_dir): gfile.DeleteRecursively(self.data_source.raw_data_dir) self.etcd = etcd_client.EtcdClient('test_cluster', 'localhost:2379', 'fedlearner', True) self.etcd.delete_prefix(self.data_source.data_source_meta.name) self.total_raw_data_count = 0 self.total_example_id_count = 0 self.manifest_manager = raw_data_manifest_manager.RawDataManifestManager( self.etcd, self.data_source) self.g_data_block_index = 0
def _sync_saved_data_block_index(self): if self._saved_data_block_index is None: assert self._saving_data_block_index is None, \ "no data block index is saving when no saved index" low_index = 0 high_index = data_block_index_threshold while low_index <= high_index: data_index = (low_index + high_index) // 2 file_name = self._acquire_data_block_meta_path(data_index) if gfile.Exists(file_name): low_index = data_index + 1 else: high_index = data_index - 1 self._saved_data_block_index = high_index elif self._saving_data_block_index is not None: assert self._saving_data_block_index == self._saved_data_block_index + 1, \ "the saving index should be next of saved index " \ "{} != {} + 1".format(self._saving_data_block_index, self._saved_data_block_index) file_name = self._acquire_data_block_meta_path( self._saving_data_block_index) if not gfile.Exists(file_name): self._saving_data_block_index = None else: self._saved_data_block_index = self._saving_data_block_index self._saving_data_block_index = None
def _clean_up(self): if gfile.Exists(self._input_dir): gfile.DeleteRecursively(self._input_dir) if gfile.Exists(self._partition_output_dir): gfile.DeleteRecursively(self._partition_output_dir) if gfile.Exists(self._merge_output_dir): gfile.DeleteRecursively(self._merge_output_dir)
def _remove_existed_dir(self): if gfile.Exists(self._portal_manifest_l.input_data_base_dir): gfile.DeleteRecursively( self._portal_manifest_l.input_data_base_dir) if gfile.Exists(self._portal_manifest_l.output_data_base_dir): gfile.DeleteRecursively( self._portal_manifest_l.output_data_base_dir) if gfile.Exists(self._portal_manifest_f.input_data_base_dir): gfile.DeleteRecursively( self._portal_manifest_f.input_data_base_dir) if gfile.Exists(self._portal_manifest_f.output_data_base_dir): gfile.DeleteRecursively( self._portal_manifest_f.output_data_base_dir) if gfile.Exists(self._data_source_l.data_block_dir): gfile.DeleteRecursively(self._data_source_l.data_block_dir) if gfile.Exists(self._data_source_l.raw_data_dir): gfile.DeleteRecursively(self._data_source_l.raw_data_dir) if gfile.Exists(self._data_source_l.example_dumped_dir): gfile.DeleteRecursively(self._data_source_l.example_dumped_dir) if gfile.Exists(self._data_source_f.data_block_dir): gfile.DeleteRecursively(self._data_source_f.data_block_dir) if gfile.Exists(self._data_source_f.raw_data_dir): gfile.DeleteRecursively(self._data_source_f.raw_data_dir) if gfile.Exists(self._data_source_f.example_dumped_dir): gfile.DeleteRecursively(self._data_source_f.example_dumped_dir)
def setUp(self): data_source_f = common_pb.DataSource() data_source_f.data_source_meta.name = "milestone" data_source_f.data_source_meta.partition_num = 1 data_source_f.output_base_dir = "./output-f" self.data_source_f = data_source_f if gfile.Exists(self.data_source_f.output_base_dir): gfile.DeleteRecursively(self.data_source_f.output_base_dir) data_source_l = common_pb.DataSource() data_source_l.data_source_meta.name = "milestone" data_source_l.data_source_meta.partition_num = 1 data_source_l.output_base_dir = "./output-l" self.raw_data_dir_l = "./raw_data-l" self.data_source_l = data_source_l if gfile.Exists(self.data_source_l.output_base_dir): gfile.DeleteRecursively(self.data_source_l.output_base_dir) if gfile.Exists(self.raw_data_dir_l): gfile.DeleteRecursively(self.raw_data_dir_l) self.etcd = etcd_client.EtcdClient('test_cluster', 'localhost:2379', 'fedlearner', True) self.etcd.delete_prefix( common.data_source_etcd_base_dir( self.data_source_l.data_source_meta.name)) self.manifest_manager = raw_data_manifest_manager.RawDataManifestManager( self.etcd, self.data_source_l)
def _sync_manager_state(self, init): if self._double_check_dump_finished() and not init: return if self._fly_sort_run_dumper is not None: if gfile.Exists(self._fly_sort_run_dumper.tmp_fpath): gfile.Remove(self._fly_sort_run_dumper.tmp_fpath) fpath = self._fly_sort_run_dumper.fpath if fpath is None and gfile.Exists(fpath): fname = path.basename(fpath) meta = SortRunMeta.decode_sort_run_meta_from_fname(fname) self._dumped_sort_run_metas.append(meta) self._dumped_process_index = meta.process_index self._fly_sort_run_dumper = None if self._dumped_process_index is None: self._dumped_sort_run_metas = \ [SortRunMeta.decode_sort_run_meta_from_fname(fname) for fname in self._list_dumper_output_dir()] self._dumped_sort_run_metas.sort() if len(self._dumped_sort_run_metas) == 0: self._dumped_process_index = -1 else: self._dumped_process_index = \ self._dumped_sort_run_metas[-1].process_index with self._lock: self._next_index_to_dump = \ 0 if len(self._dumped_sort_run_metas) == 0 \ else self._dumped_sort_run_metas[-1].end_index + 1
def setUp(self): data_source = common_pb.DataSource() data_source.data_source_meta.name = "milestone-f" data_source.data_source_meta.partition_num = 1 data_source.output_base_dir = "./ds_output" self.raw_data_dir = "./raw_data" self.data_source = data_source self.raw_data_options = dj_pb.RawDataOptions(raw_data_iter='TF_RECORD', compressed_type='') self.example_id_dump_options = dj_pb.ExampleIdDumpOptions( example_id_dump_interval=1, example_id_dump_threshold=1024) self.example_joiner_options = dj_pb.ExampleJoinerOptions( example_joiner='ATTRIBUTION_JOINER', min_matching_window=32, max_matching_window=51200, max_conversion_delay=interval_to_timestamp("124"), enable_negative_example_generator=True, data_block_dump_interval=32, data_block_dump_threshold=128, negative_sampling_rate=0.8, ) if gfile.Exists(self.data_source.output_base_dir): gfile.DeleteRecursively(self.data_source.output_base_dir) if gfile.Exists(self.raw_data_dir): gfile.DeleteRecursively(self.raw_data_dir) self.kvstore = db_client.DBClient('etcd', True) self.kvstore.delete_prefix( common.data_source_kvstore_base_dir( self.data_source.data_source_meta.name)) self.total_raw_data_count = 0 self.total_example_id_count = 0 self.manifest_manager = raw_data_manifest_manager.RawDataManifestManager( self.kvstore, self.data_source) self.g_data_block_index = 0
def tearDown(self): if gfile.Exists(self.data_source_f.output_base_dir): gfile.DeleteRecursively(self.data_source_f.output_base_dir) if gfile.Exists(self.data_source_l.output_base_dir): gfile.DeleteRecursively(self.data_source_l.output_base_dir) if gfile.Exists(self.raw_data_dir_l): gfile.DeleteRecursively(self.raw_data_dir_l) self.etcd.delete_prefix(common.data_source_etcd_base_dir(self.data_source_l.data_source_meta.name))
def tearDown(self): if gfile.Exists(self.data_source.output_base_dir): gfile.DeleteRecursively(self.data_source.output_base_dir) if gfile.Exists(self.raw_data_dir): gfile.DeleteRecursively(self.raw_data_dir) self.kvstore.delete_prefix( common.data_source_kvstore_base_dir( self.data_source.data_source_meta.name))
def tearDown(self): if gfile.Exists(self.data_source.data_block_dir): gfile.DeleteRecursively(self.data_source.data_block_dir) if gfile.Exists(self.data_source.example_dumped_dir): gfile.DeleteRecursively(self.data_source.example_dumped_dir) if gfile.Exists(self.data_source.raw_data_dir): gfile.DeleteRecursively(self.data_source.raw_data_dir) self.etcd.delete_prefix(self.data_source.data_source_meta.name)
def tearDown(self): if gfile.Exists(self.data_source_l.output_base_dir): gfile.DeleteRecursively(self.data_source_l.output_base_dir) if gfile.Exists(self.raw_data_dir_l): gfile.DeleteRecursively(self.raw_data_dir_l) if gfile.Exists(self.data_source_f.output_base_dir): gfile.DeleteRecursively(self.data_source_f.output_base_dir) if gfile.Exists(self.raw_data_dir_f): gfile.DeleteRecursively(self.raw_data_dir_f) self.kvstore_f.delete_prefix( common.data_source_kvstore_base_dir(self.db_base_dir_f)) self.kvstore_l.delete_prefix( common.data_source_kvstore_base_dir(self.db_base_dir_l))
def threshold_segmentation(segmentation_dir, corner, labels, threshold): prob_path = object_prob_path(segmentation_dir, corner) if not gfile.Exists(prob_path): prob_path = legacy_object_prob_path(segmentation_dir, corner) if not gfile.Exists(prob_path): raise ValueError('Cannot find probability map %s' % prob_path) with gfile.Open(prob_path, 'rb') as f: data = np.load(f) if 'qprob' not in data: raise ValueError('Invalid FFN probability map.') prob = dequantize_probability(data['qprob']) labels[prob < threshold] = 0
def test_remove(self): """Test remove. """ # Setup and check preconditions. file_name = "igfs:///test_remove/1" self.assertFalse(gfile.Exists(file_name)) with gfile.Open(file_name, mode="w") as w: w.write("") self.assertTrue(gfile.Exists(file_name)) # Remove file. gfile.Remove(file_name) # Check that file was removed. self.assertFalse(gfile.Exists(file_name))
def test_copy(self): """Test copy. """ # Setup and check preconditions. src_file_name = "igfs:///test_copy/1" dst_file_name = "igfs:///test_copy/2" self.assertFalse(gfile.Exists(src_file_name)) self.assertFalse(gfile.Exists(dst_file_name)) with gfile.Open(src_file_name, mode="w") as w: w.write("42") self.assertTrue(gfile.Exists(src_file_name)) self.assertFalse(gfile.Exists(dst_file_name)) # Copy file. gfile.Copy(src_file_name, dst_file_name) # Check that files are identical. self.assertTrue(gfile.Exists(src_file_name)) self.assertTrue(gfile.Exists(dst_file_name)) with gfile.Open(dst_file_name, mode="r") as r: data_v = r.read() self.assertEqual("42", data_v) # Remove file. gfile.Remove(src_file_name) gfile.Remove(dst_file_name) # Check that file was removed. self.assertFalse(gfile.Exists(src_file_name)) self.assertFalse(gfile.Exists(dst_file_name))
def test_make_dirs(self): """Test make dirs. """ # Setup and check preconditions. dir_name = "igfs:///test_make_dirs/" self.assertFalse(gfile.Exists(dir_name)) # Make directory. gfile.MkDir(dir_name) # Check that directory was created. self.assertTrue(gfile.Exists(dir_name)) # Remove directory. gfile.Remove(dir_name) # Check that directory was removed. self.assertFalse(gfile.Exists(dir_name))
def test_raw_data_visitor(self): self.data_source = common_pb.DataSource() self.data_source.data_source_meta.name = 'fclh_test' self.data_source.data_source_meta.partition_num = 1 self.data_source.raw_data_dir = "./test/compressed_raw_data" self.etcd = etcd_client.EtcdClient('test_cluster', 'localhost:2379', 'fedlearner', True) self.etcd.delete_prefix(self.data_source.data_source_meta.name) self.assertEqual(self.data_source.data_source_meta.partition_num, 1) partition_dir = os.path.join(self.data_source.raw_data_dir, common.partition_repr(0)) self.assertTrue(gfile.Exists(partition_dir)) manifest_manager = raw_data_manifest_manager.RawDataManifestManager( self.etcd, self.data_source) manifest_manager.add_raw_data( 0, [dj_pb.RawDataMeta(file_path=os.path.join(partition_dir, "0-0.idx"), timestamp=timestamp_pb2.Timestamp(seconds=3))], True) raw_data_options = dj_pb.RawDataOptions( raw_data_iter='TF_DATASET', compressed_type='GZIP' ) rdm = raw_data_visitor.RawDataManager(self.etcd, self.data_source,0) self.assertTrue(rdm.check_index_meta_by_process_index(0)) rdv = raw_data_visitor.RawDataVisitor(self.etcd, self.data_source, 0, raw_data_options) expected_index = 0 for (index, item) in rdv: if index > 0 and index % 32 == 0: print("{} {}".format(index, item.example_id)) self.assertEqual(index, expected_index) expected_index += 1 self.assertGreater(expected_index, 0)
def _create_merged_dir_if_need(self): merge_dir = os.path.join(self._options.output_file_dir, common.partition_repr(self._partition_id)) if gfile.Exists(merge_dir): assert gfile.IsDirectory(merge_dir) else: gfile.MakeDirs(merge_dir)
def check_glob_prefix(prefix): """Verifies that there is at least one match for a glob prefix. Args: prefix: Glob prefix to check. Returns: None Raises: RuntimeError: If there are no matches or the parent path doesn't exist. """ if prefix is None: raise RuntimeError("Got None instead of a valid glob prefix.") path = pathlib.Path(prefix) # Check if the prefix path FLAGS.source_embeddings_prefix has at least one # match. This methods stays fast even if there are a trillion matches. # Definitely unnecessary. (len(list(matches)) > 0 felt ugly.) if not gfile.Exists(path.parent): raise RuntimeError(f"The parent of the glob prefix didn't exist:\n" f" - Glob prefix: {path}\n" f" - Glob parent: {path.parent}") matches = path.parent.glob(path.name + "*") at_least_one = len(list(itertools.islice(matches, 0, 1))) > 0 # pylint: disable=g-explicit-length-test if not at_least_one: raise RuntimeError("No matches to the globbing prefix:\n{prefix}")
def test_csv_raw_data_visitor(self): self.data_source = common_pb.DataSource() self.data_source.data_source_meta.name = 'fclh_test' self.data_source.data_source_meta.partition_num = 1 self.raw_data_dir = path.join(path.dirname(path.abspath(__file__)), "../csv_raw_data") self.etcd = etcd_client.EtcdClient('test_cluster', 'localhost:2379', 'fedlearner', True) self.etcd.delete_prefix( common.data_source_etcd_base_dir( self.data_source.data_source_meta.name)) self.assertEqual(self.data_source.data_source_meta.partition_num, 1) partition_dir = path.join(self.raw_data_dir, common.partition_repr(0)) self.assertTrue(gfile.Exists(partition_dir)) manifest_manager = raw_data_manifest_manager.RawDataManifestManager( self.etcd, self.data_source) manifest_manager.add_raw_data(0, [ dj_pb.RawDataMeta(file_path=path.join(partition_dir, "test_raw_data.csv"), timestamp=timestamp_pb2.Timestamp(seconds=3)) ], True) raw_data_options = dj_pb.RawDataOptions(raw_data_iter='CSV_DICT', read_ahead_size=1 << 20) rdm = raw_data_visitor.RawDataManager(self.etcd, self.data_source, 0) self.assertTrue(rdm.check_index_meta_by_process_index(0)) rdv = raw_data_visitor.RawDataVisitor(self.etcd, self.data_source, 0, raw_data_options) expected_index = 0 for (index, item) in rdv: if index > 0 and index % 1024 == 0: print("{} {}".format(index, item.raw_id)) self.assertEqual(index, expected_index) expected_index += 1 self.assertEqual(expected_index, 4999)
def _make_directory_if_nessary(self): example_dumped_dir = self._example_dumped_dir() if not gfile.Exists(example_dumped_dir): gfile.MakeDirs(example_dumped_dir) if not gfile.IsDirectory(example_dumped_dir): logging.fatal("%s should be directory", example_dumped_dir) os._exit(-1) # pylint: disable=protected-access
def add_raw_data(self, partition_id, fpaths, dedup, timestamps=None): self._check_partition_id(partition_id) if not fpaths: raise RuntimeError("no files input") if timestamps is not None and len(fpaths) != len(timestamps): raise RuntimeError("the number of raw data file "\ "and timestamp mismatch") rdreq = dj_pb.RawDataRequest( data_source_meta=self._data_source.data_source_meta, partition_id=partition_id, added_raw_data_metas=dj_pb.AddedRawDataMetas( dedup=dedup ) ) for index, fpath in enumerate(fpaths): if not gfile.Exists(fpath): raise ValueError('{} is not existed' % format(fpath)) raw_data_meta = dj_pb.RawDataMeta( file_path=fpath, start_index=-1 ) if timestamps is not None: raw_data_meta.timestamp.MergeFrom(timestamps[index]) rdreq.added_raw_data_metas.raw_data_metas.append(raw_data_meta) return self._master_client.AddRawData(rdreq)
def _generate_input_tf_record(self, cands, base_dir): if not gfile.Exists(base_dir): gfile.MakeDirs(base_dir) fpaths = [] random.shuffle(cands) tfr_writers = [] partition_num = self._data_source_l.data_source_meta.partition_num for partition_id in range(partition_num): fpath = os.path.join(base_dir, str(partition_id) + common.RawDataFileSuffix) fpaths.append(fpath) tfr_writers.append(tf.io.TFRecordWriter(fpath)) for item in cands: partition_id = CityHash32(item) % partition_num feat = {} feat['raw_id'] = tf.train.Feature(bytes_list=tf.train.BytesList( value=[item.encode()])) f0 = 'follower' + str((partition_id << 30) + 0) + item f1 = 'follower' + str((partition_id << 30) + 1) + item f2 = 'follower' + str((partition_id << 30) + 2) + item feat['feat_0'] = tf.train.Feature(bytes_list=tf.train.BytesList( value=[f0.encode()])) feat['feat_1'] = tf.train.Feature(bytes_list=tf.train.BytesList( value=[f1.encode()])) feat['feat_2'] = tf.train.Feature(bytes_list=tf.train.BytesList( value=[f2.encode()])) example = tf.train.Example(features=tf.train.Features( feature=feat)) tfr_writers[partition_id].write(example.SerializeToString()) for tfr_writer in tfr_writers: tfr_writer.close() return fpaths
def _add_raw_data_impl(self, notify_ctx, portal_manifest, ds_pid): dt = notify_ctx.get_raw_data_updated_datetime(ds_pid) + \ timedelta(hours=1) begin_dt = common.convert_timestamp_to_datetime( common.trim_timestamp_by_hourly(portal_manifest.begin_timestamp)) if dt < begin_dt: dt = begin_dt committed_dt = common.convert_timestamp_to_datetime( portal_manifest.committed_timestamp) fpaths = [] timestamps = [] ds_ptnum = notify_ctx.data_source.data_source_meta.partition_num while dt <= committed_dt: for pt_pid in range(ds_pid, portal_manifest.output_partition_num, ds_ptnum): fpath = common.encode_portal_hourly_fpath( portal_manifest.output_data_base_dir, dt, pt_pid) if gfile.Exists(fpath): fpaths.append(fpath) timestamps.append(common.convert_datetime_to_timestamp(dt)) if len(fpaths) > 32 or dt == committed_dt: break dt += timedelta(hours=1) notify_ctx.add_raw_data(ds_pid, fpaths, timestamps, dt) logging.info("add %d raw data file for partition %d of data "\ "source %s. latest updated datetime %s", len(fpaths), ds_pid, notify_ctx.data_source.data_source_meta.name, dt) return dt >= committed_dt
def _publish_raw_data(self, job_id): portal_manifest = self._sync_portal_manifest() output_dir = None if portal_manifest.data_portal_type == dp_pb.DataPortalType.PSI: output_dir = common.portal_map_output_dir( portal_manifest.output_base_dir, job_id) else: output_dir = common.portal_reduce_output_dir( portal_manifest.output_base_dir, job_id) for partition_id in range(self._output_partition_num): dpath = path.join(output_dir, common.partition_repr(partition_id)) fnames = [] if gfile.Exists(dpath) and gfile.IsDirectory(dpath): fnames = [ f for f in gfile.ListDirectory(dpath) if f.endswith(common.RawDataFileSuffix) ] publish_fpaths = [] if portal_manifest.data_portal_type == dp_pb.DataPortalType.PSI: publish_fpaths = self._publish_psi_raw_data( partition_id, dpath, fnames) else: publish_fpaths = self._publish_streaming_raw_data( partition_id, dpath, fnames) logging.info("Data Portal Master publish %d file for partition "\ "%d of streaming job %d\n----------\n", len(publish_fpaths), partition_id, job_id) for seq, fpath in enumerate(publish_fpaths): logging.info("%d. %s", seq, fpath) logging.info("------------------------------------------\n")
def generate_input_csv(base_dir, start_id, end_id, partition_num): for partition_id in range(partition_num): dirpath = os.path.join(base_dir, common.partition_repr(partition_id)) if not gfile.Exists(dirpath): gfile.MakeDirs(dirpath) assert gfile.IsDirectory(dirpath) csv_writers = [ SortRunMergerWriter(base_dir, 0, partition_id, 'CSV_DICT') for partition_id in range(partition_num) ] for idx in range(start_id, end_id): if idx % 262144 == 0: logging.info("Process at index %d", idx) partition_id = CityHash32(str(idx)) % partition_num raw = OrderedDict() raw['raw_id'] = str(idx) raw['feat_0'] = str((partition_id << 30) + 0) + str(idx) raw['feat_1'] = str((partition_id << 30) + 1) + str(idx) raw['feat_2'] = str((partition_id << 30) + 2) + str(idx) csv_writers[partition_id].append(raw) for partition_id, csv_writer in enumerate(csv_writers): fpaths = csv_writer.finish() logging.info("partition %d dump %d files", partition_id, len(fpaths)) for seq_id, fpath in enumerate(fpaths): logging.info(" %d. %s", seq_id, fpath) logging.info("---------------")
def setUp(self): data_source = common_pb.DataSource() data_source.data_source_meta.name = "milestone-x" data_source.data_source_meta.partition_num = 4 data_source.data_source_meta.start_time = 0 data_source.data_source_meta.end_time = 10000 data_source.output_base_dir = "./ds_output" data_source.role = common_pb.FLRole.Follower self.data_source = data_source self.db_database = 'test_cluster' self.db_addr = 'localhost:2379' self.db_base_dir = 'fedlearner' self.db_username = '******' self.db_password = '******' self.kvstore = mysql_client.DBClient(self.db_database, self.db_addr, self.db_username, self.db_password, self.db_base_dir, True) common.commit_data_source(self.kvstore, self.data_source) if gfile.Exists(data_source.output_base_dir): gfile.DeleteRecursively(data_source.output_base_dir) self.data_block_matas = [] self.manifest_manager = raw_data_manifest_manager.RawDataManifestManager( self.kvstore, self.data_source) partition_num = self.data_source.data_source_meta.partition_num for i in range(partition_num): self._create_data_block(i)
def _make_directory_if_nessary(self): data_block_dir = self._data_block_dir() if not gfile.Exists(data_block_dir): gfile.MakeDirs(data_block_dir) if not gfile.IsDirectory(data_block_dir): logging.fatal("%s should be directory", data_block_dir) os._exit(-1) # pylint: disable=protected-access