Esempio n. 1
0
def get_existing_subvolume_path(segmentation_dir, corner, allow_cpoint=False):
    """Returns the path to an existing FFN subvolume.

  This like `get_subvolume_path`, but returns paths to existing data only.

  Args:
    segmentation_dir: directory containing FFN subvolumes
    corner: lower corner of the FFN subvolume as a (z, y, x) tuple
    allow_cpoint: whether to return a checkpoint path in case the final
        segmentation is not ready

  Returns:
    path to an existing FFN subvolume (string) or None if no such subvolume
    is found
  """
    target_path = segmentation_path(segmentation_dir, corner)
    if gfile.Exists(target_path):
        return target_path

    target_path = legacy_segmentation_path(segmentation_dir, corner)
    if gfile.Exists(target_path):
        return target_path

    if allow_cpoint:
        target_path = checkpoint_path(segmentation_dir, corner)
        if gfile.Exists(target_path):
            return target_path

    return None
Esempio n. 2
0
 def setUp(self):
     data_source = common_pb.DataSource()
     data_source.data_source_meta.name = "milestone-f"
     data_source.data_source_meta.partition_num = 1
     data_source.output_base_dir = "./ds_output"
     self.raw_data_dir = "./raw_data"
     self.data_source = data_source
     self.raw_data_options = dj_pb.RawDataOptions(raw_data_iter='TF_RECORD',
                                                  compressed_type='',
                                                  optional_fields=['label'])
     self.example_id_dump_options = dj_pb.ExampleIdDumpOptions(
         example_id_dump_interval=1, example_id_dump_threshold=1024)
     self.example_joiner_options = dj_pb.ExampleJoinerOptions(
         example_joiner='STREAM_JOINER',
         min_matching_window=32,
         max_matching_window=128,
         data_block_dump_interval=30,
         data_block_dump_threshold=128)
     if gfile.Exists(self.data_source.output_base_dir):
         gfile.DeleteRecursively(self.data_source.output_base_dir)
     if gfile.Exists(self.raw_data_dir):
         gfile.DeleteRecursively(self.raw_data_dir)
     self.kvstore = mysql_client.DBClient('test_cluster', 'localhost:2379',
                                          'test_user', 'test_password',
                                          'fedlearner', True)
     self.kvstore.delete_prefix(
         common.data_source_kvstore_base_dir(
             self.data_source.data_source_meta.name))
     self.total_raw_data_count = 0
     self.total_example_id_count = 0
     self.manifest_manager = raw_data_manifest_manager.RawDataManifestManager(
         self.kvstore, self.data_source)
     self.g_data_block_index = 0
def copy_to_gcs(src, dst):
    assert gfile.Exists(src), src
    assert not gfile.Exists(dst), dst

    print("Saving to", dst)
    with gfile.GFile(src, "rb") as src_f, gfile.GFile(dst, "wb") as dst_f:
        shutil.copyfileobj(src_f, dst_f)
Esempio n. 4
0
 def setUp(self):
     data_source = common_pb.DataSource()
     data_source.data_source_meta.name = "milestone-f"
     data_source.data_source_meta.partition_num = 1
     data_source.data_block_dir = "./data_block"
     data_source.example_dumped_dir = "./example_id"
     data_source.raw_data_dir = "./raw_data"
     self.data_source = data_source
     self.raw_data_options = dj_pb.RawDataOptions(raw_data_iter='TF_RECORD',
                                                  compressed_type='')
     self.example_id_dump_options = dj_pb.ExampleIdDumpOptions(
         example_id_dump_interval=1, example_id_dump_threshold=1024)
     self.example_joiner_options = dj_pb.ExampleJoinerOptions(
         example_joiner='STREAM_JOINER',
         min_matching_window=32,
         max_matching_window=128,
         data_block_dump_interval=30,
         data_block_dump_threshold=128)
     if gfile.Exists(self.data_source.data_block_dir):
         gfile.DeleteRecursively(self.data_source.data_block_dir)
     if gfile.Exists(self.data_source.example_dumped_dir):
         gfile.DeleteRecursively(self.data_source.example_dumped_dir)
     if gfile.Exists(self.data_source.raw_data_dir):
         gfile.DeleteRecursively(self.data_source.raw_data_dir)
     self.etcd = etcd_client.EtcdClient('test_cluster', 'localhost:2379',
                                        'fedlearner', True)
     self.etcd.delete_prefix(self.data_source.data_source_meta.name)
     self.total_raw_data_count = 0
     self.total_example_id_count = 0
     self.manifest_manager = raw_data_manifest_manager.RawDataManifestManager(
         self.etcd, self.data_source)
     self.g_data_block_index = 0
Esempio n. 5
0
 def _sync_saved_data_block_index(self):
     if self._saved_data_block_index is None:
         assert self._saving_data_block_index is None, \
             "no data block index is saving when no saved index"
         low_index = 0
         high_index = data_block_index_threshold
         while low_index <= high_index:
             data_index = (low_index + high_index) // 2
             file_name = self._acquire_data_block_meta_path(data_index)
             if gfile.Exists(file_name):
                 low_index = data_index + 1
             else:
                 high_index = data_index - 1
         self._saved_data_block_index = high_index
     elif self._saving_data_block_index is not None:
         assert self._saving_data_block_index == self._saved_data_block_index + 1, \
             "the saving index should be next of saved index " \
             "{} != {} + 1".format(self._saving_data_block_index, self._saved_data_block_index)
         file_name = self._acquire_data_block_meta_path(
             self._saving_data_block_index)
         if not gfile.Exists(file_name):
             self._saving_data_block_index = None
         else:
             self._saved_data_block_index = self._saving_data_block_index
             self._saving_data_block_index = None
Esempio n. 6
0
 def _clean_up(self):
     if gfile.Exists(self._input_dir):
         gfile.DeleteRecursively(self._input_dir)
     if gfile.Exists(self._partition_output_dir):
         gfile.DeleteRecursively(self._partition_output_dir)
     if gfile.Exists(self._merge_output_dir):
         gfile.DeleteRecursively(self._merge_output_dir)
 def _remove_existed_dir(self):
     if gfile.Exists(self._portal_manifest_l.input_data_base_dir):
         gfile.DeleteRecursively(
             self._portal_manifest_l.input_data_base_dir)
     if gfile.Exists(self._portal_manifest_l.output_data_base_dir):
         gfile.DeleteRecursively(
             self._portal_manifest_l.output_data_base_dir)
     if gfile.Exists(self._portal_manifest_f.input_data_base_dir):
         gfile.DeleteRecursively(
             self._portal_manifest_f.input_data_base_dir)
     if gfile.Exists(self._portal_manifest_f.output_data_base_dir):
         gfile.DeleteRecursively(
             self._portal_manifest_f.output_data_base_dir)
     if gfile.Exists(self._data_source_l.data_block_dir):
         gfile.DeleteRecursively(self._data_source_l.data_block_dir)
     if gfile.Exists(self._data_source_l.raw_data_dir):
         gfile.DeleteRecursively(self._data_source_l.raw_data_dir)
     if gfile.Exists(self._data_source_l.example_dumped_dir):
         gfile.DeleteRecursively(self._data_source_l.example_dumped_dir)
     if gfile.Exists(self._data_source_f.data_block_dir):
         gfile.DeleteRecursively(self._data_source_f.data_block_dir)
     if gfile.Exists(self._data_source_f.raw_data_dir):
         gfile.DeleteRecursively(self._data_source_f.raw_data_dir)
     if gfile.Exists(self._data_source_f.example_dumped_dir):
         gfile.DeleteRecursively(self._data_source_f.example_dumped_dir)
Esempio n. 8
0
 def setUp(self):
     data_source_f = common_pb.DataSource()
     data_source_f.data_source_meta.name = "milestone"
     data_source_f.data_source_meta.partition_num = 1
     data_source_f.output_base_dir = "./output-f"
     self.data_source_f = data_source_f
     if gfile.Exists(self.data_source_f.output_base_dir):
         gfile.DeleteRecursively(self.data_source_f.output_base_dir)
     data_source_l = common_pb.DataSource()
     data_source_l.data_source_meta.name = "milestone"
     data_source_l.data_source_meta.partition_num = 1
     data_source_l.output_base_dir = "./output-l"
     self.raw_data_dir_l = "./raw_data-l"
     self.data_source_l = data_source_l
     if gfile.Exists(self.data_source_l.output_base_dir):
         gfile.DeleteRecursively(self.data_source_l.output_base_dir)
     if gfile.Exists(self.raw_data_dir_l):
         gfile.DeleteRecursively(self.raw_data_dir_l)
     self.etcd = etcd_client.EtcdClient('test_cluster', 'localhost:2379',
                                        'fedlearner', True)
     self.etcd.delete_prefix(
         common.data_source_etcd_base_dir(
             self.data_source_l.data_source_meta.name))
     self.manifest_manager = raw_data_manifest_manager.RawDataManifestManager(
         self.etcd, self.data_source_l)
Esempio n. 9
0
 def _sync_manager_state(self, init):
     if self._double_check_dump_finished() and not init:
         return
     if self._fly_sort_run_dumper is not None:
         if gfile.Exists(self._fly_sort_run_dumper.tmp_fpath):
             gfile.Remove(self._fly_sort_run_dumper.tmp_fpath)
         fpath = self._fly_sort_run_dumper.fpath
         if fpath is None and gfile.Exists(fpath):
             fname = path.basename(fpath)
             meta = SortRunMeta.decode_sort_run_meta_from_fname(fname)
             self._dumped_sort_run_metas.append(meta)
             self._dumped_process_index = meta.process_index
     self._fly_sort_run_dumper = None
     if self._dumped_process_index is None:
         self._dumped_sort_run_metas = \
                 [SortRunMeta.decode_sort_run_meta_from_fname(fname)
                  for fname in self._list_dumper_output_dir()]
         self._dumped_sort_run_metas.sort()
         if len(self._dumped_sort_run_metas) == 0:
             self._dumped_process_index = -1
         else:
             self._dumped_process_index = \
                     self._dumped_sort_run_metas[-1].process_index
     with self._lock:
         self._next_index_to_dump = \
                 0 if len(self._dumped_sort_run_metas) == 0 \
                 else self._dumped_sort_run_metas[-1].end_index + 1
Esempio n. 10
0
 def setUp(self):
     data_source = common_pb.DataSource()
     data_source.data_source_meta.name = "milestone-f"
     data_source.data_source_meta.partition_num = 1
     data_source.output_base_dir = "./ds_output"
     self.raw_data_dir = "./raw_data"
     self.data_source = data_source
     self.raw_data_options = dj_pb.RawDataOptions(raw_data_iter='TF_RECORD',
                                                  compressed_type='')
     self.example_id_dump_options = dj_pb.ExampleIdDumpOptions(
         example_id_dump_interval=1, example_id_dump_threshold=1024)
     self.example_joiner_options = dj_pb.ExampleJoinerOptions(
         example_joiner='ATTRIBUTION_JOINER',
         min_matching_window=32,
         max_matching_window=51200,
         max_conversion_delay=interval_to_timestamp("124"),
         enable_negative_example_generator=True,
         data_block_dump_interval=32,
         data_block_dump_threshold=128,
         negative_sampling_rate=0.8,
     )
     if gfile.Exists(self.data_source.output_base_dir):
         gfile.DeleteRecursively(self.data_source.output_base_dir)
     if gfile.Exists(self.raw_data_dir):
         gfile.DeleteRecursively(self.raw_data_dir)
     self.kvstore = db_client.DBClient('etcd', True)
     self.kvstore.delete_prefix(
         common.data_source_kvstore_base_dir(
             self.data_source.data_source_meta.name))
     self.total_raw_data_count = 0
     self.total_example_id_count = 0
     self.manifest_manager = raw_data_manifest_manager.RawDataManifestManager(
         self.kvstore, self.data_source)
     self.g_data_block_index = 0
Esempio n. 11
0
 def tearDown(self):
     if gfile.Exists(self.data_source_f.output_base_dir):
         gfile.DeleteRecursively(self.data_source_f.output_base_dir)
     if gfile.Exists(self.data_source_l.output_base_dir):
         gfile.DeleteRecursively(self.data_source_l.output_base_dir)
     if gfile.Exists(self.raw_data_dir_l):
         gfile.DeleteRecursively(self.raw_data_dir_l)
     self.etcd.delete_prefix(common.data_source_etcd_base_dir(self.data_source_l.data_source_meta.name))
Esempio n. 12
0
 def tearDown(self):
     if gfile.Exists(self.data_source.output_base_dir):
         gfile.DeleteRecursively(self.data_source.output_base_dir)
     if gfile.Exists(self.raw_data_dir):
         gfile.DeleteRecursively(self.raw_data_dir)
     self.kvstore.delete_prefix(
         common.data_source_kvstore_base_dir(
             self.data_source.data_source_meta.name))
Esempio n. 13
0
 def tearDown(self):
     if gfile.Exists(self.data_source.data_block_dir):
         gfile.DeleteRecursively(self.data_source.data_block_dir)
     if gfile.Exists(self.data_source.example_dumped_dir):
         gfile.DeleteRecursively(self.data_source.example_dumped_dir)
     if gfile.Exists(self.data_source.raw_data_dir):
         gfile.DeleteRecursively(self.data_source.raw_data_dir)
     self.etcd.delete_prefix(self.data_source.data_source_meta.name)
Esempio n. 14
0
 def tearDown(self):
     if gfile.Exists(self.data_source_l.output_base_dir):
         gfile.DeleteRecursively(self.data_source_l.output_base_dir)
     if gfile.Exists(self.raw_data_dir_l):
         gfile.DeleteRecursively(self.raw_data_dir_l)
     if gfile.Exists(self.data_source_f.output_base_dir):
         gfile.DeleteRecursively(self.data_source_f.output_base_dir)
     if gfile.Exists(self.raw_data_dir_f):
         gfile.DeleteRecursively(self.raw_data_dir_f)
     self.kvstore_f.delete_prefix(
         common.data_source_kvstore_base_dir(self.db_base_dir_f))
     self.kvstore_l.delete_prefix(
         common.data_source_kvstore_base_dir(self.db_base_dir_l))
Esempio n. 15
0
def threshold_segmentation(segmentation_dir, corner, labels, threshold):
    prob_path = object_prob_path(segmentation_dir, corner)
    if not gfile.Exists(prob_path):
        prob_path = legacy_object_prob_path(segmentation_dir, corner)
        if not gfile.Exists(prob_path):
            raise ValueError('Cannot find probability map %s' % prob_path)

    with gfile.Open(prob_path, 'rb') as f:
        data = np.load(f)
        if 'qprob' not in data:
            raise ValueError('Invalid FFN probability map.')

        prob = dequantize_probability(data['qprob'])
        labels[prob < threshold] = 0
Esempio n. 16
0
    def test_remove(self):
        """Test remove.

    """
        # Setup and check preconditions.
        file_name = "igfs:///test_remove/1"
        self.assertFalse(gfile.Exists(file_name))
        with gfile.Open(file_name, mode="w") as w:
            w.write("")
        self.assertTrue(gfile.Exists(file_name))
        # Remove file.
        gfile.Remove(file_name)
        # Check that file was removed.
        self.assertFalse(gfile.Exists(file_name))
Esempio n. 17
0
    def test_copy(self):
        """Test copy.

    """
        # Setup and check preconditions.
        src_file_name = "igfs:///test_copy/1"
        dst_file_name = "igfs:///test_copy/2"
        self.assertFalse(gfile.Exists(src_file_name))
        self.assertFalse(gfile.Exists(dst_file_name))
        with gfile.Open(src_file_name, mode="w") as w:
            w.write("42")
        self.assertTrue(gfile.Exists(src_file_name))
        self.assertFalse(gfile.Exists(dst_file_name))
        # Copy file.
        gfile.Copy(src_file_name, dst_file_name)
        # Check that files are identical.
        self.assertTrue(gfile.Exists(src_file_name))
        self.assertTrue(gfile.Exists(dst_file_name))
        with gfile.Open(dst_file_name, mode="r") as r:
            data_v = r.read()
        self.assertEqual("42", data_v)
        # Remove file.
        gfile.Remove(src_file_name)
        gfile.Remove(dst_file_name)
        # Check that file was removed.
        self.assertFalse(gfile.Exists(src_file_name))
        self.assertFalse(gfile.Exists(dst_file_name))
Esempio n. 18
0
    def test_make_dirs(self):
        """Test make dirs.

    """
        # Setup and check preconditions.
        dir_name = "igfs:///test_make_dirs/"
        self.assertFalse(gfile.Exists(dir_name))
        # Make directory.
        gfile.MkDir(dir_name)
        # Check that directory was created.
        self.assertTrue(gfile.Exists(dir_name))
        # Remove directory.
        gfile.Remove(dir_name)
        # Check that directory was removed.
        self.assertFalse(gfile.Exists(dir_name))
 def test_raw_data_visitor(self):
     self.data_source = common_pb.DataSource()
     self.data_source.data_source_meta.name = 'fclh_test'
     self.data_source.data_source_meta.partition_num = 1
     self.data_source.raw_data_dir = "./test/compressed_raw_data"
     self.etcd = etcd_client.EtcdClient('test_cluster', 'localhost:2379',
                                        'fedlearner', True)
     self.etcd.delete_prefix(self.data_source.data_source_meta.name)
     self.assertEqual(self.data_source.data_source_meta.partition_num, 1)
     partition_dir = os.path.join(self.data_source.raw_data_dir, common.partition_repr(0))
     self.assertTrue(gfile.Exists(partition_dir))
     manifest_manager = raw_data_manifest_manager.RawDataManifestManager(
         self.etcd, self.data_source)
     manifest_manager.add_raw_data(
             0, [dj_pb.RawDataMeta(file_path=os.path.join(partition_dir, "0-0.idx"),
                                   timestamp=timestamp_pb2.Timestamp(seconds=3))],
             True)
     raw_data_options = dj_pb.RawDataOptions(
             raw_data_iter='TF_DATASET',
             compressed_type='GZIP'
         )
     rdm = raw_data_visitor.RawDataManager(self.etcd, self.data_source,0)
     self.assertTrue(rdm.check_index_meta_by_process_index(0))
     rdv = raw_data_visitor.RawDataVisitor(self.etcd, self.data_source, 0,
                                           raw_data_options)
     expected_index = 0
     for (index, item) in rdv:
         if index > 0 and index % 32 == 0:
             print("{} {}".format(index, item.example_id))
         self.assertEqual(index, expected_index)
         expected_index += 1
     self.assertGreater(expected_index, 0)
Esempio n. 20
0
 def _create_merged_dir_if_need(self):
     merge_dir = os.path.join(self._options.output_file_dir,
                              common.partition_repr(self._partition_id))
     if gfile.Exists(merge_dir):
         assert gfile.IsDirectory(merge_dir)
     else:
         gfile.MakeDirs(merge_dir)
Esempio n. 21
0
def check_glob_prefix(prefix):
  """Verifies that there is at least one match for a glob prefix.

  Args:
    prefix: Glob prefix to check.

  Returns:
    None

  Raises:
    RuntimeError: If there are no matches or the parent path doesn't exist.
  """
  if prefix is None:
    raise RuntimeError("Got None instead of a valid glob prefix.")

  path = pathlib.Path(prefix)
  # Check if the prefix path FLAGS.source_embeddings_prefix has at least one
  # match. This methods stays fast even if there are a trillion matches.
  # Definitely unnecessary. (len(list(matches)) > 0 felt ugly.)
  if not gfile.Exists(path.parent):
    raise RuntimeError(f"The parent of the glob prefix didn't exist:\n"
                       f" - Glob prefix: {path}\n"
                       f" - Glob parent: {path.parent}")
  matches = path.parent.glob(path.name + "*")
  at_least_one = len(list(itertools.islice(matches, 0, 1))) > 0  # pylint: disable=g-explicit-length-test
  if not at_least_one:
    raise RuntimeError("No matches to the globbing prefix:\n{prefix}")
Esempio n. 22
0
 def test_csv_raw_data_visitor(self):
     self.data_source = common_pb.DataSource()
     self.data_source.data_source_meta.name = 'fclh_test'
     self.data_source.data_source_meta.partition_num = 1
     self.raw_data_dir = path.join(path.dirname(path.abspath(__file__)),
                                   "../csv_raw_data")
     self.etcd = etcd_client.EtcdClient('test_cluster', 'localhost:2379',
                                        'fedlearner', True)
     self.etcd.delete_prefix(
         common.data_source_etcd_base_dir(
             self.data_source.data_source_meta.name))
     self.assertEqual(self.data_source.data_source_meta.partition_num, 1)
     partition_dir = path.join(self.raw_data_dir, common.partition_repr(0))
     self.assertTrue(gfile.Exists(partition_dir))
     manifest_manager = raw_data_manifest_manager.RawDataManifestManager(
         self.etcd, self.data_source)
     manifest_manager.add_raw_data(0, [
         dj_pb.RawDataMeta(file_path=path.join(partition_dir,
                                               "test_raw_data.csv"),
                           timestamp=timestamp_pb2.Timestamp(seconds=3))
     ], True)
     raw_data_options = dj_pb.RawDataOptions(raw_data_iter='CSV_DICT',
                                             read_ahead_size=1 << 20)
     rdm = raw_data_visitor.RawDataManager(self.etcd, self.data_source, 0)
     self.assertTrue(rdm.check_index_meta_by_process_index(0))
     rdv = raw_data_visitor.RawDataVisitor(self.etcd, self.data_source, 0,
                                           raw_data_options)
     expected_index = 0
     for (index, item) in rdv:
         if index > 0 and index % 1024 == 0:
             print("{} {}".format(index, item.raw_id))
         self.assertEqual(index, expected_index)
         expected_index += 1
     self.assertEqual(expected_index, 4999)
Esempio n. 23
0
 def _make_directory_if_nessary(self):
     example_dumped_dir = self._example_dumped_dir()
     if not gfile.Exists(example_dumped_dir):
         gfile.MakeDirs(example_dumped_dir)
     if not gfile.IsDirectory(example_dumped_dir):
         logging.fatal("%s should be directory", example_dumped_dir)
         os._exit(-1)  # pylint: disable=protected-access
Esempio n. 24
0
 def add_raw_data(self, partition_id, fpaths, dedup, timestamps=None):
     self._check_partition_id(partition_id)
     if not fpaths:
         raise RuntimeError("no files input")
     if timestamps is not None and len(fpaths) != len(timestamps):
         raise RuntimeError("the number of raw data file "\
                            "and timestamp mismatch")
     rdreq = dj_pb.RawDataRequest(
                 data_source_meta=self._data_source.data_source_meta,
                 partition_id=partition_id,
                 added_raw_data_metas=dj_pb.AddedRawDataMetas(
                     dedup=dedup
                 )
             )
     for index, fpath in enumerate(fpaths):
         if not gfile.Exists(fpath):
             raise ValueError('{} is not existed' % format(fpath))
         raw_data_meta = dj_pb.RawDataMeta(
                 file_path=fpath,
                 start_index=-1
             )
         if timestamps is not None:
             raw_data_meta.timestamp.MergeFrom(timestamps[index])
         rdreq.added_raw_data_metas.raw_data_metas.append(raw_data_meta)
     return self._master_client.AddRawData(rdreq)
Esempio n. 25
0
 def _generate_input_tf_record(self, cands, base_dir):
     if not gfile.Exists(base_dir):
         gfile.MakeDirs(base_dir)
     fpaths = []
     random.shuffle(cands)
     tfr_writers = []
     partition_num = self._data_source_l.data_source_meta.partition_num
     for partition_id in range(partition_num):
         fpath = os.path.join(base_dir,
                              str(partition_id) + common.RawDataFileSuffix)
         fpaths.append(fpath)
         tfr_writers.append(tf.io.TFRecordWriter(fpath))
     for item in cands:
         partition_id = CityHash32(item) % partition_num
         feat = {}
         feat['raw_id'] = tf.train.Feature(bytes_list=tf.train.BytesList(
             value=[item.encode()]))
         f0 = 'follower' + str((partition_id << 30) + 0) + item
         f1 = 'follower' + str((partition_id << 30) + 1) + item
         f2 = 'follower' + str((partition_id << 30) + 2) + item
         feat['feat_0'] = tf.train.Feature(bytes_list=tf.train.BytesList(
             value=[f0.encode()]))
         feat['feat_1'] = tf.train.Feature(bytes_list=tf.train.BytesList(
             value=[f1.encode()]))
         feat['feat_2'] = tf.train.Feature(bytes_list=tf.train.BytesList(
             value=[f2.encode()]))
         example = tf.train.Example(features=tf.train.Features(
             feature=feat))
         tfr_writers[partition_id].write(example.SerializeToString())
     for tfr_writer in tfr_writers:
         tfr_writer.close()
     return fpaths
 def _add_raw_data_impl(self, notify_ctx, portal_manifest, ds_pid):
     dt = notify_ctx.get_raw_data_updated_datetime(ds_pid) + \
             timedelta(hours=1)
     begin_dt = common.convert_timestamp_to_datetime(
         common.trim_timestamp_by_hourly(portal_manifest.begin_timestamp))
     if dt < begin_dt:
         dt = begin_dt
     committed_dt = common.convert_timestamp_to_datetime(
         portal_manifest.committed_timestamp)
     fpaths = []
     timestamps = []
     ds_ptnum = notify_ctx.data_source.data_source_meta.partition_num
     while dt <= committed_dt:
         for pt_pid in range(ds_pid, portal_manifest.output_partition_num,
                             ds_ptnum):
             fpath = common.encode_portal_hourly_fpath(
                 portal_manifest.output_data_base_dir, dt, pt_pid)
             if gfile.Exists(fpath):
                 fpaths.append(fpath)
                 timestamps.append(common.convert_datetime_to_timestamp(dt))
         if len(fpaths) > 32 or dt == committed_dt:
             break
         dt += timedelta(hours=1)
     notify_ctx.add_raw_data(ds_pid, fpaths, timestamps, dt)
     logging.info("add %d raw data file for partition %d of data "\
                  "source %s. latest updated datetime %s",
                   len(fpaths), ds_pid,
                   notify_ctx.data_source.data_source_meta.name, dt)
     return dt >= committed_dt
Esempio n. 27
0
 def _publish_raw_data(self, job_id):
     portal_manifest = self._sync_portal_manifest()
     output_dir = None
     if portal_manifest.data_portal_type == dp_pb.DataPortalType.PSI:
         output_dir = common.portal_map_output_dir(
             portal_manifest.output_base_dir, job_id)
     else:
         output_dir = common.portal_reduce_output_dir(
             portal_manifest.output_base_dir, job_id)
     for partition_id in range(self._output_partition_num):
         dpath = path.join(output_dir, common.partition_repr(partition_id))
         fnames = []
         if gfile.Exists(dpath) and gfile.IsDirectory(dpath):
             fnames = [
                 f for f in gfile.ListDirectory(dpath)
                 if f.endswith(common.RawDataFileSuffix)
             ]
         publish_fpaths = []
         if portal_manifest.data_portal_type == dp_pb.DataPortalType.PSI:
             publish_fpaths = self._publish_psi_raw_data(
                 partition_id, dpath, fnames)
         else:
             publish_fpaths = self._publish_streaming_raw_data(
                 partition_id, dpath, fnames)
         logging.info("Data Portal Master publish %d file for partition "\
                      "%d of streaming job %d\n----------\n",
                      len(publish_fpaths), partition_id, job_id)
         for seq, fpath in enumerate(publish_fpaths):
             logging.info("%d. %s", seq, fpath)
         logging.info("------------------------------------------\n")
Esempio n. 28
0
def generate_input_csv(base_dir, start_id, end_id, partition_num):
    for partition_id in range(partition_num):
        dirpath = os.path.join(base_dir, common.partition_repr(partition_id))
        if not gfile.Exists(dirpath):
            gfile.MakeDirs(dirpath)
        assert gfile.IsDirectory(dirpath)
    csv_writers = [
        SortRunMergerWriter(base_dir, 0, partition_id, 'CSV_DICT')
        for partition_id in range(partition_num)
    ]
    for idx in range(start_id, end_id):
        if idx % 262144 == 0:
            logging.info("Process at index %d", idx)
        partition_id = CityHash32(str(idx)) % partition_num
        raw = OrderedDict()
        raw['raw_id'] = str(idx)
        raw['feat_0'] = str((partition_id << 30) + 0) + str(idx)
        raw['feat_1'] = str((partition_id << 30) + 1) + str(idx)
        raw['feat_2'] = str((partition_id << 30) + 2) + str(idx)
        csv_writers[partition_id].append(raw)
    for partition_id, csv_writer in enumerate(csv_writers):
        fpaths = csv_writer.finish()
        logging.info("partition %d dump %d files", partition_id, len(fpaths))
        for seq_id, fpath in enumerate(fpaths):
            logging.info("  %d. %s", seq_id, fpath)
        logging.info("---------------")
 def setUp(self):
     data_source = common_pb.DataSource()
     data_source.data_source_meta.name = "milestone-x"
     data_source.data_source_meta.partition_num = 4
     data_source.data_source_meta.start_time = 0
     data_source.data_source_meta.end_time = 10000
     data_source.output_base_dir = "./ds_output"
     data_source.role = common_pb.FLRole.Follower
     self.data_source = data_source
     self.db_database = 'test_cluster'
     self.db_addr = 'localhost:2379'
     self.db_base_dir = 'fedlearner'
     self.db_username = '******'
     self.db_password = '******'
     self.kvstore = mysql_client.DBClient(self.db_database, self.db_addr,
                                          self.db_username,
                                          self.db_password,
                                          self.db_base_dir, True)
     common.commit_data_source(self.kvstore, self.data_source)
     if gfile.Exists(data_source.output_base_dir):
         gfile.DeleteRecursively(data_source.output_base_dir)
     self.data_block_matas = []
     self.manifest_manager = raw_data_manifest_manager.RawDataManifestManager(
         self.kvstore, self.data_source)
     partition_num = self.data_source.data_source_meta.partition_num
     for i in range(partition_num):
         self._create_data_block(i)
Esempio n. 30
0
 def _make_directory_if_nessary(self):
     data_block_dir = self._data_block_dir()
     if not gfile.Exists(data_block_dir):
         gfile.MakeDirs(data_block_dir)
     if not gfile.IsDirectory(data_block_dir):
         logging.fatal("%s should be directory", data_block_dir)
         os._exit(-1)  # pylint: disable=protected-access