Example #1
0
    def _build_split_filenames(self, split_info_list):
        """Construct the split filenames associated with the split info.

    The filenames correspond to the pre-processed datasets files present in
    the root directory of the dataset.

    Args:
      split_info_list: (list[SplitInfo]) List of split from which generate the
        filenames

    Returns:
      filenames: (list[str]) The list of filenames path corresponding to the
        split info object
    """

        filenames = []
        for split_info in split_info_list:
            filenames.extend(
                naming.filepaths_for_dataset_split(
                    dataset_name=self.name,
                    split=split_info.name,
                    num_shards=split_info.num_shards,
                    data_dir=self._data_dir,
                    filetype_suffix=self._file_format_adapter.filetype_suffix,
                ))
        return filenames
Example #2
0
def _get_dataset_files(name, path, instruction, name2shard_lengths):
    """Returns a list of files (+skip/take) corresponding to given instruction.

  Args:
    name: Name of the dataset.
    path: path to tfrecords.
    instruction: _AbsoluteInstruction instance.
    name2shard_lengths: dict associating split names to shard lengths.

  Returns:
    list of dict(filename, skip, take).
  """
    shard_lengths = name2shard_lengths[instruction.splitname]
    if not shard_lengths:
        msg = (
            '`DatasetInfo.SplitInfo.num_shards` is empty. S3 tfrecords_reader '
            'cannot be used. Make sure the data you are trying to read was '
            'generated using tfrecords_writer module (S3).')
        raise AssertionError(msg)
    filenames = naming.filepaths_for_dataset_split(
        dataset_name=name,
        split=instruction.splitname,
        num_shards=len(shard_lengths),
        data_dir=path,
        filetype_suffix='tfrecord')
    from_ = 0 if instruction.from_ is None else instruction.from_
    to = sum(shard_lengths) if instruction.to is None else instruction.to
    return _sharded_files.get_read_instructions(from_, to, filenames,
                                                shard_lengths)
 def filepaths(self):
     """Returns list of filepaths for this split."""
     return naming.filepaths_for_dataset_split(
         dataset_name=self.dataset_name,
         split=self.split,
         num_shards=self.num_shards,
         data_dir=self.data_dir,
         filetype_suffix=self.filetype_suffix)
Example #4
0
 def test_filepaths_for_dataset_split(self):
     self.assertEqual([
         '/tmp/bar/foo-train-00000-of-00002',
         '/tmp/bar/foo-train-00001-of-00002',
     ],
                      naming.filepaths_for_dataset_split(
                          dataset_name='foo',
                          split=splits.Split.TRAIN,
                          num_shards=2,
                          data_dir='/tmp/bar/'))
 def test_filepaths_for_dataset_split(self):
     self.assertEqual([
         "/tmp/bar/foo-train-00000-of-00002",
         "/tmp/bar/foo-train-00001-of-00002",
     ],
                      naming.filepaths_for_dataset_split(
                          dataset_name="foo",
                          split=splits.Split.TRAIN,
                          num_shards=2,
                          data_dir="/tmp/bar/"))
Example #6
0
 def test_filepaths_for_dataset_split(self):
     actual = naming.filepaths_for_dataset_split(dataset_name='foo',
                                                 split=splits.Split.TRAIN,
                                                 num_shards=2,
                                                 data_dir='/tmp/bar/',
                                                 filetype_suffix='bar')
     self.assertEqual(actual, [
         '/tmp/bar/foo-train.bar-00000-of-00002',
         '/tmp/bar/foo-train.bar-00001-of-00002'
     ])
Example #7
0
 def test_filepaths_for_dataset_split_with_suffix(self):
     self.assertEqual([
         "/tmp/bar/foo-train.bar-00000-of-00002",
         "/tmp/bar/foo-train.bar-00001-of-00002",
     ],
                      naming.filepaths_for_dataset_split(
                          dataset_name="foo",
                          split=dataset_builder.Split.TRAIN,
                          num_shards=2,
                          data_dir="/tmp/bar/",
                          filetype_suffix="bar"))
Example #8
0
def _get_dataset_files(name, path, instruction, name2shard_lengths):
    """Returns a list of files (+skip/take) corresponding to given instruction.

  This is the core of the reading logic, to translate from absolute instructions
  (split + left/right boundaries) to files + skip/take.

  Args:
    name: Name of the dataset.
    path: path to tfrecords.
    instruction: _AbsoluteInstruction instance.
    name2shard_lengths: dict associating split names to shard lengths.

  Returns:
    list of dict(filename, skip, take).
  """
    shard_lengths = name2shard_lengths[instruction.splitname]
    if not shard_lengths:
        msg = (
            '`DatasetInfo.SplitInfo.num_shards` is empty. S3 tfrecords_reader '
            'cannot be used. Make sure the data you are trying to read was '
            'generated using tfrecords_writer module (S3).')
        raise AssertionError(msg)
    filenames = naming.filepaths_for_dataset_split(
        dataset_name=name,
        split=instruction.splitname,
        num_shards=len(shard_lengths),
        data_dir=path,
        filetype_suffix='tfrecord')
    from_ = 0 if instruction.from_ is None else instruction.from_
    to = sum(shard_lengths) if instruction.to is None else instruction.to
    index_start = 0  # Beginning (included) of moving window.
    index_end = 0  # End (excluded) of moving window.
    files = []
    for filename, length in zip(filenames, shard_lengths):
        index_end += length
        if from_ < index_end and to > index_start:  # There is something to take.
            skip = from_ - index_start if from_ > index_start else 0
            take = to - index_start - skip if to < index_end else -1
            files.append(dict(filename=filename, skip=skip, take=take))
        index_start += length
    return files
Example #9
0
    def _build_split_filenames(self, split_info):
        """Construct the split filenames associated with the split info.

    The filenames correspond to the pre-processed datasets files present in
    the root directory of the dataset.

    Args:
      split_info: (SplitInfo) needed split.

    Returns:
      filenames: (list[str]) The list of filenames path corresponding to the
        split info object
    """

        return naming.filepaths_for_dataset_split(
            dataset_name=self.name,
            split=split_info.name,
            num_shards=split_info.num_shards,
            data_dir=self._data_dir,
            filetype_suffix=self._file_format_adapter.filetype_suffix,
        )