Beispiel #1
0
def _make_file_instructions_from_absolutes(
    name,
    name2shard_lengths,
    absolute_instructions,
):
    """Returns the files instructions from the absolute instructions list."""
    # For each split, return the files instruction (skip/take)
    file_instructions = []
    num_examples_per_shard = []
    for abs_instr in absolute_instructions:
        shard_lengths = name2shard_lengths[abs_instr.splitname]
        if not shard_lengths:
            raise ValueError(
                'Shard empty. This might means that dataset hasn\'t been generated '
                'yet and info not restored from GCS, or that legacy dataset is used.'
            )
        filenames = naming.filenames_for_dataset_split(
            dataset_name=name,
            split=abs_instr.splitname,
            num_shards=len(shard_lengths),
            filetype_suffix='tfrecord')
        from_ = 0 if abs_instr.from_ is None else abs_instr.from_
        to = sum(shard_lengths) if abs_instr.to is None else abs_instr.to
        num_examples_per_shard.append(to - from_)
        single_file_instructions = _sharded_files.get_read_instructions(
            from_, to, filenames, shard_lengths)
        file_instructions.extend(single_file_instructions)
    return FileInstructions(
        num_examples_per_shard=num_examples_per_shard,
        file_instructions=file_instructions,
    )
def _make_file_instructions_from_absolutes(
    name: str,
    name2shard_lengths: Dict[str, List[int]],
    absolute_instructions: 'ReadInstruction',
    file_format: file_adapters.FileFormat = file_adapters.DEFAULT_FILE_FORMAT,
) -> List[shard_utils.FileInstruction]:
    """Returns the files instructions from the absolute instructions list."""
    # For each split, return the files instruction (skip/take)
    file_instructions = []
    for abs_instr in absolute_instructions:
        shard_lengths = name2shard_lengths[abs_instr.splitname]
        if not shard_lengths:
            raise ValueError(
                'Shard empty. This might means that dataset hasn\'t been generated '
                'yet and info not restored from GCS, or that legacy dataset is used.'
            )
        filenames = naming.filenames_for_dataset_split(
            dataset_name=name,
            split=abs_instr.splitname,
            num_shards=len(shard_lengths),
            filetype_suffix=file_adapters.ADAPTER_FOR_FORMAT[file_format].
            FILE_SUFFIX)
        from_ = 0 if abs_instr.from_ is None else abs_instr.from_
        to = sum(shard_lengths) if abs_instr.to is None else abs_instr.to
        single_file_instructions = shard_utils.get_file_instructions(
            from_, to, filenames, shard_lengths)
        file_instructions.extend(single_file_instructions)
    return file_instructions
Beispiel #3
0
 def test_filenames_for_dataset_split(self):
   self.assertEqual([
       'foo-train-00000-of-00002',
       'foo-train-00001-of-00002',
   ], naming.filenames_for_dataset_split(
       dataset_name='foo',
       split=splits.Split.TRAIN,
       num_shards=2))
Beispiel #4
0
 def test_filenames_for_dataset_split(self):
     actual = naming.filenames_for_dataset_split(dataset_name='foo',
                                                 split=splits.Split.TRAIN,
                                                 num_shards=2,
                                                 filetype_suffix='bar',
                                                 data_dir='/path')
     self.assertEqual(
         actual,
         ['foo-train.bar-00000-of-00002', 'foo-train.bar-00001-of-00002'])