def _make_file_instructions_from_absolutes( name, name2shard_lengths, absolute_instructions, ): """Returns the files instructions from the absolute instructions list.""" # For each split, return the files instruction (skip/take) file_instructions = [] num_examples_per_shard = [] for abs_instr in absolute_instructions: shard_lengths = name2shard_lengths[abs_instr.splitname] if not shard_lengths: raise ValueError( 'Shard empty. This might means that dataset hasn\'t been generated ' 'yet and info not restored from GCS, or that legacy dataset is used.' ) filenames = naming.filenames_for_dataset_split( dataset_name=name, split=abs_instr.splitname, num_shards=len(shard_lengths), filetype_suffix='tfrecord') from_ = 0 if abs_instr.from_ is None else abs_instr.from_ to = sum(shard_lengths) if abs_instr.to is None else abs_instr.to num_examples_per_shard.append(to - from_) single_file_instructions = _sharded_files.get_read_instructions( from_, to, filenames, shard_lengths) file_instructions.extend(single_file_instructions) return FileInstructions( num_examples_per_shard=num_examples_per_shard, file_instructions=file_instructions, )
def _make_file_instructions_from_absolutes( name: str, name2shard_lengths: Dict[str, List[int]], absolute_instructions: 'ReadInstruction', file_format: file_adapters.FileFormat = file_adapters.DEFAULT_FILE_FORMAT, ) -> List[shard_utils.FileInstruction]: """Returns the files instructions from the absolute instructions list.""" # For each split, return the files instruction (skip/take) file_instructions = [] for abs_instr in absolute_instructions: shard_lengths = name2shard_lengths[abs_instr.splitname] if not shard_lengths: raise ValueError( 'Shard empty. This might means that dataset hasn\'t been generated ' 'yet and info not restored from GCS, or that legacy dataset is used.' ) filenames = naming.filenames_for_dataset_split( dataset_name=name, split=abs_instr.splitname, num_shards=len(shard_lengths), filetype_suffix=file_adapters.ADAPTER_FOR_FORMAT[file_format]. FILE_SUFFIX) from_ = 0 if abs_instr.from_ is None else abs_instr.from_ to = sum(shard_lengths) if abs_instr.to is None else abs_instr.to single_file_instructions = shard_utils.get_file_instructions( from_, to, filenames, shard_lengths) file_instructions.extend(single_file_instructions) return file_instructions
def test_filenames_for_dataset_split(self): self.assertEqual([ 'foo-train-00000-of-00002', 'foo-train-00001-of-00002', ], naming.filenames_for_dataset_split( dataset_name='foo', split=splits.Split.TRAIN, num_shards=2))
def test_filenames_for_dataset_split(self): actual = naming.filenames_for_dataset_split(dataset_name='foo', split=splits.Split.TRAIN, num_shards=2, filetype_suffix='bar', data_dir='/path') self.assertEqual( actual, ['foo-train.bar-00000-of-00002', 'foo-train.bar-00001-of-00002'])