def next_file(self): if self.__input_index == len(self.__inputs): raise StopIteration() file_input = self.__inputs[self.__input_index] if file_input.range: first_format = self.__file_format_root._formats[0] if not first_format.can_split(): raise ValueError('Input range specified for a non splitable format %s' % first_format.NAME) first_format._range = file_input.range self.__previous_input_index = self.__input_index self.__input_index += 1 return files.open(file_input.filename, 'r')
def _deep_split(filenames, size_per_shard, parsed_formats): """Split files into roots using the first FileFormat. Deep split can split within a file. It tells the first format how big a split it wants and the first format will do the actually splitting because only the first format knows how to operate on this particular format. Args: filenames: a list of input filenames. size_per_shard: size per shard. parsed_format: the parsed FileFormats. Returns: A list of FileFormatRoot. """ roots = [] inputs = [] size_left = size_per_shard for filename in filenames: index = 0 with files.open(filename) as f: cache_for_split = {} # Split a single file. while True: if size_left <= 0: # Shard has been filled. roots.append(FileFormatRoot(copy.deepcopy(parsed_formats), inputs)) size_left = size_per_shard inputs = [] start_index = index size_left, index = parsed_formats[0].split(size_left, start_index, f, cache_for_split) # File has been entirely covered. if start_index == index: break inputs.append(_FileRange(filename, (start_index, index))) if inputs: roots.append(FileFormatRoot(copy.deepcopy(parsed_formats), inputs)) return roots