Ejemplo n.º 1
0
 def next_file(self):
     if self.__input_index == len(self.__inputs):
         raise StopIteration()
     file_input = self.__inputs[self.__input_index]
     if file_input.range:
         first_format = self.__file_format_root._formats[0]
         if not first_format.can_split():
             raise ValueError("Input range specified for a non splitable format %s" % first_format.NAME)
         first_format._range = file_input.range
     self.__previous_input_index = self.__input_index
     self.__input_index += 1
     return files.open(file_input.filename, "r", buffering=-1)
Ejemplo n.º 2
0
 def next_file(self):
   if self.__input_index == len(self.__inputs):
     raise StopIteration()
   file_input = self.__inputs[self.__input_index]
   if file_input.range:
     first_format = self.__file_format_root._formats[0]
     if not first_format.can_split():
       raise ValueError('Input range specified for a non splitable format %s'
                        % first_format.NAME)
     first_format._range = file_input.range
   self.__previous_input_index = self.__input_index
   self.__input_index += 1
   return files.open(file_input.filename, 'r', buffering=-1)
def _deep_split(filenames, size_per_shard, parsed_formats):
  """Split files into roots using the first FileFormat.

  Deep split can split within a file. It tells the first format how big
  a split it wants and the first format will do the actually splitting
  because only the first format knows how to operate on this particular
  format.

  Args:
    filenames: a list of input filenames.
    size_per_shard: size per shard.
    parsed_format: the parsed FileFormats.

  Returns:
    A list of FileFormatRoot.
  """
  roots = []
  inputs = []
  size_left = size_per_shard

  for filename in filenames:
    index = 0
    with files.open(filename) as f:
      cache_for_split = {}
      # Split a single file.
      while True:
        if size_left <= 0:
          # Shard has been filled.
          roots.append(FileFormatRoot(copy.deepcopy(parsed_formats), inputs))
          size_left = size_per_shard
          inputs = []
        start_index = index
        size_left, index = parsed_formats[0].split(size_left,
                                                   start_index,
                                                   f,
                                                   cache_for_split)
        # File has been entirely covered.
        if start_index == index:
          break
        inputs.append(_FileRange(filename, (start_index, index)))

  if inputs:
    roots.append(FileFormatRoot(copy.deepcopy(parsed_formats), inputs))

  return roots
Ejemplo n.º 4
0
def _deep_split(filenames, size_per_shard, parsed_formats):
  """Split files into roots using the first FileFormat.

  Deep split can split within a file. It tells the first format how big
  a split it wants and the first format will do the actually splitting
  because only the first format knows how to operate on this particular
  format.

  Args:
    filenames: a list of input filenames.
    size_per_shard: size per shard.
    parsed_format: the parsed FileFormats.

  Returns:
    A list of FileFormatRoot.
  """
  roots = []
  inputs = []
  size_left = size_per_shard

  for filename in filenames:
    index = 0
    with files.open(filename) as f:
      cache_for_split = {}

      while True:
        if size_left <= 0:

          roots.append(FileFormatRoot(copy.deepcopy(parsed_formats), inputs))
          size_left = size_per_shard
          inputs = []
        start_index = index
        size_left, index = parsed_formats[0].split(size_left,
                                                   start_index,
                                                   f,
                                                   cache_for_split)

        if start_index == index:
          break
        inputs.append(_FileRange(filename, (start_index, index)))

  if inputs:
    roots.append(FileFormatRoot(copy.deepcopy(parsed_formats), inputs))

  return roots