Example #1
0
    def split_input(cls, mapper_spec):
        """Returns a list of shard_count input_spec_shards for input_spec.

    Args:
      mapper_spec: The mapper specification to split from. Must contain
          'file_paths' parameter with one or more file paths.

    Returns:
      A list of GoogleStorageLineInputReader corresponding to the
      specified shards.
    """
        params = _get_params(mapper_spec)
        file_paths = params[cls.FILE_PATHS_PARAM]

        if isinstance(file_paths, basestring):
            # This is a mechanism to allow multiple file paths (which do not contain
            # commas) in a single string. It may go away.
            file_paths = file_paths.split(",")

        file_sizes = {}

        for file_path in file_paths:
            fp = files.BufferedFile(file_path)
            fp.seek(0, 2)
            file_sizes[file_path] = fp.tell()

        shard_count = min(cls._MAX_SHARD_COUNT, mapper_spec.shard_count)
        shards_per_file = shard_count // len(file_paths)

        if shards_per_file == 0:
            shards_per_file = 1

        chunks = []

        for file_path, file_size in file_sizes.items():
            file_chunk_size = file_size // shards_per_file
            for i in xrange(shards_per_file - 1):
                chunks.append(
                    GoogleStorageLineInputReader.from_json({
                        cls.FILE_PATH_PARAM:
                        file_path,
                        cls.INITIAL_POSITION_PARAM:
                        file_chunk_size * i,
                        cls.END_POSITION_PARAM:
                        file_chunk_size * (i + 1)
                    }))
            chunks.append(
                GoogleStorageLineInputReader.from_json({
                    cls.FILE_PATH_PARAM:
                    file_path,
                    cls.INITIAL_POSITION_PARAM:
                    file_chunk_size * (shards_per_file - 1),
                    cls.END_POSITION_PARAM:
                    file_size
                }))

        return chunks
Example #2
0
    def next(self):
        """Returns the next input from as an (offset, line) tuple."""
        self._has_iterated = True

        if not self._filestream:
            self._filestream = files.BufferedFile(self._file_path)
            if self._start_position:
                self._filestream.seek(self._start_position)
                self._filestream.readline()

        start_position = self._filestream.tell()

        if start_position > self._end_position:
            raise StopIteration()

        line = self._filestream.readline()

        if not line:
            raise StopIteration()

        return start_position, line.rstrip("\n")
Example #3
0
  def __iter__(self):
    """Iterate over records in input files.

    self._offsets is always correctly updated so that stopping iterations
    doesn't skip records and doesn't read the same record twice.
    """
    ctx = context.get()
    mapper_spec = ctx.mapreduce_spec.mapper
    shard_number = ctx.shard_state.shard_number
    filenames = mapper_spec.params[self.FILES_PARAM][shard_number]

    if len(filenames) != len(self._offsets):
      raise Exception("Files list and offsets do not match.")

    # Heap with (Key, Value, Index, reader) pairs.
    readers = []

    # Initialize heap
    for (i, filename) in enumerate(filenames):
      offset = self._offsets[i]
      reader = records.RecordsReader(files.BufferedFile(filename))
      reader.seek(offset)
      readers.append((None, None, i, reader))

    # Read records from heap and merge values with the same key.
    current_result = None
    while readers:
      (key, value, index, reader) = readers[0]

      if key is not None:
        if current_result and key != current_result[0]:
          # New key encountered. Yield corrent key.
          yield current_result
        if not current_result or key != current_result[0]:
          current_result = (key, [])
        current_result[1].append(value)

      # Read next key/value from reader.
      try:
        self._offsets[index] = reader.tell()
        start_time = time.time()
        binary_record = reader.read()
        # update counters
        if context.get():
          operation.counters.Increment(
              input_readers.COUNTER_IO_READ_BYTES,
              len(binary_record))(context.get())
          operation.counters.Increment(
              input_readers.COUNTER_IO_READ_MSEC,
              int((time.time() - start_time) * 1000))(context.get())
        proto = file_service_pb.KeyValue()
        proto.ParseFromString(binary_record)
        # Put read data back into heap.
        heapq.heapreplace(readers,
                          (proto.key(), proto.value(), index, reader))
      except EOFError:
        heapq.heappop(readers)

    # Yield leftovers.
    if current_result:
      yield current_result