Python BufferedFile Examples

Programming Language: Python

Namespace/Package Name: mapreduce.lib.files

Method/Function: BufferedFile

Examples at hotexamples.com: 3

Python BufferedFile - 3 examples found. These are the top rated real world Python examples of mapreduce.lib.files.BufferedFile extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: input_readers.py Project: cloudysunny14/lakshmi

    def split_input(cls, mapper_spec):
        """Returns a list of shard_count input_spec_shards for input_spec.

    Args:
      mapper_spec: The mapper specification to split from. Must contain
          'file_paths' parameter with one or more file paths.

    Returns:
      A list of GoogleStorageLineInputReader corresponding to the
      specified shards.
    """
        params = _get_params(mapper_spec)
        file_paths = params[cls.FILE_PATHS_PARAM]

        if isinstance(file_paths, basestring):
            # This is a mechanism to allow multiple file paths (which do not contain
            # commas) in a single string. It may go away.
            file_paths = file_paths.split(",")

        file_sizes = {}

        for file_path in file_paths:
            fp = files.BufferedFile(file_path)
            fp.seek(0, 2)
            file_sizes[file_path] = fp.tell()

        shard_count = min(cls._MAX_SHARD_COUNT, mapper_spec.shard_count)
        shards_per_file = shard_count // len(file_paths)

        if shards_per_file == 0:
            shards_per_file = 1

        chunks = []

        for file_path, file_size in file_sizes.items():
            file_chunk_size = file_size // shards_per_file
            for i in xrange(shards_per_file - 1):
                chunks.append(
                    GoogleStorageLineInputReader.from_json({
                        cls.FILE_PATH_PARAM:
                        file_path,
                        cls.INITIAL_POSITION_PARAM:
                        file_chunk_size * i,
                        cls.END_POSITION_PARAM:
                        file_chunk_size * (i + 1)
                    }))
            chunks.append(
                GoogleStorageLineInputReader.from_json({
                    cls.FILE_PATH_PARAM:
                    file_path,
                    cls.INITIAL_POSITION_PARAM:
                    file_chunk_size * (shards_per_file - 1),
                    cls.END_POSITION_PARAM:
                    file_size
                }))

        return chunks

Example #2

Show file

File: input_readers.py Project: cloudysunny14/lakshmi

    def next(self):
        """Returns the next input from as an (offset, line) tuple."""
        self._has_iterated = True

        if not self._filestream:
            self._filestream = files.BufferedFile(self._file_path)
            if self._start_position:
                self._filestream.seek(self._start_position)
                self._filestream.readline()

        start_position = self._filestream.tell()

        if start_position > self._end_position:
            raise StopIteration()

        line = self._filestream.readline()

        if not line:
            raise StopIteration()

        return start_position, line.rstrip("\n")

Example #3

Show file

File: shuffler.py Project: sgammon/ArtWise

  def __iter__(self):
    """Iterate over records in input files.

    self._offsets is always correctly updated so that stopping iterations
    doesn't skip records and doesn't read the same record twice.
    """
    ctx = context.get()
    mapper_spec = ctx.mapreduce_spec.mapper
    shard_number = ctx.shard_state.shard_number
    filenames = mapper_spec.params[self.FILES_PARAM][shard_number]

    if len(filenames) != len(self._offsets):
      raise Exception("Files list and offsets do not match.")

    # Heap with (Key, Value, Index, reader) pairs.
    readers = []

    # Initialize heap
    for (i, filename) in enumerate(filenames):
      offset = self._offsets[i]
      reader = records.RecordsReader(files.BufferedFile(filename))
      reader.seek(offset)
      readers.append((None, None, i, reader))

    # Read records from heap and merge values with the same key.
    current_result = None
    while readers:
      (key, value, index, reader) = readers[0]

      if key is not None:
        if current_result and key != current_result[0]:
          # New key encountered. Yield corrent key.
          yield current_result
        if not current_result or key != current_result[0]:
          current_result = (key, [])
        current_result[1].append(value)

      # Read next key/value from reader.
      try:
        self._offsets[index] = reader.tell()
        start_time = time.time()
        binary_record = reader.read()
        # update counters
        if context.get():
          operation.counters.Increment(
              input_readers.COUNTER_IO_READ_BYTES,
              len(binary_record))(context.get())
          operation.counters.Increment(
              input_readers.COUNTER_IO_READ_MSEC,
              int((time.time() - start_time) * 1000))(context.get())
        proto = file_service_pb.KeyValue()
        proto.ParseFromString(binary_record)
        # Put read data back into heap.
        heapq.heapreplace(readers,
                          (proto.key(), proto.value(), index, reader))
      except EOFError:
        heapq.heappop(readers)

    # Yield leftovers.
    if current_result:
      yield current_result