def split_input(cls, mapper_spec): """Returns a list of shard_count input_spec_shards for input_spec. Args: mapper_spec: The mapper specification to split from. Must contain 'file_paths' parameter with one or more file paths. Returns: A list of GoogleStorageLineInputReader corresponding to the specified shards. """ params = _get_params(mapper_spec) file_paths = params[cls.FILE_PATHS_PARAM] if isinstance(file_paths, basestring): # This is a mechanism to allow multiple file paths (which do not contain # commas) in a single string. It may go away. file_paths = file_paths.split(",") file_sizes = {} for file_path in file_paths: fp = files.BufferedFile(file_path) fp.seek(0, 2) file_sizes[file_path] = fp.tell() shard_count = min(cls._MAX_SHARD_COUNT, mapper_spec.shard_count) shards_per_file = shard_count // len(file_paths) if shards_per_file == 0: shards_per_file = 1 chunks = [] for file_path, file_size in file_sizes.items(): file_chunk_size = file_size // shards_per_file for i in xrange(shards_per_file - 1): chunks.append( GoogleStorageLineInputReader.from_json({ cls.FILE_PATH_PARAM: file_path, cls.INITIAL_POSITION_PARAM: file_chunk_size * i, cls.END_POSITION_PARAM: file_chunk_size * (i + 1) })) chunks.append( GoogleStorageLineInputReader.from_json({ cls.FILE_PATH_PARAM: file_path, cls.INITIAL_POSITION_PARAM: file_chunk_size * (shards_per_file - 1), cls.END_POSITION_PARAM: file_size })) return chunks
def next(self): """Returns the next input from as an (offset, line) tuple.""" self._has_iterated = True if not self._filestream: self._filestream = files.BufferedFile(self._file_path) if self._start_position: self._filestream.seek(self._start_position) self._filestream.readline() start_position = self._filestream.tell() if start_position > self._end_position: raise StopIteration() line = self._filestream.readline() if not line: raise StopIteration() return start_position, line.rstrip("\n")
def __iter__(self): """Iterate over records in input files. self._offsets is always correctly updated so that stopping iterations doesn't skip records and doesn't read the same record twice. """ ctx = context.get() mapper_spec = ctx.mapreduce_spec.mapper shard_number = ctx.shard_state.shard_number filenames = mapper_spec.params[self.FILES_PARAM][shard_number] if len(filenames) != len(self._offsets): raise Exception("Files list and offsets do not match.") # Heap with (Key, Value, Index, reader) pairs. readers = [] # Initialize heap for (i, filename) in enumerate(filenames): offset = self._offsets[i] reader = records.RecordsReader(files.BufferedFile(filename)) reader.seek(offset) readers.append((None, None, i, reader)) # Read records from heap and merge values with the same key. current_result = None while readers: (key, value, index, reader) = readers[0] if key is not None: if current_result and key != current_result[0]: # New key encountered. Yield corrent key. yield current_result if not current_result or key != current_result[0]: current_result = (key, []) current_result[1].append(value) # Read next key/value from reader. try: self._offsets[index] = reader.tell() start_time = time.time() binary_record = reader.read() # update counters if context.get(): operation.counters.Increment( input_readers.COUNTER_IO_READ_BYTES, len(binary_record))(context.get()) operation.counters.Increment( input_readers.COUNTER_IO_READ_MSEC, int((time.time() - start_time) * 1000))(context.get()) proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) # Put read data back into heap. heapq.heapreplace(readers, (proto.key(), proto.value(), index, reader)) except EOFError: heapq.heappop(readers) # Yield leftovers. if current_result: yield current_result