def get(self, filename): #self.response.write(self.request.url) pass path = '/gs/%s' % (urllib.quote(filename)) fp = files.BufferedFile(path) data = fp.read(1000*1024*1024*2) self.response.content_type= 'application/octet-stream'; #self.response.write(self.response.content_type) self.response.write(data)
def __iter__(self): """Iterate over records in input files. self._offsets is always correctly updated so that stopping iterations doesn't skip records and doesn't read the same record twice. """ ctx = context.get() mapper_spec = ctx.mapreduce_spec.mapper shard_number = ctx.shard_state.shard_number filenames = mapper_spec.params[self.FILES_PARAM][shard_number] if len(filenames) != len(self._offsets): raise Exception("Files list and offsets do not match.") readers = [] for (i, filename) in enumerate(filenames): offset = self._offsets[i] reader = records.RecordsReader(files.BufferedFile(filename)) reader.seek(offset) readers.append((None, None, i, reader)) current_result = None while readers: (key, value, index, reader) = readers[0] if key is not None: if current_result and key != current_result[0]: yield current_result if not current_result or key != current_result[0]: current_result = (key, []) current_result[1].append(value) try: self._offsets[index] = reader.tell() start_time = time.time() binary_record = reader.read() if context.get(): operation.counters.Increment( input_readers.COUNTER_IO_READ_BYTES, len(binary_record))(context.get()) operation.counters.Increment( input_readers.COUNTER_IO_READ_MSEC, int((time.time() - start_time) * 1000))(context.get()) proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) heapq.heapreplace(readers, (proto.key(), proto.value(), index, reader)) except EOFError: heapq.heappop(readers) if current_result: yield current_result
def __iter__(self): """Iterate over records in input files. self._offsets is always correctly updated so that stopping iterations doesn't skip records and doesn't read the same record twice. """ ctx = context.get() mapper_spec = ctx.mapreduce_spec.mapper shard_number = ctx._shard_state.shard_number filenames = mapper_spec.params[self.FILES_PARAM][shard_number] if len(filenames) != len(self._offsets): raise Exception("Files list and offsets do not match.") # Heap with (Key, Value, Index, reader) pairs. readers = [] # Initialize heap for (i, filename) in enumerate(filenames): offset = self._offsets[i] reader = records.RecordsReader(files.BufferedFile(filename)) reader.seek(offset) readers.append((None, None, i, reader)) # Read records from heap and merge values with the same key. # current_result is yielded and consumed buy _merge_map. # current_result = (key, value, is_partial) current_result = None current_count = 0 current_size = 0 while readers: (key, value, index, reader) = readers[0] if key is not None: current_count += 1 current_size += len(value) should_yield = False if current_result: if key != current_result[0]: # New key encountered should_yield = True elif (self._max_values_count != -1 and current_count >= self._max_values_count): # Maximum number of values encountered. current_result[2] = True should_yield = True elif (self._max_values_size != -1 and current_size >= self._max_values_size): # Maximum size of values encountered current_result[2] = True should_yield = True if should_yield: # New key encountered or maximum count hit. Yield current key. yield current_result if not current_result or should_yield: current_result = [key, [], False] current_count = 0 current_size = 0 current_result[1].append(value) # Read next key/value from reader. try: self._offsets[index] = reader.tell() start_time = time.time() binary_record = reader.read() # update counters if context.get(): operation.counters.Increment( input_readers.COUNTER_IO_READ_BYTES, len(binary_record))(context.get()) operation.counters.Increment( input_readers.COUNTER_IO_READ_MSEC, int((time.time() - start_time) * 1000))(context.get()) proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) # Put read data back into heap. heapq.heapreplace(readers, (proto.key(), proto.value(), index, reader)) except EOFError: heapq.heappop(readers) # Yield leftovers. if current_result: yield current_result