def write(self, data): """Write data. Args: data: actual data yielded from handler. Type is writer-specific. """ ctx = context.get() if len(data) != 2: logging.error("Got bad tuple of length %d (2-tuple expected): %s", len(data), data) try: key = str(data[0]) value = str(data[1]) except TypeError: logging.error("Expecting a tuple, but got %s: %s", data.__class__.__name__, data) # AppScale: Use a deterministic hash function. file_index = zlib.adler32(key) % len(self._filenames) pool = self._pools[file_index] if pool is None: filehandle = self._filehandles[file_index] pool = output_writers.GCSRecordsPool(filehandle=filehandle, ctx=ctx) self._pools[file_index] = pool proto = kv_pb.KeyValue() proto.set_key(key) proto.set_value(value) pool.append(proto.Encode())
def _hashing_map(binary_record): """A map function used in hash phase. Reads KeyValue from binary record. Args: binary_record: The binary record. Yields: The (key, value). """ proto = kv_pb.KeyValue() proto.ParseFromString(binary_record) yield (proto.key(), proto.value())
def write(self, data): if len(data) != 2: logging.error("Got bad tuple of length %d (2-tuple expected): %s", len(data), data) try: key = str(data[0]) value = str(data[1]) except TypeError: logging.error("Expecting a tuple, but got %s: %s", data.__class__.__name__, data) proto = kv_pb.KeyValue() proto.set_key(key) proto.set_value(value) GoogleCloudStorageRecordOutputWriter.write(self, proto.Encode())
def _sort_records_map(records): """Map function sorting records. Converts records to KeyValue protos, sorts them by key and writes them into new GCS file. Creates _OutputFile entity to record resulting file name. Args: records: list of records which are serialized KeyValue protos. """ ctx = context.get() l = len(records) key_records = [None] * l logging.debug("Parsing") for i in range(l): proto = kv_pb.KeyValue() proto.ParseFromString(records[i]) key_records[i] = (proto.key(), records[i]) logging.debug("Sorting") key_records.sort(cmp=_compare_keys) logging.debug("Writing") mapper_spec = ctx.mapreduce_spec.mapper params = input_readers._get_params(mapper_spec) bucket_name = params.get("bucket_name") filename = (ctx.mapreduce_spec.name + "/" + ctx.mapreduce_id + "/output-" + ctx.shard_id + "-" + str(int(time.time()))) full_filename = "/%s/%s" % (bucket_name, filename) filehandle = cloudstorage.open(full_filename, mode="w") with output_writers.GCSRecordsPool(filehandle, ctx=ctx) as pool: for key_record in key_records: pool.append(key_record[1]) logging.debug("Finalizing") filehandle.close() entity = _OutputFile(key_name=full_filename, parent=_OutputFile.get_root_key(ctx.mapreduce_id)) entity.put()
def __iter__(self): """Iterate over records in input files. self._offsets is always correctly updated so that stopping iterations doesn't skip records and doesn't read the same record twice. Raises: Exception: when Files list and offsets do not match. Yields: The result. """ ctx = context.get() mapper_spec = ctx.mapreduce_spec.mapper shard_number = ctx._shard_state.shard_number filenames = mapper_spec.params[self.FILES_PARAM][shard_number] if len(filenames) != len(self._offsets): raise Exception("Files list and offsets do not match.") readers = [] for (i, filename) in enumerate(filenames): offset = self._offsets[i] reader = records.RecordsReader( cloudstorage.open(filename, read_buffer_size=self.GCS_BUFFER_SIZE)) reader.seek(offset) readers.append((None, None, i, reader)) current_result = None current_count = 0 current_size = 0 while readers: (key, value, index, reader) = readers[0] if key is not None: current_count += 1 current_size += len(value) should_yield = False if current_result: if key != current_result[0]: should_yield = True elif (self._max_values_count != -1 and current_count >= self._max_values_count): current_result[2] = True should_yield = True elif (self._max_values_size != -1 and current_size >= self._max_values_size): current_result[2] = True should_yield = True if should_yield: yield current_result if not current_result or should_yield: current_result = [key, [], False] current_count = 0 current_size = 0 current_result[1].append(value) try: self._offsets[index] = reader.tell() start_time = time.time() binary_record = reader.read() if context.get(): operation.counters.Increment( input_readers.COUNTER_IO_READ_BYTES, len(binary_record))(context.get()) operation.counters.Increment( input_readers.COUNTER_IO_READ_MSEC, int((time.time() - start_time) * 1000))(context.get()) proto = kv_pb.KeyValue() proto.ParseFromString(binary_record) heapq.heapreplace(readers, (proto.key(), proto.value(), index, reader)) except EOFError: heapq.heappop(readers) if current_result: yield current_result