Exemple #1
0
    def write(self, data):
        """Write data.

    Args:
      data: actual data yielded from handler. Type is writer-specific.
    """
        ctx = context.get()
        if len(data) != 2:
            logging.error("Got bad tuple of length %d (2-tuple expected): %s",
                          len(data), data)

        try:
            key = str(data[0])
            value = str(data[1])
        except TypeError:
            logging.error("Expecting a tuple, but got %s: %s",
                          data.__class__.__name__, data)

        # AppScale: Use a deterministic hash function.
        file_index = zlib.adler32(key) % len(self._filenames)

        pool = self._pools[file_index]
        if pool is None:
            filehandle = self._filehandles[file_index]
            pool = output_writers.GCSRecordsPool(filehandle=filehandle,
                                                 ctx=ctx)
            self._pools[file_index] = pool

        proto = kv_pb.KeyValue()
        proto.set_key(key)
        proto.set_value(value)
        pool.append(proto.Encode())
Exemple #2
0
def _hashing_map(binary_record):
    """A map function used in hash phase.

  Reads KeyValue from binary record.

  Args:
    binary_record: The binary record.

  Yields:
    The (key, value).
  """
    proto = kv_pb.KeyValue()
    proto.ParseFromString(binary_record)
    yield (proto.key(), proto.value())
Exemple #3
0
    def write(self, data):
        if len(data) != 2:
            logging.error("Got bad tuple of length %d (2-tuple expected): %s",
                          len(data), data)

        try:
            key = str(data[0])
            value = str(data[1])
        except TypeError:
            logging.error("Expecting a tuple, but got %s: %s",
                          data.__class__.__name__, data)

        proto = kv_pb.KeyValue()
        proto.set_key(key)
        proto.set_value(value)
        GoogleCloudStorageRecordOutputWriter.write(self, proto.Encode())
Exemple #4
0
def _sort_records_map(records):
    """Map function sorting records.

  Converts records to KeyValue protos, sorts them by key and writes them
  into new GCS file. Creates _OutputFile entity to record resulting
  file name.

  Args:
    records: list of records which are serialized KeyValue protos.
  """
    ctx = context.get()
    l = len(records)
    key_records = [None] * l

    logging.debug("Parsing")
    for i in range(l):
        proto = kv_pb.KeyValue()
        proto.ParseFromString(records[i])
        key_records[i] = (proto.key(), records[i])

    logging.debug("Sorting")
    key_records.sort(cmp=_compare_keys)

    logging.debug("Writing")
    mapper_spec = ctx.mapreduce_spec.mapper
    params = input_readers._get_params(mapper_spec)
    bucket_name = params.get("bucket_name")
    filename = (ctx.mapreduce_spec.name + "/" + ctx.mapreduce_id + "/output-" +
                ctx.shard_id + "-" + str(int(time.time())))
    full_filename = "/%s/%s" % (bucket_name, filename)
    filehandle = cloudstorage.open(full_filename, mode="w")
    with output_writers.GCSRecordsPool(filehandle, ctx=ctx) as pool:
        for key_record in key_records:
            pool.append(key_record[1])

    logging.debug("Finalizing")
    filehandle.close()

    entity = _OutputFile(key_name=full_filename,
                         parent=_OutputFile.get_root_key(ctx.mapreduce_id))
    entity.put()
Exemple #5
0
    def __iter__(self):
        """Iterate over records in input files.

    self._offsets is always correctly updated so that stopping iterations
    doesn't skip records and doesn't read the same record twice.

    Raises:
      Exception: when Files list and offsets do not match.

    Yields:
      The result.
    """
        ctx = context.get()
        mapper_spec = ctx.mapreduce_spec.mapper
        shard_number = ctx._shard_state.shard_number
        filenames = mapper_spec.params[self.FILES_PARAM][shard_number]

        if len(filenames) != len(self._offsets):
            raise Exception("Files list and offsets do not match.")

        readers = []

        for (i, filename) in enumerate(filenames):
            offset = self._offsets[i]

            reader = records.RecordsReader(
                cloudstorage.open(filename,
                                  read_buffer_size=self.GCS_BUFFER_SIZE))
            reader.seek(offset)
            readers.append((None, None, i, reader))

        current_result = None
        current_count = 0
        current_size = 0
        while readers:
            (key, value, index, reader) = readers[0]

            if key is not None:
                current_count += 1
                current_size += len(value)

                should_yield = False
                if current_result:
                    if key != current_result[0]:

                        should_yield = True
                    elif (self._max_values_count != -1
                          and current_count >= self._max_values_count):

                        current_result[2] = True
                        should_yield = True
                    elif (self._max_values_size != -1
                          and current_size >= self._max_values_size):

                        current_result[2] = True
                        should_yield = True

                if should_yield:

                    yield current_result
                if not current_result or should_yield:
                    current_result = [key, [], False]
                    current_count = 0
                    current_size = 0
                current_result[1].append(value)

            try:
                self._offsets[index] = reader.tell()
                start_time = time.time()
                binary_record = reader.read()

                if context.get():
                    operation.counters.Increment(
                        input_readers.COUNTER_IO_READ_BYTES,
                        len(binary_record))(context.get())
                    operation.counters.Increment(
                        input_readers.COUNTER_IO_READ_MSEC,
                        int((time.time() - start_time) * 1000))(context.get())
                proto = kv_pb.KeyValue()
                proto.ParseFromString(binary_record)

                heapq.heapreplace(readers,
                                  (proto.key(), proto.value(), index, reader))
            except EOFError:
                heapq.heappop(readers)

        if current_result:
            yield current_result