Exemple #1
0
    def write(self, data):
        """Write data.

    Args:
      data: actual data yielded from handler. Type is writer-specific.
    """
        ctx = context.get()
        if len(data) != 2:
            logging.error("Got bad tuple of length %d (2-tuple expected): %s",
                          len(data), data)

        try:
            key = str(data[0])
            value = str(data[1])
        except TypeError:
            logging.error("Expecting a tuple, but got %s: %s",
                          data.__class__.__name__, data)

        file_index = key.__hash__() % len(self._filehandles)

        # Work-around: Since we don't have access to the context in the to_json()
        # function, but we need to flush each pool before we serialize the
        # filehandle, we rely on a member variable instead of using context for
        # pool management.
        pool = self._pools[file_index]
        if pool is None:
            filehandle = self._filehandles[file_index]
            pool = output_writers.GCSRecordsPool(filehandle=filehandle,
                                                 ctx=ctx)
            self._pools[file_index] = pool

        proto = kv_pb.KeyValue()
        proto.set_key(key)
        proto.set_value(value)
        pool.append(proto.Encode())
Exemple #2
0
    def setUp(self):
        super(GCSRecordsPoolTest, self).setUp()
        bucket_name = "testbucket"
        test_filename = "testfile"

        self.filename = "/%s/%s" % (bucket_name, test_filename)
        self.filehandle = cloudstorage.open(self.filename, mode="w")
        self.pool = output_writers.GCSRecordsPool(self.filehandle,
                                                  flush_size_chars=30)
Exemple #3
0
def _sort_records_map(records):
    """Map function sorting records.

  Converts records to KeyValue protos, sorts them by key and writes them
  into new GCS file. Creates _OutputFile entity to record resulting
  file name.

  Args:
    records: list of records which are serialized KeyValue protos.
  """
    ctx = context.get()
    l = len(records)
    key_records = [None] * l

    logging.debug("Parsing")
    for i in range(l):
        proto = kv_pb.KeyValue()
        proto.ParseFromString(records[i])
        key_records[i] = (proto.key(), records[i])

    logging.debug("Sorting")
    key_records.sort(cmp=_compare_keys)

    logging.debug("Writing")
    mapper_spec = ctx.mapreduce_spec.mapper
    params = input_readers._get_params(mapper_spec)
    bucket_name = params.get("bucket_name")
    filename = (ctx.mapreduce_spec.name + "/" + ctx.mapreduce_id + "/output-" +
                ctx.shard_id + "-" + str(int(time.time())))
    full_filename = "/%s/%s" % (bucket_name, filename)
    filehandle = cloudstorage.open(full_filename, mode="w")
    with output_writers.GCSRecordsPool(filehandle, ctx=ctx) as pool:
        for key_record in key_records:
            pool.append(key_record[1])

    logging.debug("Finalizing")
    filehandle.close()

    entity = _OutputFile(key_name=full_filename,
                         parent=_OutputFile.get_root_key(ctx.mapreduce_id))
    entity.put()