def _to_map_job_config(cls, mr_spec, queue_name):
        """Converts model.MapreduceSpec back to JobConfig.

    This method allows our internal methods to use JobConfig directly.
    This method also allows us to expose JobConfig as an API during execution,
    despite that it is not saved into datastore.

    Args:
      mr_spec: model.MapreduceSpec.
      queue_name: queue name.

    Returns:
      The JobConfig object for this job.
    """
        mapper_spec = mr_spec.mapper

        api_version = mr_spec.params.get("api_version", 0)
        old_api = api_version == 0

        input_reader_cls = mapper_spec.input_reader_class()
        input_reader_params = input_readers._get_params(mapper_spec)
        if issubclass(input_reader_cls, input_reader.InputReader):
            input_reader_params = input_reader_cls.params_from_json(
                input_reader_params)

        output_writer_cls = mapper_spec.output_writer_class()
        output_writer_params = output_writers._get_params(mapper_spec)

        return cls(
            _lenient=old_api,
            job_name=mr_spec.name,
            job_id=mr_spec.mapreduce_id,
            mapper=util.for_name(mapper_spec.handler_spec),
            input_reader_cls=input_reader_cls,
            input_reader_params=input_reader_params,
            output_writer_cls=output_writer_cls,
            output_writer_params=output_writer_params,
            shard_count=mapper_spec.shard_count,
            queue_name=queue_name,
            user_params=mr_spec.params.get("user_params"),
            shard_max_attempts=mr_spec.params.get("shard_max_attempts"),
            done_callback_url=mr_spec.params.get("done_callback"),
            _force_writes=mr_spec.params.get("force_writes"),
            _base_path=mr_spec.params["base_path"],
            _task_max_attempts=mr_spec.params.get("task_max_attempts"),
            _task_max_data_processing_attempts=(
                mr_spec.params.get("task_max_data_processing_attempts")),
            _hooks_cls=util.for_name(mr_spec.hooks_class_name),
            _app=mr_spec.params.get("app_id"),
            _api_version=api_version)
Exemple #2
0
  def _to_map_job_config(cls,
                         mr_spec,


                         queue_name):
    """Converts model.MapreduceSpec back to JobConfig.

    This method allows our internal methods to use JobConfig directly.
    This method also allows us to expose JobConfig as an API during execution,
    despite that it is not saved into datastore.

    Args:
      mr_spec: model.MapreduceSpec.
      queue_name: queue name.

    Returns:
      The JobConfig object for this job.
    """
    mapper_spec = mr_spec.mapper

    api_version = mr_spec.params.get("api_version", 0)
    old_api = api_version == 0





    return cls(_lenient=old_api,
               job_name=mr_spec.name,
               job_id=mr_spec.mapreduce_id,

               mapper=util.for_name(mapper_spec.handler_spec),
               input_reader_cls=mapper_spec.input_reader_class(),
               input_reader_params=input_readers._get_params(mapper_spec),
               output_writer_cls=mapper_spec.output_writer_class(),
               output_writer_params=output_writers._get_params(mapper_spec),
               shard_count=mapper_spec.shard_count,
               queue_name=queue_name,
               user_params=mr_spec.params.get("user_params"),
               shard_max_attempts=mr_spec.params.get("shard_max_attempts"),
               done_callback_url=mr_spec.params.get("done_callback"),
               _force_writes=mr_spec.params.get("force_writes"),
               _base_path=mr_spec.params["base_path"],
               _task_max_attempts=mr_spec.params.get("task_max_attempts"),
               _task_max_data_processing_attempts=(
                   mr_spec.params.get("task_max_data_processing_attempts")),
               _hooks_cls=util.for_name(mr_spec.hooks_class_name),
               _app=mr_spec.params.get("app_id"),
               _api_version=api_version)
def _sort_records_map(records):
    """Map function sorting records.

  Converts records to KeyValue protos, sorts them by key and writes them
  into new GCS file. Creates _OutputFile entity to record resulting
  file name.

  Args:
    records: list of records which are serialized KeyValue protos.
  """
    ctx = context.get()
    l = len(records)
    key_records = [None] * l

    logging.debug("Parsing")
    for i in range(l):
        proto = file_service_pb.KeyValue()
        proto.ParseFromString(records[i])
        key_records[i] = (proto.key(), records[i])

    logging.debug("Sorting")
    key_records.sort(cmp=_compare_keys)

    logging.debug("Writing")
    mapper_spec = ctx.mapreduce_spec.mapper
    params = input_readers._get_params(mapper_spec)
    bucket_name = params.get("bucket_name")
    filename = (ctx.mapreduce_spec.name + "/" + ctx.mapreduce_id + "/output-" +
                ctx.shard_id + "-" + str(int(time.time())))
    full_filename = "/%s/%s" % (bucket_name, filename)
    filehandle = cloudstorage.open(full_filename, mode="w")
    with output_writers.GCSRecordsPool(filehandle, ctx=ctx) as pool:
        for key_record in key_records:
            pool.append(key_record[1])

    logging.debug("Finalizing")
    filehandle.close()

    entity = _OutputFile(key_name=full_filename,
                         parent=_OutputFile.get_root_key(ctx.mapreduce_id))
    entity.put()
Exemple #4
0
def _sort_records_map(records):
  """Map function sorting records.

  Converts records to KeyValue protos, sorts them by key and writes them
  into new GCS file. Creates _OutputFile entity to record resulting
  file name.

  Args:
    records: list of records which are serialized KeyValue protos.
  """
  ctx = context.get()
  l = len(records)
  key_records = [None] * l

  logging.debug("Parsing")
  for i in range(l):
    proto = file_service_pb.KeyValue()
    proto.ParseFromString(records[i])
    key_records[i] = (proto.key(), records[i])

  logging.debug("Sorting")
  key_records.sort(cmp=_compare_keys)

  logging.debug("Writing")
  mapper_spec = ctx.mapreduce_spec.mapper
  params = input_readers._get_params(mapper_spec)
  bucket_name = params.get("bucket_name")
  filename = (ctx.mapreduce_spec.name + "/" + ctx.mapreduce_id + "/output-" +
              ctx.shard_id + "-" + str(int(time.time())))
  full_filename = "/%s/%s" % (bucket_name, filename)
  filehandle = cloudstorage.open(full_filename, mode="w")
  with output_writers.GCSRecordsPool(filehandle, ctx=ctx) as pool:
    for key_record in key_records:
      pool.append(key_record[1])

  logging.debug("Finalizing")
  filehandle.close()

  entity = _OutputFile(key_name=full_filename,
                       parent=_OutputFile.get_root_key(ctx.mapreduce_id))
  entity.put()