Example #1
0
    def validate(cls, mapper_spec):
        """Check that the pipeline has passed all the parameters needed by the OutputWriter"""

        if mapper_spec.output_writer_class() != cls:
            raise BadWriterParamsError("Mapper output writer class mismatch")

        params = _get_params(mapper_spec)

        # Check hashtag parameter
        if cls.HASHTAG not in params:
            raise BadWriterParamsError("Must specify %s" % cls.HASHTAG)
        if not isinstance(str(params[cls.HASHTAG]), str):
            raise BadWriterParamsError("%s should be a string" % cls.HASHTAG)

        # Check session_id parameter
        if cls.SESSION_ID not in params:
            raise BadWriterParamsError("Must specify %s" % cls.SESSION_ID)
        if not isinstance(str(params[cls.SESSION_ID]), str):
            raise BadWriterParamsError("%s should be a string" %
                                       cls.SESSION_ID)

        # Check field paramter
        if cls.FIELD not in params:
            raise BadWriterParamsError("Must specify %s" % cls.FIELD)
        if not isinstance(str(params[cls.FIELD]), str):
            raise BadWriterParamsError("%s should be a string" % cls.FIELD)
 def create(cls, mr_spec, shard_number, shard_attempt, _writer_state=None):
   mapper_spec = mr_spec.mapper
   params = _get_params(mapper_spec)
   return cls(host=params.get('host'),
              port=params.get('port'),
              database=params.get('database'),
              user=params.get('user'),
              password=params.get('password'))
Example #3
0
 def create(cls, mr_spec, shard_number, shard_attempt, _writer_state=None):
     mapper_spec = mr_spec.mapper
     params = _get_params(mapper_spec)
     return cls(host=params.get('host'),
                port=params.get('port'),
                database=params.get('database'),
                user=params.get('user'),
                password=params.get('password'))
  def validate(cls, mapper_spec):
    required_params = ["host", "port", "database", "user", "password"]
    if mapper_spec.output_writer_class() != cls:
      raise errors.BadWriterParamsError("Output writer class mismatch")

    params = _get_params(mapper_spec)
    if not all([arg in params for arg in required_params]):
      raise errors.BadWriterParamsError("Output writer requires parameters [{}]".format(', '.join(required_params)))

    if not isinstance(params.get("port"), int):
      raise errors.BadWriterParamsError("Parameter 'port' must be integer.")
Example #5
0
    def create(cls, mr_spec, shard_number, shard_attempt, _writer_state=None):
        """Class method to create a new OutputWriter"""

        # Get params for the Output Writer
        params = _get_params(mr_spec.mapper)
        hashtag = params.get(cls.HASHTAG)
        session_id = params.get(cls.SESSION_ID)
        field = params.get(cls.FIELD)

        # Return parameters needed to create a new instance of the Output Writer
        return cls(hashtag, session_id, field)
Example #6
0
    def create(cls, mr_spec, shard_number, shard_attempt, _writer_state=None):
        """Inherit docs."""
        mapper_spec = mr_spec.mapper
        params = output_writers._get_params(mapper_spec)
        bucket_name = params.get(cls.BUCKET_NAME_PARAM)
        shards = mapper_spec.shard_count

        filehandles = []
        filename = (mr_spec.name + "/" + mr_spec.mapreduce_id + "/shard-" +
                    str(shard_number) + "-bucket-")
        for i in range(shards):
            full_filename = "/%s/%s%d" % (bucket_name, filename, i)
            filehandles.append(cloudstorage.open(full_filename, mode="w"))
        return cls(filehandles)
Example #7
0
  def create(cls, mr_spec, shard_number, shard_attempt, _writer_state=None):
    """Inherit docs."""
    mapper_spec = mr_spec.mapper
    params = output_writers._get_params(mapper_spec)
    bucket_name = params.get(cls.BUCKET_NAME_PARAM)
    shards = mapper_spec.shard_count

    filehandles = []
    filename = (mr_spec.name + "/" + mr_spec.mapreduce_id +
                "/shard-" + str(shard_number) + "-bucket-")
    for i in range(shards):
      full_filename = "/%s/%s%d" % (bucket_name, filename, i)
      filehandles.append(cloudstorage.open(full_filename, mode="w"))
    return cls(filehandles)
Example #8
0
    def validate(cls, mapper_spec):
        required_params = ["host", "port", "database", "user", "password"]
        if mapper_spec.output_writer_class() != cls:
            raise errors.BadWriterParamsError("Output writer class mismatch")

        params = _get_params(mapper_spec)
        if not all([arg in params for arg in required_params]):
            raise errors.BadWriterParamsError(
                "Output writer requires parameters [{}]".format(
                    ', '.join(required_params)))

        if not isinstance(params.get("port"), int):
            raise errors.BadWriterParamsError(
                "Parameter 'port' must be integer.")
Example #9
0
  def _to_map_job_config(cls,
                         mr_spec,
                         # TODO(user): Remove this parameter after it can be
                         # read from mr_spec.
                         queue_name):
    """Converts model.MapreduceSpec back to JobConfig.

    This method allows our internal methods to use JobConfig directly.
    This method also allows us to expose JobConfig as an API during execution,
    despite that it is not saved into datastore.

    Args:
      mr_spec: model.MapreduceSpec.
      queue_name: queue name.

    Returns:
      The JobConfig object for this job.
    """
    mapper_spec = mr_spec.mapper
    # 0 means all the old APIs before api_version is introduced.
    api_version = mr_spec.params.get("api_version", 0)
    old_api = api_version == 0
    # We can not always convert MapreduceSpec generated by older API
    # to JobConfig. Thus, mr framework should use/expose the returned JobConfig
    # object with caution when a job is started with an old API.
    # In this case, this method only tries not to blow up and assemble a
    # JobConfig object as accurate as possible.
    return cls(_lenient=old_api,
               job_name=mr_spec.name,
               job_id=mr_spec.mapreduce_id,
               # handler_spec from older API may not have map_job.Mapper type.
               mapper=util.for_name(mapper_spec.handler_spec),
               input_reader_cls=mapper_spec.input_reader_class(),
               input_reader_params=input_readers._get_params(mapper_spec),
               output_writer_cls=mapper_spec.output_writer_class(),
               output_writer_params=output_writers._get_params(mapper_spec),
               shard_count=mapper_spec.shard_count,
               queue_name=queue_name,
               user_params=mr_spec.params.get("user_params"),
               shard_max_attempts=mr_spec.params.get("shard_max_attempts"),
               done_callback_url=mr_spec.params.get("done_callback"),
               _force_writes=mr_spec.params.get("force_writes"),
               _base_path=mr_spec.params["base_path"],
               _task_max_attempts=mr_spec.params.get("task_max_attempts"),
               _task_max_data_processing_attempts=(
                   mr_spec.params.get("task_max_data_processing_attempts")),
               _hooks_cls=util.for_name(mr_spec.hooks_class_name),
               _app=mr_spec.params.get("app_id"),
               _api_version=api_version)
Example #10
0
    def validate(cls, mapper_spec):
        """Validates mapper specification.

    Args:
      mapper_spec: an instance of model.MapperSpec to validate.
    Raises:
      BadWriterParamsError: when Output writer class mismatch.
    """
        if mapper_spec.output_writer_class() != cls:
            raise errors.BadWriterParamsError("Output writer class mismatch")
        params = output_writers._get_params(mapper_spec)
        # Bucket Name is required
        if cls.BUCKET_NAME_PARAM not in params:
            raise errors.BadWriterParamsError(
                "%s is required for the _HashingGCSOutputWriter" %
                cls.BUCKET_NAME_PARAM)
Example #11
0
  def validate(cls, mapper_spec):
    """Validates mapper specification.

    Args:
      mapper_spec: an instance of model.MapperSpec to validate.
    Raises:
      BadWriterParamsError: when Output writer class mismatch.
    """
    if mapper_spec.output_writer_class() != cls:
      raise errors.BadWriterParamsError("Output writer class mismatch")
    params = output_writers._get_params(mapper_spec)
    # Bucket Name is required
    if cls.BUCKET_NAME_PARAM not in params:
      raise errors.BadWriterParamsError(
          "%s is required for the _HashingGCSOutputWriter" %
          cls.BUCKET_NAME_PARAM)