Esempio n. 1
0
    def validate(cls, mapper_spec):
        """Validates mapper specification.

    Args:
      mapper_spec: an instance of model.MapperSpec to validate.
    """
        if mapper_spec.output_writer_class() != cls:
            raise errors.BadWriterParamsError("Output writer class mismatch")

        output_sharding = cls._get_output_sharding(mapper_spec=mapper_spec)
        if (output_sharding != cls.OUTPUT_SHARDING_NONE
                and output_sharding != cls.OUTPUT_SHARDING_INPUT_SHARDS):
            raise errors.BadWriterParamsError(
                "Invalid output_sharding value: %s" % output_sharding)

        params = _get_params(mapper_spec)
        filesystem = cls._get_filesystem(mapper_spec)
        if filesystem not in files.FILESYSTEMS:
            raise errors.BadWriterParamsError(
                "Filesystem '%s' is not supported. Should be one of %s" %
                (filesystem, files.FILESYSTEMS))
        if filesystem == files.GS_FILESYSTEM:
            if not cls.GS_BUCKET_NAME_PARAM in params:
                raise errors.BadWriterParamsError(
                    "%s is required for Google store filesystem" %
                    cls.GS_BUCKET_NAME_PARAM)
        else:
            if params.get(cls.GS_BUCKET_NAME_PARAM) is not None:
                raise errors.BadWriterParamsError(
                    "%s can only be provided for Google store filesystem" %
                    cls.GS_BUCKET_NAME_PARAM)
Esempio n. 2
0
    def validate(cls, mapper_spec):
        """Validates mapper specification.

    Args:
      mapper_spec: an instance of model.MapperSpec to validate.
    """
        if mapper_spec.output_writer_class() != cls:
            raise errors.BadWriterParamsError("Output writer class mismatch")

        output_sharding = _get_output_sharding(mapper_spec=mapper_spec)
        if (output_sharding != cls.OUTPUT_SHARDING_NONE
                and output_sharding != cls.OUTPUT_SHARDING_INPUT_SHARDS):
            raise errors.BadWriterParamsError(
                "Invalid output_sharding value: %s" % output_sharding)
Esempio n. 3
0
    def _generate_filename(cls, writer_spec, name, job_id, num, retry):
        """Generates a filename for a shard / retry count.

    Args:
      writer_spec: specification dictionary for the output writer.
      name: name of the job.
      job_id: the ID number assigned to the job.
      num: shard number.
      retry: the retry number.

    Returns:
      a string containing the filename.

    Raises:
      BadWriterParamsError if the template contains any errors such as invalid
        syntax or contains unknown substitution placeholders.
    """
        naming_format = writer_spec.get(cls.NAMING_FORMAT_PARAM,
                                        cls.DEFAULT_NAMING_FORMAT)
        template = string.Template(naming_format)
        try:

            return template.substitute(name=name,
                                       id=job_id,
                                       num=num,
                                       retry=retry)
        except ValueError, error:
            raise errors.BadWriterParamsError("Naming template is bad, %s" %
                                              (error))
Esempio n. 4
0
    def validate(cls, mapper_spec):
        """Validates mapper specification.

    Args:
      mapper_spec: an instance of model.MapperSpec to validate.
    Raises:
      BadWriterParamsError: when Output writer class mismatch.
    """
        if mapper_spec.output_writer_class() != cls:
            raise errors.BadWriterParamsError("Output writer class mismatch")
        params = output_writers._get_params(mapper_spec)

        if cls.BUCKET_NAME_PARAM not in params:
            raise errors.BadWriterParamsError(
                "%s is required for the _HashingGCSOutputWriter" %
                cls.BUCKET_NAME_PARAM)
Esempio n. 5
0
  def _generate_filename(cls, writer_spec, name, job_id, num,
                         attempt=None, seg_index=None):
    """Generates a filename for a particular output.

    Args:
      writer_spec: specification dictionary for the output writer.
      name: name of the job.
      job_id: the ID number assigned to the job.
      num: shard number.
      attempt: the shard attempt number.
      seg_index: index of the seg. None means the final output.

    Returns:
      a string containing the filename.

    Raises:
      BadWriterParamsError if the template contains any errors such as invalid
        syntax or contains unknown substitution placeholders.
    """
    naming_format = cls._TMP_FILE_NAMING_FORMAT
    if seg_index is None:
      naming_format = writer_spec.get(cls.NAMING_FORMAT_PARAM,
                                      cls.DEFAULT_NAMING_FORMAT)

    template = string.Template(naming_format)
    try:

      if seg_index is None:
        return template.substitute(name=name, id=job_id, num=num)
      else:
        return template.substitute(name=name, id=job_id, num=num,
                                   attempt=attempt,
                                   seg=seg_index)
    except ValueError, error:
      raise errors.BadWriterParamsError("Naming template is bad, %s" % (error))
Esempio n. 6
0
    def validate(cls, mapper_spec):
        """Validates mapper specification.

    Args:
      mapper_spec: an instance of model.MapperSpec to validate.
    """
        if mapper_spec.output_writer_class() != cls:
            raise errors.BadWriterParamsError("Output writer class mismatch")
Esempio n. 7
0
 def _create_file(cls, filesystem, filename, mime_type, **kwargs):
     """Creates a file and returns its created filename."""
     if filesystem == files.BLOBSTORE_FILESYSTEM:
         return files.blobstore.create(mime_type, filename)
     elif filesystem == files.GS_FILESYSTEM:
         return files.gs.create("/gs/%s" % filename, mime_type, **kwargs)
     else:
         raise errors.BadWriterParamsError(
             "Filesystem '%s' is not supported" % filesystem)
Esempio n. 8
0
  def validate(cls, mapper_spec):
    """Validates mapper specification.

    Args:
      mapper_spec: an instance of model.MapperSpec to validate.
    """
    if cls.OUTPUT_SHARDING_PARAM in _get_params(mapper_spec):
      raise errors.BadWriterParamsError(
          "output_sharding should not be specified for %s" % cls.__name__)
    super(FileRecordsOutputWriter, cls).validate(mapper_spec)
Esempio n. 9
0
 def _get_finalized_filename(cls, fs, create_filename, request_filename):
     """Returns the finalized filename for the created filename."""
     if fs == "blobstore":
         return files.blobstore.get_file_name(
             files.blobstore.get_blob_key(create_filename))
     elif fs == "gs":
         return "/gs/" + request_filename
     else:
         raise errors.BadWriterParamsError(
             "Filesystem '%s' is not supported" % fs)
Esempio n. 10
0
    def validate(cls, mapper_spec):
        """Validate mapper specification.

    Args:
      mapper_spec: an instance of model.MapperSpec.

    Raises:
      BadWriterParamsError if the specification is invalid for any reason such
        as missing the bucket name or providing an invalid bucket name.
    """
        writer_spec = _get_params(mapper_spec, allow_old=False)

        if cls.BUCKET_NAME_PARAM not in writer_spec:
            raise errors.BadWriterParamsError(
                "%s is required for Google Cloud Storage" %
                cls.BUCKET_NAME_PARAM)
        try:
            cloudstorage.validate_bucket_name(
                writer_spec[cls.BUCKET_NAME_PARAM])
        except ValueError, error:
            raise errors.BadWriterParamsError("Bad bucket name, %s" % (error))
Esempio n. 11
0
def _get_params(mapper_spec, allowed_keys=None, allow_old=True):
    """Obtain output writer parameters.

  Utility function for output writer implementation. Fetches parameters
  from mapreduce specification giving appropriate usage warnings.

  Args:
    mapper_spec: The MapperSpec for the job
    allowed_keys: set of all allowed keys in parameters as strings. If it is not
      None, then parameters are expected to be in a separate "output_writer"
      subdictionary of mapper_spec parameters.
    allow_old: Allow parameters to exist outside of the output_writer
      subdictionary for compatability.

  Returns:
    mapper parameters as dict

  Raises:
    BadWriterParamsError: if parameters are invalid/missing or not allowed.
  """
    if "output_writer" not in mapper_spec.params:
        message = ("Output writer's parameters should be specified in "
                   "output_writer subdictionary.")
        if not allow_old or allowed_keys:
            raise errors.BadWriterParamsError(message)
        params = mapper_spec.params
        params = dict((str(n), v) for n, v in params.iteritems())
    else:
        if not isinstance(mapper_spec.params.get("output_writer"), dict):
            raise errors.BadWriterParamsError(
                "Output writer parameters should be a dictionary")
        params = mapper_spec.params.get("output_writer")
        params = dict((str(n), v) for n, v in params.iteritems())
        if allowed_keys:
            params_diff = set(params.keys()) - allowed_keys
            if params_diff:
                raise errors.BadWriterParamsError(
                    "Invalid output_writer parameters: %s" %
                    ",".join(params_diff))
    return params
Esempio n. 12
0
  def validate(cls, job_config):
    """Validates relevant parameters.

    This method can validate fields which it deems relevant.

    Args:
      job_config: an instance of map_job.JobConfig.

    Raises:
      errors.BadWriterParamsError: required parameters are missing or invalid.
    """
    if job_config.output_writer_cls != cls:
      raise errors.BadWriterParamsError(
          "Expect output writer class %r, got %r." %
          (cls, job_config.output_writer_cls))
Esempio n. 13
0
class _GoogleCloudStorageOutputWriterBase(_GoogleCloudStorageBase):
    """Base class for GCS writers directly interacting with GCS.

  Base class for both _GoogleCloudStorageOutputWriter and
  GoogleCloudStorageConsistentOutputWriter.

  This class is expected to be subclassed with a writer that applies formatting
  to user-level records.

  Subclasses need to define to_json, from_json, create, finalize and
  _get_write_buffer methods.

  See _GoogleCloudStorageBase for config options.
  """

    _DEFAULT_NAMING_FORMAT = "$name/$id/output-$num"

    _MR_TMP = "gae_mr_tmp"
    _TMP_FILE_NAMING_FORMAT = (
        _MR_TMP + "/$name/$id/attempt-$attempt/output-$num/seg-$seg")

    @classmethod
    def _generate_filename(cls,
                           writer_spec,
                           name,
                           job_id,
                           num,
                           attempt=None,
                           seg_index=None):
        """Generates a filename for a particular output.

    Args:
      writer_spec: specification dictionary for the output writer.
      name: name of the job.
      job_id: the ID number assigned to the job.
      num: shard number.
      attempt: the shard attempt number.
      seg_index: index of the seg. None means the final output.

    Returns:
      a string containing the filename.

    Raises:
      BadWriterParamsError: if the template contains any errors such as invalid
        syntax or contains unknown substitution placeholders.
    """
        naming_format = cls._TMP_FILE_NAMING_FORMAT
        if seg_index is None:
            naming_format = writer_spec.get(cls.NAMING_FORMAT_PARAM,
                                            cls._DEFAULT_NAMING_FORMAT)

        template = string.Template(naming_format)
        try:

            if seg_index is None:
                return template.substitute(name=name, id=job_id, num=num)
            else:
                return template.substitute(name=name,
                                           id=job_id,
                                           num=num,
                                           attempt=attempt,
                                           seg=seg_index)
        except ValueError, error:
            raise errors.BadWriterParamsError("Naming template is bad, %s" %
                                              (error))
        except KeyError, error:
            raise errors.BadWriterParamsError("Naming template '%s' has extra "
                                              "mappings, %s" %
                                              (naming_format, error))
Esempio n. 14
0
    """
    writer_spec = _get_params(mapper_spec, allow_old=False)


    if cls.BUCKET_NAME_PARAM not in writer_spec:
      raise errors.BadWriterParamsError(
          "%s is required for Google Cloud Storage" %
          cls.BUCKET_NAME_PARAM)
    try:
      cloudstorage.validate_bucket_name(
          writer_spec[cls.BUCKET_NAME_PARAM])
    except ValueError, error:
      raise errors.BadWriterParamsError("Bad bucket name, %s" % (error))

    if writer_spec.get(cls._NO_DUPLICATE, False) not in (True, False):
      raise errors.BadWriterParamsError("No duplicate must a boolean.")


    cls._generate_filename(writer_spec, "name", "id", 0)
    cls._generate_filename(writer_spec, "name", "id", 0, 1, 0)

  @classmethod
  def create(cls, mr_spec, shard_number, shard_attempt, _writer_state=None):
    """Inherit docs."""
    writer_spec = _get_params(mr_spec.mapper, allow_old=False)
    seg_index = None
    if writer_spec.get(cls._NO_DUPLICATE, False):
      seg_index = 0


    key = cls._generate_filename(writer_spec, mr_spec.name,
Esempio n. 15
0
class _GoogleCloudStorageOutputWriter(OutputWriter):
    """Output writer to Google Cloud Storage using the cloudstorage library.

  This class is expected to be subclassed with a writer that applies formatting
  to user-level records.

  Required configuration in the mapper_spec.output_writer dictionary.
    BUCKET_NAME_PARAM: name of the bucket to use (with no extra delimiters or
      suffixes such as directories. Directories/prefixes can be specifed as
      part of the NAMING_FORMAT_PARAM).

  Optional configuration in the mapper_spec.output_writer dictionary:
    ACL_PARAM: acl to apply to new files, else bucket default used.
    NAMING_FORMAT_PARAM: prefix format string for the new files (there is no
      required starting slash, expected formats would look like
      "directory/basename...", any starting slash will be treated as part of
      the file name) that should use the following substitutions:
        $name - the name of the job
        $id - the id assigned to the job
        $num - the shard number
        $retry - the retry count for this shard
      If there is more than one shard $num must be used. An arbitrary suffix may
      be applied by the writer.
    CONTENT_TYPE_PARAM: mime type to apply on the files. If not provided, Google
      Cloud Storage will apply its default.
  """

    BUCKET_NAME_PARAM = "bucket_name"
    ACL_PARAM = "acl"
    NAMING_FORMAT_PARAM = "naming_format"
    CONTENT_TYPE_PARAM = "content_type"

    DEFAULT_NAMING_FORMAT = "$name-$id-output-$num-retry-$retry"

    _ACCOUNT_ID_PARAM = "account_id"
    _JSON_PICKLE = "pickle"

    def __init__(self, streaming_buffer, writer_spec=None):
        """Initialize a GoogleCloudStorageOutputWriter instance.

    Args:
      streaming_buffer: an instance of writable buffer from cloudstorage_api.
      writer_spec: the specification for the writer, useful for subclasses.
    """
        self._streaming_buffer = streaming_buffer

    @classmethod
    def _generate_filename(cls, writer_spec, name, job_id, num, retry):
        """Generates a filename for a shard / retry count.

    Args:
      writer_spec: specification dictionary for the output writer.
      name: name of the job.
      job_id: the ID number assigned to the job.
      num: shard number.
      retry: the retry number.

    Returns:
      a string containing the filename.

    Raises:
      BadWriterParamsError if the template contains any errors such as invalid
        syntax or contains unknown substitution placeholders.
    """
        naming_format = writer_spec.get(cls.NAMING_FORMAT_PARAM,
                                        cls.DEFAULT_NAMING_FORMAT)
        template = string.Template(naming_format)
        try:

            return template.substitute(name=name,
                                       id=job_id,
                                       num=num,
                                       retry=retry)
        except ValueError, error:
            raise errors.BadWriterParamsError("Naming template is bad, %s" %
                                              (error))
        except KeyError, error:
            raise errors.BadWriterParamsError("Naming template '%s' has extra "
                                              "mappings, %s" %
                                              (naming_format, error))
Esempio n. 16
0
 def validate(cls, mapper_spec):
     """Inherit docs."""
     writer_spec = cls.get_params(mapper_spec, allow_old=False)
     if writer_spec.get(cls._NO_DUPLICATE, False) not in (True, False):
         raise errors.BadWriterParamsError("No duplicate must a boolean.")
     super(_GoogleCloudStorageOutputWriter, cls).validate(mapper_spec)
Esempio n. 17
0
class _GoogleCloudStorageOutputWriter(OutputWriter):
  """Output writer to Google Cloud Storage using the cloudstorage library.

  This class is expected to be subclassed with a writer that applies formatting
  to user-level records.

  Required configuration in the mapper_spec.output_writer dictionary.
    BUCKET_NAME_PARAM: name of the bucket to use (with no extra delimiters or
      suffixes such as directories. Directories/prefixes can be specifed as
      part of the NAMING_FORMAT_PARAM).

  Optional configuration in the mapper_spec.output_writer dictionary:
    ACL_PARAM: acl to apply to new files, else bucket default used.
    NAMING_FORMAT_PARAM: prefix format string for the new files (there is no
      required starting slash, expected formats would look like
      "directory/basename...", any starting slash will be treated as part of
      the file name) that should use the following substitutions:
        $name - the name of the job
        $id - the id assigned to the job
        $num - the shard number
      If there is more than one shard $num must be used. An arbitrary suffix may
      be applied by the writer.
    CONTENT_TYPE_PARAM: mime type to apply on the files. If not provided, Google
      Cloud Storage will apply its default.
    _NO_DUPLICATE: if True, slice recovery logic will be used to ensure
      output files has no duplicates. Every shard should have only one final
      output in user specified location. But it may produce many smaller
      files (named "seg") due to slice recovery. These segs live in a
      tmp directory and should be combined and renamed to the final location.
      In current impl, they are not combined.
  """


  BUCKET_NAME_PARAM = "bucket_name"
  ACL_PARAM = "acl"
  NAMING_FORMAT_PARAM = "naming_format"
  CONTENT_TYPE_PARAM = "content_type"
  _NO_DUPLICATE = "no_duplicate"


  DEFAULT_NAMING_FORMAT = "$name/$id/output-$num"


  _MR_TMP = "gae_mr_tmp"
  _TMP_FILE_NAMING_FORMAT = (
      _MR_TMP + "/$name/$id/attempt-$attempt/output-$num/seg-$seg")
  _ACCOUNT_ID_PARAM = "account_id"
  _SEG_PREFIX = "seg_prefix"
  _LAST_SEG_INDEX = "last_seg_index"
  _JSON_GCS_BUFFER = "buffer"
  _JSON_SEG_INDEX = "seg_index"
  _JSON_NO_DUP = "no_dup"

  _VALID_LENGTH = "x-goog-meta-gae-mr-valid-length"


  def __init__(self, streaming_buffer, writer_spec=None):
    """Initialize a GoogleCloudStorageOutputWriter instance.

    Args:
      streaming_buffer: an instance of writable buffer from cloudstorage_api.

      writer_spec: the specification for the writer.
    """
    self._streaming_buffer = streaming_buffer
    self._no_dup = False
    if writer_spec:
      self._no_dup = writer_spec.get(self._NO_DUPLICATE, False)
    if self._no_dup:



      self._seg_index = int(streaming_buffer.name.rsplit("-", 1)[1])




      self._seg_valid_length = 0

  @classmethod
  def _generate_filename(cls, writer_spec, name, job_id, num,
                         attempt=None, seg_index=None):
    """Generates a filename for a particular output.

    Args:
      writer_spec: specification dictionary for the output writer.
      name: name of the job.
      job_id: the ID number assigned to the job.
      num: shard number.
      attempt: the shard attempt number.
      seg_index: index of the seg. None means the final output.

    Returns:
      a string containing the filename.

    Raises:
      BadWriterParamsError if the template contains any errors such as invalid
        syntax or contains unknown substitution placeholders.
    """
    naming_format = cls._TMP_FILE_NAMING_FORMAT
    if seg_index is None:
      naming_format = writer_spec.get(cls.NAMING_FORMAT_PARAM,
                                      cls.DEFAULT_NAMING_FORMAT)

    template = string.Template(naming_format)
    try:

      if seg_index is None:
        return template.substitute(name=name, id=job_id, num=num)
      else:
        return template.substitute(name=name, id=job_id, num=num,
                                   attempt=attempt,
                                   seg=seg_index)
    except ValueError, error:
      raise errors.BadWriterParamsError("Naming template is bad, %s" % (error))
    except KeyError, error:
      raise errors.BadWriterParamsError("Naming template '%s' has extra "
                                        "mappings, %s" % (naming_format, error))