def __init__(self,
               file_pattern,
               min_bundle_size=0,
               compression_type=CompressionTypes.AUTO,
               splittable=True,
               validate=True):
    """Initializes :class:`FileBasedSource`.

    Args:
      file_pattern (str): the file glob to read a string or a
        :class:`~apache_beam.options.value_provider.ValueProvider`
        (placeholder to inject a runtime value).
      min_bundle_size (str): minimum size of bundles that should be generated
        when performing initial splitting on this source.
      compression_type (str): Used to handle compressed output files.
        Typical value is :attr:`CompressionTypes.AUTO
        <apache_beam.io.filesystem.CompressionTypes.AUTO>`,
        in which case the final file path's extension will be used to detect
        the compression.
      splittable (bool): whether :class:`FileBasedSource` should try to
        logically split a single file into data ranges so that different parts
        of the same file can be read in parallel. If set to :data:`False`,
        :class:`FileBasedSource` will prevent both initial and dynamic splitting
        of sources for single files. File patterns that represent multiple files
        may still get split into sources for individual files. Even if set to
        :data:`True` by the user, :class:`FileBasedSource` may choose to not
        split the file, for example, for compressed files where currently it is
        not possible to efficiently read a data range without decompressing the
        whole file.
      validate (bool): Boolean flag to verify that the files exist during the
        pipeline creation time.

    Raises:
      ~exceptions.TypeError: when **compression_type** is not valid or if
        **file_pattern** is not a :class:`str` or a
        :class:`~apache_beam.options.value_provider.ValueProvider`.
      ~exceptions.ValueError: when compression and splittable files are
        specified.
      ~exceptions.IOError: when the file pattern specified yields an empty
        result.
    """

    if not isinstance(file_pattern, (basestring, ValueProvider)):
      raise TypeError('%s: file_pattern must be of type string'
                      ' or ValueProvider; got %r instead'
                      % (self.__class__.__name__, file_pattern))

    if isinstance(file_pattern, basestring):
      file_pattern = StaticValueProvider(str, file_pattern)
    self._pattern = file_pattern

    self._concat_source = None
    self._min_bundle_size = min_bundle_size
    if not CompressionTypes.is_valid_compression_type(compression_type):
      raise TypeError('compression_type must be CompressionType object but '
                      'was %s' % type(compression_type))
    self._compression_type = compression_type
    self._splittable = splittable
    if validate and file_pattern.is_accessible():
      self._validate()
  def __init__(self,
               file_pattern,
               min_bundle_size=0,
               compression_type=CompressionTypes.AUTO,
               splittable=True,
               validate=True):
    """Initializes ``FileBasedSource``.

    Args:
      file_pattern: the file glob to read a string or a ValueProvider
                    (placeholder to inject a runtime value).
      min_bundle_size: minimum size of bundles that should be generated when
                       performing initial splitting on this source.
      compression_type: compression type to use
      splittable: whether FileBasedSource should try to logically split a single
                  file into data ranges so that different parts of the same file
                  can be read in parallel. If set to False, FileBasedSource will
                  prevent both initial and dynamic splitting of sources for
                  single files. File patterns that represent multiple files may
                  still get split into sources for individual files. Even if set
                  to True by the user, FileBasedSource may choose to not split
                  the file, for example, for compressed files where currently
                  it is not possible to efficiently read a data range without
                  decompressing the whole file.
      validate: Boolean flag to verify that the files exist during the pipeline
                creation time.
    Raises:
      TypeError: when compression_type is not valid or if file_pattern is not a
                 string or a ValueProvider.
      ValueError: when compression and splittable files are specified.
      IOError: when the file pattern specified yields an empty result.
    """

    if not isinstance(file_pattern, (basestring, ValueProvider)):
      raise TypeError('%s: file_pattern must be of type string'
                      ' or ValueProvider; got %r instead'
                      % (self.__class__.__name__, file_pattern))

    if isinstance(file_pattern, basestring):
      file_pattern = StaticValueProvider(str, file_pattern)
    self._pattern = file_pattern

    self._concat_source = None
    self._min_bundle_size = min_bundle_size
    if not CompressionTypes.is_valid_compression_type(compression_type):
      raise TypeError('compression_type must be CompressionType object but '
                      'was %s' % type(compression_type))
    self._compression_type = compression_type
    if compression_type in (CompressionTypes.UNCOMPRESSED,
                            CompressionTypes.AUTO):
      self._splittable = splittable
    else:
      # We can't split compressed files efficiently so turn off splitting.
      self._splittable = False
    if validate and file_pattern.is_accessible():
      self._validate()
 def _open_hdfs(self, path, mode, mime_type, compression_type):
   if mime_type != 'application/octet-stream':
     logging.warning('Mime types are not supported. Got non-default mime_type:'
                     ' %s', mime_type)
   if compression_type == CompressionTypes.AUTO:
     compression_type = CompressionTypes.detect_compression_type(path)
   res = self._hdfs_client.open(path, mode)
   if compression_type != CompressionTypes.UNCOMPRESSED:
     res = CompressedFile(res)
   return res
  def _add_compression(stream, path, mime_type, compression_type):
    if mime_type != 'application/octet-stream':
      logging.warning('Mime types are not supported. Got non-default mime_type:'
                      ' %s', mime_type)
    if compression_type == CompressionTypes.AUTO:
      compression_type = CompressionTypes.detect_compression_type(path)
    if compression_type != CompressionTypes.UNCOMPRESSED:
      return CompressedFile(stream)

    return stream
 def _path_open(self, path, mode, mime_type='application/octet-stream',
                compression_type=CompressionTypes.AUTO):
   """Helper functions to open a file in the provided mode.
   """
   compression_type = FileSystem._get_compression_type(path, compression_type)
   mime_type = CompressionTypes.mime_type(compression_type, mime_type)
   raw_file = gcsio.GcsIO().open(path, mode, mime_type=mime_type)
   if compression_type == CompressionTypes.UNCOMPRESSED:
     return raw_file
   return CompressedFile(raw_file, compression_type=compression_type)
Exemple #6
0
 def _path_open(self,
                path,
                mode,
                mime_type='application/octet-stream',
                compression_type=CompressionTypes.AUTO):
     """Helper functions to open a file in the provided mode.
 """
     compression_type = FileSystem._get_compression_type(
         path, compression_type)
     mime_type = CompressionTypes.mime_type(compression_type, mime_type)
     raw_file = gcsio.GcsIO().open(path, mode, mime_type=mime_type)
     if compression_type == CompressionTypes.UNCOMPRESSED:
         return raw_file
     return CompressedFile(raw_file, compression_type=compression_type)
Exemple #7
0
    def __init__(self,
                 file_path_prefix,
                 coder,
                 file_name_suffix='',
                 num_shards=0,
                 shard_name_template=None,
                 mime_type='application/octet-stream',
                 compression_type=CompressionTypes.AUTO):
        """
     Raises:
      TypeError: if file path parameters are not a string or ValueProvider,
                 or if compression_type is not member of CompressionTypes.
      ValueError: if shard_name_template is not of expected format.
    """
        if not (isinstance(file_path_prefix, basestring)
                or isinstance(file_path_prefix, ValueProvider)):
            raise TypeError(
                'file_path_prefix must be a string or ValueProvider;'
                'got %r instead' % file_path_prefix)
        if not (isinstance(file_name_suffix, basestring)
                or isinstance(file_name_suffix, ValueProvider)):
            raise TypeError(
                'file_name_suffix must be a string or ValueProvider;'
                'got %r instead' % file_name_suffix)

        if not CompressionTypes.is_valid_compression_type(compression_type):
            raise TypeError(
                'compression_type must be CompressionType object but '
                'was %s' % type(compression_type))
        if shard_name_template is None:
            shard_name_template = DEFAULT_SHARD_NAME_TEMPLATE
        elif shard_name_template is '':
            num_shards = 1
        if isinstance(file_path_prefix, basestring):
            file_path_prefix = StaticValueProvider(str, file_path_prefix)
        if isinstance(file_name_suffix, basestring):
            file_name_suffix = StaticValueProvider(str, file_name_suffix)
        self.file_path_prefix = file_path_prefix
        self.file_name_suffix = file_name_suffix
        self.num_shards = num_shards
        self.coder = coder
        self.shard_name_format = self._template_to_format(shard_name_template)
        self.compression_type = compression_type
        self.mime_type = mime_type
        if file_path_prefix.is_accessible():
            self._file_system = get_filesystem(file_path_prefix.get())
        else:
            self._file_system = None
Exemple #8
0
    def __init__(self,
                 file_path_prefix,
                 coder,
                 file_name_suffix='',
                 num_shards=0,
                 shard_name_template=None,
                 mime_type='application/octet-stream',
                 compression_type=CompressionTypes.AUTO):
        """
     Raises:
      ~exceptions.TypeError: if file path parameters are not a :class:`str` or
        :class:`~apache_beam.options.value_provider.ValueProvider`, or if
        **compression_type** is not member of
        :class:`~apache_beam.io.filesystem.CompressionTypes`.
      ~exceptions.ValueError: if **shard_name_template** is not of expected
        format.
    """
        if not isinstance(file_path_prefix, ((str, unicode), ValueProvider)):
            raise TypeError(
                'file_path_prefix must be a string or ValueProvider;'
                'got %r instead' % file_path_prefix)
        if not isinstance(file_name_suffix, ((str, unicode), ValueProvider)):
            raise TypeError(
                'file_name_suffix must be a string or ValueProvider;'
                'got %r instead' % file_name_suffix)

        if not CompressionTypes.is_valid_compression_type(compression_type):
            raise TypeError(
                'compression_type must be CompressionType object but '
                'was %s' % type(compression_type))
        if shard_name_template is None:
            shard_name_template = DEFAULT_SHARD_NAME_TEMPLATE
        elif shard_name_template == '':
            num_shards = 1
        if isinstance(file_path_prefix, (str, unicode)):
            file_path_prefix = StaticValueProvider(str, file_path_prefix)
        if isinstance(file_name_suffix, (str, unicode)):
            file_name_suffix = StaticValueProvider(str, file_name_suffix)
        self.file_path_prefix = file_path_prefix
        self.file_name_suffix = file_name_suffix
        self.num_shards = num_shards
        self.coder = coder
        self.shard_name_format = self._template_to_format(shard_name_template)
        self.shard_name_glob_format = self._template_to_glob_format(
            shard_name_template)
        self.compression_type = compression_type
        self.mime_type = mime_type
Exemple #9
0
  def __init__(self,
               file_path_prefix,
               coder,
               file_name_suffix='',
               num_shards=0,
               shard_name_template=None,
               mime_type='application/octet-stream',
               compression_type=CompressionTypes.AUTO):
    """
     Raises:
      ~exceptions.TypeError: if file path parameters are not a :class:`str` or
        :class:`~apache_beam.options.value_provider.ValueProvider`, or if
        **compression_type** is not member of
        :class:`~apache_beam.io.filesystem.CompressionTypes`.
      ~exceptions.ValueError: if **shard_name_template** is not of expected
        format.
    """
    if not isinstance(file_path_prefix, (string_types, ValueProvider)):
      raise TypeError('file_path_prefix must be a string or ValueProvider;'
                      'got %r instead' % file_path_prefix)
    if not isinstance(file_name_suffix, (string_types, ValueProvider)):
      raise TypeError('file_name_suffix must be a string or ValueProvider;'
                      'got %r instead' % file_name_suffix)

    if not CompressionTypes.is_valid_compression_type(compression_type):
      raise TypeError('compression_type must be CompressionType object but '
                      'was %s' % type(compression_type))
    if shard_name_template is None:
      shard_name_template = DEFAULT_SHARD_NAME_TEMPLATE
    elif shard_name_template == '':
      num_shards = 1
    if isinstance(file_path_prefix, string_types):
      file_path_prefix = StaticValueProvider(str, file_path_prefix)
    if isinstance(file_name_suffix, string_types):
      file_name_suffix = StaticValueProvider(str, file_name_suffix)
    self.file_path_prefix = file_path_prefix
    self.file_name_suffix = file_name_suffix
    self.num_shards = num_shards
    self.coder = coder
    self.shard_name_format = self._template_to_format(shard_name_template)
    self.shard_name_glob_format = self._template_to_glob_format(
        shard_name_template)
    self.compression_type = compression_type
    self.mime_type = mime_type
Exemple #10
0
    def _get_concat_source(self):
        if self._concat_source is None:
            pattern = self._pattern.get()

            single_file_sources = []
            if self._file_system is None:
                self._file_system = get_filesystem(pattern)
            match_result = self._file_system.match([pattern])[0]
            files_metadata = match_result.metadata_list

            # We create a reference for FileBasedSource that will be serialized along
            # with each _SingleFileSource. To prevent this FileBasedSource from having
            # a reference to ConcatSource (resulting in quadratic space complexity)
            # we clone it here.
            file_based_source_ref = pickler.loads(pickler.dumps(self))

            for file_metadata in files_metadata:
                file_name = file_metadata.path
                file_size = file_metadata.size_in_bytes
                if file_size == 0:
                    continue  # Ignoring empty file.

                # We determine splittability of this specific file.
                splittable = self.splittable
                if (splittable
                        and self._compression_type == CompressionTypes.AUTO):
                    compression_type = CompressionTypes.detect_compression_type(
                        file_name)
                    if compression_type != CompressionTypes.UNCOMPRESSED:
                        splittable = False

                single_file_source = _SingleFileSource(
                    file_based_source_ref,
                    file_name,
                    0,
                    file_size,
                    min_bundle_size=self._min_bundle_size,
                    splittable=splittable)
                single_file_sources.append(single_file_source)
            self._concat_source = concat_source.ConcatSource(
                single_file_sources)
        return self._concat_source
  def _get_concat_source(self):
    if self._concat_source is None:
      pattern = self._pattern.get()

      single_file_sources = []
      if self._file_system is None:
        self._file_system = get_filesystem(pattern)
      match_result = self._file_system.match([pattern])[0]
      files_metadata = match_result.metadata_list

      # We create a reference for FileBasedSource that will be serialized along
      # with each _SingleFileSource. To prevent this FileBasedSource from having
      # a reference to ConcatSource (resulting in quadratic space complexity)
      # we clone it here.
      file_based_source_ref = pickler.loads(pickler.dumps(self))

      for file_metadata in files_metadata:
        file_name = file_metadata.path
        file_size = file_metadata.size_in_bytes
        if file_size == 0:
          continue  # Ignoring empty file.

        # We determine splittability of this specific file.
        splittable = self.splittable
        if (splittable and
            self._compression_type == CompressionTypes.AUTO):
          compression_type = CompressionTypes.detect_compression_type(
              file_name)
          if compression_type != CompressionTypes.UNCOMPRESSED:
            splittable = False

        single_file_source = _SingleFileSource(
            file_based_source_ref, file_name,
            0,
            file_size,
            min_bundle_size=self._min_bundle_size,
            splittable=splittable)
        single_file_sources.append(single_file_source)
      self._concat_source = concat_source.ConcatSource(single_file_sources)
    return self._concat_source
Exemple #12
0
  def __init__(self,
               file_path_prefix,
               coder,
               file_name_suffix='',
               num_shards=0,
               shard_name_template=None,
               mime_type='application/octet-stream',
               compression_type=CompressionTypes.AUTO):
    """
     Raises:
      TypeError: if file path parameters are not a string or ValueProvider,
                 or if compression_type is not member of CompressionTypes.
      ValueError: if shard_name_template is not of expected format.
    """
    if not isinstance(file_path_prefix, (basestring, ValueProvider)):
      raise TypeError('file_path_prefix must be a string or ValueProvider;'
                      'got %r instead' % file_path_prefix)
    if not isinstance(file_name_suffix, (basestring, ValueProvider)):
      raise TypeError('file_name_suffix must be a string or ValueProvider;'
                      'got %r instead' % file_name_suffix)

    if not CompressionTypes.is_valid_compression_type(compression_type):
      raise TypeError('compression_type must be CompressionType object but '
                      'was %s' % type(compression_type))
    if shard_name_template is None:
      shard_name_template = DEFAULT_SHARD_NAME_TEMPLATE
    elif shard_name_template == '':
      num_shards = 1
    if isinstance(file_path_prefix, basestring):
      file_path_prefix = StaticValueProvider(str, file_path_prefix)
    if isinstance(file_name_suffix, basestring):
      file_name_suffix = StaticValueProvider(str, file_name_suffix)
    self.file_path_prefix = file_path_prefix
    self.file_name_suffix = file_name_suffix
    self.num_shards = num_shards
    self.coder = coder
    self.shard_name_format = self._template_to_format(shard_name_template)
    self.compression_type = compression_type
    self.mime_type = mime_type
Exemple #13
0
 def __init__(self,
              reader,
              file_patterns,
              min_bundle_size=0,
              compression_type=CompressionTypes.AUTO,
              splittable=True,
              validate=True):
     if not isinstance(file_patterns, ValueProvider):
         file_patterns = StaticValueProvider(list, file_patterns)
     self._patterns = file_patterns
     self._pickle_reader = pickler.dumps(reader)
     self._reader = None
     self._concat_source = None
     self._min_bundle_size = min_bundle_size
     if not CompressionTypes.is_valid_compression_type(compression_type):
         raise TypeError(
             'compression_type must be CompressionType object but '
             'was %s' % type(compression_type))
     self._compression_type = compression_type
     self._splittable = splittable
     if validate and file_patterns.is_accessible():
         self._validate()
def _determine_splittability_from_compression_type(
    file_path, compression_type):
  if compression_type == CompressionTypes.AUTO:
    compression_type = CompressionTypes.detect_compression_type(file_path)

  return compression_type == CompressionTypes.UNCOMPRESSED
Exemple #15
0
    def __init__(self,
                 file_pattern,
                 min_bundle_size=0,
                 compression_type=CompressionTypes.AUTO,
                 splittable=True,
                 validate=True):
        """Initializes :class:`FileBasedSource`.

    Args:
      file_pattern (str): the file glob to read a string or a
        :class:`~apache_beam.options.value_provider.ValueProvider`
        (placeholder to inject a runtime value).
      min_bundle_size (str): minimum size of bundles that should be generated
        when performing initial splitting on this source.
      compression_type (str): Used to handle compressed output files.
        Typical value is :attr:`CompressionTypes.AUTO
        <apache_beam.io.filesystem.CompressionTypes.AUTO>`,
        in which case the final file path's extension will be used to detect
        the compression.
      splittable (bool): whether :class:`FileBasedSource` should try to
        logically split a single file into data ranges so that different parts
        of the same file can be read in parallel. If set to :data:`False`,
        :class:`FileBasedSource` will prevent both initial and dynamic splitting
        of sources for single files. File patterns that represent multiple files
        may still get split into sources for individual files. Even if set to
        :data:`True` by the user, :class:`FileBasedSource` may choose to not
        split the file, for example, for compressed files where currently it is
        not possible to efficiently read a data range without decompressing the
        whole file.
      validate (bool): Boolean flag to verify that the files exist during the
        pipeline creation time.

    Raises:
      ~exceptions.TypeError: when **compression_type** is not valid or if
        **file_pattern** is not a :class:`str` or a
        :class:`~apache_beam.options.value_provider.ValueProvider`.
      ~exceptions.ValueError: when compression and splittable files are
        specified.
      ~exceptions.IOError: when the file pattern specified yields an empty
        result.
    """

        if not isinstance(file_pattern, (basestring, ValueProvider)):
            raise TypeError('%s: file_pattern must be of type string'
                            ' or ValueProvider; got %r instead' %
                            (self.__class__.__name__, file_pattern))

        if isinstance(file_pattern, basestring):
            file_pattern = StaticValueProvider(str, file_pattern)
        self._pattern = file_pattern

        self._concat_source = None
        self._min_bundle_size = min_bundle_size
        if not CompressionTypes.is_valid_compression_type(compression_type):
            raise TypeError(
                'compression_type must be CompressionType object but '
                'was %s' % type(compression_type))
        self._compression_type = compression_type
        self._splittable = splittable
        if validate and file_pattern.is_accessible():
            self._validate()
Exemple #16
0
def _determine_splittability_from_compression_type(file_path,
                                                   compression_type):
    if compression_type == CompressionTypes.AUTO:
        compression_type = CompressionTypes.detect_compression_type(file_path)

    return compression_type == CompressionTypes.UNCOMPRESSED
Exemple #17
0
    def __init__(self,
                 file_pattern,
                 min_bundle_size=0,
                 compression_type=CompressionTypes.AUTO,
                 splittable=True,
                 validate=True):
        """Initializes ``FileBasedSource``.

    Args:
      file_pattern: the file glob to read a string or a ValueProvider
                    (placeholder to inject a runtime value).
      min_bundle_size: minimum size of bundles that should be generated when
                       performing initial splitting on this source.
      compression_type: compression type to use
      splittable: whether FileBasedSource should try to logically split a single
                  file into data ranges so that different parts of the same file
                  can be read in parallel. If set to False, FileBasedSource will
                  prevent both initial and dynamic splitting of sources for
                  single files. File patterns that represent multiple files may
                  still get split into sources for individual files. Even if set
                  to True by the user, FileBasedSource may choose to not split
                  the file, for example, for compressed files where currently
                  it is not possible to efficiently read a data range without
                  decompressing the whole file.
      validate: Boolean flag to verify that the files exist during the pipeline
                creation time.
    Raises:
      TypeError: when compression_type is not valid or if file_pattern is not a
                 string or a ValueProvider.
      ValueError: when compression and splittable files are specified.
      IOError: when the file pattern specified yields an empty result.
    """

        if (not (isinstance(file_pattern, basestring)
                 or isinstance(file_pattern, ValueProvider))):
            raise TypeError('%s: file_pattern must be of type string'
                            ' or ValueProvider; got %r instead' %
                            (self.__class__.__name__, file_pattern))

        if isinstance(file_pattern, basestring):
            file_pattern = StaticValueProvider(str, file_pattern)
        self._pattern = file_pattern
        if file_pattern.is_accessible():
            self._file_system = get_filesystem(file_pattern.get())
        else:
            self._file_system = None

        self._concat_source = None
        self._min_bundle_size = min_bundle_size
        if not CompressionTypes.is_valid_compression_type(compression_type):
            raise TypeError(
                'compression_type must be CompressionType object but '
                'was %s' % type(compression_type))
        self._compression_type = compression_type
        if compression_type in (CompressionTypes.UNCOMPRESSED,
                                CompressionTypes.AUTO):
            self._splittable = splittable
        else:
            # We can't split compressed files efficiently so turn off splitting.
            self._splittable = False
        if validate and file_pattern.is_accessible():
            self._validate()