def __init__(self, file_pattern, min_bundle_size=0, compression_type=CompressionTypes.AUTO, splittable=True, validate=True): """Initializes :class:`FileBasedSource`. Args: file_pattern (str): the file glob to read a string or a :class:`~apache_beam.options.value_provider.ValueProvider` (placeholder to inject a runtime value). min_bundle_size (str): minimum size of bundles that should be generated when performing initial splitting on this source. compression_type (str): Used to handle compressed output files. Typical value is :attr:`CompressionTypes.AUTO <apache_beam.io.filesystem.CompressionTypes.AUTO>`, in which case the final file path's extension will be used to detect the compression. splittable (bool): whether :class:`FileBasedSource` should try to logically split a single file into data ranges so that different parts of the same file can be read in parallel. If set to :data:`False`, :class:`FileBasedSource` will prevent both initial and dynamic splitting of sources for single files. File patterns that represent multiple files may still get split into sources for individual files. Even if set to :data:`True` by the user, :class:`FileBasedSource` may choose to not split the file, for example, for compressed files where currently it is not possible to efficiently read a data range without decompressing the whole file. validate (bool): Boolean flag to verify that the files exist during the pipeline creation time. Raises: ~exceptions.TypeError: when **compression_type** is not valid or if **file_pattern** is not a :class:`str` or a :class:`~apache_beam.options.value_provider.ValueProvider`. ~exceptions.ValueError: when compression and splittable files are specified. ~exceptions.IOError: when the file pattern specified yields an empty result. """ if not isinstance(file_pattern, (basestring, ValueProvider)): raise TypeError('%s: file_pattern must be of type string' ' or ValueProvider; got %r instead' % (self.__class__.__name__, file_pattern)) if isinstance(file_pattern, basestring): file_pattern = StaticValueProvider(str, file_pattern) self._pattern = file_pattern self._concat_source = None self._min_bundle_size = min_bundle_size if not CompressionTypes.is_valid_compression_type(compression_type): raise TypeError('compression_type must be CompressionType object but ' 'was %s' % type(compression_type)) self._compression_type = compression_type self._splittable = splittable if validate and file_pattern.is_accessible(): self._validate()
def __init__(self, file_pattern, min_bundle_size=0, compression_type=CompressionTypes.AUTO, splittable=True, validate=True): """Initializes ``FileBasedSource``. Args: file_pattern: the file glob to read a string or a ValueProvider (placeholder to inject a runtime value). min_bundle_size: minimum size of bundles that should be generated when performing initial splitting on this source. compression_type: compression type to use splittable: whether FileBasedSource should try to logically split a single file into data ranges so that different parts of the same file can be read in parallel. If set to False, FileBasedSource will prevent both initial and dynamic splitting of sources for single files. File patterns that represent multiple files may still get split into sources for individual files. Even if set to True by the user, FileBasedSource may choose to not split the file, for example, for compressed files where currently it is not possible to efficiently read a data range without decompressing the whole file. validate: Boolean flag to verify that the files exist during the pipeline creation time. Raises: TypeError: when compression_type is not valid or if file_pattern is not a string or a ValueProvider. ValueError: when compression and splittable files are specified. IOError: when the file pattern specified yields an empty result. """ if not isinstance(file_pattern, (basestring, ValueProvider)): raise TypeError('%s: file_pattern must be of type string' ' or ValueProvider; got %r instead' % (self.__class__.__name__, file_pattern)) if isinstance(file_pattern, basestring): file_pattern = StaticValueProvider(str, file_pattern) self._pattern = file_pattern self._concat_source = None self._min_bundle_size = min_bundle_size if not CompressionTypes.is_valid_compression_type(compression_type): raise TypeError('compression_type must be CompressionType object but ' 'was %s' % type(compression_type)) self._compression_type = compression_type if compression_type in (CompressionTypes.UNCOMPRESSED, CompressionTypes.AUTO): self._splittable = splittable else: # We can't split compressed files efficiently so turn off splitting. self._splittable = False if validate and file_pattern.is_accessible(): self._validate()
def _open_hdfs(self, path, mode, mime_type, compression_type): if mime_type != 'application/octet-stream': logging.warning('Mime types are not supported. Got non-default mime_type:' ' %s', mime_type) if compression_type == CompressionTypes.AUTO: compression_type = CompressionTypes.detect_compression_type(path) res = self._hdfs_client.open(path, mode) if compression_type != CompressionTypes.UNCOMPRESSED: res = CompressedFile(res) return res
def _add_compression(stream, path, mime_type, compression_type): if mime_type != 'application/octet-stream': logging.warning('Mime types are not supported. Got non-default mime_type:' ' %s', mime_type) if compression_type == CompressionTypes.AUTO: compression_type = CompressionTypes.detect_compression_type(path) if compression_type != CompressionTypes.UNCOMPRESSED: return CompressedFile(stream) return stream
def _path_open(self, path, mode, mime_type='application/octet-stream', compression_type=CompressionTypes.AUTO): """Helper functions to open a file in the provided mode. """ compression_type = FileSystem._get_compression_type(path, compression_type) mime_type = CompressionTypes.mime_type(compression_type, mime_type) raw_file = gcsio.GcsIO().open(path, mode, mime_type=mime_type) if compression_type == CompressionTypes.UNCOMPRESSED: return raw_file return CompressedFile(raw_file, compression_type=compression_type)
def _path_open(self, path, mode, mime_type='application/octet-stream', compression_type=CompressionTypes.AUTO): """Helper functions to open a file in the provided mode. """ compression_type = FileSystem._get_compression_type( path, compression_type) mime_type = CompressionTypes.mime_type(compression_type, mime_type) raw_file = gcsio.GcsIO().open(path, mode, mime_type=mime_type) if compression_type == CompressionTypes.UNCOMPRESSED: return raw_file return CompressedFile(raw_file, compression_type=compression_type)
def __init__(self, file_path_prefix, coder, file_name_suffix='', num_shards=0, shard_name_template=None, mime_type='application/octet-stream', compression_type=CompressionTypes.AUTO): """ Raises: TypeError: if file path parameters are not a string or ValueProvider, or if compression_type is not member of CompressionTypes. ValueError: if shard_name_template is not of expected format. """ if not (isinstance(file_path_prefix, basestring) or isinstance(file_path_prefix, ValueProvider)): raise TypeError( 'file_path_prefix must be a string or ValueProvider;' 'got %r instead' % file_path_prefix) if not (isinstance(file_name_suffix, basestring) or isinstance(file_name_suffix, ValueProvider)): raise TypeError( 'file_name_suffix must be a string or ValueProvider;' 'got %r instead' % file_name_suffix) if not CompressionTypes.is_valid_compression_type(compression_type): raise TypeError( 'compression_type must be CompressionType object but ' 'was %s' % type(compression_type)) if shard_name_template is None: shard_name_template = DEFAULT_SHARD_NAME_TEMPLATE elif shard_name_template is '': num_shards = 1 if isinstance(file_path_prefix, basestring): file_path_prefix = StaticValueProvider(str, file_path_prefix) if isinstance(file_name_suffix, basestring): file_name_suffix = StaticValueProvider(str, file_name_suffix) self.file_path_prefix = file_path_prefix self.file_name_suffix = file_name_suffix self.num_shards = num_shards self.coder = coder self.shard_name_format = self._template_to_format(shard_name_template) self.compression_type = compression_type self.mime_type = mime_type if file_path_prefix.is_accessible(): self._file_system = get_filesystem(file_path_prefix.get()) else: self._file_system = None
def __init__(self, file_path_prefix, coder, file_name_suffix='', num_shards=0, shard_name_template=None, mime_type='application/octet-stream', compression_type=CompressionTypes.AUTO): """ Raises: ~exceptions.TypeError: if file path parameters are not a :class:`str` or :class:`~apache_beam.options.value_provider.ValueProvider`, or if **compression_type** is not member of :class:`~apache_beam.io.filesystem.CompressionTypes`. ~exceptions.ValueError: if **shard_name_template** is not of expected format. """ if not isinstance(file_path_prefix, ((str, unicode), ValueProvider)): raise TypeError( 'file_path_prefix must be a string or ValueProvider;' 'got %r instead' % file_path_prefix) if not isinstance(file_name_suffix, ((str, unicode), ValueProvider)): raise TypeError( 'file_name_suffix must be a string or ValueProvider;' 'got %r instead' % file_name_suffix) if not CompressionTypes.is_valid_compression_type(compression_type): raise TypeError( 'compression_type must be CompressionType object but ' 'was %s' % type(compression_type)) if shard_name_template is None: shard_name_template = DEFAULT_SHARD_NAME_TEMPLATE elif shard_name_template == '': num_shards = 1 if isinstance(file_path_prefix, (str, unicode)): file_path_prefix = StaticValueProvider(str, file_path_prefix) if isinstance(file_name_suffix, (str, unicode)): file_name_suffix = StaticValueProvider(str, file_name_suffix) self.file_path_prefix = file_path_prefix self.file_name_suffix = file_name_suffix self.num_shards = num_shards self.coder = coder self.shard_name_format = self._template_to_format(shard_name_template) self.shard_name_glob_format = self._template_to_glob_format( shard_name_template) self.compression_type = compression_type self.mime_type = mime_type
def __init__(self, file_path_prefix, coder, file_name_suffix='', num_shards=0, shard_name_template=None, mime_type='application/octet-stream', compression_type=CompressionTypes.AUTO): """ Raises: ~exceptions.TypeError: if file path parameters are not a :class:`str` or :class:`~apache_beam.options.value_provider.ValueProvider`, or if **compression_type** is not member of :class:`~apache_beam.io.filesystem.CompressionTypes`. ~exceptions.ValueError: if **shard_name_template** is not of expected format. """ if not isinstance(file_path_prefix, (string_types, ValueProvider)): raise TypeError('file_path_prefix must be a string or ValueProvider;' 'got %r instead' % file_path_prefix) if not isinstance(file_name_suffix, (string_types, ValueProvider)): raise TypeError('file_name_suffix must be a string or ValueProvider;' 'got %r instead' % file_name_suffix) if not CompressionTypes.is_valid_compression_type(compression_type): raise TypeError('compression_type must be CompressionType object but ' 'was %s' % type(compression_type)) if shard_name_template is None: shard_name_template = DEFAULT_SHARD_NAME_TEMPLATE elif shard_name_template == '': num_shards = 1 if isinstance(file_path_prefix, string_types): file_path_prefix = StaticValueProvider(str, file_path_prefix) if isinstance(file_name_suffix, string_types): file_name_suffix = StaticValueProvider(str, file_name_suffix) self.file_path_prefix = file_path_prefix self.file_name_suffix = file_name_suffix self.num_shards = num_shards self.coder = coder self.shard_name_format = self._template_to_format(shard_name_template) self.shard_name_glob_format = self._template_to_glob_format( shard_name_template) self.compression_type = compression_type self.mime_type = mime_type
def _get_concat_source(self): if self._concat_source is None: pattern = self._pattern.get() single_file_sources = [] if self._file_system is None: self._file_system = get_filesystem(pattern) match_result = self._file_system.match([pattern])[0] files_metadata = match_result.metadata_list # We create a reference for FileBasedSource that will be serialized along # with each _SingleFileSource. To prevent this FileBasedSource from having # a reference to ConcatSource (resulting in quadratic space complexity) # we clone it here. file_based_source_ref = pickler.loads(pickler.dumps(self)) for file_metadata in files_metadata: file_name = file_metadata.path file_size = file_metadata.size_in_bytes if file_size == 0: continue # Ignoring empty file. # We determine splittability of this specific file. splittable = self.splittable if (splittable and self._compression_type == CompressionTypes.AUTO): compression_type = CompressionTypes.detect_compression_type( file_name) if compression_type != CompressionTypes.UNCOMPRESSED: splittable = False single_file_source = _SingleFileSource( file_based_source_ref, file_name, 0, file_size, min_bundle_size=self._min_bundle_size, splittable=splittable) single_file_sources.append(single_file_source) self._concat_source = concat_source.ConcatSource( single_file_sources) return self._concat_source
def _get_concat_source(self): if self._concat_source is None: pattern = self._pattern.get() single_file_sources = [] if self._file_system is None: self._file_system = get_filesystem(pattern) match_result = self._file_system.match([pattern])[0] files_metadata = match_result.metadata_list # We create a reference for FileBasedSource that will be serialized along # with each _SingleFileSource. To prevent this FileBasedSource from having # a reference to ConcatSource (resulting in quadratic space complexity) # we clone it here. file_based_source_ref = pickler.loads(pickler.dumps(self)) for file_metadata in files_metadata: file_name = file_metadata.path file_size = file_metadata.size_in_bytes if file_size == 0: continue # Ignoring empty file. # We determine splittability of this specific file. splittable = self.splittable if (splittable and self._compression_type == CompressionTypes.AUTO): compression_type = CompressionTypes.detect_compression_type( file_name) if compression_type != CompressionTypes.UNCOMPRESSED: splittable = False single_file_source = _SingleFileSource( file_based_source_ref, file_name, 0, file_size, min_bundle_size=self._min_bundle_size, splittable=splittable) single_file_sources.append(single_file_source) self._concat_source = concat_source.ConcatSource(single_file_sources) return self._concat_source
def __init__(self, file_path_prefix, coder, file_name_suffix='', num_shards=0, shard_name_template=None, mime_type='application/octet-stream', compression_type=CompressionTypes.AUTO): """ Raises: TypeError: if file path parameters are not a string or ValueProvider, or if compression_type is not member of CompressionTypes. ValueError: if shard_name_template is not of expected format. """ if not isinstance(file_path_prefix, (basestring, ValueProvider)): raise TypeError('file_path_prefix must be a string or ValueProvider;' 'got %r instead' % file_path_prefix) if not isinstance(file_name_suffix, (basestring, ValueProvider)): raise TypeError('file_name_suffix must be a string or ValueProvider;' 'got %r instead' % file_name_suffix) if not CompressionTypes.is_valid_compression_type(compression_type): raise TypeError('compression_type must be CompressionType object but ' 'was %s' % type(compression_type)) if shard_name_template is None: shard_name_template = DEFAULT_SHARD_NAME_TEMPLATE elif shard_name_template == '': num_shards = 1 if isinstance(file_path_prefix, basestring): file_path_prefix = StaticValueProvider(str, file_path_prefix) if isinstance(file_name_suffix, basestring): file_name_suffix = StaticValueProvider(str, file_name_suffix) self.file_path_prefix = file_path_prefix self.file_name_suffix = file_name_suffix self.num_shards = num_shards self.coder = coder self.shard_name_format = self._template_to_format(shard_name_template) self.compression_type = compression_type self.mime_type = mime_type
def __init__(self, reader, file_patterns, min_bundle_size=0, compression_type=CompressionTypes.AUTO, splittable=True, validate=True): if not isinstance(file_patterns, ValueProvider): file_patterns = StaticValueProvider(list, file_patterns) self._patterns = file_patterns self._pickle_reader = pickler.dumps(reader) self._reader = None self._concat_source = None self._min_bundle_size = min_bundle_size if not CompressionTypes.is_valid_compression_type(compression_type): raise TypeError( 'compression_type must be CompressionType object but ' 'was %s' % type(compression_type)) self._compression_type = compression_type self._splittable = splittable if validate and file_patterns.is_accessible(): self._validate()
def _determine_splittability_from_compression_type( file_path, compression_type): if compression_type == CompressionTypes.AUTO: compression_type = CompressionTypes.detect_compression_type(file_path) return compression_type == CompressionTypes.UNCOMPRESSED
def __init__(self, file_pattern, min_bundle_size=0, compression_type=CompressionTypes.AUTO, splittable=True, validate=True): """Initializes :class:`FileBasedSource`. Args: file_pattern (str): the file glob to read a string or a :class:`~apache_beam.options.value_provider.ValueProvider` (placeholder to inject a runtime value). min_bundle_size (str): minimum size of bundles that should be generated when performing initial splitting on this source. compression_type (str): Used to handle compressed output files. Typical value is :attr:`CompressionTypes.AUTO <apache_beam.io.filesystem.CompressionTypes.AUTO>`, in which case the final file path's extension will be used to detect the compression. splittable (bool): whether :class:`FileBasedSource` should try to logically split a single file into data ranges so that different parts of the same file can be read in parallel. If set to :data:`False`, :class:`FileBasedSource` will prevent both initial and dynamic splitting of sources for single files. File patterns that represent multiple files may still get split into sources for individual files. Even if set to :data:`True` by the user, :class:`FileBasedSource` may choose to not split the file, for example, for compressed files where currently it is not possible to efficiently read a data range without decompressing the whole file. validate (bool): Boolean flag to verify that the files exist during the pipeline creation time. Raises: ~exceptions.TypeError: when **compression_type** is not valid or if **file_pattern** is not a :class:`str` or a :class:`~apache_beam.options.value_provider.ValueProvider`. ~exceptions.ValueError: when compression and splittable files are specified. ~exceptions.IOError: when the file pattern specified yields an empty result. """ if not isinstance(file_pattern, (basestring, ValueProvider)): raise TypeError('%s: file_pattern must be of type string' ' or ValueProvider; got %r instead' % (self.__class__.__name__, file_pattern)) if isinstance(file_pattern, basestring): file_pattern = StaticValueProvider(str, file_pattern) self._pattern = file_pattern self._concat_source = None self._min_bundle_size = min_bundle_size if not CompressionTypes.is_valid_compression_type(compression_type): raise TypeError( 'compression_type must be CompressionType object but ' 'was %s' % type(compression_type)) self._compression_type = compression_type self._splittable = splittable if validate and file_pattern.is_accessible(): self._validate()
def _determine_splittability_from_compression_type(file_path, compression_type): if compression_type == CompressionTypes.AUTO: compression_type = CompressionTypes.detect_compression_type(file_path) return compression_type == CompressionTypes.UNCOMPRESSED
def __init__(self, file_pattern, min_bundle_size=0, compression_type=CompressionTypes.AUTO, splittable=True, validate=True): """Initializes ``FileBasedSource``. Args: file_pattern: the file glob to read a string or a ValueProvider (placeholder to inject a runtime value). min_bundle_size: minimum size of bundles that should be generated when performing initial splitting on this source. compression_type: compression type to use splittable: whether FileBasedSource should try to logically split a single file into data ranges so that different parts of the same file can be read in parallel. If set to False, FileBasedSource will prevent both initial and dynamic splitting of sources for single files. File patterns that represent multiple files may still get split into sources for individual files. Even if set to True by the user, FileBasedSource may choose to not split the file, for example, for compressed files where currently it is not possible to efficiently read a data range without decompressing the whole file. validate: Boolean flag to verify that the files exist during the pipeline creation time. Raises: TypeError: when compression_type is not valid or if file_pattern is not a string or a ValueProvider. ValueError: when compression and splittable files are specified. IOError: when the file pattern specified yields an empty result. """ if (not (isinstance(file_pattern, basestring) or isinstance(file_pattern, ValueProvider))): raise TypeError('%s: file_pattern must be of type string' ' or ValueProvider; got %r instead' % (self.__class__.__name__, file_pattern)) if isinstance(file_pattern, basestring): file_pattern = StaticValueProvider(str, file_pattern) self._pattern = file_pattern if file_pattern.is_accessible(): self._file_system = get_filesystem(file_pattern.get()) else: self._file_system = None self._concat_source = None self._min_bundle_size = min_bundle_size if not CompressionTypes.is_valid_compression_type(compression_type): raise TypeError( 'compression_type must be CompressionType object but ' 'was %s' % type(compression_type)) self._compression_type = compression_type if compression_type in (CompressionTypes.UNCOMPRESSED, CompressionTypes.AUTO): self._splittable = splittable else: # We can't split compressed files efficiently so turn off splitting. self._splittable = False if validate and file_pattern.is_accessible(): self._validate()