def _add_compression(stream, path, mime_type, compression_type): if mime_type != 'application/octet-stream': logging.warning('Mime types are not supported. Got non-default mime_type:' ' %s', mime_type) if compression_type == CompressionTypes.AUTO: compression_type = CompressionTypes.detect_compression_type(path) if compression_type != CompressionTypes.UNCOMPRESSED: return CompressedFile(stream) return stream
def _add_compression(stream, path, mime_type, compression_type): if mime_type != 'application/octet-stream': logging.warning('Mime types are not supported. Got non-default mime_type:' ' %s', mime_type) if compression_type == CompressionTypes.AUTO: compression_type = CompressionTypes.detect_compression_type(path) if compression_type != CompressionTypes.UNCOMPRESSED: return CompressedFile(stream) return stream
def _open_hdfs(self, path, mode, mime_type, compression_type): if mime_type != 'application/octet-stream': logging.warning('Mime types are not supported. Got non-default mime_type:' ' %s', mime_type) if compression_type == CompressionTypes.AUTO: compression_type = CompressionTypes.detect_compression_type(path) res = self._hdfs_client.open(path, mode) if compression_type != CompressionTypes.UNCOMPRESSED: res = CompressedFile(res) return res
def _open_hdfs(self, path, mode, mime_type, compression_type): if mime_type != 'application/octet-stream': logging.warning( 'Mime types are not supported. Got non-default mime_type:' ' %s', mime_type) if compression_type == CompressionTypes.AUTO: compression_type = CompressionTypes.detect_compression_type(path) res = self._hdfs_client.open(path, mode) if compression_type != CompressionTypes.UNCOMPRESSED: res = CompressedFile(res) return res
def _get_concat_source(self): if self._concat_source is None: pattern = self._pattern.get() single_file_sources = [] if self._file_system is None: self._file_system = get_filesystem(pattern) match_result = self._file_system.match([pattern])[0] files_metadata = match_result.metadata_list # We create a reference for FileBasedSource that will be serialized along # with each _SingleFileSource. To prevent this FileBasedSource from having # a reference to ConcatSource (resulting in quadratic space complexity) # we clone it here. file_based_source_ref = pickler.loads(pickler.dumps(self)) for file_metadata in files_metadata: file_name = file_metadata.path file_size = file_metadata.size_in_bytes if file_size == 0: continue # Ignoring empty file. # We determine splittability of this specific file. splittable = self.splittable if (splittable and self._compression_type == CompressionTypes.AUTO): compression_type = CompressionTypes.detect_compression_type( file_name) if compression_type != CompressionTypes.UNCOMPRESSED: splittable = False single_file_source = _SingleFileSource( file_based_source_ref, file_name, 0, file_size, min_bundle_size=self._min_bundle_size, splittable=splittable) single_file_sources.append(single_file_source) self._concat_source = concat_source.ConcatSource( single_file_sources) return self._concat_source
def _get_concat_source(self): if self._concat_source is None: pattern = self._pattern.get() single_file_sources = [] if self._file_system is None: self._file_system = get_filesystem(pattern) match_result = self._file_system.match([pattern])[0] files_metadata = match_result.metadata_list # We create a reference for FileBasedSource that will be serialized along # with each _SingleFileSource. To prevent this FileBasedSource from having # a reference to ConcatSource (resulting in quadratic space complexity) # we clone it here. file_based_source_ref = pickler.loads(pickler.dumps(self)) for file_metadata in files_metadata: file_name = file_metadata.path file_size = file_metadata.size_in_bytes if file_size == 0: continue # Ignoring empty file. # We determine splittability of this specific file. splittable = self.splittable if (splittable and self._compression_type == CompressionTypes.AUTO): compression_type = CompressionTypes.detect_compression_type( file_name) if compression_type != CompressionTypes.UNCOMPRESSED: splittable = False single_file_source = _SingleFileSource( file_based_source_ref, file_name, 0, file_size, min_bundle_size=self._min_bundle_size, splittable=splittable) single_file_sources.append(single_file_source) self._concat_source = concat_source.ConcatSource(single_file_sources) return self._concat_source
def _determine_splittability_from_compression_type(file_path, compression_type): if compression_type == CompressionTypes.AUTO: compression_type = CompressionTypes.detect_compression_type(file_path) return compression_type == CompressionTypes.UNCOMPRESSED
def _determine_splittability_from_compression_type( file_path, compression_type): if compression_type == CompressionTypes.AUTO: compression_type = CompressionTypes.detect_compression_type(file_path) return compression_type == CompressionTypes.UNCOMPRESSED