def _get_concat_source(self): if self._concat_source is None: pattern = self._pattern.get() single_file_sources = [] match_result = FileSystems.match([pattern])[0] files_metadata = match_result.metadata_list # We create a reference for FileBasedSource that will be serialized along # with each _SingleFileSource. To prevent this FileBasedSource from having # a reference to ConcatSource (resulting in quadratic space complexity) # we clone it here. file_based_source_ref = pickler.loads(pickler.dumps(self)) for file_metadata in files_metadata: file_name = file_metadata.path file_size = file_metadata.size_in_bytes if file_size == 0: continue # Ignoring empty file. # We determine splittability of this specific file. splittable = ( self.splittable and _determine_splittability_from_compression_type( file_name, self._compression_type)) single_file_source = _SingleFileSource( file_based_source_ref, file_name, 0, file_size, min_bundle_size=self._min_bundle_size, splittable=splittable) single_file_sources.append(single_file_source) self._concat_source = concat_source.ConcatSource(single_file_sources) return self._concat_source
def _get_concat_source(self): if self._concat_source is None: single_file_sources = [] file_names = [f for f in fileio.ChannelFactory.glob(self._pattern)] sizes = FileBasedSource._estimate_sizes_of_files(file_names, self._pattern) # We create a reference for FileBasedSource that will be serialized along # with each _SingleFileSource. To prevent this FileBasedSource from having # a reference to ConcatSource (resulting in quadratic space complexity) # we clone it here. file_based_source_ref = pickler.loads(pickler.dumps(self)) for index, file_name in enumerate(file_names): if sizes[index] == 0: continue # Ignoring empty file. # We determine splittability of this specific file. splittable = self.splittable if (splittable and self._compression_type == fileio.CompressionTypes.AUTO): compression_type = fileio.CompressionTypes.detect_compression_type( file_name) if compression_type != fileio.CompressionTypes.UNCOMPRESSED: splittable = False single_file_source = _SingleFileSource( file_based_source_ref, file_name, 0, sizes[index], min_bundle_size=self._min_bundle_size, splittable=splittable) single_file_sources.append(single_file_source) self._concat_source = concat_source.ConcatSource(single_file_sources) return self._concat_source