def test_get_splittable_bgzf(self): non_gs_metadata_list = [ filesystem.FileMetadata(path, size) for (path, size) in [('1.vcf', 100), ('2.vcf', 100)] ] with mock.patch.object(FileSystems, 'match', return_value=[ filesystem.MatchResult( 'non_gs', non_gs_metadata_list) ]): self.assertEqual(pipeline_common._get_splittable_bgzf(['non_gs']), []) gs_metadata_list = [ filesystem.FileMetadata(path, size) for (path, size) in [('gs://1.vcf.bgz', 100), ('gs://2.vcf.bgz', 100)] ] with mock.patch.object( FileSystems, 'match', return_value=[filesystem.MatchResult('gs', gs_metadata_list)]): with mock.patch.object(FileSystems, 'exists', return_value=True): self.assertEqual( pipeline_common._get_splittable_bgzf(['index file exists' ]), ['gs://1.vcf.bgz', 'gs://2.vcf.bgz']) with mock.patch.object(FileSystems, 'exists', return_value=False): self.assertEqual( pipeline_common._get_splittable_bgzf(['no index file']), [])
def _get_file_metadata_list(self): return [filesystem.FileMetadata('gs://bucket/count_100000', 10), filesystem.FileMetadata('gs://bucket/count_1', 10), filesystem.FileMetadata('gs://bucket/count_100000', 10), filesystem.FileMetadata('gs://bucket/count_1', 10), filesystem.FileMetadata('gs://bucket/count_100000', 10), filesystem.FileMetadata('gs://bucket/count_1', 10), filesystem.FileMetadata('gs://bucket/count_1', 10)]
def test_get_compression_type(self): vcf_metadata_list = [ filesystem.FileMetadata(path, size) for (path, size) in [('gs://1.vcf', 100), ('2.vcf', 100)] ] with mock.patch.object(FileSystems, 'match', return_value=[ filesystem.MatchResult( 'vcf', vcf_metadata_list) ]): self.assertEqual(pipeline_common.get_compression_type(['vcf']), filesystem.CompressionTypes.AUTO) gzip_metadata_list = [ filesystem.FileMetadata(path, size) for (path, size) in [('gs://1.vcf.gz', 100), ('2.vcf.gz', 100)] ] with mock.patch.object(FileSystems, 'match', return_value=[ filesystem.MatchResult( 'gzip', gzip_metadata_list) ]): self.assertEqual(pipeline_common.get_compression_type('gzip'), filesystem.CompressionTypes.GZIP) mixed_metadata_list = [ filesystem.FileMetadata(path, size) for (path, size) in [('gs://1.vcf.gz', 100), ('2.vcf', 100)] ] with mock.patch.object(FileSystems, 'match', return_value=[ filesystem.MatchResult( 'mixed', mixed_metadata_list) ]): with self.assertRaises(ValueError): pipeline_common.get_compression_type('mixed')
def process(self, file_metadata): metadata = (filesystem.FileMetadata(file_metadata, 0) if isinstance( file_metadata, (str, unicode)) else file_metadata) if metadata.path.endswith('/') and self._skip_directories: return elif metadata.path.endswith('/'): raise BeamIOError( 'Directories are not allowed in ReadMatches transform.' 'Found %s.' % metadata.path) # TODO: Mime type? Other arguments? Maybe arguments passed in to transform? yield ReadableFile(metadata)
def process( self, file_metadata: Union[str, filesystem.FileMetadata], ) -> Iterable[ReadableFile]: metadata = (filesystem.FileMetadata(file_metadata, 0) if isinstance( file_metadata, str) else file_metadata) if ((metadata.path.endswith('/') or metadata.path.endswith('\\')) and self._skip_directories): return elif metadata.path.endswith('/') or metadata.path.endswith('\\'): raise BeamIOError( 'Directories are not allowed in ReadMatches transform.' 'Found %s.' % metadata.path) # TODO: Mime type? Other arguments? Maybe arguments passed in to transform? yield ReadableFile(metadata, self._compression)