Exemple #1
0
    def test_get_splittable_bgzf(self):
        non_gs_metadata_list = [
            filesystem.FileMetadata(path, size)
            for (path, size) in [('1.vcf', 100), ('2.vcf', 100)]
        ]
        with mock.patch.object(FileSystems,
                               'match',
                               return_value=[
                                   filesystem.MatchResult(
                                       'non_gs', non_gs_metadata_list)
                               ]):
            self.assertEqual(pipeline_common._get_splittable_bgzf(['non_gs']),
                             [])

        gs_metadata_list = [
            filesystem.FileMetadata(path, size)
            for (path, size) in [('gs://1.vcf.bgz',
                                  100), ('gs://2.vcf.bgz', 100)]
        ]
        with mock.patch.object(
                FileSystems,
                'match',
                return_value=[filesystem.MatchResult('gs', gs_metadata_list)]):
            with mock.patch.object(FileSystems, 'exists', return_value=True):
                self.assertEqual(
                    pipeline_common._get_splittable_bgzf(['index file exists'
                                                          ]),
                    ['gs://1.vcf.bgz', 'gs://2.vcf.bgz'])

            with mock.patch.object(FileSystems, 'exists', return_value=False):
                self.assertEqual(
                    pipeline_common._get_splittable_bgzf(['no index file']),
                    [])
Exemple #2
0
 def _get_file_metadata_list(self):
   return [filesystem.FileMetadata('gs://bucket/count_100000', 10),
           filesystem.FileMetadata('gs://bucket/count_1', 10),
           filesystem.FileMetadata('gs://bucket/count_100000', 10),
           filesystem.FileMetadata('gs://bucket/count_1', 10),
           filesystem.FileMetadata('gs://bucket/count_100000', 10),
           filesystem.FileMetadata('gs://bucket/count_1', 10),
           filesystem.FileMetadata('gs://bucket/count_1', 10)]
Exemple #3
0
    def test_get_compression_type(self):
        vcf_metadata_list = [
            filesystem.FileMetadata(path, size)
            for (path, size) in [('gs://1.vcf', 100), ('2.vcf', 100)]
        ]
        with mock.patch.object(FileSystems,
                               'match',
                               return_value=[
                                   filesystem.MatchResult(
                                       'vcf', vcf_metadata_list)
                               ]):
            self.assertEqual(pipeline_common.get_compression_type(['vcf']),
                             filesystem.CompressionTypes.AUTO)

        gzip_metadata_list = [
            filesystem.FileMetadata(path, size)
            for (path, size) in [('gs://1.vcf.gz', 100), ('2.vcf.gz', 100)]
        ]
        with mock.patch.object(FileSystems,
                               'match',
                               return_value=[
                                   filesystem.MatchResult(
                                       'gzip', gzip_metadata_list)
                               ]):
            self.assertEqual(pipeline_common.get_compression_type('gzip'),
                             filesystem.CompressionTypes.GZIP)

        mixed_metadata_list = [
            filesystem.FileMetadata(path, size)
            for (path, size) in [('gs://1.vcf.gz', 100), ('2.vcf', 100)]
        ]
        with mock.patch.object(FileSystems,
                               'match',
                               return_value=[
                                   filesystem.MatchResult(
                                       'mixed', mixed_metadata_list)
                               ]):
            with self.assertRaises(ValueError):
                pipeline_common.get_compression_type('mixed')
Exemple #4
0
    def process(self, file_metadata):
        metadata = (filesystem.FileMetadata(file_metadata, 0) if isinstance(
            file_metadata, (str, unicode)) else file_metadata)

        if metadata.path.endswith('/') and self._skip_directories:
            return
        elif metadata.path.endswith('/'):
            raise BeamIOError(
                'Directories are not allowed in ReadMatches transform.'
                'Found %s.' % metadata.path)

        # TODO: Mime type? Other arguments? Maybe arguments passed in to transform?
        yield ReadableFile(metadata)
Exemple #5
0
    def process(
        self,
        file_metadata: Union[str, filesystem.FileMetadata],
    ) -> Iterable[ReadableFile]:
        metadata = (filesystem.FileMetadata(file_metadata, 0) if isinstance(
            file_metadata, str) else file_metadata)

        if ((metadata.path.endswith('/') or metadata.path.endswith('\\'))
                and self._skip_directories):
            return
        elif metadata.path.endswith('/') or metadata.path.endswith('\\'):
            raise BeamIOError(
                'Directories are not allowed in ReadMatches transform.'
                'Found %s.' % metadata.path)

        # TODO: Mime type? Other arguments? Maybe arguments passed in to transform?
        yield ReadableFile(metadata, self._compression)