def test_read_tbi(self): blocks = bgzf_io.split_bgzf(testdata_util.get_full_file_path( 'empty.vcf.gz')) self.assertEqual(len(list(blocks)), 0) blocks = bgzf_io.split_bgzf(testdata_util.get_full_file_path( 'Y.vcf.bgz')) self.assertEqual(len(list(blocks)), 19)
def test_pipeline_read_all_multiple_files_large(self): pipeline = TestPipeline() pcoll = (pipeline | 'Create' >> beam.Create( [testdata_util.get_full_file_path('valid-4.0.vcf'), testdata_util.get_full_file_path('valid-4.1-large.vcf'), testdata_util.get_full_file_path('valid-4.2.vcf')]) | 'Read' >> ReadAllFromVcf()) assert_that(pcoll, asserts.count_equals_to(9900)) pipeline.run()
def test_read_single_file_large(self): test_data_conifgs = [ { 'file': 'valid-4.0.vcf', 'num_records': 5 }, { 'file': 'valid-4.0.vcf.gz', 'num_records': 5 }, { 'file': 'valid-4.0.vcf.bz2', 'num_records': 5 }, { 'file': 'valid-4.1-large.vcf', 'num_records': 9882 }, { 'file': 'valid-4.2.vcf', 'num_records': 13 }, ] for config in test_data_conifgs: read_data = self._read_records( testdata_util.get_full_file_path(config['file'])) self.assertEqual(config['num_records'], len(read_data))
def test_read_variants_large_mode(self): pipeline = test_pipeline.TestPipeline() all_patterns = [testdata_util.get_full_file_path('valid-4.0.vcf')] variants = pipeline_common.read_variants(pipeline, all_patterns, PipelineModes.LARGE, False) assert_that(variants, asserts.count_equals_to(5)) pipeline.run()
def test_read_single_file_large(self): test_data_conifgs = [ { 'file': 'valid-4.0.vcf', 'num_infos': 6, 'num_formats': 4 }, { 'file': 'valid-4.0.vcf.gz', 'num_infos': 6, 'num_formats': 4 }, { 'file': 'valid-4.0.vcf.bz2', 'num_infos': 6, 'num_formats': 4 }, { 'file': 'valid-4.1-large.vcf', 'num_infos': 21, 'num_formats': 33 }, { 'file': 'valid-4.2.vcf', 'num_infos': 8, 'num_formats': 5 }, ] for config in test_data_conifgs: read_data = source_test_utils.read_from_source( VcfHeaderSource( testdata_util.get_full_file_path(config['file']))) self.assertEqual(config['num_infos'], len(read_data[0].infos)) self.assertEqual(config['num_formats'], len(read_data[0].formats))
def test_read_variants_use_1_based_coordinate(self): pipeline = test_pipeline.TestPipeline() all_patterns = [testdata_util.get_full_file_path('valid-4.0.vcf')] variants = pipeline_common.read_variants(pipeline, all_patterns, PipelineModes.SMALL, False, use_1_based_coordinate=True) assert_that(variants, asserts.count_equals_to(5)) pipeline.run()
def setUp(self): with open(testdata_util.get_full_file_path('Y.vcf.bgz'), mode='rb') as file_to_read: data = file_to_read.readlines() self._data = b''.join(data) self.client = gcsio_test.FakeGcsClient() self.gcs = gcsio.GcsIO(self.client) self._file_name = 'gs://bucket/test' bucket, name = gcsio.parse_gcs_path(self._file_name) self.client.objects.add_file( gcsio_test.FakeFile(bucket, name, self._data, 1))
def test_read_after_splitting(self): file_name = testdata_util.get_full_file_path('valid-4.1-large.vcf') source = VcfSource(file_name) splits = [p for p in source.split(desired_bundle_size=500)] self.assertGreater(len(splits), 1) sources_info = ([(split.source, split.start_position, split.stop_position) for split in splits]) self.assertGreater(len(sources_info), 1) split_records = [] for source_info in sources_info: split_records.extend( source_test_utils.read_from_source(*source_info)) self.assertEqual(9882, len(split_records))
def test_read_single_file_large(self): test_data_conifgs = [ { 'file': 'valid-4.0.vcf', 'variant_count': 4, 'size': 1500 }, { 'file': 'valid-4.0.vcf.gz', 'variant_count': 13, 'size': 1454 }, { 'file': 'valid-4.0.vcf.bz2', 'variant_count': 14, 'size': 1562 }, { 'file': 'valid-4.1-large.vcf', 'variant_count': 14425, 'size': 832396 }, { 'file': 'valid-4.1-large.vcf.gz', 'variant_count': 5498, 'size': 313430 }, { 'file': 'valid-4.2.vcf', 'variant_count': 10, 'size': 3195 }, ] for config in test_data_conifgs: read_data = source_test_utils.read_from_source( VcfEstimateSource( testdata_util.get_full_file_path(config['file']))) self.assertEqual(config['variant_count'], int(read_data[0].estimated_variant_count)) self.assertEqual(config['size'], read_data[0].size_in_bytes)
def test_pipeline_read_all_single_file_large(self): self._assert_pipeline_read_files_record_count_equal( testdata_util.get_full_file_path('valid-4.1-large.vcf'), 9882, use_read_all=True)
def test_gz(self): """Tests successfully parsing gz files.""" file_path = testdata_util.get_full_file_path('valid-4.0.vcf.gz') header_fields = vcf_header_parser.get_vcf_headers(file_path) self.assertGreater(len(header_fields.infos), 1) self.assertGreater(len(header_fields.formats), 1)
def test_get_block_offsets(self): blocks = bgzf_io._get_block_offsets(testdata_util.get_full_file_path( 'Y.vcf.bgz.tbi')) self.assertEqual(len(blocks), 108)