def test_invalid_file(self): invalid_file_contents = self._get_invalid_file_contents() for content in chain(*invalid_file_contents): with TempDir() as tempdir, self.assertRaises(ValueError): self._read_records(self._create_temp_vcf_file(content, tempdir)) self.fail('Invalid VCF file must throw an exception') # Try with multiple files (any one of them will throw an exception). with TempDir() as tempdir, self.assertRaises(ValueError): for content in chain(*invalid_file_contents): self._create_temp_vcf_file(content, tempdir) self._read_records(os.path.join(tempdir.get_path(), '*.vcf'))
def test_allow_malformed_records(self): invalid_records, invalid_headers = self._get_invalid_file_contents() # Invalid records should not raise errors for content in invalid_records: with TempDir() as tempdir: self._read_records(self._create_temp_vcf_file(content, tempdir), allow_malformed_records=True) # Invalid headers should still raise errors for content in invalid_headers: with TempDir() as tempdir, self.assertRaises(ValueError): self._read_records(self._create_temp_vcf_file(content, tempdir), allow_malformed_records=True)
def test_read_reentrant_without_splitting(self): with TempDir() as tempdir: file_name = self._create_temp_vcf_file( _SAMPLE_HEADER_LINES + _SAMPLE_TEXT_LINES, tempdir) source = VcfSource(file_name) source_test_utils.assert_reentrant_reads_succeed( (source, None, None))
def _create_temp_file_and_return_records_with_file_name( self, lines, representative_header_lines=None, sample_name_encoding=SampleNameEncoding.WITHOUT_FILE_PATH): with TempDir() as tempdir: file_name = tempdir.create_temp_file(suffix='.vcf', lines=lines) return (self._read_records(file_name, representative_header_lines, sample_name_encoding), file_name)
def _create_temp_file_and_read_records( self, lines, representative_header_lines=None, vcf_parser_type=VcfParserType.PYVCF): with TempDir() as tempdir: file_name = tempdir.create_temp_file(suffix='.vcf', lines=lines) return self._read_records(file_name, representative_header_lines, vcf_parser_type)
def test_allow_malformed_records(self): invalid_records = self._get_invalid_file_contents() # Invalid records should not raise errors for content in invalid_records: with TempDir() as tempdir: self._read_records(self._create_temp_vcf_file(content, tempdir), allow_malformed_records=True)
def test_dynamic_work_rebalancing(self): with TempDir() as tempdir: file_name = self._create_temp_vcf_file( _SAMPLE_HEADER_LINES + _SAMPLE_TEXT_LINES, tempdir) source = VcfSource(file_name) splits = list(split for split in source.split(desired_bundle_size=100000)) assert len(splits) == 1 source_test_utils.assert_split_at_fraction_exhaustive( splits[0].source, splits[0].start_position, splits[0].stop_position)
def test_read_reentrant_after_splitting(self): with TempDir() as tempdir: file_name = self._create_temp_vcf_file( _SAMPLE_HEADER_LINES + _SAMPLE_TEXT_LINES, tempdir) source = VcfSource(file_name) splits = list(split for split in source.split(desired_bundle_size=100000)) assert len(splits) == 1 source_test_utils.assert_reentrant_reads_succeed( (splits[0].source, splits[0].start_position, splits[0].stop_position))
def test_pipeline_read_all_multiple_files(self): with TempDir() as tempdir: file_name_1 = self._create_temp_vcf_file( _SAMPLE_HEADER_LINES + _SAMPLE_TEXT_LINES, tempdir) file_name_2 = self._create_temp_vcf_file( _SAMPLE_HEADER_LINES + _SAMPLE_TEXT_LINES, tempdir) pipeline = TestPipeline() pcoll = (pipeline | 'Create' >> beam.Create([file_name_1, file_name_2]) | 'Read' >> ReadAllFromVcf()) assert_that(pcoll, asserts.count_equals_to(2 * len(_SAMPLE_TEXT_LINES))) pipeline.run()
def test_file_pattern_verify_details(self): variant_1 = _get_sample_variant_1() variant_2 = _get_sample_variant_2() variant_3 = _get_sample_variant_3() with TempDir() as tempdir: self._create_temp_vcf_file(_SAMPLE_HEADER_LINES + [VCF_LINE_1], tempdir) self._create_temp_vcf_file((_SAMPLE_HEADER_LINES + [VCF_LINE_2, VCF_LINE_3]), tempdir) read_data = self._read_records(os.path.join(tempdir.get_path(), '*.vcf')) self.assertEqual(3, len(read_data)) self._assert_variants_equal([variant_1, variant_2, variant_3], read_data)
def test_file_pattern_verify_details_nucleus(self): variant_1, vcf_line_1 = _get_sample_variant_1(is_for_nucleus=True) variant_2, vcf_line_2 = _get_sample_variant_2(is_for_nucleus=True) variant_3, vcf_line_3 = _get_sample_variant_3(is_for_nucleus=True) with TempDir() as tempdir: self._create_temp_vcf_file(_NUCLEUS_HEADER_LINES + [vcf_line_1], tempdir) self._create_temp_vcf_file((_NUCLEUS_HEADER_LINES + [vcf_line_2, vcf_line_3]), tempdir) read_data = self._read_records(os.path.join(tempdir.get_path(), '*.vcf'), vcf_parser_type=VcfParserType.NUCLEUS) self.assertEqual(3, len(read_data)) self._assert_variants_equal([variant_1, variant_2, variant_3], read_data)
def test_single_file_1_based_verify_details(self): variant = _get_sample_variant_1(use_1_based_coordinate=True) read_data = None with TempDir() as tempdir: file_name = tempdir.create_temp_file( suffix='.vcf', lines=_SAMPLE_HEADER_LINES + [VCF_LINE_1]) read_data = source_test_utils.read_from_source( VcfSource(file_name, representative_header_lines=None, sample_name_encoding=SampleNameEncoding.WITHOUT_FILE_PATH, use_1_based_coordinate=True)) self.assertEqual(1, len(read_data)) self.assertEqual(variant, read_data[0])
def test_file_pattern_1_based_verify_details(self): variant_1 = _get_sample_variant_1(use_1_based_coordinate=True) variant_2 = _get_sample_variant_2(use_1_based_coordinate=True) variant_3 = _get_sample_variant_3(use_1_based_coordinate=True) with TempDir() as tempdir: _ = tempdir.create_temp_file( suffix='.vcf', lines=_SAMPLE_HEADER_LINES + [VCF_LINE_1]) _ = tempdir.create_temp_file( suffix='.vcf', lines=_SAMPLE_HEADER_LINES + [VCF_LINE_2, VCF_LINE_3]) read_data = source_test_utils.read_from_source( VcfSource(os.path.join(tempdir.get_path(), '*.vcf'), representative_header_lines=None, sample_name_encoding=SampleNameEncoding.WITHOUT_FILE_PATH, use_1_based_coordinate=True)) self.assertEqual(3, len(read_data)) self._assert_variants_equal([variant_1, variant_2, variant_3], read_data)
def test_pipeline_read_all_single_file(self): with TempDir() as tempdir: file_name = self._create_temp_vcf_file( _SAMPLE_HEADER_LINES + _SAMPLE_TEXT_LINES, tempdir) self._assert_pipeline_read_files_record_count_equal( file_name, len(_SAMPLE_TEXT_LINES), use_read_all=True)
def _create_temp_file_and_read_records(self, lines, representative_header_lines=None): with TempDir() as tempdir: file_name = tempdir.create_temp_file(suffix='.vcf', lines=lines) return self._read_records(file_name, representative_header_lines)
def _create_temp_file_and_read_records(self, lines): with TempDir() as tempdir: file_name = tempdir.create_temp_file(suffix='.vcf', lines=lines) return self._read_records(file_name)