def test_read_VCF_line(self): with open(os.path.join(self.data_dir, "vcf_example.vcf"), "r") as vcf_file: vcf_handler = VCFReader(vcf_file) vcf_handler.read_header() self.assertEqual(len(vcf_handler.header.file_metadata), 7) self.assertEqual(len(vcf_handler.header.samples), 2) records = list(vcf_handler.read_records()) self.assertEqual(len(records), 2) # test first record fully self.variant_is_equal(records[0], ("20", 9, set(), "CT", "C")) # zero=based representation self.assertEqual(records[0].filters, set()) self.assertEqual(records[0].passes_filter, True) self.assertEqual(len(records[0].info), 12) self.assertEqual(records[0].info["PP"], [3000]) self.assertEqual(records[0].info["DP"], [250]) self.assertEqual(records[0].info["DPR"], [140]) self.assertEqual(records[0].info["DPF"], [110]) self.assertEqual(records[0].info["VC"], [100]) self.assertEqual(records[0].info["VCR"], [49]) self.assertEqual(records[0].info["VCF"], [51]) self.assertEqual(records[0].info["ABPV"], [0.2]) self.assertEqual(records[0].info["SBPV"], [0.3]) self.assertEqual(records[0].info["MQ"], [70]) self.assertEqual(records[0].info["BR"], [31]) self.assertEqual(records[0].info["QD"], [None]) self.assertEqual(records[0].samples, ['sample1', 'sample2']) self.assertEqual(records[0].sample_info.get_field('sample1', "GT"), GenotypeCall("0/1")) self.assertEqual(records[0].sample_info.get_field('sample2', "GT"), GenotypeCall("1/1")) self.assertEqual(records[0].sample_info.get_field('sample1', 'PL'), [3000, 0, 3000]) self.assertEqual(records[0].sample_info.get_field('sample2', 'PL'), [114, 0, 0]) self.assertEqual(records[0].sample_info.get_field('sample1', 'GQ'), [1000]) self.assertEqual(records[0].sample_info.get_field('sample2', 'GQ'), [None]) # check that ordering in the dictionaries is preserved expected_keys = ["PP", "DP", "DPR", "DPF", "VC", "VCR", "VCF", "ABPV", "SBPV", "MQ", "BR", "QD"] self.assertEqual(list(records[0].info.keys()), expected_keys) # ensure last record is still being read correctly self.variant_is_equal(records[-1], ("20", 10, set(), "T", "G"))
def test_should_fail_on_unexpected_EOF(self): lines = [ '##fileformat=VCFv4.2\n', ] reader = VCFReader(iter(lines)) with self.assertRaisesRegex(Exception, 'unexpected EOF'): print(reader.read_header())
def test_should_fail_if_column_header_line_is_missing(self): lines = [ '##fileformat=VCFv4.2\n', 'the line after the header\n', ] reader = VCFReader(iter(lines)) with self.assertRaisesRegex(Exception, 'expected column header line: \'the line after the header\''): print(reader.read_header())
def test_should_fail_if_version_is_not_defined(self): lines = [ '##notFileformat=foo\n', '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n', ] reader = VCFReader(iter(lines)) with self.assertRaisesRegex(Exception, 'unrecognised file format line: \'##notFileformat=foo\''): print(reader.read_header())
def test_should_fail_with_unexpected_version(self): lines = [ '##fileformat=VCFv0.0\n', '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n', ] reader = VCFReader(iter(lines)) with self.assertRaisesRegex(Exception, 'unexpected version: \'0.0\''): print(reader.read_header())
def test_should_store_header_as_attribute_of_parser(self): lines = [ '##fileformat=VCFv4.2\n', '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n', ] reader = VCFReader(iter(lines)) header = reader.read_header() self.assertEqual(header, reader.header)
def test_should_fail_to_parse_malformed_header_line(self): lines = [ '##fileformat=VCFv4.2\n', '##malformed line!\n', '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n', ] reader = VCFReader(iter(lines)) with self.assertRaisesRegex(Exception, 'failed to parse header line: \'##malformed line!\''): print(reader.read_header())
def test_should_parse_well_formatted_version(self): lines = [ '##fileformat=VCFv4.2\n', '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n', ] reader = VCFReader(iter(lines)) header = reader.read_header() expected = Schema() self.assertEqual(expected, header)
def test_should_parse_column_headers_with_format_but_no_samples(self): lines = [ '##fileformat=VCFv4.2\n', '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\n', ] reader = VCFReader(iter(lines)) header = reader.read_header() expected = Schema() self.assertEqual(expected, header)
def test_should_fail_with_malformed_format_column_header(self): lines = [ '##fileformat=VCFv4.2\n', '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFOO\n', ] reader = VCFReader(iter(lines)) with self.assertRaisesRegex( Exception, re.escape('expected column header line: \'#CHROM\\tPOS\\tID\\tREF\\tALT\\tQUAL\\tFILTER\\tINFO\\tFOO\'') ): print(reader.read_header())
def test_should_parse_column_headers_with_complex_sample_names(self): lines = [ '##fileformat=VCFv4.2\n', '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tOWEN_TOBY-RHYS.JONES\n', ] reader = VCFReader(iter(lines)) header = reader.read_header() expected = Schema() expected.samples = ['OWEN_TOBY-RHYS.JONES'] self.assertEqual(expected, header)
def test_should_fail_without_required_column_headers(self): lines = [ '##fileformat=VCFv4.2\n', '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\n', ] reader = VCFReader(iter(lines)) with self.assertRaisesRegex( Exception, re.escape("expected column header line: '#CHROM\\tPOS\\tID\\tREF\\tALT\\tQUAL\\tFILTER'") ): print(reader.read_header())
def test_should_parse_valid_filter_header_fields(self): lines = [ '##fileformat=VCFv4.2\n', '##FILTER=<ID=key,Description="description">\n', '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n', ] reader = VCFReader(iter(lines)) header = reader.read_header() expected = Schema() expected.set_filter('key', 'description') self.assertEqual(expected, header)
def test_should_parse_well_formatted_file_metadata(self): lines = [ '##fileformat=VCFv4.2\n', '##fileDate=2013-07-08\n', '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n', ] reader = VCFReader(iter(lines)) header = reader.read_header() expected = Schema() expected.file_metadata['fileDate'] = '2013-07-08' self.assertEqual(expected, header)
def test_should_parse_valid_contig_header_fields(self): lines = [ '##fileformat=VCFv4.2\n', '##contig=<ID=key,length=666>\n', '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n', ] reader = VCFReader(iter(lines)) header = reader.read_header() expected = Schema() expected.set_contig('key', 666) self.assertEqual(expected, header)
def test_should_parse_valid_sample_header_fields(self): lines = [ '##fileformat=VCFv4.2\n', '##FORMAT=<ID=key,Number=1,Type=String,Description="description">\n', '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n', ] reader = VCFReader(iter(lines)) header = reader.read_header() expected = Schema() expected.set_sample_data('key', '1', 'String', 'description') self.assertEqual(expected, header)
def test_should_not_parse_column_headers_with_sample_names_containing_white_space(self): lines = [ '##fileformat=VCFv4.2\n', '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tOWEN JONES\n', ] reader = VCFReader(iter(lines)) with self.assertRaisesRegex( Exception, re.escape( 'expected column header line: ' '\'#CHROM\\tPOS\\tID\\tREF\\tALT\\tQUAL\\tFILTER\\tINFO\\tFORMAT\\tOWEN JONES\'' ) ): print(reader.read_header())
def test_should_parse_all_info_header_fields(self): lines = [ '##fileformat=VCFv4.2\n', '##INFO=<ID=key,Number=1,Type=String,Description="description",Source="foo",Version="bar">\n', '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n', ] reader = VCFReader(iter(lines)) header = reader.read_header() expected = Schema() expected.set_info_data( 'key', '1', 'String', 'description', 'foo', 'bar') self.assertEqual(expected, header)