def test_doesnt_give_a_flying_damn_about_spurious_filter_header(self): chrom = "22" variant = Variant(chrom, 11, "A", "C") schema = Schema() complex_filter_name = '.+-*\\/~@?!%^&><=\"\'(){}[]_|' schema.set_filter(complex_filter_name, 'unusual characters') gv_builder = VCFBuilder(join(self.work_dir, "genotype.vcf"), schema=schema) gv_builder.with_record_from_variant(variant, filters={complex_filter_name}) gv_builder.build().index() driver = SVCDriver(self) dodgy_sample = "bobs_your_uncle" driver.with_ref_sequence( "ACGCCCCCTGCAAAAAAAAAA", chrom=chrom, pos_from=0).with_read( "...........C.........", n_fwd=5, n_rev=5, chrom=chrom, sample_name=dodgy_sample).with_genotype_alleles( gv_builder.compressed_filename) expect = driver.call(expected_success=True) expect .with_output_vcf()\ .has_record_for_variant(variant)\ .with_sample(dodgy_sample)\ .has_genotype("1/1")
def __init__(self, filename, schema=None): self.__filename = filename self.__indexer = TabixIndexer(self.__filename, "vcf") if schema is None: self.schema = Schema() else: self.schema = schema self.__records = []
def test_should_return_true_if_no_GL_or_PL_present(self): schema = Schema() schema.set_sample_data('GT', '1', 'String', '') schema.samples = ['foo'] records = list( generate_records( schema, ['chrZ', '200', '.', 'C', 'A', '.', 'PASS', '.', 'GT', '0/1'])) self.assertTrue(records[0].sample_info.has_no_likelihoods())
def test_should_write_filter_in_expected_format(self): mock_file = StringIO() schema = Schema() schema.set_filter('key', 'a filter') writer = VCFWriter(mock_file) writer.write_header(schema) expected_file = '##fileformat=VCFv4.2\n' \ '##FILTER=<ID=key,Description="a filter">\n' \ '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n' self.assertEqual(expected_file, mock_file.getvalue())
def test_should_write_sample_data_in_expected_format(self): mock_file = StringIO() schema = Schema() schema.set_sample_data('key', '1', 'String', 'a sample field') writer = VCFWriter(mock_file) writer.write_header(schema) expected_file = '##fileformat=VCFv4.2\n' \ '##FORMAT=<ID=key,Number=1,Type=String,Description="a sample field">\n' \ '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n' self.assertEqual(expected_file, mock_file.getvalue())
def test_should_parse_column_headers_with_complex_sample_names(self): lines = [ '##fileformat=VCFv4.2\n', '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tOWEN_TOBY-RHYS.JONES\n', ] reader = VCFReader(iter(lines)) header = reader.read_header() expected = Schema() expected.samples = ['OWEN_TOBY-RHYS.JONES'] self.assertEqual(expected, header)
def test_should_write_contig_in_expected_format(self): mock_file = StringIO() schema = Schema() schema.set_contig('key', 666) writer = VCFWriter(mock_file) writer.write_header(schema) expected_file = '##fileformat=VCFv4.2\n' \ '##contig=<ID=key,length=666>\n' \ '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n' self.assertEqual(expected_file, mock_file.getvalue())
def test_should_format_multiple_values(self): schema = Schema() schema.set_info_data('K', 'A', 'Float', 'K') schema.set_info_data('K2', 'A', 'String', 'K') schema.set_info_data('K3', '0', 'Flag', 'K') schema.set_info_data('K4', 'A', 'String', 'K') info_data = InfoData(schema, { 'K3': None, 'K2': ['S2'], 'K': [1.0, 2.66, 3.0], 'K4': ['S4'] }) self.assertEqual('K=1.0,2.66,3.0;K2=S2;K3;K4=S4', info_data.to_vcf())
def test_should_parse_valid_contig_header_fields(self): lines = [ '##fileformat=VCFv4.2\n', '##contig=<ID=key,length=666>\n', '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n', ] reader = VCFReader(iter(lines)) header = reader.read_header() expected = Schema() expected.set_contig('key', 666) self.assertEqual(expected, header)
def test_should_parse_valid_filter_header_fields(self): lines = [ '##fileformat=VCFv4.2\n', '##FILTER=<ID=key,Description="description">\n', '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n', ] reader = VCFReader(iter(lines)) header = reader.read_header() expected = Schema() expected.set_filter('key', 'description') self.assertEqual(expected, header)
def test_should_parse_valid_sample_header_fields(self): lines = [ '##fileformat=VCFv4.2\n', '##FORMAT=<ID=key,Number=1,Type=String,Description="description">\n', '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n', ] reader = VCFReader(iter(lines)) header = reader.read_header() expected = Schema() expected.set_sample_data('key', '1', 'String', 'description') self.assertEqual(expected, header)
def test_should_warn_about_too_many_alts_in_field_of_allelic_cardinality( self, log): schema = Schema() schema.set_info_data('key', 'A', 'String', '') records = list(generate_records(schema, [ 'chrZ', '200', '.', 'C', 'A,T', '.', 'PASS', 'key=a,b,c' ])) expected = [['a'], ['b']] for index, record in enumerate(records): self.assertEqual(expected[index], record.info['key']) log.check(('wecall.vcfutils.fieldmetadata', 'WARNING', 'expected 2 items in {!r}'.format([['a'], ['b'], ['c']])), )
def test_should_parse_well_formatted_file_metadata(self): lines = [ '##fileformat=VCFv4.2\n', '##fileDate=2013-07-08\n', '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n', ] reader = VCFReader(iter(lines)) header = reader.read_header() expected = Schema() expected.file_metadata['fileDate'] = '2013-07-08' self.assertEqual(expected, header)
def test_should_write_file_metadata_in_expected_format(self): mock_file = StringIO() date = datetime.datetime.utcnow().strftime('%F') schema = Schema() schema.file_metadata['fileDate'] = date writer = VCFWriter(mock_file) writer.write_header(schema) expected_file = '##fileformat=VCFv4.2\n' \ '##fileDate={date!s}\n' \ '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n' \ .format(date=date) self.assertEqual(expected_file, mock_file.getvalue())
def test_should_warn_when_GT_is_not_present(self, log): schema = Schema() schema.set_sample_data('GL', 'G', 'Float', '') schema.samples = ['foo'] records = list( generate_records(schema, [ 'chrZ', '200', '.', 'C', 'A,T', '.', 'PASS', '.', 'GL', '1,2,3' ])) for index, record in enumerate(records): self.assertEqual( (index, ['1', '2', '3']), (index, record.sample_info.get_field('foo', 'GL'))) log.check(('wecall.vcfutils.fieldmetadata', 'WARNING', 'Unknown ploidy when parsing genotype likelihood'), )
def test_should_write_empty_file_containing_expected_version_number(self): mock_file = StringIO() empty_schema = Schema() writer = VCFWriter(mock_file) writer.write_header(empty_schema) expected_file = '##fileformat=VCFv4.2\n#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n' self.assertEqual(expected_file, mock_file.getvalue())
def __enter__(self): self.fp = open(self.filename, 'w') if self.header is None: self.header = Schema() self.header.file_metadata['fileDate'] = datetime.date.today( ).strftime('%F') self.vcf_writer = VCFWriter(self.fp) self.vcf_writer.write_header(self.header) return self.vcf_writer
def test_should_parse_all_info_header_fields(self): lines = [ '##fileformat=VCFv4.2\n', '##INFO=<ID=key,Number=1,Type=String,Description="description",Source="foo",Version="bar">\n', '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n', ] reader = VCFReader(iter(lines)) header = reader.read_header() expected = Schema() expected.set_info_data( 'key', '1', 'String', 'description', 'foo', 'bar') self.assertEqual(expected, header)
def test_should_add_default_parsing_rule_for_unknown_key_in_multiallelic_line(self): schema = Schema() records = list(generate_records(schema, [ 'chrZ', '200', '.', 'C', 'A,T', '.', 'PASS', 'NEW_KEY=value' ])) self.assertEqual(0, len(list(schema.iter_info_data()))) for index, record in enumerate(records): self.assertEqual( (index, ['value']), (index, record.info['NEW_KEY'])) self.assertEqual(1, len(list(schema.iter_info_data()))) info_metadata = schema.get_info_data('NEW_KEY') self.assertEqual('.', info_metadata.number) self.assertEqual('String', info_metadata.data_type) self.assertEqual( 'Inferred from file content during parsing', info_metadata.description) self.assertEqual('vcfutils', info_metadata.source) self.assertEqual('undefined', info_metadata.version)
def test_should_parse_column_headers_with_format_but_no_samples(self): lines = [ '##fileformat=VCFv4.2\n', '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\n', ] reader = VCFReader(iter(lines)) header = reader.read_header() expected = Schema() self.assertEqual(expected, header)
def test_should_parse_well_formatted_version(self): lines = [ '##fileformat=VCFv4.2\n', '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n', ] reader = VCFReader(iter(lines)) header = reader.read_header() expected = Schema() self.assertEqual(expected, header)
def test_should_write_sample_names_in_column_header_line(self): mock_file = StringIO() schema = Schema() schema.samples.append('FOO') writer = VCFWriter(mock_file) writer.write_header(schema) expected_file = '##fileformat=VCFv4.2\n' \ '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tFOO\n' self.assertEqual(expected_file, mock_file.getvalue())
def test_should_warn_about_unrecognised_key_in_monoallelic_line(self, log): records = list(generate_records(Schema(), [ 'chrZ', '200', '.', 'C', 'T', '.', 'PASS', 'NEW_KEY=value' ])) for index, record in enumerate(records): self.assertEqual( (index, ['value']), (index, record.info['NEW_KEY'])) log.check( ('root', 'WARNING', 'info field {!r} not defined in schema'.format('NEW_KEY')), )
def test_should_return_false_if_one_sample_okay_for_GL(self): schema = Schema() schema.set_sample_data('GT', '1', 'String', '') schema.set_sample_data('GL', 'G', 'Float', '') schema.samples = ['foo'] records = list( generate_records(schema, [ 'chrZ', '200', '.', 'C', 'A', '.', 'PASS', '.', 'GT:GL', '0/1:90,1,120', '0/1:.,.,.' ])) self.assertFalse(records[0].sample_info.has_no_likelihoods())
def test_should_return_true_if_all_likelihoods_are_none_for_PL(self): schema = Schema() schema.set_sample_data('GT', '1', 'String', '') schema.set_sample_data('PL', 'G', 'Float', '') schema.samples = ['foo'] records = list( generate_records(schema, [ 'chrZ', '200', '.', 'C', 'A', '.', 'PASS', '.', 'GT:PL', '0/1:.,.,.' ])) self.assertTrue(records[0].sample_info.has_no_likelihoods())
def test_should_drop_genotype_likelihood_with_mismatch_ploidy(self): schema = Schema() schema.set_sample_data('GT', '1', 'String', '') schema.set_sample_data('GL', 'G', 'Float', '') schema.samples = ['foo'] records = list( generate_records(schema, [ 'chrZ', '200', '.', 'C', 'A,T', '.', 'PASS', '.', 'GT:GL', '0/1:1,2,3,4' ])) self.assertEqual(GenotypeCall("0/1"), records[0].sample_info.get_field('foo', 'GT')) self.assertEqual([None, None, None], records[0].sample_info.get_field('foo', 'GL')) self.assertEqual(GenotypeCall("0/0"), records[1].sample_info.get_field('foo', 'GT')) self.assertEqual([None, None, None], records[1].sample_info.get_field('foo', 'GL'))
def test_should_split_genotype_likelihood_properly(self): schema = Schema() schema.set_sample_data('GT', '1', 'String', '') schema.set_sample_data('GL', 'G', 'Float', '') schema.samples = ['foo'] records = list( generate_records(schema, [ 'chrZ', '200', '.', 'C', 'A,T', '.', 'PASS', '.', 'GT:GL', '0/1:1,2,3,4,5,6' ])) self.assertEqual(GenotypeCall("0/1"), records[0].sample_info.get_field('foo', 'GT')) self.assertEqual([1.0, 2.0, 3.0], records[0].sample_info.get_field('foo', 'GL')) self.assertEqual(GenotypeCall("0/0"), records[1].sample_info.get_field('foo', 'GT')) self.assertEqual([1.0, 4.0, 6.0], records[1].sample_info.get_field('foo', 'GL'))
def test_should_format_a_string_list(self): schema = Schema() schema.set_info_data('K', 'A', 'String', 'K') info_data = InfoData(schema, {'K': ['V1', 'V2']}) self.assertEqual('K=V1,V2', info_data.to_vcf())
def test_should_format_an_int_list(self): schema = Schema() schema.set_info_data('K', 'A', 'Integer', 'K') info_data = InfoData(schema, {'K': [1, 2, 3]}) self.assertEqual('K=1,2,3', info_data.to_vcf())
def test_should_format_a_float_list(self): schema = Schema() schema.set_info_data('K', 'A', 'Integer', 'K') info_data = InfoData(schema, {'K': [1.0, 2.66, 3.0]}) self.assertEqual('K=1.0,2.66,3.0', info_data.to_vcf())