def test_use_of_representative_header(self): # Info field `HU` is defined as Float in file header while data is String. # This results in parser failure. We test if parser completes successfully # when a representative headers with String definition for field `HU` is # given. file_content = [ '##INFO=<ID=HU,Number=.,Type=Float,Description="Info">\n', '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\r\n', '#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Sample1 Sample2\r\n', '19 2 . A T . . HU=a,b GT 0/0 0/1\n', ] representative_header_lines = [ '##INFO=<ID=HU,Number=.,Type=String,Description="Info">\n', '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\r\n', ] variant = Variant(reference_name='19', start=1, end=2, reference_bases='A', alternate_bases=['T'], info={'HU': ['a', 'b']}) variant.calls.append(VariantCall(name='Sample1', genotype=[0, 0])) variant.calls.append(VariantCall(name='Sample2', genotype=[0, 1])) # `file_headers` is used. with self.assertRaises(ValueError): read_data = self._create_temp_file_and_read_records(file_content) # `representative_header` is used. read_data = self._create_temp_file_and_read_records( file_content, representative_header_lines) self.assertEqual(1, len(read_data)) self._assert_variants_equal([variant], read_data)
def test_custom_phaseset(self): phaseset_header_line = ( '##FORMAT=<ID=PS,Number=1,Type=Integer,Description="Phaseset">\n') record_lines = [ '19 123 . A T . . . GT:PS 1|0:1111 0/1:.\n', '19 121 . A T . . . GT:PS 1|0:2222 0/1:2222\n' ] variant_1 = Variant(reference_name='19', start=122, end=123, reference_bases='A', alternate_bases=['T']) variant_1.calls.append( VariantCall(name='Sample1', genotype=[1, 0], phaseset='1111')) variant_1.calls.append(VariantCall(name='Sample2', genotype=[0, 1])) variant_2 = Variant(reference_name='19', start=120, end=121, reference_bases='A', alternate_bases=['T']) variant_2.calls.append( VariantCall(name='Sample1', genotype=[1, 0], phaseset='2222')) variant_2.calls.append( VariantCall(name='Sample2', genotype=[0, 1], phaseset='2222')) read_data = self._create_temp_file_and_read_records( [phaseset_header_line] + _SAMPLE_HEADER_LINES[1:] + record_lines) self.assertEqual(2, len(read_data)) self._assert_variants_equal([variant_1, variant_2], read_data)
def test_end_info_key_unknown_number_invalid(self): end_info_header_line = ( '##INFO=<ID=END,Number=.,Type=Integer,Description="End of record.">\n') # PySam should only take first END field. variant = Variant( reference_name='19', start=122, end=150, reference_bases='A', alternate_bases=['T']) variant.calls.append(VariantCall(sample_id=hash_name('Sample1'), name='Sample1', genotype=[1, 0])) variant.calls.append(VariantCall(sample_id=hash_name('Sample2'), name='Sample2', genotype=[0, 1])) read_data = self._create_temp_file_and_read_records( [end_info_header_line] + _SAMPLE_HEADER_LINES[1:] + ['19 123 . A T . . END=150,160 GT 1/0 0/1\n']) self.assertEqual(1, len(read_data)) self._assert_variants_equal([variant], read_data) # END should be rounded down. read_data = self._create_temp_file_and_read_records( [end_info_header_line] + _SAMPLE_HEADER_LINES[1:] + ['19 123 . A T . . END=150.9 GT 1/0 0/1\n']) self.assertEqual(1, len(read_data)) self._assert_variants_equal([variant], read_data) # END should not be a string. with self.assertRaises(ValueError): self._create_temp_file_and_read_records( [end_info_header_line] + _SAMPLE_HEADER_LINES[1:] + ['19 123 . A T . . END=text GT 1/0 0/1\n'])
def test_format_numbers(self): format_headers = [ '##FORMAT=<ID=FU,Number=.,Type=String,Description="Format_variable">\n', '##FORMAT=<ID=F1,Number=1,Type=Integer,Description="Format_1">\n', '##FORMAT=<ID=F2,Number=2,Type=Character,Description="Format_2">\n', '##FORMAT=<ID=AO,Number=A,Type=Integer,Description="Format_3">\n', '##FORMAT=<ID=AD,Number=G,Type=Integer,Description="Format_4">\n',] record_lines = [ ('19 2 . A T,C . . . ' 'GT:FU:F1:F2:AO:AD 1/0:a1:3:a,b:1:3,4 ' '0/1:a2,a3:4:b,c:1,2:3')] expected_variant = Variant( reference_name='19', start=1, end=2, reference_bases='A', alternate_bases=['T', 'C']) expected_variant.calls.append(VariantCall( sample_id=hash_name('Sample1'), name='Sample1', genotype=[1, 0], info={'FU': ['a1'], 'F1': 3, 'F2': ['a', 'b'], 'AO': [1], 'AD': [3, 4]})) expected_variant.calls.append(VariantCall( sample_id=hash_name('Sample2'), name='Sample2', genotype=[0, 1], info={'FU': ['a2', 'a3'], 'F1': 4, 'F2': ['b', 'c'], 'AO': [1, 2], 'AD':[3]})) read_data = self._create_temp_file_and_read_records( format_headers + _SAMPLE_HEADER_LINES[1:] + record_lines) self.assertEqual(1, len(read_data)) self.assertEqual(expected_variant, read_data[0])
def test_info_numbers_and_types(self): info_headers = [ '##INFO=<ID=HA,Number=A,Type=String,Description="StringInfo_A">\n', '##INFO=<ID=HG,Number=G,Type=Integer,Description="IntInfo_G">\n', '##INFO=<ID=HR,Number=R,Type=Character,Description="ChrInfo_R">\n', '##INFO=<ID=HF,Number=0,Type=Flag,Description="FlagInfo">\n', '##INFO=<ID=HU,Number=.,Type=Float,Description="FloatInfo_variable">\n'] record_lines = [ '19 2 . A T,C . . HA=a1,a2;HG=1,2,3;HR=a,b,c;HF;HU=0.1 GT 1/0 0/1\n', '19 124 . A T . . HG=3,4,5;HR=d,e;HU=1.1,1.2 GT 0/0 0/1'] variant_1 = Variant( reference_name='19', start=1, end=2, reference_bases='A', alternate_bases=['T', 'C'], info={'HA': ['a1', 'a2'], 'HG': [1, 2, 3], 'HR': ['a', 'b', 'c'], 'HF': True, 'HU': [0.1]}) variant_1.calls.append(VariantCall(sample_id=hash_name('Sample1'), name='Sample1', genotype=[1, 0])) variant_1.calls.append(VariantCall(sample_id=hash_name('Sample2'), name='Sample2', genotype=[0, 1])) variant_2 = Variant( reference_name='19', start=123, end=124, reference_bases='A', alternate_bases=['T'], info={'HG': [3, 4, 5], 'HR': ['d', 'e'], 'HU': [1.1, 1.2]}) variant_2.calls.append(VariantCall(sample_id=hash_name('Sample1'), name='Sample1', genotype=[0, 0])) variant_2.calls.append(VariantCall(sample_id=hash_name('Sample2'), name='Sample2', genotype=[0, 1])) read_data = self._create_temp_file_and_read_records( info_headers + _SAMPLE_HEADER_LINES[1:] + record_lines) self.assertEqual(2, len(read_data)) self._assert_variants_equal([variant_1, variant_2], read_data)
def test_end_info_key(self): end_info_header_line = ( '##INFO=<ID=END,Number=1,Type=Integer,Description="End of record.">\n') record_lines = ['19 123 . A T . . END=1111 GT 1/0 0/1\n', '19 123 . A T . . . GT 0/1 1/1\n'] variant_1 = Variant( reference_name='19', start=122, end=1111, reference_bases='A', alternate_bases=['T']) variant_1.calls.append(VariantCall(sample_id=hash_name('Sample1'), name='Sample1', genotype=[1, 0])) variant_1.calls.append(VariantCall(sample_id=hash_name('Sample2'), name='Sample2', genotype=[0, 1])) variant_2 = Variant( reference_name='19', start=122, end=123, reference_bases='A', alternate_bases=['T']) variant_2.calls.append(VariantCall(sample_id=hash_name('Sample1'), name='Sample1', genotype=[0, 1])) variant_2.calls.append(VariantCall(sample_id=hash_name('Sample2'), name='Sample2', genotype=[1, 1])) read_data = self._create_temp_file_and_read_records( [end_info_header_line] + _SAMPLE_HEADER_LINES[1:] + record_lines) self.assertEqual(2, len(read_data)) self._assert_variants_equal([variant_1, variant_2], read_data)
def test_missing_info_key(self): coder = self._get_coder() variant = Variant() variant.calls.append(VariantCall( name='Sample1', genotype=[0, 1], info={'GQ': 10, 'AF': 20})) variant.calls.append(VariantCall( name='Sample2', genotype=[0, 1], info={'AF': 20})) expected = ('. . . . . . . . GT:AF:GQ 0/1:20:10 ' '0/1:20:.\n') self._assert_variant_lines_equal(coder.encode(variant), expected)
def test_no_info(self): record_line = 'chr19 123 . . . . . . GT . .' expected_variant = Variant(reference_name='chr19', start=122, end=123) expected_variant.calls.append( VariantCall(name='Sample1', genotype=[vcfio.MISSING_GENOTYPE_VALUE])) expected_variant.calls.append( VariantCall(name='Sample2', genotype=[vcfio.MISSING_GENOTYPE_VALUE])) read_data = self._create_temp_file_and_read_records( _SAMPLE_HEADER_LINES + [record_line]) self.assertEqual(1, len(read_data)) self.assertEqual(expected_variant, read_data[0])
def test_end_info_key_unknown_number(self): end_info_header_line = ( '##INFO=<ID=END,Number=.,Type=Integer,Description="End of record.">\n') record_lines = ['19 123 . A . . . END=1111 GT 1/0 0/1\n'] variant_1 = Variant( reference_name='19', start=122, end=1111, reference_bases='A') variant_1.calls.append(VariantCall(name='Sample1', genotype=[1, 0])) variant_1.calls.append(VariantCall(name='Sample2', genotype=[0, 1])) read_data = self._create_temp_file_and_read_records( [end_info_header_line] + _SAMPLE_HEADER_LINES[1:] + record_lines) self.assertEqual(1, len(read_data)) self._assert_variants_equal([variant_1], read_data)
def test_use_of_representative_header_two_files(self): # Info field `HU` is defined as Float in file header while data is String. # This results in parser failure. We test if parser completes successfully # when a representative headers with String definition for field `HU` is # given. file_content_1 = [ '##INFO=<ID=HU,Number=.,Type=Float,Descri\n', '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\r\n', '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSample1\r\n', '9\t2\t.\tA\tT\t.\t.\tHU=a,b\tGT\t0/0' ] file_content_2 = [ '##INFO=<ID=HU,Number=.,Type=Float,Descri\n', '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\r\n', '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSample2\r\n', '19\t2\t.\tA\tT\t.\t.\tHU=a,b\tGT\t0/1\n', ] representative_header_lines = [ '##INFO=<ID=HU,Number=.,Type=String,Description="Info">\n', '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\r\n', ] variant_1 = Variant(reference_name='9', start=1, end=2, reference_bases='A', alternate_bases=['T'], info={'HU': ['a', 'b']}) variant_1.calls.append( VariantCall(sample_id=hash_name('Sample1'), genotype=[0, 0])) variant_2 = Variant(reference_name='19', start=1, end=2, reference_bases='A', alternate_bases=['T'], info={'HU': ['a', 'b']}) variant_2.calls.append( VariantCall(sample_id=hash_name('Sample2'), genotype=[0, 1])) read_data_1 = self._create_temp_file_and_read_records( file_content_1, representative_header_lines) self.assertEqual(1, len(read_data_1)) self._assert_variants_equal([variant_1], read_data_1) read_data_2 = self._create_temp_file_and_read_records( file_content_2, representative_header_lines) self.assertEqual(1, len(read_data_2)) self._assert_variants_equal([variant_2], read_data_2)
def test_triploid_genotype(self): coder = self._get_coder() variant = Variant() variant.calls.append(VariantCall(name='Sample', genotype=[1, 0, 1])) expected = '. . . . . . . . GT 1/0/1\n' self._assert_variant_lines_equal(coder.encode(variant), expected)
def test_empty_sample_calls(self): coder = self._get_coder() variant = Variant() variant.calls.append( VariantCall(name='Sample2', genotype=-1)) expected = '. . . . . . . . GT .\n' self._assert_variant_lines_equal(coder.encode(variant), expected)
def test_missing_genotype(self): coder = self._get_coder() variant = Variant() variant.calls.append(VariantCall( name='Sample', genotype=[1, vcfio.MISSING_GENOTYPE_VALUE])) expected = '. . . . . . . . GT 1/.\n' self._assert_variant_lines_equal(coder.encode(variant), expected)
def test_info_list(self): coder = self._get_coder() variant = Variant() variant.calls.append(VariantCall( name='Sample', genotype=[0, 1], info={'LI': [1, None, 3]})) expected = '. . . . . . . . GT:LI 0/1:1,.,3\n' self._assert_variant_lines_equal(coder.encode(variant), expected)
def test_variant_equality(self): base_variant = Variant(reference_name='a', start=20, end=22, reference_bases='a', alternate_bases=['g', 't'], names=['variant'], quality=9, filters=['q10'], info={'key': 'value'}, calls=[VariantCall(genotype=[0, 0])]) equal_variant = Variant(reference_name='a', start=20, end=22, reference_bases='a', alternate_bases=['g', 't'], names=['variant'], quality=9, filters=['q10'], info={'key': 'value'}, calls=[VariantCall(genotype=[0, 0])]) different_calls = Variant(reference_name='a', start=20, end=22, reference_bases='a', alternate_bases=['g', 't'], names=['variant'], quality=9, filters=['q10'], info={'key': 'value'}, calls=[VariantCall(genotype=[1, 0])]) missing_field = Variant(reference_name='a', start=20, end=22, reference_bases='a', alternate_bases=['g', 't'], names=['variant'], quality=9, filters=['q10'], info={'key': 'value'}) self.assertEqual(base_variant, equal_variant) self.assertNotEqual(base_variant, different_calls) self.assertNotEqual(base_variant, missing_field)