def test_unicode_fields(self): sample_unicode_str = u'\xc3\xb6' sample_utf8_str = sample_unicode_str.encode('utf-8') variant = vcfio.Variant( reference_name='chr19', start=11, end=12, reference_bases='CT', alternate_bases=[], filters=[sample_unicode_str, sample_utf8_str], info={ 'AS1': vcfio.VariantInfo(sample_utf8_str, '1'), 'AS2': vcfio.VariantInfo([sample_unicode_str, sample_utf8_str], '2') }) expected_row = { ColumnKeyConstants.REFERENCE_NAME: 'chr19', ColumnKeyConstants.START_POSITION: 11, ColumnKeyConstants.END_POSITION: 12, ColumnKeyConstants.REFERENCE_BASES: 'CT', ColumnKeyConstants.ALTERNATE_BASES: [], ColumnKeyConstants.FILTER: [sample_unicode_str, sample_unicode_str], ColumnKeyConstants.CALLS: [], 'AS1': sample_unicode_str, 'AS2': [sample_unicode_str, sample_unicode_str] } self.assertEqual([expected_row], self._get_row_list_from_variant(variant))
def test_nonstandard_float_values(self): variant = vcfio.Variant( reference_name='chr19', start=11, end=12, reference_bases='CT', alternate_bases=[], filters=[], info={ 'F1': vcfio.VariantInfo(float('inf'), '1'), 'F2': vcfio.VariantInfo( [float('-inf'), float('nan'), 1.2], '3'), 'F3': vcfio.VariantInfo(float('nan'), '1'), }) null_replacement_value = -sys.maxint expected_row = { ColumnKeyConstants.REFERENCE_NAME: 'chr19', ColumnKeyConstants.START_POSITION: 11, ColumnKeyConstants.END_POSITION: 12, ColumnKeyConstants.REFERENCE_BASES: 'CT', ColumnKeyConstants.ALTERNATE_BASES: [], ColumnKeyConstants.CALLS: [], 'F1': sys.maxint, 'F2': [-sys.maxint, null_replacement_value, 1.2], 'F3': None } self.assertEqual([expected_row], self._get_row_list_from_variant(variant))
def test_no_alternate_bases(self): variant = vcfio.Variant(reference_name='chr19', start=11, end=12, reference_bases='CT', alternate_bases=[], filters=['q10'], info={ 'A1': vcfio.VariantInfo('some data', '1'), 'A2': vcfio.VariantInfo(['data1', 'data2'], '2') }) expected_row = { ColumnKeyConstants.REFERENCE_NAME: 'chr19', ColumnKeyConstants.START_POSITION: 11, ColumnKeyConstants.END_POSITION: 12, ColumnKeyConstants.REFERENCE_BASES: 'CT', ColumnKeyConstants.ALTERNATE_BASES: [], ColumnKeyConstants.FILTER: ['q10'], ColumnKeyConstants.CALLS: [], 'A1': 'some data', 'A2': ['data1', 'data2'] } self.assertEqual([expected_row], self._get_row_list_from_variant(variant))
def test_null_repeated_fields(self): variant = vcfio.Variant(reference_name='chr19', start=11, end=12, reference_bases='CT', alternate_bases=[], filters=['q10'], info={ 'AI': vcfio.VariantInfo([0, 1, None], '3'), 'AB': vcfio.VariantInfo([True, None, False], '3'), 'AF': vcfio.VariantInfo([0.1, 0.2, None, 0.4], '4'), 'AS': vcfio.VariantInfo([None, 'data1', 'data2'], '3') }) expected_row = { ColumnKeyConstants.REFERENCE_NAME: 'chr19', ColumnKeyConstants.START_POSITION: 11, ColumnKeyConstants.END_POSITION: 12, ColumnKeyConstants.REFERENCE_BASES: 'CT', ColumnKeyConstants.ALTERNATE_BASES: [], ColumnKeyConstants.FILTER: ['q10'], ColumnKeyConstants.CALLS: [], 'AI': [0, 1, -sys.maxint], 'AB': [True, False, False], 'AF': [0.1, 0.2, -sys.maxint, 0.4], 'AS': ['.', 'data1', 'data2'] } self.assertEqual([expected_row], self._get_row_list_from_variant(variant))
def test_schema_conflict_in_info_field_number(self): variant = vcfio.Variant( reference_name='chr19', start=11, end=12, reference_bases='CT', alternate_bases=[], filters=[], info={ 'IB': vcfio.VariantInfo(data=[1, 2], field_count='2'), 'IBR': vcfio.VariantInfo(data=1, field_count='1'), 'II': vcfio.VariantInfo(data=[10, 20], field_count='2'), 'IF': vcfio.VariantInfo(data=1.1, field_count='1'), 'IS': vcfio.VariantInfo(data='foo', field_count='1') }, ) expected_row = { ColumnKeyConstants.REFERENCE_NAME: 'chr19', ColumnKeyConstants.START_POSITION: 11, ColumnKeyConstants.END_POSITION: 12, ColumnKeyConstants.REFERENCE_BASES: 'CT', ColumnKeyConstants.ALTERNATE_BASES: [], ColumnKeyConstants.CALLS: [], 'IB': True, 'IBR': [True], 'II': 10, 'IF': [1.1], 'IS': ['foo'], } self.assertEqual([expected_row], self._get_row_list_from_variant( variant, self._schema_descriptor, allow_incompatible_records=True))
def _get_sample_variant(self): return vcfio.Variant(reference_name='19', start=11, end=12, reference_bases='C', alternate_bases=['A', 'TT'], names=['rs1', 'rs2'], quality=2, filters=['PASS'], info={ 'A1': vcfio.VariantInfo('some data', '1'), 'A2': vcfio.VariantInfo(['data1', 'data2'], 'A') }, calls=[ vcfio.VariantCall(name='Sample1', genotype=[0, 1], info={ 'GQ': 20, 'HQ': [10, 20] }), vcfio.VariantCall(name='Sample2', genotype=[1, 0], info={ 'GQ': 10, 'FLAG1': True }) ])
def _get_sample_variant_1(): """Get first sample variant. Features: multiple alternates not phased multiple names """ vcf_line = ('20 1234 rs123;rs2 C A,T 50 PASS AF=0.5,0.1;NS=1 ' 'GT:GQ 0/0:48 1/0:20\n') variant = vcfio.Variant(reference_name='20', start=1233, end=1234, reference_bases='C', alternate_bases=['A', 'T'], names=['rs123', 'rs2'], quality=50, filters=['PASS'], info={ 'AF': vcfio.VariantInfo(data=[0.5, 0.1], field_count='A'), 'NS': vcfio.VariantInfo(data=1, field_count='1') }) variant.calls.append( vcfio.VariantCall(name='Sample1', genotype=[0, 0], info={'GQ': 48})) variant.calls.append( vcfio.VariantCall(name='Sample2', genotype=[1, 0], info={'GQ': 20})) return variant, vcf_line
def test_get_merged_variants_move_info_to_calls(self): strategy = merge_with_non_variants_strategy.MergeWithNonVariantsStrategy( info_keys_to_move_to_calls_regex='^A1$', copy_quality_to_calls=False, copy_filter_to_calls=False) variants = self._get_sample_variants() # Test single variant merge. single_merged_variant = list( strategy.get_merged_variants([variants[0]]))[0] self.assertEqual([ vcfio.VariantCall(name='Sample1', genotype=[0, 1], info={ 'GQ': 20, 'HQ': [10, 20], 'A1': 'some data' }), vcfio.VariantCall(name='Sample2', genotype=[1, 0], info={ 'GQ': 10, 'FLAG1': True, 'A1': 'some data' }) ], single_merged_variant.calls) # Test multiple variant merge. merged_variant = list(strategy.get_merged_variants(variants))[0] self._assert_common_expected_merged_fields(merged_variant) self.assertEqual([ vcfio.VariantCall(name='Sample1', genotype=[0, 1], info={ 'GQ': 20, 'HQ': [10, 20], 'A1': 'some data' }), vcfio.VariantCall(name='Sample2', genotype=[1, 0], info={ 'GQ': 10, 'FLAG1': True, 'A1': 'some data' }), vcfio.VariantCall( name='Sample3', genotype=[1, 1], info={'A1': 'some data2'}), vcfio.VariantCall(name='Sample4', genotype=[1, 0], info={ 'GQ': 20, 'A1': 'some data2' }) ], merged_variant.calls) self.assertItemsEqual(['A2', 'A3'], merged_variant.info.keys()) self.assertEqual(vcfio.VariantInfo(['data1', 'data2'], '2'), merged_variant.info['A2']) self.assertEqual(vcfio.VariantInfo(['data3', 'data4'], '2'), merged_variant.info['A3'])
def test_schema_conflict_in_info_field_type(self): variant = vcfio.Variant(reference_name='chr19', start=11, end=12, reference_bases='CT', alternate_bases=[], filters=[], info={ 'IB': vcfio.VariantInfo(data=1, field_count='1'), 'II': vcfio.VariantInfo(data=1.1, field_count='1'), 'IF': vcfio.VariantInfo(data=[1, 2], field_count='2'), 'IS': vcfio.VariantInfo(data=[1.0, 2.0], field_count='2') }) expected_row = { ColumnKeyConstants.REFERENCE_NAME: 'chr19', ColumnKeyConstants.START_POSITION: 11, ColumnKeyConstants.END_POSITION: 12, ColumnKeyConstants.REFERENCE_BASES: 'CT', ColumnKeyConstants.ALTERNATE_BASES: [], ColumnKeyConstants.CALLS: [], 'IB': True, 'II': 1, 'IF': [1.0, 2.0], 'IS': ['1.0', '2.0'] } self.assertEqual([expected_row], self._get_row_list_from_variant( variant, self._schema_descriptor, allow_incompatible_records=True)) with self.assertRaises(ValueError): variant = vcfio.Variant( reference_name='chr19', start=11, end=12, reference_bases='CT', alternate_bases=[], filters=[], # String cannot be casted to integer. info={ 'II': vcfio.VariantInfo(data='1.1', field_count='1'), }) self._get_row_list_from_variant(variant, self._schema_descriptor, allow_incompatible_records=True) self.fail( 'String data for an integer schema must cause an exception')
def _get_sample_variants(self): variant_1 = vcfio.Variant(reference_name='19', start=11, end=12, reference_bases='C', alternate_bases=['A', 'TT'], names=['rs1'], quality=2, filters=['PASS'], info={ 'A1': vcfio.VariantInfo('some data', '1'), 'A2': vcfio.VariantInfo(['data1', 'data2'], '2') }, calls=[ vcfio.VariantCall(name='Sample1', genotype=[0, 1], phaseset='*', info={ 'GQ': 20, 'HQ': [10, 20] }), vcfio.VariantCall(name='Sample2', genotype=[1, 0], info={ 'GQ': 10, 'FLAG1': True }), ]) variant_2 = vcfio.Variant(reference_name='20', start=11, end=12, reference_bases='C', alternate_bases=['A', 'TT'], names=['rs1'], quality=20, filters=['q10'], info={ 'A1': vcfio.VariantInfo('some data2', '2'), 'A3': vcfio.VariantInfo(['data3', 'data4'], '2') }, calls=[ vcfio.VariantCall(name='Sample3', genotype=[1, 1]), vcfio.VariantCall(name='Sample4', genotype=[1, 0], info={'GQ': 20}), ]) return [variant_1, variant_2]
def _get_sample_variant_1(self): variant = vcfio.Variant( reference_name='chr19', start=11, end=12, reference_bases='C', alternate_bases=['A', 'TT'], names=['rs1', 'rs2'], quality=2, filters=['PASS'], info={'IS': vcfio.VariantInfo('some data', '1'), 'IF': vcfio.VariantInfo(True, '0'), 'IA': vcfio.VariantInfo([0.1, 0.2], '2')}, calls=[vcfio.VariantCall( name='Sample1', genotype=[0, 1], phaseset='*', info={'FI': 20, 'FU': [10.0, 20.0]})] ) return variant
def _get_sample_variant_with_incompatible_records(self): variant = vcfio.Variant(reference_name='chr19', start=11, end=12, reference_bases='C', alternate_bases=[], filters=['PASS'], info={ 'IFR': vcfio.VariantInfo(['0.1', '0.2'], '2'), 'IS': vcfio.VariantInfo(1, '1'), 'ISR': vcfio.VariantInfo(1, '1') }, calls=[ vcfio.VariantCall(name='Sample1', genotype=[0, 1], phaseset='*', info={ 'GQ': 20, 'FIR': [10.0, 20.0] }), ]) row = { ColumnKeyConstants.REFERENCE_NAME: 'chr19', ColumnKeyConstants.START_POSITION: 11, ColumnKeyConstants.END_POSITION: 12, ColumnKeyConstants.REFERENCE_BASES: 'C', ColumnKeyConstants.ALTERNATE_BASES: [], ColumnKeyConstants.FILTER: ['PASS'], ColumnKeyConstants.CALLS: [{ ColumnKeyConstants.CALLS_NAME: 'Sample1', ColumnKeyConstants.CALLS_GENOTYPE: [0, 1], ColumnKeyConstants.CALLS_PHASESET: '*', 'GQ': 20, 'FIR': [10, 20] }], 'IFR': [0.1, 0.2], 'IS': '1', 'ISR': ['1'] } return variant, row
def _get_sample_variant_3(): """Get third sample variant. Features: symbolic alternate no calls for sample 2 alternate phaseset """ vcf_line = ('19 12 . C <SYMBOLIC> 49 q10 AF=0.5 GT:PS:GQ 0|1:1:45 ' '.:.:.\n') variant = vcfio.Variant( reference_name='19', start=11, end=12, reference_bases='C', alternate_bases=['<SYMBOLIC>'], quality=49, filters=['q10'], info={'AF': vcfio.VariantInfo(data=[0.5], field_count='A')}) variant.calls.append( vcfio.VariantCall(name='Sample1', genotype=[0, 1], phaseset='1', info={'GQ': 45})) variant.calls.append( vcfio.VariantCall(name='Sample2', genotype=[vcfio.MISSING_GENOTYPE_VALUE], info={'GQ': None})) return variant, vcf_line
def _get_sample_variant_2(): """Get second sample variant. Features: multiple references no alternate phased multiple filters missing format field """ vcf_line = ('19 123 rs1234 GTC . 40 q10;s50 NS=2 GT:GQ 1|0:48 0/1:.\n') variant = vcfio.Variant( reference_name='19', start=122, end=125, reference_bases='GTC', alternate_bases=[], names=['rs1234'], quality=40, filters=['q10', 's50'], info={'NS': vcfio.VariantInfo(data=2, field_count='1')}) variant.calls.append( vcfio.VariantCall(name='Sample1', genotype=[1, 0], phaseset=vcfio.DEFAULT_PHASESET_VALUE, info={'GQ': 48})) variant.calls.append( vcfio.VariantCall(name='Sample2', genotype=[0, 1], info={'GQ': None})) return variant, vcf_line
def _get_sample_variant_with_empty_calls(self): variant = vcfio.Variant(reference_name='20', start=123, end=125, reference_bases='CT', alternate_bases=[], filters=['q10', 's10'], info={'II': vcfio.VariantInfo(1234, '1')}, calls=[ vcfio.VariantCall(name='EmptySample', genotype=[], phaseset='*', info={}), ]) row = { ColumnKeyConstants.REFERENCE_NAME: '20', ColumnKeyConstants.START_POSITION: 123, ColumnKeyConstants.END_POSITION: 125, ColumnKeyConstants.REFERENCE_BASES: 'CT', ColumnKeyConstants.ALTERNATE_BASES: [], ColumnKeyConstants.FILTER: ['q10', 's10'], ColumnKeyConstants.CALLS: [], 'II': 1234 } return variant, row
def _get_sample_variant_1(self, split_alternate_allele_info_fields=True): variant = vcfio.Variant( reference_name='chr19', start=11, end=12, reference_bases='C', alternate_bases=['A', 'TT'], names=['rs1', 'rs2'], quality=2, filters=['PASS'], info={'AF': vcfio.VariantInfo([0.1, 0.2], 'A'), 'AF2': vcfio.VariantInfo([0.2, 0.3], 'A'), 'A1': vcfio.VariantInfo('some data', '1'), 'A2': vcfio.VariantInfo(['data1', 'data2'], '2')}, calls=[ vcfio.VariantCall( name='Sample1', genotype=[0, 1], phaseset='*', info={'GQ': 20, 'HQ': [10, 20]}), vcfio.VariantCall( name='Sample2', genotype=[1, 0], info={'GQ': 10, 'FLAG1': True}), ] ) row = {ColumnKeyConstants.REFERENCE_NAME: 'chr19', ColumnKeyConstants.START_POSITION: 11, ColumnKeyConstants.END_POSITION: 12, ColumnKeyConstants.REFERENCE_BASES: 'C', ColumnKeyConstants.NAMES: ['rs1', 'rs2'], ColumnKeyConstants.QUALITY: 2, ColumnKeyConstants.FILTER: ['PASS'], ColumnKeyConstants.CALLS: [ {ColumnKeyConstants.CALLS_NAME: 'Sample1', ColumnKeyConstants.CALLS_GENOTYPE: [0, 1], ColumnKeyConstants.CALLS_PHASESET: '*', 'GQ': 20, 'HQ': [10, 20]}, {ColumnKeyConstants.CALLS_NAME: 'Sample2', ColumnKeyConstants.CALLS_GENOTYPE: [1, 0], ColumnKeyConstants.CALLS_PHASESET: None, 'GQ': 10, 'FLAG1': True}], 'A1': 'some data', 'A2': ['data1', 'data2']} if split_alternate_allele_info_fields: row[ColumnKeyConstants.ALTERNATE_BASES] = [ {ColumnKeyConstants.ALTERNATE_BASES_ALT: 'A', 'AF': 0.1, 'AF2': 0.2}, {ColumnKeyConstants.ALTERNATE_BASES_ALT: 'TT', 'AF': 0.2, 'AF2': 0.3}] else: row[ColumnKeyConstants.ALTERNATE_BASES] = [ {ColumnKeyConstants.ALTERNATE_BASES_ALT: 'A'}, {ColumnKeyConstants.ALTERNATE_BASES_ALT: 'TT'}] row['AF'] = [0.1, 0.2] row['AF2'] = [0.2, 0.3] return variant, row
def test_create_processed_variant_mismatched_annotation_alt(self): # This is like `test_create_processed_variant_move_alt_info_and_annotation` # with the difference that it has an extra alt annotation which does not # match any alts. variant, header_fields = self._get_sample_variant_and_header_with_csq() variant.info['CSQ'] = vcfio.VariantInfo(data=[ 'A|C1|I1|S1|G1', 'TT|C2|I2|S2|G2', 'A|C3|I3|S3|G3', 'ATAT|C3|I3|S3|G3' ], field_count='.') counter_factory = _CounterSpyFactory() factory = processed_variant.ProcessedVariantFactory( header_fields, split_alternate_allele_info_fields=True, annotation_fields=['CSQ'], counter_factory=counter_factory) proc_var = factory.create_processed_variant(variant) alt1 = processed_variant.AlternateBaseData('A') alt1._info = { 'A2': 'data1', 'CSQ': [{ processed_variant._ANNOTATION_ALT: 'A', 'Consequence': 'C1', 'IMPACT': 'I1', 'SYMBOL': 'S1', 'Gene': 'G1' }, { processed_variant._ANNOTATION_ALT: 'A', 'Consequence': 'C3', 'IMPACT': 'I3', 'SYMBOL': 'S3', 'Gene': 'G3' }] } alt2 = processed_variant.AlternateBaseData('TT') alt2._info = { 'A2': 'data2', 'CSQ': [{ processed_variant._ANNOTATION_ALT: 'TT', 'Consequence': 'C2', 'IMPACT': 'I2', 'SYMBOL': 'S2', 'Gene': 'G2' }] } self.assertEqual(proc_var.alternate_data_list, [alt1, alt2]) self.assertFalse(proc_var.non_alt_info.has_key('A2')) self.assertFalse(proc_var.non_alt_info.has_key('CSQ')) self.assertEqual( counter_factory.counter_map[CEnum.VARIANT.value].get_value(), 1) self.assertEqual( counter_factory.counter_map[ CEnum.ANNOTATION_ALT_MATCH.value].get_value(), 2) self.assertEqual( counter_factory.counter_map[ CEnum.ANNOTATION_ALT_MISMATCH.value].get_value(), 1)
def _get_sample_variant_2(self): variant = vcfio.Variant( reference_name='20', start=123, end=125, reference_bases='CT', alternate_bases=[], filters=['q10', 's10'], info={'IS_2': vcfio.VariantInfo('some data', '1')}, calls=[vcfio.VariantCall( name='Sample1', genotype=[0, 1], phaseset='*', info={'FI_2': 20})] ) return variant
def test_create_processed_variant_annotation_alt_prefix_but_ref(self): # The returned variant is ignored as we create a custom one next. _, header_fields = self._get_sample_variant_and_header_with_csq() variant = vcfio.Variant( reference_name='19', start=11, end=12, reference_bases='C', alternate_bases=['AA', 'AAA'], names=['rs1'], quality=2, filters=['PASS'], info={ 'CSQ': vcfio.VariantInfo(data=['AA|C1|I1|S1|G1', 'AAA|C2|I2|S2|G2'], field_count='.') }) counter_factory = _CounterSpyFactory() factory = processed_variant.ProcessedVariantFactory( header_fields, split_alternate_allele_info_fields=True, annotation_fields=['CSQ'], counter_factory=counter_factory) proc_var = factory.create_processed_variant(variant) alt1 = processed_variant.AlternateBaseData('AA') alt1._info = { 'CSQ': [{ processed_variant._ANNOTATION_ALT: 'AA', 'Consequence': 'C1', 'IMPACT': 'I1', 'SYMBOL': 'S1', 'Gene': 'G1' }] } alt2 = processed_variant.AlternateBaseData('AAA') alt2._info = { 'CSQ': [{ processed_variant._ANNOTATION_ALT: 'AAA', 'Consequence': 'C2', 'IMPACT': 'I2', 'SYMBOL': 'S2', 'Gene': 'G2' }] } self.assertEqual(proc_var.alternate_data_list, [alt1, alt2]) self.assertFalse(proc_var.non_alt_info.has_key('CSQ')) self.assertEqual( counter_factory.counter_map[CEnum.VARIANT.value].get_value(), 1) self.assertEqual( counter_factory.counter_map[ CEnum.ANNOTATION_ALT_MATCH.value].get_value(), 2) self.assertEqual( counter_factory.counter_map[ CEnum.ANNOTATION_ALT_MISMATCH.value].get_value(), 0) self.assertEqual( counter_factory.counter_map[ CEnum.ANNOTATION_ALT_MINIMAL_MATCH.value].get_value(), 0)
def test_nonstandard_fields_names(self): variant = vcfio.Variant(reference_name='chr19', start=11, end=12, reference_bases='CT', alternate_bases=[], info={ 'A-1': vcfio.VariantInfo('data1', '1'), '_A': vcfio.VariantInfo('data2', '2') }) expected_row = { ColumnKeyConstants.REFERENCE_NAME: 'chr19', ColumnKeyConstants.START_POSITION: 11, ColumnKeyConstants.END_POSITION: 12, ColumnKeyConstants.REFERENCE_BASES: 'CT', ColumnKeyConstants.ALTERNATE_BASES: [], ColumnKeyConstants.CALLS: [], 'A_1': 'data1', 'field__A': 'data2' } self.assertEqual([expected_row], self._get_row_list_from_variant(variant))
def _get_sample_variant_2(self): variant = vcfio.Variant( reference_name='20', start=123, end=125, reference_bases='CT', alternate_bases=[], filters=['q10', 's10'], info={'INTINFO': vcfio.VariantInfo(1234, '1')}) row = {ColumnKeyConstants.REFERENCE_NAME: '20', ColumnKeyConstants.START_POSITION: 123, ColumnKeyConstants.END_POSITION: 125, ColumnKeyConstants.REFERENCE_BASES: 'CT', ColumnKeyConstants.ALTERNATE_BASES: [], ColumnKeyConstants.FILTER: ['q10', 's10'], ColumnKeyConstants.CALLS: [], 'INTINFO': 1234} return variant, row
def _get_sample_variant_and_header_with_csq(self): variant = self._get_sample_variant() variant.info['CSQ'] = vcfio.VariantInfo( data=['A|C1|I1|S1|G1', 'TT|C2|I2|S2|G2', 'A|C3|I3|S3|G3'], field_count='.') csq_info = parser._Info( id=None, num='.', type=None, desc='some desc Allele|Consequence|IMPACT|SYMBOL|Gene', source=None, version=None) header_fields = vcf_header_io.VcfHeader(infos={'CSQ': csq_info}) return variant, header_fields
def test_get_merged_variants_no_custom_options(self): strategy = move_to_calls_strategy.MoveToCallsStrategy( info_keys_to_move_to_calls_regex=None, copy_quality_to_calls=False, copy_filter_to_calls=False) variants = self._get_sample_variants() # Test single variant merge. self.assertEqual([variants[0]], strategy.get_merged_variants([variants[0]])) # Test multiple variant merge. merged_variant = strategy.get_merged_variants(variants)[0] self._assert_common_expected_merged_fields(merged_variant) self.assertEqual([ vcfio.VariantCall(name='Sample1', genotype=[0, 1], info={ 'GQ': 20, 'HQ': [10, 20] }), vcfio.VariantCall(name='Sample2', genotype=[1, 0], info={ 'GQ': 10, 'FLAG1': True }), vcfio.VariantCall(name='Sample3', genotype=[1, 1]), vcfio.VariantCall(name='Sample4', genotype=[1, 0], info={'GQ': 20}) ], merged_variant.calls) self.assertItemsEqual(['A1', 'A2', 'A3'], merged_variant.info.keys()) self.assertTrue(merged_variant.info['A1'].data in ('some data', 'some data2')) self.assertEqual(vcfio.VariantInfo(['data1', 'data2'], '2'), merged_variant.info['A2']) self.assertEqual(vcfio.VariantInfo(['data3', 'data4'], '2'), merged_variant.info['A3'])
def test_all_fields(self): variant = vcfio.Variant( reference_name='chr19', start=11, end=12, reference_bases='C', alternate_bases=['A', 'TT'], names=['rs1', 'rs2'], quality=2, filters=['PASS'], info={ 'AF': vcfio.VariantInfo([0.1, 0.2], 'A'), 'AF2': vcfio.VariantInfo([0.2, 0.3], 'A'), 'I1': vcfio.VariantInfo('some data', '1'), 'I2': vcfio.VariantInfo(['data1', 'data2'], '2') }, calls=[ vcfio.VariantCall(name='Sample1', genotype=[0, 1], phaseset='*', info={ 'GQ': 20, 'HQ': [10, 20] }), vcfio.VariantCall(name='Sample2', genotype=[1, 0], info={ 'GQ': 10, 'FLAG1': True }), vcfio.VariantCall(name='Sample3', genotype=[vcfio.MISSING_GENOTYPE_VALUE]) ]) expected_row = { ColumnKeyConstants.REFERENCE_NAME: 'chr19', ColumnKeyConstants.START_POSITION: 11, ColumnKeyConstants.END_POSITION: 12, ColumnKeyConstants.REFERENCE_BASES: 'C', ColumnKeyConstants.ALTERNATE_BASES: [{ ColumnKeyConstants.ALTERNATE_BASES_ALT: 'A', 'AF': 0.1, 'AF2': 0.2 }, { ColumnKeyConstants.ALTERNATE_BASES_ALT: 'TT', 'AF': 0.2, 'AF2': 0.3 }], ColumnKeyConstants.NAMES: ['rs1', 'rs2'], ColumnKeyConstants.QUALITY: 2, ColumnKeyConstants.FILTER: ['PASS'], ColumnKeyConstants.CALLS: [{ ColumnKeyConstants.CALLS_NAME: 'Sample1', ColumnKeyConstants.CALLS_GENOTYPE: [0, 1], ColumnKeyConstants.CALLS_PHASESET: '*', 'GQ': 20, 'HQ': [10, 20] }, { ColumnKeyConstants.CALLS_NAME: 'Sample2', ColumnKeyConstants.CALLS_GENOTYPE: [1, 0], ColumnKeyConstants.CALLS_PHASESET: None, 'GQ': 10, 'FLAG1': True }, { ColumnKeyConstants.CALLS_NAME: 'Sample3', ColumnKeyConstants.CALLS_GENOTYPE: [vcfio.MISSING_GENOTYPE_VALUE], ColumnKeyConstants.CALLS_PHASESET: None }], 'I1': 'some data', 'I2': ['data1', 'data2'] } self.assertEqual([expected_row], self._get_row_list_from_variant(variant))
def test_create_processed_variant_annotation_alt_allele_num(self): csq_info = parser._Info( id=None, num='.', type=None, desc='some desc Allele|Consequence|IMPACT|ALLELE_NUM', source=None, version=None) header_fields = vcf_header_parser.HeaderFields(infos={'CSQ': csq_info}, formats={}) variant = vcfio.Variant( reference_name='19', start=11, end=12, reference_bases='C', # The following represent a SNV and an insertion, resp. alternate_bases=['T', 'CT'], names=['rs1'], quality=2, filters=['PASS'], # Note that in the minimal mode of VEP, 'T' is an ambiguous annotation # ALT because it can map to either the 'T' SNV or the 'CT' insertion. # But because there is ALLELE_NUM there should be no ambiguity. # The last four annotations have incorrect ALLELE_NUMs. info={ 'CSQ': vcfio.VariantInfo(data=[ 'T|C1|I1|1', 'T|C2|I2|2', 'T|C3|I3|0', 'T|C4|I4|3', 'T|C5|I5|TEST', 'T|C6|I6|' ], field_count='.') }) counter_factory = _CounterSpyFactory() factory = processed_variant.ProcessedVariantFactory( header_fields, split_alternate_allele_info_fields=True, annotation_fields=['CSQ'], use_allele_num=True, minimal_match=True, # This should be ignored by the factory method. counter_factory=counter_factory) proc_var = factory.create_processed_variant(variant) alt1 = processed_variant.AlternateBaseData('T') alt1._info = { 'CSQ': [{ processed_variant._ANNOTATION_ALT: 'T', 'Consequence': 'C1', 'IMPACT': 'I1', 'ALLELE_NUM': '1' }] } alt2 = processed_variant.AlternateBaseData('CT') alt2._info = { 'CSQ': [{ processed_variant._ANNOTATION_ALT: 'T', 'Consequence': 'C2', 'IMPACT': 'I2', 'ALLELE_NUM': '2' }] } self.assertEqual(proc_var.alternate_data_list, [alt1, alt2]) self.assertFalse(proc_var.non_alt_info.has_key('CSQ')) self.assertEqual( counter_factory.counter_map[CEnum.VARIANT.value].get_value(), 1) self.assertEqual( counter_factory.counter_map[ CEnum.ANNOTATION_ALT_MATCH.value].get_value(), 2) self.assertEqual( counter_factory.counter_map[ CEnum.ANNOTATION_ALT_MISMATCH.value].get_value(), 0) self.assertEqual( counter_factory.counter_map[ CEnum.ANNOTATION_ALT_MINIMAL_MATCH.value].get_value(), 0) self.assertEqual( counter_factory.counter_map[ CEnum.ANNOTATION_ALT_MINIMAL_AMBIGUOUS.value].get_value(), 0) self.assertEqual( counter_factory.counter_map[ CEnum.ALLELE_NUM_INCORRECT.value].get_value(), 4)
def test_create_processed_variant_annotation_alt_minimal(self): # The returned variant is ignored as we create a custom one next. _, header_fields = self._get_sample_variant_and_header_with_csq() variant = vcfio.Variant( reference_name='19', start=11, end=12, reference_bases='CC', # The following represent a SNV, an insertion, and a deletion, resp. alternate_bases=['CT', 'CCT', 'C'], names=['rs1'], quality=2, filters=['PASS'], # Note that in the minimal mode, 'T' is an ambiguous annotation ALT # because it can map to either the 'CT' SNV or the 'CCT' insertion. # It is not ambiguous in the non-minimal mode (it only maps to `CT`). info={ 'CSQ': vcfio.VariantInfo(data=['T|C1|I1|S1|G1', '-|C2|I2|S2|G2'], field_count='.') }) counter_factory = _CounterSpyFactory() factory = processed_variant.ProcessedVariantFactory( header_fields, split_alternate_allele_info_fields=True, annotation_fields=['CSQ'], minimal_match=True, counter_factory=counter_factory) proc_var = factory.create_processed_variant(variant) alt1 = processed_variant.AlternateBaseData('CT') alt1._info = {} alt2 = processed_variant.AlternateBaseData('CCT') alt2._info = { 'CSQ': [{ processed_variant._ANNOTATION_ALT: 'T', processed_variant._ANNOTATION_ALT_AMBIGUOUS: True, 'Consequence': 'C1', 'IMPACT': 'I1', 'SYMBOL': 'S1', 'Gene': 'G1' }] } alt3 = processed_variant.AlternateBaseData('C') alt3._info = { 'CSQ': [{ processed_variant._ANNOTATION_ALT: '-', processed_variant._ANNOTATION_ALT_AMBIGUOUS: False, 'Consequence': 'C2', 'IMPACT': 'I2', 'SYMBOL': 'S2', 'Gene': 'G2' }] } self.assertEqual(proc_var.alternate_data_list, [alt1, alt2, alt3]) self.assertFalse(proc_var.non_alt_info.has_key('CSQ')) self.assertEqual( counter_factory.counter_map[CEnum.VARIANT.value].get_value(), 1) self.assertEqual( counter_factory.counter_map[ CEnum.ANNOTATION_ALT_MATCH.value].get_value(), 0) self.assertEqual( counter_factory.counter_map[ CEnum.ANNOTATION_ALT_MISMATCH.value].get_value(), 0) self.assertEqual( counter_factory.counter_map[ CEnum.ANNOTATION_ALT_MINIMAL_MATCH.value].get_value(), 2) self.assertEqual( counter_factory.counter_map[ CEnum.ANNOTATION_ALT_MINIMAL_AMBIGUOUS.value].get_value(), 1)
def test_create_processed_variant_symbolic_and_breakend_annotation_alt( self): # The returned variant is ignored as we create a custom one next. _, header_fields = self._get_sample_variant_and_header_with_csq() variant = vcfio.Variant( reference_name='19', start=11, end=12, reference_bases='C', alternate_bases=['<SYMBOLIC>', '[13:123457[.', 'C[10:10357[.'], names=['rs1'], quality=2, filters=['PASS'], info={ 'CSQ': vcfio.VariantInfo( data=[ 'SYMBOLIC|C1|I1|S1|G1', '[13|C2|I2|S2|G2', 'C[10|C3|I3|S3|G3', 'C[1|C3|I3|S3|G3' ], # The last one does not match any alts. field_count='.') }) counter_factory = _CounterSpyFactory() factory = processed_variant.ProcessedVariantFactory( header_fields, split_alternate_allele_info_fields=True, annotation_fields=['CSQ'], counter_factory=counter_factory) proc_var = factory.create_processed_variant(variant) alt1 = processed_variant.AlternateBaseData('<SYMBOLIC>') alt1._info = { 'CSQ': [{ processed_variant._ANNOTATION_ALT: 'SYMBOLIC', 'Consequence': 'C1', 'IMPACT': 'I1', 'SYMBOL': 'S1', 'Gene': 'G1' }] } alt2 = processed_variant.AlternateBaseData('[13:123457[.') alt2._info = { 'CSQ': [{ processed_variant._ANNOTATION_ALT: '[13', 'Consequence': 'C2', 'IMPACT': 'I2', 'SYMBOL': 'S2', 'Gene': 'G2' }] } alt3 = processed_variant.AlternateBaseData('C[10:10357[.') alt3._info = { 'CSQ': [{ processed_variant._ANNOTATION_ALT: 'C[10', 'Consequence': 'C3', 'IMPACT': 'I3', 'SYMBOL': 'S3', 'Gene': 'G3' }] } self.assertEqual(proc_var.alternate_data_list, [alt1, alt2, alt3]) self.assertFalse(proc_var.non_alt_info.has_key('CSQ')) self.assertEqual( counter_factory.counter_map[CEnum.VARIANT.value].get_value(), 1) self.assertEqual( counter_factory.counter_map[ CEnum.ANNOTATION_ALT_MATCH.value].get_value(), 3) self.assertEqual( counter_factory.counter_map[ CEnum.ANNOTATION_ALT_MISMATCH.value].get_value(), 1)
def test_sharded_rows(self): variant = vcfio.Variant(reference_name='chr19', start=11, end=12, reference_bases='C', alternate_bases=['A', 'TT'], names=['rs1', 'rs2'], quality=2, filters=['PASS'], info={ 'AF': vcfio.VariantInfo([0.1, 0.2], 'A'), 'AF2': vcfio.VariantInfo([0.2, 0.3], 'A'), 'I1': vcfio.VariantInfo('some data', '1'), }, calls=[ vcfio.VariantCall(name='Sample1', genotype=[0, 1], phaseset='*', info={ 'GQ': 20, 'HQ': [10, 20] }), vcfio.VariantCall(name='Sample2', genotype=[1, 0], info={ 'GQ': 10, 'FLAG1': True }), vcfio.VariantCall(name='Sample3', genotype=[1, 0], info={ 'GQ': 30, 'FLAG1': True }) ]) expected_rows = [ { ColumnKeyConstants.REFERENCE_NAME: 'chr19', ColumnKeyConstants.START_POSITION: 11, ColumnKeyConstants.END_POSITION: 12, ColumnKeyConstants.REFERENCE_BASES: 'C', ColumnKeyConstants.ALTERNATE_BASES: [{ ColumnKeyConstants.ALTERNATE_BASES_ALT: 'A', 'AF': 0.1, 'AF2': 0.2 }, { ColumnKeyConstants.ALTERNATE_BASES_ALT: 'TT', 'AF': 0.2, 'AF2': 0.3 }], ColumnKeyConstants.NAMES: ['rs1', 'rs2'], ColumnKeyConstants.QUALITY: 2, ColumnKeyConstants.FILTER: ['PASS'], ColumnKeyConstants.CALLS: [{ ColumnKeyConstants.CALLS_NAME: 'Sample1', ColumnKeyConstants.CALLS_GENOTYPE: [0, 1], ColumnKeyConstants.CALLS_PHASESET: '*', 'GQ': 20, 'HQ': [10, 20] }, { ColumnKeyConstants.CALLS_NAME: 'Sample2', ColumnKeyConstants.CALLS_GENOTYPE: [1, 0], ColumnKeyConstants.CALLS_PHASESET: None, 'GQ': 10, 'FLAG1': True }], 'I1': 'some data' }, { ColumnKeyConstants.REFERENCE_NAME: 'chr19', ColumnKeyConstants.START_POSITION: 11, ColumnKeyConstants.END_POSITION: 12, ColumnKeyConstants.REFERENCE_BASES: 'C', ColumnKeyConstants.ALTERNATE_BASES: [{ ColumnKeyConstants.ALTERNATE_BASES_ALT: 'A', 'AF': 0.1, 'AF2': 0.2 }, { ColumnKeyConstants.ALTERNATE_BASES_ALT: 'TT', 'AF': 0.2, 'AF2': 0.3 }], ColumnKeyConstants.NAMES: ['rs1', 'rs2'], ColumnKeyConstants.QUALITY: 2, ColumnKeyConstants.FILTER: ['PASS'], ColumnKeyConstants.CALLS: [{ ColumnKeyConstants.CALLS_NAME: 'Sample3', ColumnKeyConstants.CALLS_GENOTYPE: [1, 0], ColumnKeyConstants.CALLS_PHASESET: None, 'GQ': 30, 'FLAG1': True }], 'I1': 'some data' }, ] original_max_row_size = bigquery_vcf_schema._MAX_BIGQUERY_ROW_SIZE_BYTES try: bigquery_vcf_schema._MAX_BIGQUERY_ROW_SIZE_BYTES = ( len(json.dumps(expected_rows[0])) + 10) self.assertEqual(expected_rows, self._get_row_list_from_variant(variant)) finally: bigquery_vcf_schema._MAX_BIGQUERY_ROW_SIZE_BYTES = original_max_row_size
def test_get_merged_variants_move_quality_and_filter_to_calls(self): strategy = move_to_calls_strategy.MoveToCallsStrategy( info_keys_to_move_to_calls_regex='', copy_quality_to_calls=True, copy_filter_to_calls=True) variants = self._get_sample_variants() # Test single variant merge. single_merged_variant = strategy.get_merged_variants([variants[0]])[0] self.assertEqual([ vcfio.VariantCall(name='Sample1', genotype=[0, 1], info={ 'GQ': 20, 'HQ': [10, 20], ColumnKeyConstants.QUALITY: 2, ColumnKeyConstants.FILTER: ['PASS'] }), vcfio.VariantCall(name='Sample2', genotype=[1, 0], info={ 'GQ': 10, 'FLAG1': True, ColumnKeyConstants.QUALITY: 2, ColumnKeyConstants.FILTER: ['PASS'] }) ], single_merged_variant.calls) # Test multiple variant merge. merged_variant = strategy.get_merged_variants(variants)[0] self._assert_common_expected_merged_fields(merged_variant) self.assertEqual([ vcfio.VariantCall(name='Sample1', genotype=[0, 1], info={ 'GQ': 20, 'HQ': [10, 20], ColumnKeyConstants.QUALITY: 2, ColumnKeyConstants.FILTER: ['PASS'] }), vcfio.VariantCall(name='Sample2', genotype=[1, 0], info={ 'GQ': 10, 'FLAG1': True, ColumnKeyConstants.QUALITY: 2, ColumnKeyConstants.FILTER: ['PASS'] }), vcfio.VariantCall(name='Sample3', genotype=[1, 1], info={ ColumnKeyConstants.QUALITY: 20, ColumnKeyConstants.FILTER: ['q10'] }), vcfio.VariantCall(name='Sample4', genotype=[1, 0], info={ 'GQ': 20, ColumnKeyConstants.QUALITY: 20, ColumnKeyConstants.FILTER: ['q10'] }) ], merged_variant.calls) self.assertItemsEqual(['A1', 'A2', 'A3'], merged_variant.info.keys()) self.assertTrue(merged_variant.info['A1'].data in ('some data', 'some data2')) self.assertEqual(vcfio.VariantInfo(['data1', 'data2'], '2'), merged_variant.info['A2']) self.assertEqual(vcfio.VariantInfo(['data3', 'data4'], '2'), merged_variant.info['A3'])