def test_merge_many_different_alternates(self): strategy = merge_with_non_variants_strategy.MergeWithNonVariantsStrategy( None, None, None) variant_1 = vcfio.Variant(reference_name='1', start=1, end=2, reference_bases='A', alternate_bases=['C']) variant_2 = vcfio.Variant(reference_name='1', start=1, end=2, reference_bases='A', alternate_bases=['G']) variant_3 = vcfio.Variant(reference_name='1', start=1, end=2, reference_bases='A', alternate_bases=['T']) variant_1.calls.append( vcfio.VariantCall(name='Sample1', genotype=[1, 0])) variant_2.calls.append( vcfio.VariantCall(name='Sample2', genotype=[1, 0])) variant_3.calls.append( vcfio.VariantCall(name='Sample3', genotype=[1, 0])) variants = [variant_1, variant_2, variant_3] merged_variants = list(strategy.get_merged_variants(variants)) self.assertEqual(sorted(merged_variants), sorted(variants))
def _get_variants(self): variant_1 = vcfio.Variant(reference_name='19', start=11, end=12, reference_bases='C', alternate_bases=['A', 'TT'], names=['rs1'], quality=2, filters=['PASS'], info={ 'A1': 'some data', 'A2': ['data1', 'data2'] }) variant_2 = vcfio.Variant(reference_name='19', start=11, end=12, reference_bases='C', alternate_bases=['A', 'TT'], names=['rs1'], quality=20, filters=['q10'], info={ 'A1': 'some data2', 'A3': ['data3', 'data4'] }) return [variant_1, variant_2]
def test_densify_variants_pipeline(self): sample_ids = [ hash_name('sample1'), hash_name('sample2'), hash_name('sample3') ] variant_calls = [ vcfio.VariantCall(sample_id=sample_ids[0]), vcfio.VariantCall(sample_id=sample_ids[1]), vcfio.VariantCall(sample_id=sample_ids[2]), ] variants = [ vcfio.Variant(calls=[variant_calls[0], variant_calls[1]]), vcfio.Variant(calls=[variant_calls[1], variant_calls[2]]), ] pipeline = TestPipeline() densified_variants = ( pipeline | Create(variants) | 'DensifyVariants' >> densify_variants.DensifyVariants(sample_ids)) assert_that(densified_variants, asserts.has_sample_ids(sample_ids)) pipeline.run()
def _get_sample_variants(self): variant_1 = vcfio.Variant( reference_name='19', start=11, end=12, reference_bases='C', alternate_bases=['A', 'TT'], names=['rs1'], quality=2, filters=['PASS'], info={'A1': 'some data', 'A2': ['data1', 'data2']}, calls=[ vcfio.VariantCall( name='Sample1', genotype=[0, 1], phaseset='*', info={'GQ': 20, 'HQ': [10, 20]}), vcfio.VariantCall( name='Sample2', genotype=[1, 0], info={'GQ': 10, 'FLAG1': True}), ] ) variant_2 = vcfio.Variant( reference_name='20', start=11, end=12, reference_bases='C', alternate_bases=['A', 'TT'], names=['rs1'], quality=20, filters=['q10'], info={'A1': 'some data2', 'A3': ['data3', 'data4']}, calls=[ vcfio.VariantCall(name='Sample3', genotype=[1, 1]), vcfio.VariantCall( name='Sample4', genotype=[1, 0], info={'GQ': 20}), ] ) return [variant_1, variant_2]
def _get_sample_unmerged_variants(self): # Start/end are different from merged variants. variant_1 = vcfio.Variant(reference_name='19', start=123, end=125, reference_bases='C', alternate_bases=['A', 'TT'], names=['rs2'], calls=[ vcfio.VariantCall( sample_id=hash_name('Unmerged1'), genotype=[0, 1]) ]) # Ordering of alternate_bases is different from merged variants. variant_2 = vcfio.Variant(reference_name='19', start=11, end=12, reference_bases='C', alternate_bases=['TT', 'A'], names=['rs3'], calls=[ vcfio.VariantCall( sample_id=hash_name('Unmerged2'), genotype=[0, 1]) ]) return [variant_1, variant_2]
def test_schema_conflict_in_info_field_type(self): variant = vcfio.Variant( reference_name='chr19', start=11, end=12, reference_bases='CT', alternate_bases=[], filters=[], info={'IB': 1, 'II': 1.1, 'IFR': [1, 2], 'ISR': [1.0, 2.0]}) header_num_dict = {'IB': '1', 'II': '1', 'IFR': '2', 'ISR': '2'} expected_row = { ColumnKeyConstants.REFERENCE_NAME: 'chr19', ColumnKeyConstants.START_POSITION: 11, ColumnKeyConstants.END_POSITION: 12, ColumnKeyConstants.REFERENCE_BASES: 'CT', ColumnKeyConstants.ALTERNATE_BASES: [], ColumnKeyConstants.CALLS: [], 'IB': True, 'II': 1, 'IFR': [1.0, 2.0], 'ISR': ['1.0', '2.0']} self.assertEqual([expected_row], self._get_row_list_from_variant( variant, header_num_dict, allow_incompatible_records=True)) with self.assertRaises(ValueError): variant = vcfio.Variant( reference_name='chr19', start=11, end=12, reference_bases='CT', alternate_bases=[], filters=[], # String cannot be casted to integer. info={'II': '1.1'}) header_num_dict = {'II': '1'} self._get_row_list_from_variant( variant, header_num_dict, allow_incompatible_records=True) self.fail('String data for an integer schema must cause an exception')
def test_non_variant_split_by_snp(self): strategy = merge_with_non_variants_strategy.MergeWithNonVariantsStrategy( None, None, None) non_variant = vcfio.Variant(reference_name='1', start=0, end=10) variant = vcfio.Variant(reference_name='1', start=5, end=6, reference_bases='C', alternate_bases=['A']) call_1 = vcfio.VariantCall(name='1', genotype=[0, 0]) call_2 = vcfio.VariantCall(name='2', genotype=[1, 0]) non_variant.calls.append(call_1) variant.calls.append(call_2) expected_1 = vcfio.Variant(reference_name='1', start=0, end=5) expected_2 = vcfio.Variant(reference_name='1', start=5, end=6, reference_bases='C', alternate_bases=['A']) expected_3 = vcfio.Variant(reference_name='1', start=6, end=10) expected_1.calls.append(call_1) expected_2.calls.append(call_1) expected_2.calls.append(call_2) expected_3.calls.append(call_1) actual = list(strategy.get_merged_variants([non_variant, variant])) expected = [expected_1, expected_2, expected_3] self.assertEqual(sorted(actual), sorted(expected))
def test_sample_ids_combiner_pipeline_preserve_sample_order(self): sample_ids = [ hash_name('sample2'), hash_name('sample1'), hash_name('sample3') ] variant_calls = [ vcfio.VariantCall(sample_id=sample_ids[0]), vcfio.VariantCall(sample_id=sample_ids[1]), vcfio.VariantCall(sample_id=sample_ids[2]) ] variants = [ vcfio.Variant( calls=[variant_calls[0], variant_calls[1], variant_calls[2]]), vcfio.Variant( calls=[variant_calls[0], variant_calls[1], variant_calls[2]]) ] pipeline = TestPipeline() combined_sample_ids = ( pipeline | transforms.Create(variants) | 'CombineSampleIds' >> combine_sample_ids.SampleIdsCombiner(preserve_sample_order=True) | combiners.ToList()) assert_that(combined_sample_ids, equal_to([sample_ids])) pipeline.run()
def test_merge_snp_with_non_variant(self): strategy = merge_with_non_variants_strategy.MergeWithNonVariantsStrategy( None, None, None) variant = vcfio.Variant( reference_name='1', start=5, end=6, reference_bases='A', alternate_bases=['C'], names=['v'], filters=['vf'], quality=1) non_variant = vcfio.Variant( reference_name='1', start=0, end=10, reference_bases='G', alternate_bases=['<NON_REF>'], names=['nv'], filters=['nvf'], quality=2) call_1 = vcfio.VariantCall(name='1', genotype=[1, 0]) call_2 = vcfio.VariantCall(name='2', genotype=[0, 0]) variant.calls.append(call_1) non_variant.calls.append(call_2) expected_1 = vcfio.Variant( reference_name='1', start=0, end=5, alternate_bases=['<NON_REF>'], names=['nv'], filters=['nvf'], quality=2) expected_2 = vcfio.Variant( reference_name='1', start=6, end=10, alternate_bases=['<NON_REF>'], names=['nv'], filters=['nvf'], quality=2) expected_3 = vcfio.Variant( reference_name='1', start=5, end=6, reference_bases='A', alternate_bases=['C'], names=['v'], filters=['vf'], quality=1) expected_1.calls.append(call_2) expected_2.calls.append(call_2) expected_3.calls.append(call_1) expected_3.calls.append(call_2) actual = list(strategy.get_merged_variants([variant, non_variant])) expected = [expected_1, expected_2, expected_3] self.assertEqual(sorted(actual), sorted(expected))
def test_get_snp_merge_keys(self): strategy = merge_with_non_variants_strategy.MergeWithNonVariantsStrategy( None, None, None, 2) variant_1 = vcfio.Variant(reference_name='1', start=3, end=4) variant_2 = vcfio.Variant(reference_name='2', start=4, end=5) self.assertEqual(next(strategy.get_merge_keys(variant_1)), '1:2') self.assertEqual(next(strategy.get_merge_keys(variant_2)), '2:4')
def test_merge_2_non_variants(self): strategy = merge_with_non_variants_strategy.MergeWithNonVariantsStrategy( None, None, None) non_variant_1 = vcfio.Variant( reference_name='1', start=0, end=10, alternate_bases=['<NON_REF>'], names=['nonv1', 'nonv2'], filters=['f1', 'f2'], quality=1) non_variant_2 = vcfio.Variant( reference_name='1', start=5, end=15, alternate_bases=['<NON_REF>'], names=['nonv2', 'nonv3'], filters=['f2', 'f3'], quality=2) call_1 = vcfio.VariantCall(name='1', genotype=[0, 0]) call_2 = vcfio.VariantCall(name='2', genotype=[0, 0]) non_variant_1.calls.append(call_1) non_variant_2.calls.append(call_2) expected_1 = vcfio.Variant( reference_name='1', start=0, end=5, alternate_bases=['<NON_REF>'], names=['nonv1', 'nonv2'], filters=['f1', 'f2'], quality=1) expected_2 = vcfio.Variant( reference_name='1', start=10, end=15, alternate_bases=['<NON_REF>'], names=['nonv2', 'nonv3'], filters=['f2', 'f3'], quality=2) expected_3 = vcfio.Variant( reference_name='1', start=5, end=10, alternate_bases=['<NON_REF>'], names=['nonv1', 'nonv2', 'nonv3'], filters=['f1', 'f2', 'f3'], quality=1) expected_1.calls.append(call_1) expected_2.calls.append(call_2) expected_3.calls.append(call_1) expected_3.calls.append(call_2) actual = list(strategy.get_merged_variants([non_variant_1, non_variant_2])) expected = [expected_1, expected_2, expected_3] self.assertEqual(sorted(actual), sorted(expected))
def _get_sample_variants(self): variant1 = vcfio.Variant( reference_name='chr19', start=11, end=12, reference_bases='C') variant2 = vcfio.Variant( reference_name='20', start=123, end=125, reference_bases='CT') variant3 = vcfio.Variant( reference_name='20', start=None, end=None, reference_bases=None) variant4 = vcfio.Variant( reference_name='20', start=123, end=125, reference_bases='CT') return [variant1, variant2, variant3, variant4]
def _get_sample_variant_1(is_for_nucleus=False): """Get first sample variant. Features: multiple alternates not phased multiple names utf-8 encoded """ if not is_for_nucleus: vcf_line = ('20 1234 rs123;rs2 C A,T 50 ' 'PASS AF=0.5,0.1;NS=1;SVTYPE=BÑD GT:GQ 0/0:48 1/0:20\n') variant = vcfio.Variant(reference_name='20', start=1233, end=1234, reference_bases='C', alternate_bases=['A', 'T'], names=['rs123', 'rs2'], quality=50, filters=['PASS'], info={ 'AF': [0.5, 0.1], 'NS': 1, 'SVTYPE': ['BÑD'] }) variant.calls.append( vcfio.VariantCall(name='Sample1', genotype=[0, 0], info={'GQ': 48})) variant.calls.append( vcfio.VariantCall(name='Sample2', genotype=[1, 0], info={'GQ': 20})) else: # 0.1 -> 0.25 float precision loss due to binary floating point conversion. vcf_line = ('20 1234 rs123;rs2 C A,T 50 ' 'PASS AF=0.5,0.25;NS=1 GT:GQ 0/0:48 1/0:20\n') variant = vcfio.Variant(reference_name='20', start=1233, end=1234, reference_bases='C', alternate_bases=['A', 'T'], names=['rs123', 'rs2'], quality=50, filters=['PASS'], info={ 'AF': [0.5, 0.25], 'NS': 1 }) variant.calls.append( vcfio.VariantCall(name='Sample1', genotype=[0, 0], info={'GQ': 48})) variant.calls.append( vcfio.VariantCall(name='Sample2', genotype=[1, 0], info={'GQ': 20})) return variant, vcf_line
def _get_sample_variant_3(is_for_nucleus=False): """Get third sample variant. Features: symbolic alternate no calls for sample 2 alternate phaseset """ if not is_for_nucleus: vcf_line = ('19 12 . C <SYMBOLIC> 49 q10 AF=0.5 ' 'GT:PS:GQ 0|1:1:45 .:.:.\n') variant = vcfio.Variant(reference_name='19', start=11, end=12, reference_bases='C', alternate_bases=['<SYMBOLIC>'], quality=49, filters=['q10'], info={'AF': [0.5]}) variant.calls.append( vcfio.VariantCall(name='Sample1', genotype=[0, 1], phaseset='1', info={'GQ': 45})) variant.calls.append( vcfio.VariantCall(name='Sample2', genotype=[vcfio.MISSING_GENOTYPE_VALUE], info={'GQ': None})) else: # '.:.:.' -> './.:.:.' due to Nucleus handeling of VariantCall.genotype. vcf_line = ('19 12 . C <SYMBOLIC> 49 PASS ' 'AF=0.5 GT:PS:GQ 0|1:1:45 ./.:.:.\n') variant = vcfio.Variant(reference_name='19', start=11, end=12, reference_bases='C', alternate_bases=['<SYMBOLIC>'], quality=49, filters=['PASS'], info={'AF': [0.5]}) variant.calls.append( vcfio.VariantCall(name='Sample1', genotype=[0, 1], phaseset='1', info={'GQ': 45})) variant.calls.append( vcfio.VariantCall(name='Sample2', genotype=[ vcfio.MISSING_GENOTYPE_VALUE, vcfio.MISSING_GENOTYPE_VALUE ], info={})) return variant, vcf_line
def _get_sample_variant_2(is_for_nucleus=False): """Get second sample variant. Features: multiple references no alternate phased multiple filters missing format field """ if not is_for_nucleus: vcf_line = ('19 123 rs1234 GTC . 40 q10;s50 NS=2 ' 'GT:GQ 1|0:48 0/1:.\n') variant = vcfio.Variant(reference_name='19', start=122, end=125, reference_bases='GTC', alternate_bases=[], names=['rs1234'], quality=40, filters=['q10', 's50'], info={'NS': 2}) variant.calls.append( vcfio.VariantCall(name='Sample1', genotype=[1, 0], phaseset=vcfio.DEFAULT_PHASESET_VALUE, info={'GQ': 48})) variant.calls.append( vcfio.VariantCall(name='Sample2', genotype=[0, 1], info={'GQ': None})) else: # 'q10;s50' -> 'PASS' due to missing header fields. vcf_line = ('19 123 rs1234 GTC . 40 PASS NS=2 ' 'GT:GQ 1|0:48 0/1:.\n') variant = vcfio.Variant(reference_name='19', start=122, end=125, reference_bases='GTC', alternate_bases=[], names=['rs1234'], quality=40, filters=['PASS'], info={'NS': 2}) variant.calls.append( vcfio.VariantCall(name='Sample1', genotype=[1, 0], phaseset=vcfio.DEFAULT_PHASESET_VALUE, info={'GQ': 48})) variant.calls.append( vcfio.VariantCall(name='Sample2', genotype=[0, 1], info={})) return variant, vcf_line
def test_overlapping_three_non_variants(self): strategy = merge_with_non_variants_strategy.MergeWithNonVariantsStrategy( None, None, None) non_variant_1 = vcfio.Variant(reference_name='1', start=0, end=10) non_variant_2 = vcfio.Variant(reference_name='1', start=3, end=5) non_variant_3 = vcfio.Variant(reference_name='1', start=4, end=9) call_1 = vcfio.VariantCall('1', [0, 0]) call_2 = vcfio.VariantCall('2', [0, 0]) call_3 = vcfio.VariantCall('3', [0, 0]) non_variant_1.calls.append(call_1) non_variant_2.calls.append(call_2) non_variant_3.calls.append(call_3) expected_1 = vcfio.Variant(reference_name='1', start=0, end=3) expected_2 = vcfio.Variant(reference_name='1', start=3, end=4) expected_3 = vcfio.Variant(reference_name='1', start=4, end=5) expected_4 = vcfio.Variant(reference_name='1', start=5, end=9) expected_5 = vcfio.Variant(reference_name='1', start=9, end=10) expected_1.calls.append(call_1) expected_2.calls.append(call_1) expected_2.calls.append(call_2) expected_3.calls.append(call_1) expected_3.calls.append(call_2) expected_3.calls.append(call_3) expected_4.calls.append(call_1) expected_4.calls.append(call_3) expected_5.calls.append(call_1) expected = [expected_1, expected_2, expected_3, expected_4, expected_5] actual = list( strategy.get_merged_variants( [non_variant_1, non_variant_2, non_variant_3])) self.assertEqual(sorted(actual), sorted(expected))
def test_nonstandard_float_values(self): variant = vcfio.Variant( reference_name='chr19', start=11, end=12, reference_bases='CT', alternate_bases=[], filters=[], info={ 'F1': vcfio.VariantInfo(float('inf'), '1'), 'F2': vcfio.VariantInfo( [float('-inf'), float('nan'), 1.2], '3'), 'F3': vcfio.VariantInfo(float('nan'), '1'), }) null_replacement_value = -sys.maxint expected_row = { ColumnKeyConstants.REFERENCE_NAME: 'chr19', ColumnKeyConstants.START_POSITION: 11, ColumnKeyConstants.END_POSITION: 12, ColumnKeyConstants.REFERENCE_BASES: 'CT', ColumnKeyConstants.ALTERNATE_BASES: [], ColumnKeyConstants.CALLS: [], 'F1': sys.maxint, 'F2': [-sys.maxint, null_replacement_value, 1.2], 'F3': None } self.assertEqual([expected_row], self._get_row_list_from_variant(variant))
def _get_sample_variant_with_empty_calls(self): variant = vcfio.Variant(reference_name='20', start=123, end=125, reference_bases='CT', alternate_bases=[], filters=['q10', 's10'], info={'II': 1234}, calls=[ vcfio.VariantCall(name='EmptySample', genotype=[], phaseset='*', info={}), ]) header_num_dict = {'II': '1'} row = { ColumnKeyConstants.REFERENCE_NAME: '20', ColumnKeyConstants.START_POSITION: 123, ColumnKeyConstants.END_POSITION: 125, ColumnKeyConstants.REFERENCE_BASES: 'CT', ColumnKeyConstants.ALTERNATE_BASES: [], ColumnKeyConstants.FILTER: ['q10', 's10'], ColumnKeyConstants.CALLS: [], 'II': 1234 } return variant, row, header_num_dict
def _get_sample_variant_2(): """Get second sample variant. Features: multiple references no alternate phased multiple filters missing format field """ vcf_line = ('19 123 rs1234 GTC . 40 q10;s50 NS=2 GT:GQ 1|0:48 0/1:.\n') variant = vcfio.Variant( reference_name='19', start=122, end=125, reference_bases='GTC', alternate_bases=[], names=['rs1234'], quality=40, filters=['q10', 's50'], info={'NS': vcfio.VariantInfo(data=2, field_count='1')}) variant.calls.append( vcfio.VariantCall(name='Sample1', genotype=[1, 0], phaseset=vcfio.DEFAULT_PHASESET_VALUE, info={'GQ': 48})) variant.calls.append( vcfio.VariantCall(name='Sample2', genotype=[0, 1], info={'GQ': None})) return variant, vcf_line
def _get_sample_variant_1(): """Get first sample variant. Features: multiple alternates not phased multiple names """ vcf_line = ('20 1234 rs123;rs2 C A,T 50 PASS AF=0.5,0.1;NS=1 ' 'GT:GQ 0/0:48 1/0:20\n') variant = vcfio.Variant(reference_name='20', start=1233, end=1234, reference_bases='C', alternate_bases=['A', 'T'], names=['rs123', 'rs2'], quality=50, filters=['PASS'], info={ 'AF': vcfio.VariantInfo(data=[0.5, 0.1], field_count='A'), 'NS': vcfio.VariantInfo(data=1, field_count='1') }) variant.calls.append( vcfio.VariantCall(name='Sample1', genotype=[0, 0], info={'GQ': 48})) variant.calls.append( vcfio.VariantCall(name='Sample2', genotype=[1, 0], info={'GQ': 20})) return variant, vcf_line
def _get_sample_variant_3(): """Get third sample variant. Features: symbolic alternate no calls for sample 2 alternate phaseset """ vcf_line = ('19 12 . C <SYMBOLIC> 49 q10 AF=0.5 GT:PS:GQ 0|1:1:45 ' '.:.:.\n') variant = vcfio.Variant( reference_name='19', start=11, end=12, reference_bases='C', alternate_bases=['<SYMBOLIC>'], quality=49, filters=['q10'], info={'AF': vcfio.VariantInfo(data=[0.5], field_count='A')}) variant.calls.append( vcfio.VariantCall(name='Sample1', genotype=[0, 1], phaseset='1', info={'GQ': 45})) variant.calls.append( vcfio.VariantCall(name='Sample2', genotype=[vcfio.MISSING_GENOTYPE_VALUE], info={'GQ': None})) return variant, vcf_line
def _get_sample_variant_1(self): variant = vcfio.Variant(reference_name='chr19', start=11, end=12, reference_bases='C', alternate_bases=['A', 'TT'], names=['rs1', 'rs2'], quality=2, filters=['PASS'], info={ 'IS': 'some data', 'ISI': '1', 'ISF': '1.0', 'IF': 1.0, 'IB': True, 'IA': [1, 2] }, calls=[ vcfio.VariantCall( sample_id=hash_name('Sample1'), genotype=[0, 1], phaseset='*', info={ 'FI': 20, 'FU': [10.0, 20.0] }) ]) return variant
def test_unicode_fields(self): sample_unicode_str = u'\xc3\xb6' sample_utf8_str = sample_unicode_str.encode('utf-8') variant = vcfio.Variant(reference_name='chr19', start=11, end=12, reference_bases='CT', alternate_bases=[], filters=[sample_unicode_str, sample_utf8_str], info={ 'IS': sample_utf8_str, 'ISR': [sample_unicode_str, sample_utf8_str] }) header_num_dict = {'IS': '1', 'ISR': '2'} proc_variant = _get_processed_variant(variant, header_num_dict) expected_row = { ColumnKeyConstants.REFERENCE_NAME: 'chr19', ColumnKeyConstants.START_POSITION: 11, ColumnKeyConstants.END_POSITION: 12, ColumnKeyConstants.REFERENCE_BASES: 'CT', ColumnKeyConstants.ALTERNATE_BASES: [], ColumnKeyConstants.FILTER: [sample_unicode_str, sample_unicode_str], ColumnKeyConstants.CALLS: [], 'IS': sample_unicode_str, 'ISR': [sample_unicode_str, sample_unicode_str] } self.assertEqual([expected_row], list(self._row_generator.get_rows(proc_variant)))
def test_no_alternate_bases(self): variant = vcfio.Variant(reference_name='chr19', start=11, end=12, reference_bases='CT', alternate_bases=[], filters=['q10'], info={ 'IS': 'some data', 'ISR': ['data1', 'data2'] }) header_num_dict = {'IS': '1', 'ISR': '2'} proc_variant = _get_processed_variant(variant, header_num_dict) expected_row = { ColumnKeyConstants.REFERENCE_NAME: 'chr19', ColumnKeyConstants.START_POSITION: 11, ColumnKeyConstants.END_POSITION: 12, ColumnKeyConstants.REFERENCE_BASES: 'CT', ColumnKeyConstants.ALTERNATE_BASES: [], ColumnKeyConstants.FILTER: ['q10'], ColumnKeyConstants.CALLS: [], 'IS': 'some data', 'ISR': ['data1', 'data2'] } self.assertEqual([expected_row], list(self._row_generator.get_rows(proc_variant)))
def test_create_processed_variant_annotation_alt_prefix(self): # The returned variant is ignored as we create a custom one next. _, header_fields = self._get_sample_variant_and_header_with_csq() variant = vcfio.Variant( reference_name='19', start=11, end=12, reference_bases='C', alternate_bases=['CT', 'CC', 'CCC'], names=['rs1'], quality=2, filters=['PASS'], info={'CSQ': ['T|C1|I1|S1|G1', 'C|C2|I2|S2|G2', 'CC|C3|I3|S3|G3']}) counter_factory = _CounterSpyFactory() factory = processed_variant.ProcessedVariantFactory( header_fields, split_alternate_allele_info_fields=True, annotation_fields=['CSQ'], counter_factory=counter_factory) proc_var = factory.create_processed_variant(variant) alt1 = processed_variant.AlternateBaseData('CT') alt1._info = { 'CSQ': [{ annotation_parser.ANNOTATION_ALT: 'T', 'Consequence': 'C1', 'IMPACT': 'I1', 'SYMBOL': 'S1', 'Gene': 'G1' }] } alt2 = processed_variant.AlternateBaseData('CC') alt2._info = { 'CSQ': [{ annotation_parser.ANNOTATION_ALT: 'C', 'Consequence': 'C2', 'IMPACT': 'I2', 'SYMBOL': 'S2', 'Gene': 'G2' }] } alt3 = processed_variant.AlternateBaseData('CCC') alt3._info = { 'CSQ': [{ annotation_parser.ANNOTATION_ALT: 'CC', 'Consequence': 'C3', 'IMPACT': 'I3', 'SYMBOL': 'S3', 'Gene': 'G3' }] } self.assertEqual(proc_var.alternate_data_list, [alt1, alt2, alt3]) self.assertFalse(proc_var.non_alt_info.has_key('CSQ')) self.assertEqual( counter_factory.counter_map[CEnum.VARIANT.value].get_value(), 1) self.assertEqual( counter_factory.counter_map[ CEnum.ANNOTATION_ALT_MATCH.value].get_value(), 3) self.assertEqual( counter_factory.counter_map[ CEnum.ANNOTATION_ALT_MISMATCH.value].get_value(), 0)
def _get_sample_variant_1(file_name='', use_1_based_coordinate=False, use_hashing=True, move_hom_ref_calls=False): """Get first sample variant. Features: multiple alternates not phased multiple names utf-8 encoded """ hash_name_method = _get_hashing_function(file_name, use_hashing) variant = vcfio.Variant( reference_name='20', start=1233 + use_1_based_coordinate, end=1234, reference_bases='C', alternate_bases=['A', 'T'], names=['rs123', 'rs2'], quality=50, filters=['PASS'], hom_ref_calls=([('Sample1', hash_name_method('Sample1'))] if move_hom_ref_calls else None), info={'AF': [0.5, 0.1], 'NS': 1, 'SVTYPE': ['BÑD']}) if not move_hom_ref_calls: variant.calls.append( vcfio.VariantCall(sample_id=hash_name_method('Sample1'), name='Sample1', genotype=[0, 0], info={'GQ': 48})) variant.calls.append( vcfio.VariantCall(sample_id=hash_name_method('Sample2'), name='Sample2', genotype=[1, 0], info={'GQ': 20})) return variant
def _get_sample_variant_2(file_name='', use_1_based_coordinate=False, use_hashing=True, move_hom_ref_calls=False): """Get second sample variant. Features: multiple references no alternate phased multiple filters missing format field """ hash_name_method = _get_hashing_function(file_name, use_hashing) variant = vcfio.Variant( reference_name='19', start=122 + use_1_based_coordinate, end=125, reference_bases='GTC', alternate_bases=[], names=['rs1234'], quality=40, filters=['q10', 's50'], hom_ref_calls=[] if move_hom_ref_calls else None, info={'NS': 2}) variant.calls.append( vcfio.VariantCall(sample_id=hash_name_method('Sample1'), name='Sample1', genotype=[-1, 0], phaseset=vcfio.DEFAULT_PHASESET_VALUE, info={'GQ': 48})) variant.calls.append( vcfio.VariantCall(sample_id=hash_name_method('Sample2'), name='Sample2', genotype=[0, -1], info={'GQ': None})) return variant
def _get_sample_variant(self): return vcfio.Variant(reference_name='19', start=11, end=12, reference_bases='C', alternate_bases=['A', 'TT'], names=['rs1', 'rs2'], quality=2, filters=['PASS'], info={ 'A1': vcfio.VariantInfo('some data', '1'), 'A2': vcfio.VariantInfo(['data1', 'data2'], 'A') }, calls=[ vcfio.VariantCall(name='Sample1', genotype=[0, 1], info={ 'GQ': 20, 'HQ': [10, 20] }), vcfio.VariantCall(name='Sample2', genotype=[1, 0], info={ 'GQ': 10, 'FLAG1': True }) ])
def test_get_merge_keys(self): strategy = move_to_calls_strategy.MoveToCallsStrategy(None, None, None) def get_expected_key(reference_name, start, end, reference_bases, alternate_bases): return '%s:%s:%s:%s:%s' % ( reference_name or '', str(start or ''), str( end or ''), strategy._get_hash(reference_bases or ''), strategy._get_hash(','.join(alternate_bases or []))) variant = vcfio.Variant() self.assertEqual(get_expected_key(None, None, None, None, None), next(strategy.get_merge_keys(variant))) variant.reference_name = '19' self.assertEqual(get_expected_key(19, None, None, None, None), next(strategy.get_merge_keys(variant))) variant.start = 123 variant.end = 125 variant.reference_bases = 'AT' self.assertEqual(get_expected_key(19, 123, 125, 'AT', None), next(strategy.get_merge_keys(variant))) variant.alternate_bases = ['A', 'C'] self.assertEqual(get_expected_key(19, 123, 125, 'AT', ['A', 'C']), next(strategy.get_merge_keys(variant)))
def test_schema_conflict_in_format_field_number(self): variant = vcfio.Variant(reference_name='chr19', start=11, end=12, reference_bases='CT', alternate_bases=[], filters=[], calls=[ vcfio.VariantCall(name='Sample1', genotype=[0, 1], phaseset='*', info={ 'FB': [1, 2], 'FI': [1, 2], 'FSR': 'str' }), vcfio.VariantCall(name='Sample2', genotype=[1, 0], info={ 'FB': [], 'FI': [], 'FSR': '' }) ]) proc_variant = _get_processed_variant(variant) expected_row = { ColumnKeyConstants.REFERENCE_NAME: 'chr19', ColumnKeyConstants.START_POSITION: 11, ColumnKeyConstants.END_POSITION: 12, ColumnKeyConstants.REFERENCE_BASES: 'CT', ColumnKeyConstants.ALTERNATE_BASES: [], ColumnKeyConstants.CALLS: [ { ColumnKeyConstants.CALLS_NAME: 'Sample1', ColumnKeyConstants.CALLS_GENOTYPE: [0, 1], ColumnKeyConstants.CALLS_PHASESET: '*', 'FB': True, 'FI': 1, 'FSR': ['str'] }, { ColumnKeyConstants.CALLS_NAME: 'Sample2', ColumnKeyConstants.CALLS_GENOTYPE: [1, 0], ColumnKeyConstants.CALLS_PHASESET: None, 'FB': False, 'FI': None, 'FSR': [''] }, ], } self.assertEqual( [expected_row], list( self._row_generator.get_rows(proc_variant, allow_incompatible_records=True)))