def test_sample_ids_combiner_pipeline_preserve_sample_order(self): sample_ids = [ hash_name('sample2'), hash_name('sample1'), hash_name('sample3') ] variant_calls = [ vcfio.VariantCall(sample_id=sample_ids[0]), vcfio.VariantCall(sample_id=sample_ids[1]), vcfio.VariantCall(sample_id=sample_ids[2]) ] variants = [ vcfio.Variant( calls=[variant_calls[0], variant_calls[1], variant_calls[2]]), vcfio.Variant( calls=[variant_calls[0], variant_calls[1], variant_calls[2]]) ] pipeline = TestPipeline() combined_sample_ids = ( pipeline | transforms.Create(variants) | 'CombineSampleIds' >> combine_sample_ids.SampleIdsCombiner(preserve_sample_order=True) | combiners.ToList()) assert_that(combined_sample_ids, equal_to([sample_ids])) pipeline.run()
def test_merge_many_different_alternates(self): strategy = merge_with_non_variants_strategy.MergeWithNonVariantsStrategy( None, None, None) variant_1 = vcfio.Variant(reference_name='1', start=1, end=2, reference_bases='A', alternate_bases=['C']) variant_2 = vcfio.Variant(reference_name='1', start=1, end=2, reference_bases='A', alternate_bases=['G']) variant_3 = vcfio.Variant(reference_name='1', start=1, end=2, reference_bases='A', alternate_bases=['T']) variant_1.calls.append( vcfio.VariantCall(name='Sample1', genotype=[1, 0])) variant_2.calls.append( vcfio.VariantCall(name='Sample2', genotype=[1, 0])) variant_3.calls.append( vcfio.VariantCall(name='Sample3', genotype=[1, 0])) variants = [variant_1, variant_2, variant_3] merged_variants = list(strategy.get_merged_variants(variants)) self.assertEqual(sorted(merged_variants), sorted(variants))
def _get_sample_variant_3(file_name='', use_1_based_coordinate=False, use_hashing=True): """Get third sample variant. Features: symbolic alternate no calls for sample 2 alternate phaseset """ hash_name_method = _get_hashing_function(file_name, use_hashing) variant = vcfio.Variant(reference_name='19', start=12 if use_1_based_coordinate else 11, end=12, reference_bases='C', alternate_bases=['<SYMBOLIC>'], quality=49, filters=['q10'], info={'AF': [0.5]}) variant.calls.append( vcfio.VariantCall(sample_id=hash_name_method('Sample1'), genotype=[0, 1], phaseset='1', info={'GQ': 45})) variant.calls.append( vcfio.VariantCall(sample_id=hash_name_method('Sample2'), genotype=[vcfio.MISSING_GENOTYPE_VALUE], info={'GQ': None})) return variant
def test_schema_conflict_in_format_field_number(self): variant = vcfio.Variant(reference_name='chr19', start=11, end=12, reference_bases='CT', alternate_bases=[], filters=[], calls=[ vcfio.VariantCall(name='Sample1', genotype=[0, 1], phaseset='*', info={ 'FB': [1, 2], 'FI': [1, 2], 'FSR': 'str' }), vcfio.VariantCall(name='Sample2', genotype=[1, 0], info={ 'FB': [], 'FI': [], 'FSR': '' }) ]) proc_variant = _get_processed_variant(variant) expected_row = { ColumnKeyConstants.REFERENCE_NAME: 'chr19', ColumnKeyConstants.START_POSITION: 11, ColumnKeyConstants.END_POSITION: 12, ColumnKeyConstants.REFERENCE_BASES: 'CT', ColumnKeyConstants.ALTERNATE_BASES: [], ColumnKeyConstants.CALLS: [ { ColumnKeyConstants.CALLS_NAME: 'Sample1', ColumnKeyConstants.CALLS_GENOTYPE: [0, 1], ColumnKeyConstants.CALLS_PHASESET: '*', 'FB': True, 'FI': 1, 'FSR': ['str'] }, { ColumnKeyConstants.CALLS_NAME: 'Sample2', ColumnKeyConstants.CALLS_GENOTYPE: [1, 0], ColumnKeyConstants.CALLS_PHASESET: None, 'FB': False, 'FI': None, 'FSR': [''] }, ], } self.assertEqual( [expected_row], list( self._row_generator.get_rows(proc_variant, allow_incompatible_records=True)))
def _get_sample_variant_3(): """Get third sample variant. Features: symbolic alternate no calls for sample 2 alternate phaseset """ vcf_line = ('19 12 . C <SYMBOLIC> 49 q10 AF=0.5 GT:PS:GQ 0|1:1:45 ' '.:.:.\n') variant = vcfio.Variant( reference_name='19', start=11, end=12, reference_bases='C', alternate_bases=['<SYMBOLIC>'], quality=49, filters=['q10'], info={'AF': vcfio.VariantInfo(data=[0.5], field_count='A')}) variant.calls.append( vcfio.VariantCall(name='Sample1', genotype=[0, 1], phaseset='1', info={'GQ': 45})) variant.calls.append( vcfio.VariantCall(name='Sample2', genotype=[vcfio.MISSING_GENOTYPE_VALUE], info={'GQ': None})) return variant, vcf_line
def _get_sample_variants(self): variant_1 = vcfio.Variant( reference_name='19', start=11, end=12, reference_bases='C', alternate_bases=['A', 'TT'], names=['rs1'], quality=2, filters=['PASS'], info={'A1': 'some data', 'A2': ['data1', 'data2']}, calls=[ vcfio.VariantCall( name='Sample1', genotype=[0, 1], phaseset='*', info={'GQ': 20, 'HQ': [10, 20]}), vcfio.VariantCall( name='Sample2', genotype=[1, 0], info={'GQ': 10, 'FLAG1': True}), ] ) variant_2 = vcfio.Variant( reference_name='20', start=11, end=12, reference_bases='C', alternate_bases=['A', 'TT'], names=['rs1'], quality=20, filters=['q10'], info={'A1': 'some data2', 'A3': ['data3', 'data4']}, calls=[ vcfio.VariantCall(name='Sample3', genotype=[1, 1]), vcfio.VariantCall( name='Sample4', genotype=[1, 0], info={'GQ': 20}), ] ) return [variant_1, variant_2]
def _get_sample_variant_1(): """Get first sample variant. Features: multiple alternates not phased multiple names """ vcf_line = ('20 1234 rs123;rs2 C A,T 50 PASS AF=0.5,0.1;NS=1 ' 'GT:GQ 0/0:48 1/0:20\n') variant = vcfio.Variant(reference_name='20', start=1233, end=1234, reference_bases='C', alternate_bases=['A', 'T'], names=['rs123', 'rs2'], quality=50, filters=['PASS'], info={ 'AF': vcfio.VariantInfo(data=[0.5, 0.1], field_count='A'), 'NS': vcfio.VariantInfo(data=1, field_count='1') }) variant.calls.append( vcfio.VariantCall(name='Sample1', genotype=[0, 0], info={'GQ': 48})) variant.calls.append( vcfio.VariantCall(name='Sample2', genotype=[1, 0], info={'GQ': 20})) return variant, vcf_line
def _get_sample_variant_2(): """Get second sample variant. Features: multiple references no alternate phased multiple filters missing format field """ vcf_line = ('19 123 rs1234 GTC . 40 q10;s50 NS=2 GT:GQ 1|0:48 0/1:.\n') variant = vcfio.Variant( reference_name='19', start=122, end=125, reference_bases='GTC', alternate_bases=[], names=['rs1234'], quality=40, filters=['q10', 's50'], info={'NS': vcfio.VariantInfo(data=2, field_count='1')}) variant.calls.append( vcfio.VariantCall(name='Sample1', genotype=[1, 0], phaseset=vcfio.DEFAULT_PHASESET_VALUE, info={'GQ': 48})) variant.calls.append( vcfio.VariantCall(name='Sample2', genotype=[0, 1], info={'GQ': None})) return variant, vcf_line
def _get_sample_unmerged_variants(self): # Start/end are different from merged variants. variant_1 = vcfio.Variant(reference_name='19', start=123, end=125, reference_bases='C', alternate_bases=['A', 'TT'], names=['rs2'], calls=[ vcfio.VariantCall( sample_id=hash_name('Unmerged1'), genotype=[0, 1]) ]) # Ordering of alternate_bases is different from merged variants. variant_2 = vcfio.Variant(reference_name='19', start=11, end=12, reference_bases='C', alternate_bases=['TT', 'A'], names=['rs3'], calls=[ vcfio.VariantCall( sample_id=hash_name('Unmerged2'), genotype=[0, 1]) ]) return [variant_1, variant_2]
def _get_sample_variant_1(file_name='', use_1_based_coordinate=False, use_hashing=True, move_hom_ref_calls=False): """Get first sample variant. Features: multiple alternates not phased multiple names utf-8 encoded """ hash_name_method = _get_hashing_function(file_name, use_hashing) variant = vcfio.Variant( reference_name='20', start=1233 + use_1_based_coordinate, end=1234, reference_bases='C', alternate_bases=['A', 'T'], names=['rs123', 'rs2'], quality=50, filters=['PASS'], hom_ref_calls=([('Sample1', hash_name_method('Sample1'))] if move_hom_ref_calls else None), info={'AF': [0.5, 0.1], 'NS': 1, 'SVTYPE': ['BÑD']}) if not move_hom_ref_calls: variant.calls.append( vcfio.VariantCall(sample_id=hash_name_method('Sample1'), name='Sample1', genotype=[0, 0], info={'GQ': 48})) variant.calls.append( vcfio.VariantCall(sample_id=hash_name_method('Sample2'), name='Sample2', genotype=[1, 0], info={'GQ': 20})) return variant
def _get_sample_variant(self): return vcfio.Variant(reference_name='19', start=11, end=12, reference_bases='C', alternate_bases=['A', 'TT'], names=['rs1', 'rs2'], quality=2, filters=['PASS'], info={ 'A1': vcfio.VariantInfo('some data', '1'), 'A2': vcfio.VariantInfo(['data1', 'data2'], 'A') }, calls=[ vcfio.VariantCall(name='Sample1', genotype=[0, 1], info={ 'GQ': 20, 'HQ': [10, 20] }), vcfio.VariantCall(name='Sample2', genotype=[1, 0], info={ 'GQ': 10, 'FLAG1': True }) ])
def _get_sample_variant_2(file_name='', use_1_based_coordinate=False, use_hashing=True, move_hom_ref_calls=False): """Get second sample variant. Features: multiple references no alternate phased multiple filters missing format field """ hash_name_method = _get_hashing_function(file_name, use_hashing) variant = vcfio.Variant( reference_name='19', start=122 + use_1_based_coordinate, end=125, reference_bases='GTC', alternate_bases=[], names=['rs1234'], quality=40, filters=['q10', 's50'], hom_ref_calls=[] if move_hom_ref_calls else None, info={'NS': 2}) variant.calls.append( vcfio.VariantCall(sample_id=hash_name_method('Sample1'), name='Sample1', genotype=[-1, 0], phaseset=vcfio.DEFAULT_PHASESET_VALUE, info={'GQ': 48})) variant.calls.append( vcfio.VariantCall(sample_id=hash_name_method('Sample2'), name='Sample2', genotype=[0, -1], info={'GQ': None})) return variant
def test_overlapping_three_non_variants(self): strategy = merge_with_non_variants_strategy.MergeWithNonVariantsStrategy( None, None, None) non_variant_1 = vcfio.Variant(reference_name='1', start=0, end=10) non_variant_2 = vcfio.Variant(reference_name='1', start=3, end=5) non_variant_3 = vcfio.Variant(reference_name='1', start=4, end=9) call_1 = vcfio.VariantCall('1', [0, 0]) call_2 = vcfio.VariantCall('2', [0, 0]) call_3 = vcfio.VariantCall('3', [0, 0]) non_variant_1.calls.append(call_1) non_variant_2.calls.append(call_2) non_variant_3.calls.append(call_3) expected_1 = vcfio.Variant(reference_name='1', start=0, end=3) expected_2 = vcfio.Variant(reference_name='1', start=3, end=4) expected_3 = vcfio.Variant(reference_name='1', start=4, end=5) expected_4 = vcfio.Variant(reference_name='1', start=5, end=9) expected_5 = vcfio.Variant(reference_name='1', start=9, end=10) expected_1.calls.append(call_1) expected_2.calls.append(call_1) expected_2.calls.append(call_2) expected_3.calls.append(call_1) expected_3.calls.append(call_2) expected_3.calls.append(call_3) expected_4.calls.append(call_1) expected_4.calls.append(call_3) expected_5.calls.append(call_1) expected = [expected_1, expected_2, expected_3, expected_4, expected_5] actual = list( strategy.get_merged_variants( [non_variant_1, non_variant_2, non_variant_3])) self.assertEqual(sorted(actual), sorted(expected))
def test_non_variant_split_by_snp(self): strategy = merge_with_non_variants_strategy.MergeWithNonVariantsStrategy( None, None, None) non_variant = vcfio.Variant(reference_name='1', start=0, end=10) variant = vcfio.Variant(reference_name='1', start=5, end=6, reference_bases='C', alternate_bases=['A']) call_1 = vcfio.VariantCall(name='1', genotype=[0, 0]) call_2 = vcfio.VariantCall(name='2', genotype=[1, 0]) non_variant.calls.append(call_1) variant.calls.append(call_2) expected_1 = vcfio.Variant(reference_name='1', start=0, end=5) expected_2 = vcfio.Variant(reference_name='1', start=5, end=6, reference_bases='C', alternate_bases=['A']) expected_3 = vcfio.Variant(reference_name='1', start=6, end=10) expected_1.calls.append(call_1) expected_2.calls.append(call_1) expected_2.calls.append(call_2) expected_3.calls.append(call_1) actual = list(strategy.get_merged_variants([non_variant, variant])) expected = [expected_1, expected_2, expected_3] self.assertEqual(sorted(actual), sorted(expected))
def test_densify_variants_pipeline(self): sample_ids = [ hash_name('sample1'), hash_name('sample2'), hash_name('sample3') ] variant_calls = [ vcfio.VariantCall(sample_id=sample_ids[0]), vcfio.VariantCall(sample_id=sample_ids[1]), vcfio.VariantCall(sample_id=sample_ids[2]), ] variants = [ vcfio.Variant(calls=[variant_calls[0], variant_calls[1]]), vcfio.Variant(calls=[variant_calls[1], variant_calls[2]]), ] pipeline = TestPipeline() densified_variants = ( pipeline | Create(variants) | 'DensifyVariants' >> densify_variants.DensifyVariants(sample_ids)) assert_that(densified_variants, asserts.has_sample_ids(sample_ids)) pipeline.run()
def test_omit_empty_sample_calls(self): variant = vcfio.Variant( reference_name='chr19', start=11, end=12, reference_bases='C', alternate_bases=[], names=['rs1', 'rs2'], quality=2, filters=['PASS'], info={}, calls=[ vcfio.VariantCall( name='Sample1', info={'GQ': None}), vcfio.VariantCall( name='Sample2', genotype=[1, 0], info={'GQ': 10}), vcfio.VariantCall( name='Sample3', genotype=[vcfio.MISSING_GENOTYPE_VALUE, vcfio.MISSING_GENOTYPE_VALUE])]) expected_row = { ColumnKeyConstants.REFERENCE_NAME: 'chr19', ColumnKeyConstants.START_POSITION: 11, ColumnKeyConstants.END_POSITION: 12, ColumnKeyConstants.REFERENCE_BASES: 'C', ColumnKeyConstants.ALTERNATE_BASES: [], ColumnKeyConstants.NAMES: ['rs1', 'rs2'], ColumnKeyConstants.QUALITY: 2, ColumnKeyConstants.FILTER: ['PASS'], ColumnKeyConstants.CALLS: [ {ColumnKeyConstants.CALLS_NAME: 'Sample2', ColumnKeyConstants.CALLS_GENOTYPE: [1, 0], ColumnKeyConstants.CALLS_PHASESET: None, 'GQ': 10}]} self.assertEqual( [expected_row], self._get_row_list_from_variant(variant, omit_empty_sample_calls=True))
def test_convert_bq_row_to_variant(self): row = self._get_big_query_row() expected_variant = vcfio.Variant(reference_name='chr19', start=11, end=12, reference_bases='C', alternate_bases=['A', 'TT'], names=['rs1', 'rs2'], quality=2, filters=['PASS'], info={ 'IFR': [0.2], 'IFR2': [0.2, 0.3], 'IS': 'some data', 'ISR': ['data1', 'data2'] }, calls=[ vcfio.VariantCall(name='Sample1', genotype=[0, 1], phaseset='*', info={ 'GQ': 20, 'FIR': [10, 20] }), vcfio.VariantCall(name='Sample2', genotype=[1, 0], info={ 'GQ': 10, 'FB': True }) ]) bq_to_variant = bigquery_to_variant.BigQueryToVariant() self.assertEqual(expected_variant, bq_to_variant._convert_bq_row_to_variant(row))
def test_get_merged_variants_no_custom_options(self): strategy = merge_with_non_variants_strategy.MergeWithNonVariantsStrategy( info_keys_to_move_to_calls_regex=None, copy_quality_to_calls=False, copy_filter_to_calls=False) variants = self._get_sample_variants() actual = list(strategy.get_merged_variants([variants[0]])) # Test single variant merge. self.assertEqual([variants[0]], actual) # Test multiple variant merge. merged_variant = list(strategy.get_merged_variants(variants))[0] self._assert_common_expected_merged_fields(merged_variant) self.assertEqual( [vcfio.VariantCall(name='Sample1', genotype=[0, 1], info={'GQ': 20, 'HQ': [10, 20]}), vcfio.VariantCall(name='Sample2', genotype=[1, 0], info={'GQ': 10, 'FLAG1': True}), vcfio.VariantCall(name='Sample3', genotype=[1, 1]), vcfio.VariantCall(name='Sample4', genotype=[1, 0], info={'GQ': 20})], merged_variant.calls) self.assertItemsEqual(['A1', 'A2', 'A3'], merged_variant.info.keys()) self.assertTrue( merged_variant.info['A1'] in ('some data', 'some data2')) self.assertEqual(['data1', 'data2'], merged_variant.info['A2']) self.assertEqual(['data3', 'data4'], merged_variant.info['A3'])
def test_merge_snp_with_non_variant(self): strategy = merge_with_non_variants_strategy.MergeWithNonVariantsStrategy( None, None, None) variant = vcfio.Variant( reference_name='1', start=5, end=6, reference_bases='A', alternate_bases=['C'], names=['v'], filters=['vf'], quality=1) non_variant = vcfio.Variant( reference_name='1', start=0, end=10, reference_bases='G', alternate_bases=['<NON_REF>'], names=['nv'], filters=['nvf'], quality=2) call_1 = vcfio.VariantCall(name='1', genotype=[1, 0]) call_2 = vcfio.VariantCall(name='2', genotype=[0, 0]) variant.calls.append(call_1) non_variant.calls.append(call_2) expected_1 = vcfio.Variant( reference_name='1', start=0, end=5, alternate_bases=['<NON_REF>'], names=['nv'], filters=['nvf'], quality=2) expected_2 = vcfio.Variant( reference_name='1', start=6, end=10, alternate_bases=['<NON_REF>'], names=['nv'], filters=['nvf'], quality=2) expected_3 = vcfio.Variant( reference_name='1', start=5, end=6, reference_bases='A', alternate_bases=['C'], names=['v'], filters=['vf'], quality=1) expected_1.calls.append(call_2) expected_2.calls.append(call_2) expected_3.calls.append(call_1) expected_3.calls.append(call_2) actual = list(strategy.get_merged_variants([variant, non_variant])) expected = [expected_1, expected_2, expected_3] self.assertEqual(sorted(actual), sorted(expected))
def test_merge_2_non_variants(self): strategy = merge_with_non_variants_strategy.MergeWithNonVariantsStrategy( None, None, None) non_variant_1 = vcfio.Variant( reference_name='1', start=0, end=10, alternate_bases=['<NON_REF>'], names=['nonv1', 'nonv2'], filters=['f1', 'f2'], quality=1) non_variant_2 = vcfio.Variant( reference_name='1', start=5, end=15, alternate_bases=['<NON_REF>'], names=['nonv2', 'nonv3'], filters=['f2', 'f3'], quality=2) call_1 = vcfio.VariantCall(name='1', genotype=[0, 0]) call_2 = vcfio.VariantCall(name='2', genotype=[0, 0]) non_variant_1.calls.append(call_1) non_variant_2.calls.append(call_2) expected_1 = vcfio.Variant( reference_name='1', start=0, end=5, alternate_bases=['<NON_REF>'], names=['nonv1', 'nonv2'], filters=['f1', 'f2'], quality=1) expected_2 = vcfio.Variant( reference_name='1', start=10, end=15, alternate_bases=['<NON_REF>'], names=['nonv2', 'nonv3'], filters=['f2', 'f3'], quality=2) expected_3 = vcfio.Variant( reference_name='1', start=5, end=10, alternate_bases=['<NON_REF>'], names=['nonv1', 'nonv2', 'nonv3'], filters=['f1', 'f2', 'f3'], quality=1) expected_1.calls.append(call_1) expected_2.calls.append(call_2) expected_3.calls.append(call_1) expected_3.calls.append(call_2) actual = list(strategy.get_merged_variants([non_variant_1, non_variant_2])) expected = [expected_1, expected_2, expected_3] self.assertEqual(sorted(actual), sorted(expected))
def _get_sample_variant_1(is_for_nucleus=False): """Get first sample variant. Features: multiple alternates not phased multiple names utf-8 encoded """ if not is_for_nucleus: vcf_line = ('20 1234 rs123;rs2 C A,T 50 ' 'PASS AF=0.5,0.1;NS=1;SVTYPE=BÑD GT:GQ 0/0:48 1/0:20\n') variant = vcfio.Variant(reference_name='20', start=1233, end=1234, reference_bases='C', alternate_bases=['A', 'T'], names=['rs123', 'rs2'], quality=50, filters=['PASS'], info={ 'AF': [0.5, 0.1], 'NS': 1, 'SVTYPE': ['BÑD'] }) variant.calls.append( vcfio.VariantCall(name='Sample1', genotype=[0, 0], info={'GQ': 48})) variant.calls.append( vcfio.VariantCall(name='Sample2', genotype=[1, 0], info={'GQ': 20})) else: # 0.1 -> 0.25 float precision loss due to binary floating point conversion. vcf_line = ('20 1234 rs123;rs2 C A,T 50 ' 'PASS AF=0.5,0.25;NS=1 GT:GQ 0/0:48 1/0:20\n') variant = vcfio.Variant(reference_name='20', start=1233, end=1234, reference_bases='C', alternate_bases=['A', 'T'], names=['rs123', 'rs2'], quality=50, filters=['PASS'], info={ 'AF': [0.5, 0.25], 'NS': 1 }) variant.calls.append( vcfio.VariantCall(name='Sample1', genotype=[0, 0], info={'GQ': 48})) variant.calls.append( vcfio.VariantCall(name='Sample2', genotype=[1, 0], info={'GQ': 20})) return variant, vcf_line
def _get_sample_variant_3(is_for_nucleus=False): """Get third sample variant. Features: symbolic alternate no calls for sample 2 alternate phaseset """ if not is_for_nucleus: vcf_line = ('19 12 . C <SYMBOLIC> 49 q10 AF=0.5 ' 'GT:PS:GQ 0|1:1:45 .:.:.\n') variant = vcfio.Variant(reference_name='19', start=11, end=12, reference_bases='C', alternate_bases=['<SYMBOLIC>'], quality=49, filters=['q10'], info={'AF': [0.5]}) variant.calls.append( vcfio.VariantCall(name='Sample1', genotype=[0, 1], phaseset='1', info={'GQ': 45})) variant.calls.append( vcfio.VariantCall(name='Sample2', genotype=[vcfio.MISSING_GENOTYPE_VALUE], info={'GQ': None})) else: # '.:.:.' -> './.:.:.' due to Nucleus handeling of VariantCall.genotype. vcf_line = ('19 12 . C <SYMBOLIC> 49 PASS ' 'AF=0.5 GT:PS:GQ 0|1:1:45 ./.:.:.\n') variant = vcfio.Variant(reference_name='19', start=11, end=12, reference_bases='C', alternate_bases=['<SYMBOLIC>'], quality=49, filters=['PASS'], info={'AF': [0.5]}) variant.calls.append( vcfio.VariantCall(name='Sample1', genotype=[0, 1], phaseset='1', info={'GQ': 45})) variant.calls.append( vcfio.VariantCall(name='Sample2', genotype=[ vcfio.MISSING_GENOTYPE_VALUE, vcfio.MISSING_GENOTYPE_VALUE ], info={})) return variant, vcf_line
def _get_sample_variant_2(is_for_nucleus=False): """Get second sample variant. Features: multiple references no alternate phased multiple filters missing format field """ if not is_for_nucleus: vcf_line = ('19 123 rs1234 GTC . 40 q10;s50 NS=2 ' 'GT:GQ 1|0:48 0/1:.\n') variant = vcfio.Variant(reference_name='19', start=122, end=125, reference_bases='GTC', alternate_bases=[], names=['rs1234'], quality=40, filters=['q10', 's50'], info={'NS': 2}) variant.calls.append( vcfio.VariantCall(name='Sample1', genotype=[1, 0], phaseset=vcfio.DEFAULT_PHASESET_VALUE, info={'GQ': 48})) variant.calls.append( vcfio.VariantCall(name='Sample2', genotype=[0, 1], info={'GQ': None})) else: # 'q10;s50' -> 'PASS' due to missing header fields. vcf_line = ('19 123 rs1234 GTC . 40 PASS NS=2 ' 'GT:GQ 1|0:48 0/1:.\n') variant = vcfio.Variant(reference_name='19', start=122, end=125, reference_bases='GTC', alternate_bases=[], names=['rs1234'], quality=40, filters=['PASS'], info={'NS': 2}) variant.calls.append( vcfio.VariantCall(name='Sample1', genotype=[1, 0], phaseset=vcfio.DEFAULT_PHASESET_VALUE, info={'GQ': 48})) variant.calls.append( vcfio.VariantCall(name='Sample2', genotype=[0, 1], info={})) return variant, vcf_line
def test_all_fields(self): variant = vcfio.Variant( reference_name='chr19', start=11, end=12, reference_bases='C', alternate_bases=['A', 'TT'], names=['rs1', 'rs2'], quality=2, filters=['PASS'], info={'IFR': [0.1, 0.2], 'IFR2': [0.2, 0.3], 'IS': 'some data', 'ISR': ['data1', 'data2']}, calls=[ vcfio.VariantCall( name='Sample1', genotype=[0, 1], phaseset='*', info={'GQ': 20, 'FIR': [10, 20]}), vcfio.VariantCall( name='Sample2', genotype=[1, 0], info={'GQ': 10, 'FB': True}), vcfio.VariantCall( name='Sample3', genotype=[vcfio.MISSING_GENOTYPE_VALUE])]) header_num_dict = {'IFR': 'A', 'IFR2': 'A', 'IS': '1', 'ISR': '2'} expected_row = { ColumnKeyConstants.REFERENCE_NAME: 'chr19', ColumnKeyConstants.START_POSITION: 11, ColumnKeyConstants.END_POSITION: 12, ColumnKeyConstants.REFERENCE_BASES: 'C', ColumnKeyConstants.ALTERNATE_BASES: [ {ColumnKeyConstants.ALTERNATE_BASES_ALT: 'A', 'IFR': 0.1, 'IFR2': 0.2}, {ColumnKeyConstants.ALTERNATE_BASES_ALT: 'TT', 'IFR': 0.2, 'IFR2': 0.3}], ColumnKeyConstants.NAMES: ['rs1', 'rs2'], ColumnKeyConstants.QUALITY: 2, ColumnKeyConstants.FILTER: ['PASS'], ColumnKeyConstants.CALLS: [ {ColumnKeyConstants.CALLS_NAME: 'Sample1', ColumnKeyConstants.CALLS_GENOTYPE: [0, 1], ColumnKeyConstants.CALLS_PHASESET: '*', 'GQ': 20, 'FIR': [10, 20]}, {ColumnKeyConstants.CALLS_NAME: 'Sample2', ColumnKeyConstants.CALLS_GENOTYPE: [1, 0], ColumnKeyConstants.CALLS_PHASESET: None, 'GQ': 10, 'FB': True}, {ColumnKeyConstants.CALLS_NAME: 'Sample3', ColumnKeyConstants.CALLS_GENOTYPE: [vcfio.MISSING_GENOTYPE_VALUE], ColumnKeyConstants.CALLS_PHASESET: None}], 'IS': 'some data', 'ISR': ['data1', 'data2']} self.assertEqual([expected_row], self._get_row_list_from_variant(variant, header_num_dict))
def _get_sample_variant_1(self, split_alternate_allele_info_fields=True): variant = vcfio.Variant( reference_name='chr19', start=11, end=12, reference_bases='C', alternate_bases=['A', 'TT'], names=['rs1', 'rs2'], quality=2, filters=['PASS'], info={'IFR': [0.1, 0.2], 'IFR2': [0.2, 0.3], 'IS': 'some data', 'ISR': ['data1', 'data2']}, calls=[ vcfio.VariantCall( sample_id=hash_name('Sample1'), genotype=[0, 1], phaseset='*', info={'GQ': 20, 'FIR': [10, 20]}), vcfio.VariantCall( sample_id=hash_name('Sample2'), genotype=[1, 0], info={'GQ': 10, 'FB': True}), ] ) header_num_dict = {'IFR': 'A', 'IFR2': 'A', 'IS': '1', 'ISR': '2'} row = {ColumnKeyConstants.REFERENCE_NAME: 'chr19', ColumnKeyConstants.START_POSITION: 11, ColumnKeyConstants.END_POSITION: 12, ColumnKeyConstants.REFERENCE_BASES: 'C', ColumnKeyConstants.NAMES: ['rs1', 'rs2'], ColumnKeyConstants.QUALITY: 2, ColumnKeyConstants.FILTER: ['PASS'], ColumnKeyConstants.CALLS: [ {ColumnKeyConstants.CALLS_SAMPLE_ID: hash_name('Sample1'), ColumnKeyConstants.CALLS_GENOTYPE: [0, 1], ColumnKeyConstants.CALLS_PHASESET: '*', 'GQ': 20, 'FIR': [10, 20]}, {ColumnKeyConstants.CALLS_SAMPLE_ID: hash_name('Sample2'), ColumnKeyConstants.CALLS_GENOTYPE: [1, 0], ColumnKeyConstants.CALLS_PHASESET: None, 'GQ': 10, 'FB': True}], 'IS': 'some data', 'ISR': ['data1', 'data2']} if split_alternate_allele_info_fields: row[ColumnKeyConstants.ALTERNATE_BASES] = [ {ColumnKeyConstants.ALTERNATE_BASES_ALT: 'A', 'IFR': 0.1, 'IFR2': 0.2}, {ColumnKeyConstants.ALTERNATE_BASES_ALT: 'TT', 'IFR': 0.2, 'IFR2': 0.3}] else: row[ColumnKeyConstants.ALTERNATE_BASES] = [ {ColumnKeyConstants.ALTERNATE_BASES_ALT: 'A'}, {ColumnKeyConstants.ALTERNATE_BASES_ALT: 'TT'}] row['IFR'] = [0.1, 0.2] row['IFR2'] = [0.2, 0.3] return variant, row, header_num_dict
def _get_sample_variant_with_empty_calls(self): variant = vcfio.Variant(reference_name='20', start=123, end=125, reference_bases='CT', alternate_bases=[], filters=['q10', 's10'], info={'II': 1234}, calls=[ vcfio.VariantCall(name='EmptySample', genotype=[], phaseset='*', info={}), ]) header_num_dict = {'II': '1'} row = { ColumnKeyConstants.REFERENCE_NAME: '20', ColumnKeyConstants.START_POSITION: 123, ColumnKeyConstants.END_POSITION: 125, ColumnKeyConstants.REFERENCE_BASES: 'CT', ColumnKeyConstants.ALTERNATE_BASES: [], ColumnKeyConstants.FILTER: ['q10', 's10'], ColumnKeyConstants.CALLS: [], 'II': 1234 } return variant, row, header_num_dict
def _densify_variants(self, variant, all_call_names): # type: (vcf_parser.Variant, List[str]) -> vcf_parser.Variant """Cherry-picks calls for the variant. The calls are in the same order as the `all_call_names`. Args: variant: The variant that will be modified to contain calls for `all_call_names`. all_call_names: A list of sample names that used to cherry-pick each variant'calls. If one call is missing, an empty `VariantCall` is added. Returns: `variant` modified to contain calls for `all_call_names`. """ existing_call_name = {call.name: call for call in variant.calls} new_calls = [] for call_name in all_call_names: if call_name in existing_call_name.keys(): new_calls.append(existing_call_name.get(call_name)) else: new_calls.append( vcfio.VariantCall(name=call_name, genotype=vcfio.MISSING_GENOTYPE_VALUE)) variant.calls = new_calls return variant
def _get_sample_variant_with_incompatible_records(self): variant = vcfio.Variant( reference_name='chr19', start=11, end=12, reference_bases='C', alternate_bases=[], filters=['PASS'], info={'IFR': ['0.1', '0.2'], 'IS': 1, 'ISR': 1}, calls=[ vcfio.VariantCall( sample_id=hash_name('Sample1'), genotype=[0, 1], phaseset='*', info={'GQ': 20, 'FIR': [10.0, 20.0]}), ] ) header_num_dict = {'IFR': '2', 'IS': '1', 'ISR': '1'} row = {ColumnKeyConstants.REFERENCE_NAME: 'chr19', ColumnKeyConstants.START_POSITION: 11, ColumnKeyConstants.END_POSITION: 12, ColumnKeyConstants.REFERENCE_BASES: 'C', ColumnKeyConstants.ALTERNATE_BASES: [], ColumnKeyConstants.FILTER: ['PASS'], ColumnKeyConstants.CALLS: [ {ColumnKeyConstants.CALLS_SAMPLE_ID: hash_name('Sample1'), ColumnKeyConstants.CALLS_GENOTYPE: [0, 1], ColumnKeyConstants.CALLS_PHASESET: '*', 'GQ': 20, 'FIR': [10, 20]}], 'IFR': [0.1, 0.2], 'IS': '1', 'ISR': ['1']} return variant, row, header_num_dict
def test_add_missing_calls(self): transform = densify_variants.DensifyVariants() variant = vcfio.Variant(calls=[vcfio.VariantCall(name='sample2')]) new_variant = transform._densify_variants( variant, ['sample1', 'sample2', 'sample3']) call_names = [call.name for call in new_variant.calls] self.assertItemsEqual(call_names, ['sample1', 'sample2', 'sample3'])
def _get_sample_variant_1(self): variant = vcfio.Variant(reference_name='chr19', start=11, end=12, reference_bases='C', alternate_bases=['A', 'TT'], names=['rs1', 'rs2'], quality=2, filters=['PASS'], info={ 'IS': 'some data', 'ISI': '1', 'ISF': '1.0', 'IF': 1.0, 'IB': True, 'IA': [1, 2] }, calls=[ vcfio.VariantCall( sample_id=hash_name('Sample1'), genotype=[0, 1], phaseset='*', info={ 'FI': 20, 'FU': [10.0, 20.0] }) ]) return variant