def test_sample_ids_combiner_pipeline_preserve_sample_order(self): sample_ids = [ hash_name('sample2'), hash_name('sample1'), hash_name('sample3') ] variant_calls = [ vcfio.VariantCall(sample_id=sample_ids[0]), vcfio.VariantCall(sample_id=sample_ids[1]), vcfio.VariantCall(sample_id=sample_ids[2]) ] variants = [ vcfio.Variant( calls=[variant_calls[0], variant_calls[1], variant_calls[2]]), vcfio.Variant( calls=[variant_calls[0], variant_calls[1], variant_calls[2]]) ] pipeline = TestPipeline() combined_sample_ids = ( pipeline | transforms.Create(variants) | 'CombineSampleIds' >> combine_sample_ids.SampleIdsCombiner(preserve_sample_order=True) | combiners.ToList()) assert_that(combined_sample_ids, equal_to([sample_ids])) pipeline.run()
def _get_sample_variant_with_incompatible_records(self): variant = vcfio.Variant( reference_name='chr19', start=11, end=12, reference_bases='C', alternate_bases=[], filters=['PASS'], info={'IFR': ['0.1', '0.2'], 'IS': 1, 'ISR': 1}, calls=[ vcfio.VariantCall( sample_id=hash_name('Sample1'), genotype=[0, 1], phaseset='*', info={'GQ': 20, 'FIR': [10.0, 20.0]}), ] ) header_num_dict = {'IFR': '2', 'IS': '1', 'ISR': '1'} row = {ColumnKeyConstants.REFERENCE_NAME: 'chr19', ColumnKeyConstants.START_POSITION: 11, ColumnKeyConstants.END_POSITION: 12, ColumnKeyConstants.REFERENCE_BASES: 'C', ColumnKeyConstants.ALTERNATE_BASES: [], ColumnKeyConstants.FILTER: ['PASS'], ColumnKeyConstants.CALLS: [ {ColumnKeyConstants.CALLS_SAMPLE_ID: hash_name('Sample1'), ColumnKeyConstants.CALLS_GENOTYPE: [0, 1], ColumnKeyConstants.CALLS_PHASESET: '*', 'GQ': 20, 'FIR': [10, 20]}], 'IFR': [0.1, 0.2], 'IS': '1', 'ISR': ['1']} return variant, row, header_num_dict
def test_merge_many_different_alternates(self): strategy = merge_with_non_variants_strategy.MergeWithNonVariantsStrategy( None, None, None) variant_1 = vcfio.Variant(reference_name='1', start=1, end=2, reference_bases='A', alternate_bases=['C']) variant_2 = vcfio.Variant(reference_name='1', start=1, end=2, reference_bases='A', alternate_bases=['G']) variant_3 = vcfio.Variant(reference_name='1', start=1, end=2, reference_bases='A', alternate_bases=['T']) variant_1.calls.append( vcfio.VariantCall(sample_id=hash_name('Sample1'), genotype=[1, 0])) variant_2.calls.append( vcfio.VariantCall(sample_id=hash_name('Sample2'), genotype=[1, 0])) variant_3.calls.append( vcfio.VariantCall(sample_id=hash_name('Sample3'), genotype=[1, 0])) variants = [variant_1, variant_2, variant_3] merged_variants = list(strategy.get_merged_variants(variants)) self.assertEqual(sorted(merged_variants), sorted(variants))
def _get_sample_unmerged_variants(self): # Start/end are different from merged variants. variant_1 = vcfio.Variant(reference_name='19', start=123, end=125, reference_bases='C', alternate_bases=['A', 'TT'], names=['rs2'], calls=[ vcfio.VariantCall( sample_id=hash_name('Unmerged1'), genotype=[0, 1]) ]) # Ordering of alternate_bases is different from merged variants. variant_2 = vcfio.Variant(reference_name='19', start=11, end=12, reference_bases='C', alternate_bases=['TT', 'A'], names=['rs3'], calls=[ vcfio.VariantCall( sample_id=hash_name('Unmerged2'), genotype=[0, 1]) ]) return [variant_1, variant_2]
def test_non_variant_split_by_snp(self): strategy = merge_with_non_variants_strategy.MergeWithNonVariantsStrategy( None, None, None) non_variant = vcfio.Variant(reference_name='1', start=0, end=10) variant = vcfio.Variant(reference_name='1', start=5, end=6, reference_bases='C', alternate_bases=['A']) call_1 = vcfio.VariantCall(sample_id=hash_name('1'), genotype=[0, 0]) call_2 = vcfio.VariantCall(sample_id=hash_name('2'), genotype=[1, 0]) non_variant.calls.append(call_1) variant.calls.append(call_2) expected_1 = vcfio.Variant(reference_name='1', start=0, end=5) expected_2 = vcfio.Variant(reference_name='1', start=5, end=6, reference_bases='C', alternate_bases=['A']) expected_3 = vcfio.Variant(reference_name='1', start=6, end=10) expected_1.calls.append(call_1) expected_2.calls.append(call_1) expected_2.calls.append(call_2) expected_3.calls.append(call_1) actual = list(strategy.get_merged_variants([non_variant, variant])) expected = [expected_1, expected_2, expected_3] self.assertEqual(sorted(actual), sorted(expected))
def _get_sample_variant(self): return vcfio.Variant( reference_name='19', start=11, end=12, reference_bases='C', alternate_bases=['A', 'TT'], names=['rs1', 'rs2'], quality=2, filters=['PASS'], info={ 'A1': 'some data', 'A2': ['data1', 'data2'] }, calls=[ vcfio.VariantCall(sample_id=hash_name('Sample1'), genotype=[0, 1], info={ 'GQ': 20, 'HQ': [10, 20] }), vcfio.VariantCall(sample_id=hash_name('Sample2'), genotype=[1, 0], info={ 'GQ': 10, 'FLAG1': True }) ])
def test_densify_variants_pipeline(self): sample_ids = [ hash_name('sample1'), hash_name('sample2'), hash_name('sample3') ] variant_calls = [ vcfio.VariantCall(sample_id=sample_ids[0]), vcfio.VariantCall(sample_id=sample_ids[1]), vcfio.VariantCall(sample_id=sample_ids[2]), ] variants = [ vcfio.Variant(calls=[variant_calls[0], variant_calls[1]]), vcfio.Variant(calls=[variant_calls[1], variant_calls[2]]), ] pipeline = TestPipeline() densified_variants = ( pipeline | Create(variants) | 'DensifyVariants' >> densify_variants.DensifyVariants(sample_ids)) assert_that(densified_variants, asserts.has_sample_ids(sample_ids)) pipeline.run()
def _get_big_query_row(): # type: (...) -> Dict[unicode, Any] """Returns one sample BigQuery row for testing.""" row = { str(ColumnKeyConstants.REFERENCE_NAME): str('chr19'), str(ColumnKeyConstants.START_POSITION): 11, str(ColumnKeyConstants.END_POSITION): 12, str(ColumnKeyConstants.REFERENCE_BASES): 'C', str(ColumnKeyConstants.NAMES): [str('rs1'), str('rs2')], str(ColumnKeyConstants.QUALITY): 2, str(ColumnKeyConstants.FILTER): [str('PASS')], str(ColumnKeyConstants.CALLS): [{ str(ColumnKeyConstants.CALLS_SAMPLE_ID): (str(hash_name('Sample1'))), str(ColumnKeyConstants.CALLS_GENOTYPE): [0, 1], str(ColumnKeyConstants.CALLS_PHASESET): str('*'), str('GQ'): 20, str('FIR'): [10, 20] }, { str(ColumnKeyConstants.CALLS_SAMPLE_ID): (str(hash_name('Sample2'))), str(ColumnKeyConstants.CALLS_GENOTYPE): [0, 0], str(ColumnKeyConstants.CALLS_PHASESET): None, str('GQ'): 10, str('FB'): True }], str(ColumnKeyConstants.ALTERNATE_BASES): [{ str(ColumnKeyConstants.ALTERNATE_BASES_ALT): str('A'), str('IFR'): 1, str('IFR2'): 0.2 }, { str(ColumnKeyConstants.ALTERNATE_BASES_ALT): str('TT'), str('IFR'): 0.2, str('IFR2'): 0.3 }], str('IS'): str('some data'), str('ISR'): [str('data1'), str('data2')] } return row
def test_write_to_shards_pipeline(self): with temp_dir.TempDir() as tempdir: pipeline = TestPipeline() _ = (pipeline | Create(self._get_variants()) | 'WriteToShards' >> write_variants_to_shards.WriteToShards( tempdir.get_path(), [hash_name('Sample 1'), hash_name('Sample 2')])) pipeline.run()
def test_merge_snp_with_non_variant(self): strategy = merge_with_non_variants_strategy.MergeWithNonVariantsStrategy( None, None, None) variant = vcfio.Variant(reference_name='1', start=5, end=6, reference_bases='A', alternate_bases=['C'], names=['v'], filters=['vf'], quality=1) non_variant = vcfio.Variant(reference_name='1', start=0, end=10, reference_bases='G', alternate_bases=['<NON_REF>'], names=['nv'], filters=['nvf'], quality=2) call_1 = vcfio.VariantCall(sample_id=hash_name('1'), genotype=[1, 0]) call_2 = vcfio.VariantCall(sample_id=hash_name('2'), genotype=[0, 0]) variant.calls.append(call_1) non_variant.calls.append(call_2) expected_1 = vcfio.Variant(reference_name='1', start=0, end=5, alternate_bases=['<NON_REF>'], names=['nv'], filters=['nvf'], quality=2) expected_2 = vcfio.Variant(reference_name='1', start=6, end=10, alternate_bases=['<NON_REF>'], names=['nv'], filters=['nvf'], quality=2) expected_3 = vcfio.Variant(reference_name='1', start=5, end=6, reference_bases='A', alternate_bases=['C'], names=['v'], filters=['vf'], quality=1) expected_1.calls.append(call_2) expected_2.calls.append(call_2) expected_3.calls.append(call_1) expected_3.calls.append(call_2) actual = list(strategy.get_merged_variants([variant, non_variant])) expected = [expected_1, expected_2, expected_3] self.assertEqual(sorted(actual), sorted(expected))
def test_omit_empty_sample_calls(self): variant = vcfio.Variant( reference_name='chr19', start=11, end=12, reference_bases='C', alternate_bases=[], names=['rs1', 'rs2'], quality=2, filters=['PASS'], info={}, calls=[ vcfio.VariantCall(sample_id=hash_name('Sample1'), info={'GQ': None}), vcfio.VariantCall(sample_id=hash_name('Sample2'), genotype=[1, 0], info={'GQ': 10}), vcfio.VariantCall(sample_id=hash_name('Sample3'), genotype=[ vcfio.MISSING_GENOTYPE_VALUE, vcfio.MISSING_GENOTYPE_VALUE ]) ]) proc_variant = _get_processed_variant(variant) expected_row = { ColumnKeyConstants.REFERENCE_NAME: 'chr19', ColumnKeyConstants.START_POSITION: 11, ColumnKeyConstants.END_POSITION: 12, ColumnKeyConstants.REFERENCE_BASES: 'C', ColumnKeyConstants.ALTERNATE_BASES: [], ColumnKeyConstants.NAMES: ['rs1', 'rs2'], ColumnKeyConstants.QUALITY: 2, ColumnKeyConstants.FILTER: ['PASS'], ColumnKeyConstants.CALLS: [{ ColumnKeyConstants.CALLS_SAMPLE_ID: hash_name('Sample2'), ColumnKeyConstants.CALLS_GENOTYPE: [1, 0], ColumnKeyConstants.CALLS_PHASESET: None, 'GQ': 10 }] } self.assertEqual([expected_row], list( self._row_generator.get_rows( proc_variant, omit_empty_sample_calls=True)))
def test_merge_2_non_variants(self): strategy = merge_with_non_variants_strategy.MergeWithNonVariantsStrategy( None, None, None) non_variant_1 = vcfio.Variant(reference_name='1', start=0, end=10, alternate_bases=['<NON_REF>'], names=['nonv1', 'nonv2'], filters=['f1', 'f2'], quality=1) non_variant_2 = vcfio.Variant(reference_name='1', start=5, end=15, alternate_bases=['<NON_REF>'], names=['nonv2', 'nonv3'], filters=['f2', 'f3'], quality=2) call_1 = vcfio.VariantCall(sample_id=hash_name('1'), genotype=[0, 0]) call_2 = vcfio.VariantCall(sample_id=hash_name('2'), genotype=[0, 0]) non_variant_1.calls.append(call_1) non_variant_2.calls.append(call_2) expected_1 = vcfio.Variant(reference_name='1', start=0, end=5, alternate_bases=['<NON_REF>'], names=['nonv1', 'nonv2'], filters=['f1', 'f2'], quality=1) expected_2 = vcfio.Variant(reference_name='1', start=10, end=15, alternate_bases=['<NON_REF>'], names=['nonv2', 'nonv3'], filters=['f2', 'f3'], quality=2) expected_3 = vcfio.Variant(reference_name='1', start=5, end=10, alternate_bases=['<NON_REF>'], names=['nonv1', 'nonv2', 'nonv3'], filters=['f1', 'f2', 'f3'], quality=1) expected_1.calls.append(call_1) expected_2.calls.append(call_2) expected_3.calls.append(call_1) expected_3.calls.append(call_2) actual = list( strategy.get_merged_variants([non_variant_1, non_variant_2])) expected = [expected_1, expected_2, expected_3] self.assertEqual(sorted(actual), sorted(expected))
def _get_sample_variants(self): variant_1 = vcfio.Variant( reference_name='19', start=11, end=12, reference_bases='C', alternate_bases=['A', 'TT'], names=['rs1'], quality=2, filters=['PASS'], info={ 'A1': 'some data', 'A2': ['data1', 'data2'] }, calls=[ vcfio.VariantCall(sample_id=hash_name('Sample1'), genotype=[0, 1], phaseset='*', info={ 'GQ': 20, 'HQ': [10, 20] }), vcfio.VariantCall(sample_id=hash_name('Sample2'), genotype=[1, 0], info={ 'GQ': 10, 'FLAG1': True }), ]) variant_2 = vcfio.Variant( reference_name='20', start=11, end=12, reference_bases='C', alternate_bases=['A', 'TT'], names=['rs1'], quality=20, filters=['q10'], info={ 'A1': 'some data2', 'A3': ['data3', 'data4'] }, calls=[ vcfio.VariantCall(sample_id=hash_name('Sample3'), genotype=[1, 1]), vcfio.VariantCall(sample_id=hash_name('Sample4'), genotype=[1, 0], info={'GQ': 20}), ]) return [variant_1, variant_2]
def _get_sample_variant_1(self, split_alternate_allele_info_fields=True): variant = vcfio.Variant( reference_name='chr19', start=11, end=12, reference_bases='C', alternate_bases=['A', 'TT'], names=['rs1', 'rs2'], quality=2, filters=['PASS'], info={'IFR': [0.1, 0.2], 'IFR2': [0.2, 0.3], 'IS': 'some data', 'ISR': ['data1', 'data2']}, calls=[ vcfio.VariantCall( sample_id=hash_name('Sample1'), genotype=[0, 1], phaseset='*', info={'GQ': 20, 'FIR': [10, 20]}), vcfio.VariantCall( sample_id=hash_name('Sample2'), genotype=[1, 0], info={'GQ': 10, 'FB': True}), ] ) header_num_dict = {'IFR': 'A', 'IFR2': 'A', 'IS': '1', 'ISR': '2'} row = {ColumnKeyConstants.REFERENCE_NAME: 'chr19', ColumnKeyConstants.START_POSITION: 11, ColumnKeyConstants.END_POSITION: 12, ColumnKeyConstants.REFERENCE_BASES: 'C', ColumnKeyConstants.NAMES: ['rs1', 'rs2'], ColumnKeyConstants.QUALITY: 2, ColumnKeyConstants.FILTER: ['PASS'], ColumnKeyConstants.CALLS: [ {ColumnKeyConstants.CALLS_SAMPLE_ID: hash_name('Sample1'), ColumnKeyConstants.CALLS_GENOTYPE: [0, 1], ColumnKeyConstants.CALLS_PHASESET: '*', 'GQ': 20, 'FIR': [10, 20]}, {ColumnKeyConstants.CALLS_SAMPLE_ID: hash_name('Sample2'), ColumnKeyConstants.CALLS_GENOTYPE: [1, 0], ColumnKeyConstants.CALLS_PHASESET: None, 'GQ': 10, 'FB': True}], 'IS': 'some data', 'ISR': ['data1', 'data2']} if split_alternate_allele_info_fields: row[ColumnKeyConstants.ALTERNATE_BASES] = [ {ColumnKeyConstants.ALTERNATE_BASES_ALT: 'A', 'IFR': 0.1, 'IFR2': 0.2}, {ColumnKeyConstants.ALTERNATE_BASES_ALT: 'TT', 'IFR': 0.2, 'IFR2': 0.3}] else: row[ColumnKeyConstants.ALTERNATE_BASES] = [ {ColumnKeyConstants.ALTERNATE_BASES_ALT: 'A'}, {ColumnKeyConstants.ALTERNATE_BASES_ALT: 'TT'}] row['IFR'] = [0.1, 0.2] row['IFR2'] = [0.2, 0.3] return variant, row, header_num_dict
def _get_sample_variant_with_empty_calls(self): variant = vcfio.Variant(reference_name='20', start=123, end=125, reference_bases='CT', alternate_bases=[], filters=['q10', 's10'], info={'II': 1234}, calls=[ vcfio.VariantCall( sample_id=hash_name('EmptySample'), genotype=[], phaseset='*', info={}), ]) header_num_dict = {'II': '1'} row = { ColumnKeyConstants.REFERENCE_NAME: '20', ColumnKeyConstants.START_POSITION: 123, ColumnKeyConstants.END_POSITION: 125, ColumnKeyConstants.REFERENCE_BASES: 'CT', ColumnKeyConstants.ALTERNATE_BASES: [], ColumnKeyConstants.FILTER: ['q10', 's10'], ColumnKeyConstants.CALLS: [], 'II': 1234 } return variant, row, header_num_dict
def _get_sample_variant_1(self): variant = vcfio.Variant(reference_name='chr19', start=11, end=12, reference_bases='C', alternate_bases=['A', 'TT'], names=['rs1', 'rs2'], quality=2, filters=['PASS'], info={ 'IS': 'some data', 'ISI': '1', 'ISF': '1.0', 'IF': 1.0, 'IB': True, 'IA': [1, 2] }, calls=[ vcfio.VariantCall( sample_id=hash_name('Sample1'), genotype=[0, 1], phaseset='*', info={ 'FI': 20, 'FU': [10.0, 20.0] }) ]) return variant
def _get_bigquery_row_and_variant(self): row = {unicode(ColumnKeyConstants.REFERENCE_NAME): unicode('chr19'), unicode(ColumnKeyConstants.START_POSITION): 11, unicode(ColumnKeyConstants.END_POSITION): 12, unicode(ColumnKeyConstants.REFERENCE_BASES): 'C', unicode(ColumnKeyConstants.NAMES): ['rs1', 'rs2'], unicode(ColumnKeyConstants.QUALITY): 2, unicode(ColumnKeyConstants.FILTER): ['PASS'], unicode(ColumnKeyConstants.CALLS): [ {unicode(ColumnKeyConstants.CALLS_SAMPLE_ID): ( hash_name('Sample1')), unicode(ColumnKeyConstants.CALLS_GENOTYPE): [0, 1], unicode(ColumnKeyConstants.CALLS_PHASESET): unicode('*'), unicode('GQ'): 20, unicode('FIR'): [10, 20]}, {unicode(ColumnKeyConstants.CALLS_SAMPLE_ID): ( hash_name('Sample2')), unicode(ColumnKeyConstants.CALLS_GENOTYPE): [1, 0], unicode(ColumnKeyConstants.CALLS_PHASESET): None, unicode('GQ'): 10, unicode('FB'): True} ], unicode(ColumnKeyConstants.ALTERNATE_BASES): [ {unicode(ColumnKeyConstants.ALTERNATE_BASES_ALT): unicode('A'), unicode('IFR'): None, unicode('IFR2'): 0.2}, {unicode(ColumnKeyConstants.ALTERNATE_BASES_ALT): unicode('TT'), unicode('IFR'): 0.2, unicode('IFR2'): 0.3} ], unicode('IS'): unicode('some data'), unicode('ISR'): [unicode('data1'), unicode('data2')]} variant = vcfio.Variant( reference_name='chr19', start=11, end=12, reference_bases='C', alternate_bases=['A', 'TT'], names=['rs1', 'rs2'], quality=2, filters=['PASS'], info={'IFR': [0.2], 'IFR2': [0.2, 0.3], 'IS': 'some data', 'ISR': ['data1', 'data2']}, calls=[ vcfio.VariantCall( sample_id=hash_name('Sample1'), genotype=[0, 1], phaseset='*', info={'GQ': 20, 'FIR': [10, 20]}), vcfio.VariantCall( sample_id=hash_name('Sample2'), genotype=[1, 0], info={'GQ': 10, 'FB': True}) ] ) return row, variant
def test_sample_ids_combiner_pipeline_duplicate_sample_ids(self): variant_call = vcfio.VariantCall(sample_id=hash_name('sample1')) variants = [vcfio.Variant(calls=[variant_call, variant_call])] pipeline = TestPipeline() _ = (pipeline | transforms.Create(variants) | 'CombineSampleIds' >> combine_sample_ids.SampleIdsCombiner() | combiners.ToList()) with self.assertRaises(ValueError): pipeline.run()
def test_variant_to_bq_row_to_variant(self): variant = vcfio.Variant( reference_name='chr19', start=11, end=12, reference_bases='C', alternate_bases=['A', 'TT'], names=['rs1', 'rs2'], quality=2, filters=['PASS'], info={ 'IFR': [0.1, 0.2], 'IFR2': [0.2, 0.3], 'IS': 'some data', 'ISR': ['data1', 'data2'] }, calls=[ vcfio.VariantCall(sample_id=hash_name('Sample1'), genotype=[0, 1], phaseset='*', info={ 'GQ': 20, 'FIR': [10, 20] }), vcfio.VariantCall(sample_id=hash_name('Sample2'), genotype=[1, 0], info={ 'GQ': 10, 'FB': True }), vcfio.VariantCall(sample_id=hash_name('Sample3'), genotype=[vcfio.MISSING_GENOTYPE_VALUE]) ]) header_num_dict = {'IFR': 'A', 'IFR2': 'A', 'IS': '1', 'ISR': '2'} proc_variant = _get_processed_variant(variant, header_num_dict) row = list(self._row_generator.get_rows(proc_variant)) converted_variant = self._variant_generator.convert_bq_row_to_variant( row[0]) self.assertEqual(variant, converted_variant)
def test_get_variant_calls(self): variant_call_records = _get_big_query_row()[ColumnKeyConstants.CALLS] expected_calls = [ vcfio.VariantCall(sample_id=hash_name('Sample1'), genotype=[0, 1], phaseset='*', info={ 'GQ': 20, 'FIR': [10, 20] }), vcfio.VariantCall(sample_id=hash_name('Sample2'), genotype=[1, 0], info={ 'GQ': 10, 'FB': True }), ] self.assertEqual( expected_calls, self._variant_generator._get_variant_calls(variant_call_records))
def test_get_merged_variants_move_info_to_calls(self): strategy = merge_with_non_variants_strategy.MergeWithNonVariantsStrategy( info_keys_to_move_to_calls_regex='^A1$', copy_quality_to_calls=False, copy_filter_to_calls=False) variants = self._get_sample_variants() # Test single variant merge. single_merged_variant = list( strategy.get_merged_variants([variants[0]]))[0] self.assertEqual([ vcfio.VariantCall(sample_id=hash_name('Sample1'), genotype=[0, 1], info={ 'GQ': 20, 'HQ': [10, 20], 'A1': 'some data' }), vcfio.VariantCall(sample_id=hash_name('Sample2'), genotype=[1, 0], info={ 'GQ': 10, 'FLAG1': True, 'A1': 'some data' }) ], single_merged_variant.calls) # Test multiple variant merge. merged_variant = list(strategy.get_merged_variants(variants))[0] self._assert_common_expected_merged_fields(merged_variant) self.assertEqual([ vcfio.VariantCall(sample_id=hash_name('Sample1'), genotype=[0, 1], info={ 'GQ': 20, 'HQ': [10, 20], 'A1': 'some data' }), vcfio.VariantCall(sample_id=hash_name('Sample2'), genotype=[1, 0], info={ 'GQ': 10, 'FLAG1': True, 'A1': 'some data' }), vcfio.VariantCall(sample_id=hash_name('Sample3'), genotype=[1, 1], info={'A1': 'some data2'}), vcfio.VariantCall(sample_id=hash_name('Sample4'), genotype=[1, 0], info={ 'GQ': 20, 'A1': 'some data2' }) ], merged_variant.calls) self.assertItemsEqual(['A2', 'A3'], merged_variant.info.keys()) self.assertEqual(['data1', 'data2'], merged_variant.info['A2']) self.assertEqual(['data3', 'data4'], merged_variant.info['A3'])
def test_get_merged_variants_no_custom_options(self): strategy = move_to_calls_strategy.MoveToCallsStrategy( info_keys_to_move_to_calls_regex=None, copy_quality_to_calls=False, copy_filter_to_calls=False) variants = self._get_sample_variants() # Test single variant merge. self.assertEqual([variants[0]], strategy.get_merged_variants([variants[0]])) # Test multiple variant merge. merged_variant = strategy.get_merged_variants(variants)[0] self._assert_common_expected_merged_fields(merged_variant) self.assertEqual([ vcfio.VariantCall(sample_id=hash_name('Sample1'), genotype=[0, 1], info={ 'GQ': 20, 'HQ': [10, 20] }), vcfio.VariantCall(sample_id=hash_name('Sample2'), genotype=[1, 0], info={ 'GQ': 10, 'FLAG1': True }), vcfio.VariantCall(sample_id=hash_name('Sample3'), genotype=[1, 1]), vcfio.VariantCall(sample_id=hash_name('Sample4'), genotype=[1, 0], info={'GQ': 20}) ], merged_variant.calls) self.assertItemsEqual(['A1', 'A2', 'A3'], merged_variant.info.keys()) self.assertTrue(merged_variant.info['A1'] in ('some data', 'some data2')) self.assertEqual(['data1', 'data2'], merged_variant.info['A2']) self.assertEqual(['data3', 'data4'], merged_variant.info['A3'])
def test_merge_mnps(self): strategy = merge_with_non_variants_strategy.MergeWithNonVariantsStrategy( None, None, None) variant_1 = vcfio.Variant(reference_name='1', start=5, end=8, reference_bases='GTC', alternate_bases=['G', 'GTCG'], names=['mnp1', 'mnp2'], filters=['f1', 'f2'], quality=1) variant_2 = vcfio.Variant(reference_name='1', start=5, end=8, reference_bases='GTC', alternate_bases=['G', 'GTCG'], names=['mnp2', 'mnp3'], filters=['f2', 'f3'], quality=2) call_1 = vcfio.VariantCall(sample_id=hash_name('1'), genotype=[1, 2]) call_2 = vcfio.VariantCall(sample_id=hash_name('2'), genotype=[2, 0]) expected = vcfio.Variant(reference_name='1', start=5, end=8, reference_bases='GTC', alternate_bases=['G', 'GTCG'], names=['mnp1', 'mnp2', 'mnp3'], filters=['f1', 'f2', 'f3'], quality=2) expected.calls.append(call_1) expected.calls.append(call_2) variant_1.calls.append(call_1) variant_2.calls.append(call_2) actual = list(strategy.get_merged_variants([variant_1, variant_2])) self.assertEqual(actual, [expected])
def test_convert_bq_row_to_variant(self): row = _get_big_query_row() expected_variant = vcfio.Variant( reference_name='chr19', start=11, end=12, reference_bases='C', alternate_bases=['A', 'TT'], names=['rs1', 'rs2'], quality=2, filters=['PASS'], info={ 'IFR': [1, 0.2], 'IFR2': [0.2, 0.3], 'IS': 'some data', 'ISR': ['data1', 'data2'] }, calls=[ vcfio.VariantCall(sample_id=hash_name('Sample1'), genotype=[0, 1], phaseset='*', info={ 'GQ': 20, 'FIR': [10, 20] }), vcfio.VariantCall(sample_id=hash_name('Sample2'), genotype=[1, 0], info={ 'GQ': 10, 'FB': True }) ]) self.assertEqual( expected_variant, self._variant_generator.convert_bq_row_to_variant(row))
def test_sample_ids_combiner_pipeline_preserve_sample_order_error(self): sample_ids = [ hash_name('sample1'), hash_name('sample2'), hash_name('sample3') ] variant_calls = [ vcfio.VariantCall(sample_id=sample_ids[0]), vcfio.VariantCall(sample_id=sample_ids[1]), vcfio.VariantCall(sample_id=sample_ids[2]) ] variants = [ vcfio.Variant(calls=[variant_calls[0], variant_calls[1]]), vcfio.Variant(calls=[variant_calls[1], variant_calls[2]]) ] pipeline = TestPipeline() _ = (pipeline | transforms.Create(variants) | 'CombineSampleIds' >> combine_sample_ids.SampleIdsCombiner(preserve_sample_order=True) | combiners.ToList()) with self.assertRaises(ValueError): pipeline.run()
def _get_sample_variant_2(self): variant = vcfio.Variant(reference_name='20', start=123, end=125, reference_bases='CT', alternate_bases=[], filters=['q10', 's10'], info={'IS_2': 'some data'}, calls=[ vcfio.VariantCall( sample_id=hash_name('Sample1'), genotype=[0, 1], phaseset='*', info={'FI_2': 20}) ]) return variant
def test_get_merged_variants_move_everything_to_calls(self): strategy = move_to_calls_strategy.MoveToCallsStrategy( info_keys_to_move_to_calls_regex='.*', copy_quality_to_calls=True, copy_filter_to_calls=True) variants = self._get_sample_variants() # Test single variant merge. single_merged_variant = strategy.get_merged_variants([variants[0]])[0] self.assertEqual([ vcfio.VariantCall(sample_id=hash_name('Sample1'), genotype=[0, 1], info={ 'GQ': 20, 'HQ': [10, 20], 'A1': 'some data', 'A2': ['data1', 'data2'], ColumnKeyConstants.QUALITY: 2, ColumnKeyConstants.FILTER: ['PASS'] }), vcfio.VariantCall(sample_id=hash_name('Sample2'), genotype=[1, 0], info={ 'GQ': 10, 'FLAG1': True, 'A1': 'some data', 'A2': ['data1', 'data2'], ColumnKeyConstants.QUALITY: 2, ColumnKeyConstants.FILTER: ['PASS'] }) ], single_merged_variant.calls) merged_variant = strategy.get_merged_variants(variants)[0] self._assert_common_expected_merged_fields(merged_variant) self.assertEqual([ vcfio.VariantCall(sample_id=hash_name('Sample1'), genotype=[0, 1], info={ 'GQ': 20, 'HQ': [10, 20], 'A1': 'some data', 'A2': ['data1', 'data2'], ColumnKeyConstants.QUALITY: 2, ColumnKeyConstants.FILTER: ['PASS'] }), vcfio.VariantCall(sample_id=hash_name('Sample2'), genotype=[1, 0], info={ 'GQ': 10, 'FLAG1': True, 'A1': 'some data', 'A2': ['data1', 'data2'], ColumnKeyConstants.QUALITY: 2, ColumnKeyConstants.FILTER: ['PASS'] }), vcfio.VariantCall(sample_id=hash_name('Sample3'), genotype=[1, 1], info={ 'A1': 'some data2', 'A3': ['data3', 'data4'], ColumnKeyConstants.QUALITY: 20, ColumnKeyConstants.FILTER: ['q10'] }), vcfio.VariantCall(sample_id=hash_name('Sample4'), genotype=[1, 0], info={ 'GQ': 20, 'A1': 'some data2', 'A3': ['data3', 'data4'], ColumnKeyConstants.QUALITY: 20, ColumnKeyConstants.FILTER: ['q10'] }) ], merged_variant.calls) self.assertEqual([], merged_variant.info.keys())
def test_schema_conflict_in_format_field_type(self): variant = vcfio.Variant( reference_name='chr19', start=11, end=12, reference_bases='CT', alternate_bases=[], filters=[], calls=[ vcfio.VariantCall(sample_id=hash_name('Sample1'), genotype=[0, 1], phaseset='*', info={ 'FB': '', 'FI': 1.0, 'FSR': [1, 2] }), vcfio.VariantCall(sample_id=hash_name('Sample2'), genotype=[1, 0], info={ 'FB': 1, 'FI': True, 'FSR': [1.0, 2.0] }) ]) proc_variant = _get_processed_variant(variant) expected_row = { ColumnKeyConstants.REFERENCE_NAME: 'chr19', ColumnKeyConstants.START_POSITION: 11, ColumnKeyConstants.END_POSITION: 12, ColumnKeyConstants.REFERENCE_BASES: 'CT', ColumnKeyConstants.ALTERNATE_BASES: [], ColumnKeyConstants.CALLS: [{ ColumnKeyConstants.CALLS_SAMPLE_ID: hash_name('Sample1'), ColumnKeyConstants.CALLS_GENOTYPE: [0, 1], ColumnKeyConstants.CALLS_PHASESET: '*', 'FB': False, 'FI': 1, 'FSR': ['1', '2'] }, { ColumnKeyConstants.CALLS_SAMPLE_ID: hash_name('Sample2'), ColumnKeyConstants.CALLS_GENOTYPE: [1, 0], ColumnKeyConstants.CALLS_PHASESET: None, 'FB': True, 'FI': 1, 'FSR': ['1.0', '2.0'] }], } self.assertEqual( [expected_row], list( self._row_generator.get_rows(proc_variant, allow_incompatible_records=True))) with self.assertRaises(ValueError): variant = vcfio.Variant( reference_name='chr19', start=11, end=12, reference_bases='CT', alternate_bases=[], filters=[], # String cannot be casted to integer. calls=[ vcfio.VariantCall(sample_id=hash_name('Sample1'), genotype=[0, 1], phaseset='*', info={'FI': 'string_for_int_field'}) ]) proc_variant = _get_processed_variant(variant) list( self._row_generator.get_rows(proc_variant, allow_incompatible_records=True)) self.fail( 'String data for an integer schema must cause an exception')
def test_schema_conflict_in_format_field_number(self): variant = vcfio.Variant( reference_name='chr19', start=11, end=12, reference_bases='CT', alternate_bases=[], filters=[], calls=[ vcfio.VariantCall(sample_id=hash_name('Sample1'), genotype=[0, 1], phaseset='*', info={ 'FB': [1, 2], 'FI': [1, 2], 'FSR': 'str' }), vcfio.VariantCall(sample_id=hash_name('Sample2'), genotype=[1, 0], info={ 'FB': [], 'FI': [], 'FSR': '' }) ]) proc_variant = _get_processed_variant(variant) expected_row = { ColumnKeyConstants.REFERENCE_NAME: 'chr19', ColumnKeyConstants.START_POSITION: 11, ColumnKeyConstants.END_POSITION: 12, ColumnKeyConstants.REFERENCE_BASES: 'CT', ColumnKeyConstants.ALTERNATE_BASES: [], ColumnKeyConstants.CALLS: [{ ColumnKeyConstants.CALLS_SAMPLE_ID: hash_name('Sample1'), ColumnKeyConstants.CALLS_GENOTYPE: [0, 1], ColumnKeyConstants.CALLS_PHASESET: '*', 'FB': True, 'FI': 1, 'FSR': ['str'] }, { ColumnKeyConstants.CALLS_SAMPLE_ID: hash_name('Sample2'), ColumnKeyConstants.CALLS_GENOTYPE: [1, 0], ColumnKeyConstants.CALLS_PHASESET: None, 'FB': False, 'FI': None, 'FSR': [''] }], } self.assertEqual( [expected_row], list( self._row_generator.get_rows(proc_variant, allow_incompatible_records=True)))
def _default_variant_call(self): return vcfio.VariantCall(sample_id=hash_name('Sample1'), genotype=[1, 0], phaseset=vcfio.DEFAULT_PHASESET_VALUE, info={'GQ': 48})