Example #1
0
    def test_sample_ids_combiner_pipeline_preserve_sample_order(self):
        sample_ids = [
            hash_name('sample2'),
            hash_name('sample1'),
            hash_name('sample3')
        ]
        variant_calls = [
            vcfio.VariantCall(sample_id=sample_ids[0]),
            vcfio.VariantCall(sample_id=sample_ids[1]),
            vcfio.VariantCall(sample_id=sample_ids[2])
        ]
        variants = [
            vcfio.Variant(
                calls=[variant_calls[0], variant_calls[1], variant_calls[2]]),
            vcfio.Variant(
                calls=[variant_calls[0], variant_calls[1], variant_calls[2]])
        ]

        pipeline = TestPipeline()
        combined_sample_ids = (
            pipeline
            | transforms.Create(variants)
            | 'CombineSampleIds' >>
            combine_sample_ids.SampleIdsCombiner(preserve_sample_order=True)
            | combiners.ToList())
        assert_that(combined_sample_ids, equal_to([sample_ids]))
        pipeline.run()
Example #2
0
 def _get_sample_variant_with_incompatible_records(self):
   variant = vcfio.Variant(
       reference_name='chr19', start=11, end=12, reference_bases='C',
       alternate_bases=[], filters=['PASS'],
       info={'IFR': ['0.1', '0.2'], 'IS': 1, 'ISR': 1},
       calls=[
           vcfio.VariantCall(
               sample_id=hash_name('Sample1'), genotype=[0, 1], phaseset='*',
               info={'GQ': 20, 'FIR': [10.0, 20.0]}),
       ]
   )
   header_num_dict = {'IFR': '2', 'IS': '1', 'ISR': '1'}
   row = {ColumnKeyConstants.REFERENCE_NAME: 'chr19',
          ColumnKeyConstants.START_POSITION: 11,
          ColumnKeyConstants.END_POSITION: 12,
          ColumnKeyConstants.REFERENCE_BASES: 'C',
          ColumnKeyConstants.ALTERNATE_BASES: [],
          ColumnKeyConstants.FILTER: ['PASS'],
          ColumnKeyConstants.CALLS: [
              {ColumnKeyConstants.CALLS_SAMPLE_ID: hash_name('Sample1'),
               ColumnKeyConstants.CALLS_GENOTYPE: [0, 1],
               ColumnKeyConstants.CALLS_PHASESET: '*',
               'GQ': 20, 'FIR': [10, 20]}],
          'IFR': [0.1, 0.2],
          'IS': '1',
          'ISR': ['1']}
   return variant, row, header_num_dict
    def test_merge_many_different_alternates(self):
        strategy = merge_with_non_variants_strategy.MergeWithNonVariantsStrategy(
            None, None, None)

        variant_1 = vcfio.Variant(reference_name='1',
                                  start=1,
                                  end=2,
                                  reference_bases='A',
                                  alternate_bases=['C'])
        variant_2 = vcfio.Variant(reference_name='1',
                                  start=1,
                                  end=2,
                                  reference_bases='A',
                                  alternate_bases=['G'])
        variant_3 = vcfio.Variant(reference_name='1',
                                  start=1,
                                  end=2,
                                  reference_bases='A',
                                  alternate_bases=['T'])
        variant_1.calls.append(
            vcfio.VariantCall(sample_id=hash_name('Sample1'), genotype=[1, 0]))
        variant_2.calls.append(
            vcfio.VariantCall(sample_id=hash_name('Sample2'), genotype=[1, 0]))
        variant_3.calls.append(
            vcfio.VariantCall(sample_id=hash_name('Sample3'), genotype=[1, 0]))
        variants = [variant_1, variant_2, variant_3]
        merged_variants = list(strategy.get_merged_variants(variants))
        self.assertEqual(sorted(merged_variants), sorted(variants))
 def _get_sample_unmerged_variants(self):
     # Start/end are different from merged variants.
     variant_1 = vcfio.Variant(reference_name='19',
                               start=123,
                               end=125,
                               reference_bases='C',
                               alternate_bases=['A', 'TT'],
                               names=['rs2'],
                               calls=[
                                   vcfio.VariantCall(
                                       sample_id=hash_name('Unmerged1'),
                                       genotype=[0, 1])
                               ])
     # Ordering of alternate_bases is different from merged variants.
     variant_2 = vcfio.Variant(reference_name='19',
                               start=11,
                               end=12,
                               reference_bases='C',
                               alternate_bases=['TT', 'A'],
                               names=['rs3'],
                               calls=[
                                   vcfio.VariantCall(
                                       sample_id=hash_name('Unmerged2'),
                                       genotype=[0, 1])
                               ])
     return [variant_1, variant_2]
    def test_non_variant_split_by_snp(self):
        strategy = merge_with_non_variants_strategy.MergeWithNonVariantsStrategy(
            None, None, None)
        non_variant = vcfio.Variant(reference_name='1', start=0, end=10)
        variant = vcfio.Variant(reference_name='1',
                                start=5,
                                end=6,
                                reference_bases='C',
                                alternate_bases=['A'])
        call_1 = vcfio.VariantCall(sample_id=hash_name('1'), genotype=[0, 0])
        call_2 = vcfio.VariantCall(sample_id=hash_name('2'), genotype=[1, 0])
        non_variant.calls.append(call_1)
        variant.calls.append(call_2)
        expected_1 = vcfio.Variant(reference_name='1', start=0, end=5)
        expected_2 = vcfio.Variant(reference_name='1',
                                   start=5,
                                   end=6,
                                   reference_bases='C',
                                   alternate_bases=['A'])
        expected_3 = vcfio.Variant(reference_name='1', start=6, end=10)
        expected_1.calls.append(call_1)
        expected_2.calls.append(call_1)
        expected_2.calls.append(call_2)
        expected_3.calls.append(call_1)

        actual = list(strategy.get_merged_variants([non_variant, variant]))
        expected = [expected_1, expected_2, expected_3]
        self.assertEqual(sorted(actual), sorted(expected))
Example #6
0
 def _get_sample_variant(self):
     return vcfio.Variant(
         reference_name='19',
         start=11,
         end=12,
         reference_bases='C',
         alternate_bases=['A', 'TT'],
         names=['rs1', 'rs2'],
         quality=2,
         filters=['PASS'],
         info={
             'A1': 'some data',
             'A2': ['data1', 'data2']
         },
         calls=[
             vcfio.VariantCall(sample_id=hash_name('Sample1'),
                               genotype=[0, 1],
                               info={
                                   'GQ': 20,
                                   'HQ': [10, 20]
                               }),
             vcfio.VariantCall(sample_id=hash_name('Sample2'),
                               genotype=[1, 0],
                               info={
                                   'GQ': 10,
                                   'FLAG1': True
                               })
         ])
Example #7
0
    def test_densify_variants_pipeline(self):
        sample_ids = [
            hash_name('sample1'),
            hash_name('sample2'),
            hash_name('sample3')
        ]
        variant_calls = [
            vcfio.VariantCall(sample_id=sample_ids[0]),
            vcfio.VariantCall(sample_id=sample_ids[1]),
            vcfio.VariantCall(sample_id=sample_ids[2]),
        ]
        variants = [
            vcfio.Variant(calls=[variant_calls[0], variant_calls[1]]),
            vcfio.Variant(calls=[variant_calls[1], variant_calls[2]]),
        ]

        pipeline = TestPipeline()
        densified_variants = (
            pipeline
            | Create(variants)
            |
            'DensifyVariants' >> densify_variants.DensifyVariants(sample_ids))
        assert_that(densified_variants, asserts.has_sample_ids(sample_ids))

        pipeline.run()
Example #8
0
def _get_big_query_row():
    # type: (...) -> Dict[unicode, Any]
    """Returns one sample BigQuery row for testing."""
    row = {
        str(ColumnKeyConstants.REFERENCE_NAME):
        str('chr19'),
        str(ColumnKeyConstants.START_POSITION):
        11,
        str(ColumnKeyConstants.END_POSITION):
        12,
        str(ColumnKeyConstants.REFERENCE_BASES):
        'C',
        str(ColumnKeyConstants.NAMES): [str('rs1'), str('rs2')],
        str(ColumnKeyConstants.QUALITY):
        2,
        str(ColumnKeyConstants.FILTER): [str('PASS')],
        str(ColumnKeyConstants.CALLS): [{
            str(ColumnKeyConstants.CALLS_SAMPLE_ID):
            (str(hash_name('Sample1'))),
            str(ColumnKeyConstants.CALLS_GENOTYPE): [0, 1],
            str(ColumnKeyConstants.CALLS_PHASESET):
            str('*'),
            str('GQ'):
            20,
            str('FIR'): [10, 20]
        }, {
            str(ColumnKeyConstants.CALLS_SAMPLE_ID):
            (str(hash_name('Sample2'))),
            str(ColumnKeyConstants.CALLS_GENOTYPE): [0, 0],
            str(ColumnKeyConstants.CALLS_PHASESET):
            None,
            str('GQ'):
            10,
            str('FB'):
            True
        }],
        str(ColumnKeyConstants.ALTERNATE_BASES): [{
            str(ColumnKeyConstants.ALTERNATE_BASES_ALT):
            str('A'),
            str('IFR'):
            1,
            str('IFR2'):
            0.2
        }, {
            str(ColumnKeyConstants.ALTERNATE_BASES_ALT):
            str('TT'),
            str('IFR'):
            0.2,
            str('IFR2'):
            0.3
        }],
        str('IS'):
        str('some data'),
        str('ISR'): [str('data1'), str('data2')]
    }

    return row
Example #9
0
 def test_write_to_shards_pipeline(self):
     with temp_dir.TempDir() as tempdir:
         pipeline = TestPipeline()
         _ = (pipeline
              | Create(self._get_variants())
              | 'WriteToShards' >> write_variants_to_shards.WriteToShards(
                  tempdir.get_path(),
                  [hash_name('Sample 1'),
                   hash_name('Sample 2')]))
         pipeline.run()
    def test_merge_snp_with_non_variant(self):
        strategy = merge_with_non_variants_strategy.MergeWithNonVariantsStrategy(
            None, None, None)

        variant = vcfio.Variant(reference_name='1',
                                start=5,
                                end=6,
                                reference_bases='A',
                                alternate_bases=['C'],
                                names=['v'],
                                filters=['vf'],
                                quality=1)
        non_variant = vcfio.Variant(reference_name='1',
                                    start=0,
                                    end=10,
                                    reference_bases='G',
                                    alternate_bases=['<NON_REF>'],
                                    names=['nv'],
                                    filters=['nvf'],
                                    quality=2)

        call_1 = vcfio.VariantCall(sample_id=hash_name('1'), genotype=[1, 0])
        call_2 = vcfio.VariantCall(sample_id=hash_name('2'), genotype=[0, 0])
        variant.calls.append(call_1)
        non_variant.calls.append(call_2)
        expected_1 = vcfio.Variant(reference_name='1',
                                   start=0,
                                   end=5,
                                   alternate_bases=['<NON_REF>'],
                                   names=['nv'],
                                   filters=['nvf'],
                                   quality=2)
        expected_2 = vcfio.Variant(reference_name='1',
                                   start=6,
                                   end=10,
                                   alternate_bases=['<NON_REF>'],
                                   names=['nv'],
                                   filters=['nvf'],
                                   quality=2)
        expected_3 = vcfio.Variant(reference_name='1',
                                   start=5,
                                   end=6,
                                   reference_bases='A',
                                   alternate_bases=['C'],
                                   names=['v'],
                                   filters=['vf'],
                                   quality=1)
        expected_1.calls.append(call_2)
        expected_2.calls.append(call_2)
        expected_3.calls.append(call_1)
        expected_3.calls.append(call_2)
        actual = list(strategy.get_merged_variants([variant, non_variant]))
        expected = [expected_1, expected_2, expected_3]
        self.assertEqual(sorted(actual), sorted(expected))
    def test_omit_empty_sample_calls(self):
        variant = vcfio.Variant(
            reference_name='chr19',
            start=11,
            end=12,
            reference_bases='C',
            alternate_bases=[],
            names=['rs1', 'rs2'],
            quality=2,
            filters=['PASS'],
            info={},
            calls=[
                vcfio.VariantCall(sample_id=hash_name('Sample1'),
                                  info={'GQ': None}),
                vcfio.VariantCall(sample_id=hash_name('Sample2'),
                                  genotype=[1, 0],
                                  info={'GQ': 10}),
                vcfio.VariantCall(sample_id=hash_name('Sample3'),
                                  genotype=[
                                      vcfio.MISSING_GENOTYPE_VALUE,
                                      vcfio.MISSING_GENOTYPE_VALUE
                                  ])
            ])
        proc_variant = _get_processed_variant(variant)
        expected_row = {
            ColumnKeyConstants.REFERENCE_NAME:
            'chr19',
            ColumnKeyConstants.START_POSITION:
            11,
            ColumnKeyConstants.END_POSITION:
            12,
            ColumnKeyConstants.REFERENCE_BASES:
            'C',
            ColumnKeyConstants.ALTERNATE_BASES: [],
            ColumnKeyConstants.NAMES: ['rs1', 'rs2'],
            ColumnKeyConstants.QUALITY:
            2,
            ColumnKeyConstants.FILTER: ['PASS'],
            ColumnKeyConstants.CALLS: [{
                ColumnKeyConstants.CALLS_SAMPLE_ID:
                hash_name('Sample2'),
                ColumnKeyConstants.CALLS_GENOTYPE: [1, 0],
                ColumnKeyConstants.CALLS_PHASESET:
                None,
                'GQ':
                10
            }]
        }

        self.assertEqual([expected_row],
                         list(
                             self._row_generator.get_rows(
                                 proc_variant, omit_empty_sample_calls=True)))
    def test_merge_2_non_variants(self):
        strategy = merge_with_non_variants_strategy.MergeWithNonVariantsStrategy(
            None, None, None)

        non_variant_1 = vcfio.Variant(reference_name='1',
                                      start=0,
                                      end=10,
                                      alternate_bases=['<NON_REF>'],
                                      names=['nonv1', 'nonv2'],
                                      filters=['f1', 'f2'],
                                      quality=1)
        non_variant_2 = vcfio.Variant(reference_name='1',
                                      start=5,
                                      end=15,
                                      alternate_bases=['<NON_REF>'],
                                      names=['nonv2', 'nonv3'],
                                      filters=['f2', 'f3'],
                                      quality=2)
        call_1 = vcfio.VariantCall(sample_id=hash_name('1'), genotype=[0, 0])
        call_2 = vcfio.VariantCall(sample_id=hash_name('2'), genotype=[0, 0])
        non_variant_1.calls.append(call_1)
        non_variant_2.calls.append(call_2)
        expected_1 = vcfio.Variant(reference_name='1',
                                   start=0,
                                   end=5,
                                   alternate_bases=['<NON_REF>'],
                                   names=['nonv1', 'nonv2'],
                                   filters=['f1', 'f2'],
                                   quality=1)
        expected_2 = vcfio.Variant(reference_name='1',
                                   start=10,
                                   end=15,
                                   alternate_bases=['<NON_REF>'],
                                   names=['nonv2', 'nonv3'],
                                   filters=['f2', 'f3'],
                                   quality=2)
        expected_3 = vcfio.Variant(reference_name='1',
                                   start=5,
                                   end=10,
                                   alternate_bases=['<NON_REF>'],
                                   names=['nonv1', 'nonv2', 'nonv3'],
                                   filters=['f1', 'f2', 'f3'],
                                   quality=1)
        expected_1.calls.append(call_1)
        expected_2.calls.append(call_2)
        expected_3.calls.append(call_1)
        expected_3.calls.append(call_2)
        actual = list(
            strategy.get_merged_variants([non_variant_1, non_variant_2]))
        expected = [expected_1, expected_2, expected_3]

        self.assertEqual(sorted(actual), sorted(expected))
 def _get_sample_variants(self):
     variant_1 = vcfio.Variant(
         reference_name='19',
         start=11,
         end=12,
         reference_bases='C',
         alternate_bases=['A', 'TT'],
         names=['rs1'],
         quality=2,
         filters=['PASS'],
         info={
             'A1': 'some data',
             'A2': ['data1', 'data2']
         },
         calls=[
             vcfio.VariantCall(sample_id=hash_name('Sample1'),
                               genotype=[0, 1],
                               phaseset='*',
                               info={
                                   'GQ': 20,
                                   'HQ': [10, 20]
                               }),
             vcfio.VariantCall(sample_id=hash_name('Sample2'),
                               genotype=[1, 0],
                               info={
                                   'GQ': 10,
                                   'FLAG1': True
                               }),
         ])
     variant_2 = vcfio.Variant(
         reference_name='20',
         start=11,
         end=12,
         reference_bases='C',
         alternate_bases=['A', 'TT'],
         names=['rs1'],
         quality=20,
         filters=['q10'],
         info={
             'A1': 'some data2',
             'A3': ['data3', 'data4']
         },
         calls=[
             vcfio.VariantCall(sample_id=hash_name('Sample3'),
                               genotype=[1, 1]),
             vcfio.VariantCall(sample_id=hash_name('Sample4'),
                               genotype=[1, 0],
                               info={'GQ': 20}),
         ])
     return [variant_1, variant_2]
Example #14
0
 def _get_sample_variant_1(self, split_alternate_allele_info_fields=True):
   variant = vcfio.Variant(
       reference_name='chr19', start=11, end=12, reference_bases='C',
       alternate_bases=['A', 'TT'], names=['rs1', 'rs2'], quality=2,
       filters=['PASS'],
       info={'IFR': [0.1, 0.2], 'IFR2': [0.2, 0.3],
             'IS': 'some data', 'ISR': ['data1', 'data2']},
       calls=[
           vcfio.VariantCall(
               sample_id=hash_name('Sample1'), genotype=[0, 1], phaseset='*',
               info={'GQ': 20, 'FIR': [10, 20]}),
           vcfio.VariantCall(
               sample_id=hash_name('Sample2'), genotype=[1, 0],
               info={'GQ': 10, 'FB': True}),
       ]
   )
   header_num_dict = {'IFR': 'A', 'IFR2': 'A', 'IS': '1', 'ISR': '2'}
   row = {ColumnKeyConstants.REFERENCE_NAME: 'chr19',
          ColumnKeyConstants.START_POSITION: 11,
          ColumnKeyConstants.END_POSITION: 12,
          ColumnKeyConstants.REFERENCE_BASES: 'C',
          ColumnKeyConstants.NAMES: ['rs1', 'rs2'],
          ColumnKeyConstants.QUALITY: 2,
          ColumnKeyConstants.FILTER: ['PASS'],
          ColumnKeyConstants.CALLS: [
              {ColumnKeyConstants.CALLS_SAMPLE_ID: hash_name('Sample1'),
               ColumnKeyConstants.CALLS_GENOTYPE: [0, 1],
               ColumnKeyConstants.CALLS_PHASESET: '*',
               'GQ': 20, 'FIR': [10, 20]},
              {ColumnKeyConstants.CALLS_SAMPLE_ID: hash_name('Sample2'),
               ColumnKeyConstants.CALLS_GENOTYPE: [1, 0],
               ColumnKeyConstants.CALLS_PHASESET: None,
               'GQ': 10, 'FB': True}],
          'IS': 'some data',
          'ISR': ['data1', 'data2']}
   if split_alternate_allele_info_fields:
     row[ColumnKeyConstants.ALTERNATE_BASES] = [
         {ColumnKeyConstants.ALTERNATE_BASES_ALT:
          'A', 'IFR': 0.1, 'IFR2': 0.2},
         {ColumnKeyConstants.ALTERNATE_BASES_ALT:
          'TT', 'IFR': 0.2, 'IFR2': 0.3}]
   else:
     row[ColumnKeyConstants.ALTERNATE_BASES] = [
         {ColumnKeyConstants.ALTERNATE_BASES_ALT: 'A'},
         {ColumnKeyConstants.ALTERNATE_BASES_ALT: 'TT'}]
     row['IFR'] = [0.1, 0.2]
     row['IFR2'] = [0.2, 0.3]
   return variant, row, header_num_dict
Example #15
0
 def _get_sample_variant_with_empty_calls(self):
     variant = vcfio.Variant(reference_name='20',
                             start=123,
                             end=125,
                             reference_bases='CT',
                             alternate_bases=[],
                             filters=['q10', 's10'],
                             info={'II': 1234},
                             calls=[
                                 vcfio.VariantCall(
                                     sample_id=hash_name('EmptySample'),
                                     genotype=[],
                                     phaseset='*',
                                     info={}),
                             ])
     header_num_dict = {'II': '1'}
     row = {
         ColumnKeyConstants.REFERENCE_NAME: '20',
         ColumnKeyConstants.START_POSITION: 123,
         ColumnKeyConstants.END_POSITION: 125,
         ColumnKeyConstants.REFERENCE_BASES: 'CT',
         ColumnKeyConstants.ALTERNATE_BASES: [],
         ColumnKeyConstants.FILTER: ['q10', 's10'],
         ColumnKeyConstants.CALLS: [],
         'II': 1234
     }
     return variant, row, header_num_dict
Example #16
0
 def _get_sample_variant_1(self):
     variant = vcfio.Variant(reference_name='chr19',
                             start=11,
                             end=12,
                             reference_bases='C',
                             alternate_bases=['A', 'TT'],
                             names=['rs1', 'rs2'],
                             quality=2,
                             filters=['PASS'],
                             info={
                                 'IS': 'some data',
                                 'ISI': '1',
                                 'ISF': '1.0',
                                 'IF': 1.0,
                                 'IB': True,
                                 'IA': [1, 2]
                             },
                             calls=[
                                 vcfio.VariantCall(
                                     sample_id=hash_name('Sample1'),
                                     genotype=[0, 1],
                                     phaseset='*',
                                     info={
                                         'FI': 20,
                                         'FU': [10.0, 20.0]
                                     })
                             ])
     return variant
 def _get_bigquery_row_and_variant(self):
   row = {unicode(ColumnKeyConstants.REFERENCE_NAME): unicode('chr19'),
          unicode(ColumnKeyConstants.START_POSITION): 11,
          unicode(ColumnKeyConstants.END_POSITION): 12,
          unicode(ColumnKeyConstants.REFERENCE_BASES): 'C',
          unicode(ColumnKeyConstants.NAMES): ['rs1', 'rs2'],
          unicode(ColumnKeyConstants.QUALITY): 2,
          unicode(ColumnKeyConstants.FILTER): ['PASS'],
          unicode(ColumnKeyConstants.CALLS): [
              {unicode(ColumnKeyConstants.CALLS_SAMPLE_ID): (
                  hash_name('Sample1')),
               unicode(ColumnKeyConstants.CALLS_GENOTYPE): [0, 1],
               unicode(ColumnKeyConstants.CALLS_PHASESET): unicode('*'),
               unicode('GQ'): 20, unicode('FIR'): [10, 20]},
              {unicode(ColumnKeyConstants.CALLS_SAMPLE_ID): (
                  hash_name('Sample2')),
               unicode(ColumnKeyConstants.CALLS_GENOTYPE): [1, 0],
               unicode(ColumnKeyConstants.CALLS_PHASESET): None,
               unicode('GQ'): 10, unicode('FB'): True}
          ],
          unicode(ColumnKeyConstants.ALTERNATE_BASES): [
              {unicode(ColumnKeyConstants.ALTERNATE_BASES_ALT): unicode('A'),
               unicode('IFR'): None,
               unicode('IFR2'): 0.2},
              {unicode(ColumnKeyConstants.ALTERNATE_BASES_ALT): unicode('TT'),
               unicode('IFR'): 0.2,
               unicode('IFR2'): 0.3}
          ],
          unicode('IS'): unicode('some data'),
          unicode('ISR'): [unicode('data1'), unicode('data2')]}
   variant = vcfio.Variant(
       reference_name='chr19', start=11, end=12, reference_bases='C',
       alternate_bases=['A', 'TT'], names=['rs1', 'rs2'], quality=2,
       filters=['PASS'],
       info={'IFR': [0.2], 'IFR2': [0.2, 0.3],
             'IS': 'some data', 'ISR': ['data1', 'data2']},
       calls=[
           vcfio.VariantCall(
               sample_id=hash_name('Sample1'), genotype=[0, 1], phaseset='*',
               info={'GQ': 20, 'FIR': [10, 20]}),
           vcfio.VariantCall(
               sample_id=hash_name('Sample2'), genotype=[1, 0],
               info={'GQ': 10, 'FB': True})
       ]
   )
   return row, variant
Example #18
0
    def test_sample_ids_combiner_pipeline_duplicate_sample_ids(self):
        variant_call = vcfio.VariantCall(sample_id=hash_name('sample1'))
        variants = [vcfio.Variant(calls=[variant_call, variant_call])]

        pipeline = TestPipeline()
        _ = (pipeline
             | transforms.Create(variants)
             | 'CombineSampleIds' >> combine_sample_ids.SampleIdsCombiner()
             | combiners.ToList())
        with self.assertRaises(ValueError):
            pipeline.run()
Example #19
0
    def test_variant_to_bq_row_to_variant(self):
        variant = vcfio.Variant(
            reference_name='chr19',
            start=11,
            end=12,
            reference_bases='C',
            alternate_bases=['A', 'TT'],
            names=['rs1', 'rs2'],
            quality=2,
            filters=['PASS'],
            info={
                'IFR': [0.1, 0.2],
                'IFR2': [0.2, 0.3],
                'IS': 'some data',
                'ISR': ['data1', 'data2']
            },
            calls=[
                vcfio.VariantCall(sample_id=hash_name('Sample1'),
                                  genotype=[0, 1],
                                  phaseset='*',
                                  info={
                                      'GQ': 20,
                                      'FIR': [10, 20]
                                  }),
                vcfio.VariantCall(sample_id=hash_name('Sample2'),
                                  genotype=[1, 0],
                                  info={
                                      'GQ': 10,
                                      'FB': True
                                  }),
                vcfio.VariantCall(sample_id=hash_name('Sample3'),
                                  genotype=[vcfio.MISSING_GENOTYPE_VALUE])
            ])
        header_num_dict = {'IFR': 'A', 'IFR2': 'A', 'IS': '1', 'ISR': '2'}

        proc_variant = _get_processed_variant(variant, header_num_dict)
        row = list(self._row_generator.get_rows(proc_variant))
        converted_variant = self._variant_generator.convert_bq_row_to_variant(
            row[0])
        self.assertEqual(variant, converted_variant)
Example #20
0
    def test_get_variant_calls(self):
        variant_call_records = _get_big_query_row()[ColumnKeyConstants.CALLS]

        expected_calls = [
            vcfio.VariantCall(sample_id=hash_name('Sample1'),
                              genotype=[0, 1],
                              phaseset='*',
                              info={
                                  'GQ': 20,
                                  'FIR': [10, 20]
                              }),
            vcfio.VariantCall(sample_id=hash_name('Sample2'),
                              genotype=[1, 0],
                              info={
                                  'GQ': 10,
                                  'FB': True
                              }),
        ]

        self.assertEqual(
            expected_calls,
            self._variant_generator._get_variant_calls(variant_call_records))
    def test_get_merged_variants_move_info_to_calls(self):
        strategy = merge_with_non_variants_strategy.MergeWithNonVariantsStrategy(
            info_keys_to_move_to_calls_regex='^A1$',
            copy_quality_to_calls=False,
            copy_filter_to_calls=False)
        variants = self._get_sample_variants()

        # Test single variant merge.
        single_merged_variant = list(
            strategy.get_merged_variants([variants[0]]))[0]
        self.assertEqual([
            vcfio.VariantCall(sample_id=hash_name('Sample1'),
                              genotype=[0, 1],
                              info={
                                  'GQ': 20,
                                  'HQ': [10, 20],
                                  'A1': 'some data'
                              }),
            vcfio.VariantCall(sample_id=hash_name('Sample2'),
                              genotype=[1, 0],
                              info={
                                  'GQ': 10,
                                  'FLAG1': True,
                                  'A1': 'some data'
                              })
        ], single_merged_variant.calls)

        # Test multiple variant merge.
        merged_variant = list(strategy.get_merged_variants(variants))[0]
        self._assert_common_expected_merged_fields(merged_variant)
        self.assertEqual([
            vcfio.VariantCall(sample_id=hash_name('Sample1'),
                              genotype=[0, 1],
                              info={
                                  'GQ': 20,
                                  'HQ': [10, 20],
                                  'A1': 'some data'
                              }),
            vcfio.VariantCall(sample_id=hash_name('Sample2'),
                              genotype=[1, 0],
                              info={
                                  'GQ': 10,
                                  'FLAG1': True,
                                  'A1': 'some data'
                              }),
            vcfio.VariantCall(sample_id=hash_name('Sample3'),
                              genotype=[1, 1],
                              info={'A1': 'some data2'}),
            vcfio.VariantCall(sample_id=hash_name('Sample4'),
                              genotype=[1, 0],
                              info={
                                  'GQ': 20,
                                  'A1': 'some data2'
                              })
        ], merged_variant.calls)
        self.assertItemsEqual(['A2', 'A3'], merged_variant.info.keys())
        self.assertEqual(['data1', 'data2'], merged_variant.info['A2'])
        self.assertEqual(['data3', 'data4'], merged_variant.info['A3'])
    def test_get_merged_variants_no_custom_options(self):
        strategy = move_to_calls_strategy.MoveToCallsStrategy(
            info_keys_to_move_to_calls_regex=None,
            copy_quality_to_calls=False,
            copy_filter_to_calls=False)
        variants = self._get_sample_variants()

        # Test single variant merge.
        self.assertEqual([variants[0]],
                         strategy.get_merged_variants([variants[0]]))

        # Test multiple variant merge.
        merged_variant = strategy.get_merged_variants(variants)[0]
        self._assert_common_expected_merged_fields(merged_variant)
        self.assertEqual([
            vcfio.VariantCall(sample_id=hash_name('Sample1'),
                              genotype=[0, 1],
                              info={
                                  'GQ': 20,
                                  'HQ': [10, 20]
                              }),
            vcfio.VariantCall(sample_id=hash_name('Sample2'),
                              genotype=[1, 0],
                              info={
                                  'GQ': 10,
                                  'FLAG1': True
                              }),
            vcfio.VariantCall(sample_id=hash_name('Sample3'), genotype=[1, 1]),
            vcfio.VariantCall(sample_id=hash_name('Sample4'),
                              genotype=[1, 0],
                              info={'GQ': 20})
        ], merged_variant.calls)
        self.assertItemsEqual(['A1', 'A2', 'A3'], merged_variant.info.keys())
        self.assertTrue(merged_variant.info['A1'] in ('some data',
                                                      'some data2'))
        self.assertEqual(['data1', 'data2'], merged_variant.info['A2'])
        self.assertEqual(['data3', 'data4'], merged_variant.info['A3'])
    def test_merge_mnps(self):
        strategy = merge_with_non_variants_strategy.MergeWithNonVariantsStrategy(
            None, None, None)

        variant_1 = vcfio.Variant(reference_name='1',
                                  start=5,
                                  end=8,
                                  reference_bases='GTC',
                                  alternate_bases=['G', 'GTCG'],
                                  names=['mnp1', 'mnp2'],
                                  filters=['f1', 'f2'],
                                  quality=1)
        variant_2 = vcfio.Variant(reference_name='1',
                                  start=5,
                                  end=8,
                                  reference_bases='GTC',
                                  alternate_bases=['G', 'GTCG'],
                                  names=['mnp2', 'mnp3'],
                                  filters=['f2', 'f3'],
                                  quality=2)
        call_1 = vcfio.VariantCall(sample_id=hash_name('1'), genotype=[1, 2])
        call_2 = vcfio.VariantCall(sample_id=hash_name('2'), genotype=[2, 0])
        expected = vcfio.Variant(reference_name='1',
                                 start=5,
                                 end=8,
                                 reference_bases='GTC',
                                 alternate_bases=['G', 'GTCG'],
                                 names=['mnp1', 'mnp2', 'mnp3'],
                                 filters=['f1', 'f2', 'f3'],
                                 quality=2)
        expected.calls.append(call_1)
        expected.calls.append(call_2)

        variant_1.calls.append(call_1)
        variant_2.calls.append(call_2)
        actual = list(strategy.get_merged_variants([variant_1, variant_2]))
        self.assertEqual(actual, [expected])
Example #24
0
    def test_convert_bq_row_to_variant(self):
        row = _get_big_query_row()
        expected_variant = vcfio.Variant(
            reference_name='chr19',
            start=11,
            end=12,
            reference_bases='C',
            alternate_bases=['A', 'TT'],
            names=['rs1', 'rs2'],
            quality=2,
            filters=['PASS'],
            info={
                'IFR': [1, 0.2],
                'IFR2': [0.2, 0.3],
                'IS': 'some data',
                'ISR': ['data1', 'data2']
            },
            calls=[
                vcfio.VariantCall(sample_id=hash_name('Sample1'),
                                  genotype=[0, 1],
                                  phaseset='*',
                                  info={
                                      'GQ': 20,
                                      'FIR': [10, 20]
                                  }),
                vcfio.VariantCall(sample_id=hash_name('Sample2'),
                                  genotype=[1, 0],
                                  info={
                                      'GQ': 10,
                                      'FB': True
                                  })
            ])

        self.assertEqual(
            expected_variant,
            self._variant_generator.convert_bq_row_to_variant(row))
Example #25
0
    def test_sample_ids_combiner_pipeline_preserve_sample_order_error(self):
        sample_ids = [
            hash_name('sample1'),
            hash_name('sample2'),
            hash_name('sample3')
        ]
        variant_calls = [
            vcfio.VariantCall(sample_id=sample_ids[0]),
            vcfio.VariantCall(sample_id=sample_ids[1]),
            vcfio.VariantCall(sample_id=sample_ids[2])
        ]
        variants = [
            vcfio.Variant(calls=[variant_calls[0], variant_calls[1]]),
            vcfio.Variant(calls=[variant_calls[1], variant_calls[2]])
        ]

        pipeline = TestPipeline()
        _ = (pipeline
             | transforms.Create(variants)
             | 'CombineSampleIds' >>
             combine_sample_ids.SampleIdsCombiner(preserve_sample_order=True)
             | combiners.ToList())
        with self.assertRaises(ValueError):
            pipeline.run()
Example #26
0
 def _get_sample_variant_2(self):
     variant = vcfio.Variant(reference_name='20',
                             start=123,
                             end=125,
                             reference_bases='CT',
                             alternate_bases=[],
                             filters=['q10', 's10'],
                             info={'IS_2': 'some data'},
                             calls=[
                                 vcfio.VariantCall(
                                     sample_id=hash_name('Sample1'),
                                     genotype=[0, 1],
                                     phaseset='*',
                                     info={'FI_2': 20})
                             ])
     return variant
    def test_get_merged_variants_move_everything_to_calls(self):
        strategy = move_to_calls_strategy.MoveToCallsStrategy(
            info_keys_to_move_to_calls_regex='.*',
            copy_quality_to_calls=True,
            copy_filter_to_calls=True)
        variants = self._get_sample_variants()

        # Test single variant merge.
        single_merged_variant = strategy.get_merged_variants([variants[0]])[0]
        self.assertEqual([
            vcfio.VariantCall(sample_id=hash_name('Sample1'),
                              genotype=[0, 1],
                              info={
                                  'GQ': 20,
                                  'HQ': [10, 20],
                                  'A1': 'some data',
                                  'A2': ['data1', 'data2'],
                                  ColumnKeyConstants.QUALITY: 2,
                                  ColumnKeyConstants.FILTER: ['PASS']
                              }),
            vcfio.VariantCall(sample_id=hash_name('Sample2'),
                              genotype=[1, 0],
                              info={
                                  'GQ': 10,
                                  'FLAG1': True,
                                  'A1': 'some data',
                                  'A2': ['data1', 'data2'],
                                  ColumnKeyConstants.QUALITY: 2,
                                  ColumnKeyConstants.FILTER: ['PASS']
                              })
        ], single_merged_variant.calls)

        merged_variant = strategy.get_merged_variants(variants)[0]
        self._assert_common_expected_merged_fields(merged_variant)
        self.assertEqual([
            vcfio.VariantCall(sample_id=hash_name('Sample1'),
                              genotype=[0, 1],
                              info={
                                  'GQ': 20,
                                  'HQ': [10, 20],
                                  'A1': 'some data',
                                  'A2': ['data1', 'data2'],
                                  ColumnKeyConstants.QUALITY: 2,
                                  ColumnKeyConstants.FILTER: ['PASS']
                              }),
            vcfio.VariantCall(sample_id=hash_name('Sample2'),
                              genotype=[1, 0],
                              info={
                                  'GQ': 10,
                                  'FLAG1': True,
                                  'A1': 'some data',
                                  'A2': ['data1', 'data2'],
                                  ColumnKeyConstants.QUALITY: 2,
                                  ColumnKeyConstants.FILTER: ['PASS']
                              }),
            vcfio.VariantCall(sample_id=hash_name('Sample3'),
                              genotype=[1, 1],
                              info={
                                  'A1': 'some data2',
                                  'A3': ['data3', 'data4'],
                                  ColumnKeyConstants.QUALITY: 20,
                                  ColumnKeyConstants.FILTER: ['q10']
                              }),
            vcfio.VariantCall(sample_id=hash_name('Sample4'),
                              genotype=[1, 0],
                              info={
                                  'GQ': 20,
                                  'A1': 'some data2',
                                  'A3': ['data3', 'data4'],
                                  ColumnKeyConstants.QUALITY: 20,
                                  ColumnKeyConstants.FILTER: ['q10']
                              })
        ], merged_variant.calls)
        self.assertEqual([], merged_variant.info.keys())
    def test_schema_conflict_in_format_field_type(self):
        variant = vcfio.Variant(
            reference_name='chr19',
            start=11,
            end=12,
            reference_bases='CT',
            alternate_bases=[],
            filters=[],
            calls=[
                vcfio.VariantCall(sample_id=hash_name('Sample1'),
                                  genotype=[0, 1],
                                  phaseset='*',
                                  info={
                                      'FB': '',
                                      'FI': 1.0,
                                      'FSR': [1, 2]
                                  }),
                vcfio.VariantCall(sample_id=hash_name('Sample2'),
                                  genotype=[1, 0],
                                  info={
                                      'FB': 1,
                                      'FI': True,
                                      'FSR': [1.0, 2.0]
                                  })
            ])
        proc_variant = _get_processed_variant(variant)
        expected_row = {
            ColumnKeyConstants.REFERENCE_NAME:
            'chr19',
            ColumnKeyConstants.START_POSITION:
            11,
            ColumnKeyConstants.END_POSITION:
            12,
            ColumnKeyConstants.REFERENCE_BASES:
            'CT',
            ColumnKeyConstants.ALTERNATE_BASES: [],
            ColumnKeyConstants.CALLS: [{
                ColumnKeyConstants.CALLS_SAMPLE_ID:
                hash_name('Sample1'),
                ColumnKeyConstants.CALLS_GENOTYPE: [0, 1],
                ColumnKeyConstants.CALLS_PHASESET:
                '*',
                'FB':
                False,
                'FI':
                1,
                'FSR': ['1', '2']
            }, {
                ColumnKeyConstants.CALLS_SAMPLE_ID:
                hash_name('Sample2'),
                ColumnKeyConstants.CALLS_GENOTYPE: [1, 0],
                ColumnKeyConstants.CALLS_PHASESET:
                None,
                'FB':
                True,
                'FI':
                1,
                'FSR': ['1.0', '2.0']
            }],
        }

        self.assertEqual(
            [expected_row],
            list(
                self._row_generator.get_rows(proc_variant,
                                             allow_incompatible_records=True)))

        with self.assertRaises(ValueError):
            variant = vcfio.Variant(
                reference_name='chr19',
                start=11,
                end=12,
                reference_bases='CT',
                alternate_bases=[],
                filters=[],
                # String cannot be casted to integer.
                calls=[
                    vcfio.VariantCall(sample_id=hash_name('Sample1'),
                                      genotype=[0, 1],
                                      phaseset='*',
                                      info={'FI': 'string_for_int_field'})
                ])
            proc_variant = _get_processed_variant(variant)
            list(
                self._row_generator.get_rows(proc_variant,
                                             allow_incompatible_records=True))
            self.fail(
                'String data for an integer schema must cause an exception')
    def test_schema_conflict_in_format_field_number(self):
        variant = vcfio.Variant(
            reference_name='chr19',
            start=11,
            end=12,
            reference_bases='CT',
            alternate_bases=[],
            filters=[],
            calls=[
                vcfio.VariantCall(sample_id=hash_name('Sample1'),
                                  genotype=[0, 1],
                                  phaseset='*',
                                  info={
                                      'FB': [1, 2],
                                      'FI': [1, 2],
                                      'FSR': 'str'
                                  }),
                vcfio.VariantCall(sample_id=hash_name('Sample2'),
                                  genotype=[1, 0],
                                  info={
                                      'FB': [],
                                      'FI': [],
                                      'FSR': ''
                                  })
            ])
        proc_variant = _get_processed_variant(variant)
        expected_row = {
            ColumnKeyConstants.REFERENCE_NAME:
            'chr19',
            ColumnKeyConstants.START_POSITION:
            11,
            ColumnKeyConstants.END_POSITION:
            12,
            ColumnKeyConstants.REFERENCE_BASES:
            'CT',
            ColumnKeyConstants.ALTERNATE_BASES: [],
            ColumnKeyConstants.CALLS: [{
                ColumnKeyConstants.CALLS_SAMPLE_ID:
                hash_name('Sample1'),
                ColumnKeyConstants.CALLS_GENOTYPE: [0, 1],
                ColumnKeyConstants.CALLS_PHASESET:
                '*',
                'FB':
                True,
                'FI':
                1,
                'FSR': ['str']
            }, {
                ColumnKeyConstants.CALLS_SAMPLE_ID:
                hash_name('Sample2'),
                ColumnKeyConstants.CALLS_GENOTYPE: [1, 0],
                ColumnKeyConstants.CALLS_PHASESET:
                None,
                'FB':
                False,
                'FI':
                None,
                'FSR': ['']
            }],
        }

        self.assertEqual(
            [expected_row],
            list(
                self._row_generator.get_rows(proc_variant,
                                             allow_incompatible_records=True)))
 def _default_variant_call(self):
     return vcfio.VariantCall(sample_id=hash_name('Sample1'),
                              genotype=[1, 0],
                              phaseset=vcfio.DEFAULT_PHASESET_VALUE,
                              info={'GQ': 48})