Exemple #1
0
    def test_sample_ids_combiner_pipeline_preserve_sample_order(self):
        sample_ids = [
            hash_name('sample2'),
            hash_name('sample1'),
            hash_name('sample3')
        ]
        variant_calls = [
            vcfio.VariantCall(sample_id=sample_ids[0]),
            vcfio.VariantCall(sample_id=sample_ids[1]),
            vcfio.VariantCall(sample_id=sample_ids[2])
        ]
        variants = [
            vcfio.Variant(
                calls=[variant_calls[0], variant_calls[1], variant_calls[2]]),
            vcfio.Variant(
                calls=[variant_calls[0], variant_calls[1], variant_calls[2]])
        ]

        pipeline = TestPipeline()
        combined_sample_ids = (
            pipeline
            | transforms.Create(variants)
            | 'CombineSampleIds' >>
            combine_sample_ids.SampleIdsCombiner(preserve_sample_order=True)
            | combiners.ToList())
        assert_that(combined_sample_ids, equal_to([sample_ids]))
        pipeline.run()
Exemple #2
0
    def test_merge_many_different_alternates(self):
        strategy = merge_with_non_variants_strategy.MergeWithNonVariantsStrategy(
            None, None, None)

        variant_1 = vcfio.Variant(reference_name='1',
                                  start=1,
                                  end=2,
                                  reference_bases='A',
                                  alternate_bases=['C'])
        variant_2 = vcfio.Variant(reference_name='1',
                                  start=1,
                                  end=2,
                                  reference_bases='A',
                                  alternate_bases=['G'])
        variant_3 = vcfio.Variant(reference_name='1',
                                  start=1,
                                  end=2,
                                  reference_bases='A',
                                  alternate_bases=['T'])
        variant_1.calls.append(
            vcfio.VariantCall(name='Sample1', genotype=[1, 0]))
        variant_2.calls.append(
            vcfio.VariantCall(name='Sample2', genotype=[1, 0]))
        variant_3.calls.append(
            vcfio.VariantCall(name='Sample3', genotype=[1, 0]))
        variants = [variant_1, variant_2, variant_3]
        merged_variants = list(strategy.get_merged_variants(variants))
        self.assertEqual(sorted(merged_variants), sorted(variants))
def _get_sample_variant_3(file_name='',
                          use_1_based_coordinate=False,
                          use_hashing=True):
    """Get third sample variant.

  Features:
    symbolic alternate
    no calls for sample 2
    alternate phaseset
  """
    hash_name_method = _get_hashing_function(file_name, use_hashing)
    variant = vcfio.Variant(reference_name='19',
                            start=12 if use_1_based_coordinate else 11,
                            end=12,
                            reference_bases='C',
                            alternate_bases=['<SYMBOLIC>'],
                            quality=49,
                            filters=['q10'],
                            info={'AF': [0.5]})
    variant.calls.append(
        vcfio.VariantCall(sample_id=hash_name_method('Sample1'),
                          genotype=[0, 1],
                          phaseset='1',
                          info={'GQ': 45}))
    variant.calls.append(
        vcfio.VariantCall(sample_id=hash_name_method('Sample2'),
                          genotype=[vcfio.MISSING_GENOTYPE_VALUE],
                          info={'GQ': None}))
    return variant
Exemple #4
0
    def test_schema_conflict_in_format_field_number(self):
        variant = vcfio.Variant(reference_name='chr19',
                                start=11,
                                end=12,
                                reference_bases='CT',
                                alternate_bases=[],
                                filters=[],
                                calls=[
                                    vcfio.VariantCall(name='Sample1',
                                                      genotype=[0, 1],
                                                      phaseset='*',
                                                      info={
                                                          'FB': [1, 2],
                                                          'FI': [1, 2],
                                                          'FSR': 'str'
                                                      }),
                                    vcfio.VariantCall(name='Sample2',
                                                      genotype=[1, 0],
                                                      info={
                                                          'FB': [],
                                                          'FI': [],
                                                          'FSR': ''
                                                      })
                                ])
        proc_variant = _get_processed_variant(variant)
        expected_row = {
            ColumnKeyConstants.REFERENCE_NAME:
            'chr19',
            ColumnKeyConstants.START_POSITION:
            11,
            ColumnKeyConstants.END_POSITION:
            12,
            ColumnKeyConstants.REFERENCE_BASES:
            'CT',
            ColumnKeyConstants.ALTERNATE_BASES: [],
            ColumnKeyConstants.CALLS: [
                {
                    ColumnKeyConstants.CALLS_NAME: 'Sample1',
                    ColumnKeyConstants.CALLS_GENOTYPE: [0, 1],
                    ColumnKeyConstants.CALLS_PHASESET: '*',
                    'FB': True,
                    'FI': 1,
                    'FSR': ['str']
                },
                {
                    ColumnKeyConstants.CALLS_NAME: 'Sample2',
                    ColumnKeyConstants.CALLS_GENOTYPE: [1, 0],
                    ColumnKeyConstants.CALLS_PHASESET: None,
                    'FB': False,
                    'FI': None,
                    'FSR': ['']
                },
            ],
        }

        self.assertEqual(
            [expected_row],
            list(
                self._row_generator.get_rows(proc_variant,
                                             allow_incompatible_records=True)))
Exemple #5
0
def _get_sample_variant_3():
    """Get third sample variant.

  Features:
    symbolic alternate
    no calls for sample 2
    alternate phaseset
  """
    vcf_line = ('19	12	.	C	<SYMBOLIC>	49	q10	AF=0.5	GT:PS:GQ	0|1:1:45	'
                '.:.:.\n')
    variant = vcfio.Variant(
        reference_name='19',
        start=11,
        end=12,
        reference_bases='C',
        alternate_bases=['<SYMBOLIC>'],
        quality=49,
        filters=['q10'],
        info={'AF': vcfio.VariantInfo(data=[0.5], field_count='A')})
    variant.calls.append(
        vcfio.VariantCall(name='Sample1',
                          genotype=[0, 1],
                          phaseset='1',
                          info={'GQ': 45}))
    variant.calls.append(
        vcfio.VariantCall(name='Sample2',
                          genotype=[vcfio.MISSING_GENOTYPE_VALUE],
                          info={'GQ': None}))
    return variant, vcf_line
Exemple #6
0
 def _get_sample_variants(self):
   variant_1 = vcfio.Variant(
       reference_name='19', start=11, end=12, reference_bases='C',
       alternate_bases=['A', 'TT'], names=['rs1'], quality=2,
       filters=['PASS'],
       info={'A1': 'some data', 'A2': ['data1', 'data2']},
       calls=[
           vcfio.VariantCall(
               name='Sample1', genotype=[0, 1], phaseset='*',
               info={'GQ': 20, 'HQ': [10, 20]}),
           vcfio.VariantCall(
               name='Sample2', genotype=[1, 0],
               info={'GQ': 10, 'FLAG1': True}),
       ]
   )
   variant_2 = vcfio.Variant(
       reference_name='20', start=11, end=12, reference_bases='C',
       alternate_bases=['A', 'TT'], names=['rs1'], quality=20,
       filters=['q10'],
       info={'A1': 'some data2', 'A3': ['data3', 'data4']},
       calls=[
           vcfio.VariantCall(name='Sample3', genotype=[1, 1]),
           vcfio.VariantCall(
               name='Sample4', genotype=[1, 0],
               info={'GQ': 20}),
       ]
   )
   return [variant_1, variant_2]
Exemple #7
0
def _get_sample_variant_1():
    """Get first sample variant.

  Features:
    multiple alternates
    not phased
    multiple names
  """
    vcf_line = ('20	1234	rs123;rs2	C	A,T	50	PASS	AF=0.5,0.1;NS=1	'
                'GT:GQ	0/0:48	1/0:20\n')
    variant = vcfio.Variant(reference_name='20',
                            start=1233,
                            end=1234,
                            reference_bases='C',
                            alternate_bases=['A', 'T'],
                            names=['rs123', 'rs2'],
                            quality=50,
                            filters=['PASS'],
                            info={
                                'AF':
                                vcfio.VariantInfo(data=[0.5, 0.1],
                                                  field_count='A'),
                                'NS':
                                vcfio.VariantInfo(data=1, field_count='1')
                            })
    variant.calls.append(
        vcfio.VariantCall(name='Sample1', genotype=[0, 0], info={'GQ': 48}))
    variant.calls.append(
        vcfio.VariantCall(name='Sample2', genotype=[1, 0], info={'GQ': 20}))
    return variant, vcf_line
Exemple #8
0
def _get_sample_variant_2():
    """Get second sample variant.

  Features:
    multiple references
    no alternate
    phased
    multiple filters
    missing format field
  """
    vcf_line = ('19	123	rs1234	GTC	.	40	q10;s50	NS=2	GT:GQ	1|0:48	0/1:.\n')
    variant = vcfio.Variant(
        reference_name='19',
        start=122,
        end=125,
        reference_bases='GTC',
        alternate_bases=[],
        names=['rs1234'],
        quality=40,
        filters=['q10', 's50'],
        info={'NS': vcfio.VariantInfo(data=2, field_count='1')})
    variant.calls.append(
        vcfio.VariantCall(name='Sample1',
                          genotype=[1, 0],
                          phaseset=vcfio.DEFAULT_PHASESET_VALUE,
                          info={'GQ': 48}))
    variant.calls.append(
        vcfio.VariantCall(name='Sample2', genotype=[0, 1], info={'GQ': None}))
    return variant, vcf_line
 def _get_sample_unmerged_variants(self):
     # Start/end are different from merged variants.
     variant_1 = vcfio.Variant(reference_name='19',
                               start=123,
                               end=125,
                               reference_bases='C',
                               alternate_bases=['A', 'TT'],
                               names=['rs2'],
                               calls=[
                                   vcfio.VariantCall(
                                       sample_id=hash_name('Unmerged1'),
                                       genotype=[0, 1])
                               ])
     # Ordering of alternate_bases is different from merged variants.
     variant_2 = vcfio.Variant(reference_name='19',
                               start=11,
                               end=12,
                               reference_bases='C',
                               alternate_bases=['TT', 'A'],
                               names=['rs3'],
                               calls=[
                                   vcfio.VariantCall(
                                       sample_id=hash_name('Unmerged2'),
                                       genotype=[0, 1])
                               ])
     return [variant_1, variant_2]
Exemple #10
0
def _get_sample_variant_1(file_name='', use_1_based_coordinate=False,
                          use_hashing=True, move_hom_ref_calls=False):
  """Get first sample variant.

  Features:
    multiple alternates
    not phased
    multiple names
    utf-8 encoded
  """
  hash_name_method = _get_hashing_function(file_name, use_hashing)
  variant = vcfio.Variant(
      reference_name='20', start=1233 + use_1_based_coordinate, end=1234,
      reference_bases='C', alternate_bases=['A', 'T'], names=['rs123', 'rs2'],
      quality=50, filters=['PASS'],
      hom_ref_calls=([('Sample1', hash_name_method('Sample1'))] if
                      move_hom_ref_calls else None),
      info={'AF': [0.5, 0.1], 'NS': 1, 'SVTYPE': ['BÑD']})
  if not move_hom_ref_calls:
    variant.calls.append(
        vcfio.VariantCall(sample_id=hash_name_method('Sample1'), name='Sample1',
                          genotype=[0, 0], info={'GQ': 48}))
  variant.calls.append(
      vcfio.VariantCall(sample_id=hash_name_method('Sample2'), name='Sample2',
                        genotype=[1, 0], info={'GQ': 20}))

  return variant
 def _get_sample_variant(self):
     return vcfio.Variant(reference_name='19',
                          start=11,
                          end=12,
                          reference_bases='C',
                          alternate_bases=['A', 'TT'],
                          names=['rs1', 'rs2'],
                          quality=2,
                          filters=['PASS'],
                          info={
                              'A1': vcfio.VariantInfo('some data', '1'),
                              'A2': vcfio.VariantInfo(['data1', 'data2'],
                                                      'A')
                          },
                          calls=[
                              vcfio.VariantCall(name='Sample1',
                                                genotype=[0, 1],
                                                info={
                                                    'GQ': 20,
                                                    'HQ': [10, 20]
                                                }),
                              vcfio.VariantCall(name='Sample2',
                                                genotype=[1, 0],
                                                info={
                                                    'GQ': 10,
                                                    'FLAG1': True
                                                })
                          ])
Exemple #12
0
def _get_sample_variant_2(file_name='', use_1_based_coordinate=False,
                          use_hashing=True, move_hom_ref_calls=False):
  """Get second sample variant.

  Features:
    multiple references
    no alternate
    phased
    multiple filters
    missing format field
  """
  hash_name_method = _get_hashing_function(file_name, use_hashing)
  variant = vcfio.Variant(
      reference_name='19',
      start=122 + use_1_based_coordinate, end=125, reference_bases='GTC',
      alternate_bases=[], names=['rs1234'], quality=40,
      filters=['q10', 's50'], hom_ref_calls=[] if move_hom_ref_calls else None,
      info={'NS': 2})
  variant.calls.append(
      vcfio.VariantCall(sample_id=hash_name_method('Sample1'), name='Sample1',
                        genotype=[-1, 0], phaseset=vcfio.DEFAULT_PHASESET_VALUE,
                        info={'GQ': 48}))
  variant.calls.append(
      vcfio.VariantCall(sample_id=hash_name_method('Sample2'), name='Sample2',
                        genotype=[0, -1], info={'GQ': None}))
  return variant
Exemple #13
0
    def test_overlapping_three_non_variants(self):
        strategy = merge_with_non_variants_strategy.MergeWithNonVariantsStrategy(
            None, None, None)
        non_variant_1 = vcfio.Variant(reference_name='1', start=0, end=10)
        non_variant_2 = vcfio.Variant(reference_name='1', start=3, end=5)
        non_variant_3 = vcfio.Variant(reference_name='1', start=4, end=9)
        call_1 = vcfio.VariantCall('1', [0, 0])
        call_2 = vcfio.VariantCall('2', [0, 0])
        call_3 = vcfio.VariantCall('3', [0, 0])
        non_variant_1.calls.append(call_1)
        non_variant_2.calls.append(call_2)
        non_variant_3.calls.append(call_3)

        expected_1 = vcfio.Variant(reference_name='1', start=0, end=3)
        expected_2 = vcfio.Variant(reference_name='1', start=3, end=4)
        expected_3 = vcfio.Variant(reference_name='1', start=4, end=5)
        expected_4 = vcfio.Variant(reference_name='1', start=5, end=9)
        expected_5 = vcfio.Variant(reference_name='1', start=9, end=10)
        expected_1.calls.append(call_1)
        expected_2.calls.append(call_1)
        expected_2.calls.append(call_2)
        expected_3.calls.append(call_1)
        expected_3.calls.append(call_2)
        expected_3.calls.append(call_3)
        expected_4.calls.append(call_1)
        expected_4.calls.append(call_3)
        expected_5.calls.append(call_1)
        expected = [expected_1, expected_2, expected_3, expected_4, expected_5]
        actual = list(
            strategy.get_merged_variants(
                [non_variant_1, non_variant_2, non_variant_3]))
        self.assertEqual(sorted(actual), sorted(expected))
Exemple #14
0
    def test_non_variant_split_by_snp(self):
        strategy = merge_with_non_variants_strategy.MergeWithNonVariantsStrategy(
            None, None, None)
        non_variant = vcfio.Variant(reference_name='1', start=0, end=10)
        variant = vcfio.Variant(reference_name='1',
                                start=5,
                                end=6,
                                reference_bases='C',
                                alternate_bases=['A'])
        call_1 = vcfio.VariantCall(name='1', genotype=[0, 0])
        call_2 = vcfio.VariantCall(name='2', genotype=[1, 0])
        non_variant.calls.append(call_1)
        variant.calls.append(call_2)
        expected_1 = vcfio.Variant(reference_name='1', start=0, end=5)
        expected_2 = vcfio.Variant(reference_name='1',
                                   start=5,
                                   end=6,
                                   reference_bases='C',
                                   alternate_bases=['A'])
        expected_3 = vcfio.Variant(reference_name='1', start=6, end=10)
        expected_1.calls.append(call_1)
        expected_2.calls.append(call_1)
        expected_2.calls.append(call_2)
        expected_3.calls.append(call_1)

        actual = list(strategy.get_merged_variants([non_variant, variant]))
        expected = [expected_1, expected_2, expected_3]
        self.assertEqual(sorted(actual), sorted(expected))
Exemple #15
0
    def test_densify_variants_pipeline(self):
        sample_ids = [
            hash_name('sample1'),
            hash_name('sample2'),
            hash_name('sample3')
        ]
        variant_calls = [
            vcfio.VariantCall(sample_id=sample_ids[0]),
            vcfio.VariantCall(sample_id=sample_ids[1]),
            vcfio.VariantCall(sample_id=sample_ids[2]),
        ]
        variants = [
            vcfio.Variant(calls=[variant_calls[0], variant_calls[1]]),
            vcfio.Variant(calls=[variant_calls[1], variant_calls[2]]),
        ]

        pipeline = TestPipeline()
        densified_variants = (
            pipeline
            | Create(variants)
            |
            'DensifyVariants' >> densify_variants.DensifyVariants(sample_ids))
        assert_that(densified_variants, asserts.has_sample_ids(sample_ids))

        pipeline.run()
Exemple #16
0
  def test_omit_empty_sample_calls(self):
    variant = vcfio.Variant(
        reference_name='chr19', start=11, end=12, reference_bases='C',
        alternate_bases=[], names=['rs1', 'rs2'], quality=2,
        filters=['PASS'],
        info={},
        calls=[
            vcfio.VariantCall(
                name='Sample1', info={'GQ': None}),
            vcfio.VariantCall(
                name='Sample2', genotype=[1, 0],
                info={'GQ': 10}),
            vcfio.VariantCall(
                name='Sample3', genotype=[vcfio.MISSING_GENOTYPE_VALUE,
                                          vcfio.MISSING_GENOTYPE_VALUE])])
    expected_row = {
        ColumnKeyConstants.REFERENCE_NAME: 'chr19',
        ColumnKeyConstants.START_POSITION: 11,
        ColumnKeyConstants.END_POSITION: 12,
        ColumnKeyConstants.REFERENCE_BASES: 'C',
        ColumnKeyConstants.ALTERNATE_BASES: [],
        ColumnKeyConstants.NAMES: ['rs1', 'rs2'],
        ColumnKeyConstants.QUALITY: 2,
        ColumnKeyConstants.FILTER: ['PASS'],
        ColumnKeyConstants.CALLS: [
            {ColumnKeyConstants.CALLS_NAME: 'Sample2',
             ColumnKeyConstants.CALLS_GENOTYPE: [1, 0],
             ColumnKeyConstants.CALLS_PHASESET: None,
             'GQ': 10}]}

    self.assertEqual(
        [expected_row],
        self._get_row_list_from_variant(variant,
                                        omit_empty_sample_calls=True))
 def test_convert_bq_row_to_variant(self):
     row = self._get_big_query_row()
     expected_variant = vcfio.Variant(reference_name='chr19',
                                      start=11,
                                      end=12,
                                      reference_bases='C',
                                      alternate_bases=['A', 'TT'],
                                      names=['rs1', 'rs2'],
                                      quality=2,
                                      filters=['PASS'],
                                      info={
                                          'IFR': [0.2],
                                          'IFR2': [0.2, 0.3],
                                          'IS': 'some data',
                                          'ISR': ['data1', 'data2']
                                      },
                                      calls=[
                                          vcfio.VariantCall(name='Sample1',
                                                            genotype=[0, 1],
                                                            phaseset='*',
                                                            info={
                                                                'GQ': 20,
                                                                'FIR':
                                                                [10, 20]
                                                            }),
                                          vcfio.VariantCall(name='Sample2',
                                                            genotype=[1, 0],
                                                            info={
                                                                'GQ': 10,
                                                                'FB': True
                                                            })
                                      ])
     bq_to_variant = bigquery_to_variant.BigQueryToVariant()
     self.assertEqual(expected_variant,
                      bq_to_variant._convert_bq_row_to_variant(row))
Exemple #18
0
  def test_get_merged_variants_no_custom_options(self):
    strategy = merge_with_non_variants_strategy.MergeWithNonVariantsStrategy(
        info_keys_to_move_to_calls_regex=None,
        copy_quality_to_calls=False,
        copy_filter_to_calls=False)
    variants = self._get_sample_variants()

    actual = list(strategy.get_merged_variants([variants[0]]))
    # Test single variant merge.
    self.assertEqual([variants[0]], actual)

    # Test multiple variant merge.
    merged_variant = list(strategy.get_merged_variants(variants))[0]
    self._assert_common_expected_merged_fields(merged_variant)
    self.assertEqual(
        [vcfio.VariantCall(name='Sample1', genotype=[0, 1],
                           info={'GQ': 20, 'HQ': [10, 20]}),
         vcfio.VariantCall(name='Sample2', genotype=[1, 0],
                           info={'GQ': 10, 'FLAG1': True}),
         vcfio.VariantCall(name='Sample3', genotype=[1, 1]),
         vcfio.VariantCall(name='Sample4', genotype=[1, 0], info={'GQ': 20})],
        merged_variant.calls)
    self.assertItemsEqual(['A1', 'A2', 'A3'], merged_variant.info.keys())
    self.assertTrue(
        merged_variant.info['A1'] in ('some data', 'some data2'))
    self.assertEqual(['data1', 'data2'], merged_variant.info['A2'])
    self.assertEqual(['data3', 'data4'], merged_variant.info['A3'])
Exemple #19
0
  def test_merge_snp_with_non_variant(self):
    strategy = merge_with_non_variants_strategy.MergeWithNonVariantsStrategy(
        None, None, None)

    variant = vcfio.Variant(
        reference_name='1',
        start=5,
        end=6,
        reference_bases='A',
        alternate_bases=['C'],
        names=['v'],
        filters=['vf'],
        quality=1)
    non_variant = vcfio.Variant(
        reference_name='1',
        start=0,
        end=10,
        reference_bases='G',
        alternate_bases=['<NON_REF>'],
        names=['nv'],
        filters=['nvf'],
        quality=2)

    call_1 = vcfio.VariantCall(name='1', genotype=[1, 0])
    call_2 = vcfio.VariantCall(name='2', genotype=[0, 0])
    variant.calls.append(call_1)
    non_variant.calls.append(call_2)
    expected_1 = vcfio.Variant(
        reference_name='1',
        start=0,
        end=5,
        alternate_bases=['<NON_REF>'],
        names=['nv'],
        filters=['nvf'],
        quality=2)
    expected_2 = vcfio.Variant(
        reference_name='1',
        start=6,
        end=10,
        alternate_bases=['<NON_REF>'],
        names=['nv'],
        filters=['nvf'],
        quality=2)
    expected_3 = vcfio.Variant(
        reference_name='1',
        start=5,
        end=6,
        reference_bases='A',
        alternate_bases=['C'],
        names=['v'],
        filters=['vf'],
        quality=1)
    expected_1.calls.append(call_2)
    expected_2.calls.append(call_2)
    expected_3.calls.append(call_1)
    expected_3.calls.append(call_2)
    actual = list(strategy.get_merged_variants([variant, non_variant]))
    expected = [expected_1, expected_2, expected_3]
    self.assertEqual(sorted(actual), sorted(expected))
Exemple #20
0
  def test_merge_2_non_variants(self):
    strategy = merge_with_non_variants_strategy.MergeWithNonVariantsStrategy(
        None, None, None)

    non_variant_1 = vcfio.Variant(
        reference_name='1',
        start=0,
        end=10,
        alternate_bases=['<NON_REF>'],
        names=['nonv1', 'nonv2'],
        filters=['f1', 'f2'],
        quality=1)
    non_variant_2 = vcfio.Variant(
        reference_name='1',
        start=5,
        end=15,
        alternate_bases=['<NON_REF>'],
        names=['nonv2', 'nonv3'],
        filters=['f2', 'f3'],
        quality=2)
    call_1 = vcfio.VariantCall(name='1', genotype=[0, 0])
    call_2 = vcfio.VariantCall(name='2', genotype=[0, 0])
    non_variant_1.calls.append(call_1)
    non_variant_2.calls.append(call_2)
    expected_1 = vcfio.Variant(
        reference_name='1',
        start=0,
        end=5,
        alternate_bases=['<NON_REF>'],
        names=['nonv1', 'nonv2'],
        filters=['f1', 'f2'],
        quality=1)
    expected_2 = vcfio.Variant(
        reference_name='1',
        start=10,
        end=15,
        alternate_bases=['<NON_REF>'],
        names=['nonv2', 'nonv3'],
        filters=['f2', 'f3'],
        quality=2)
    expected_3 = vcfio.Variant(
        reference_name='1',
        start=5,
        end=10,
        alternate_bases=['<NON_REF>'],
        names=['nonv1', 'nonv2', 'nonv3'],
        filters=['f1', 'f2', 'f3'],
        quality=1)
    expected_1.calls.append(call_1)
    expected_2.calls.append(call_2)
    expected_3.calls.append(call_1)
    expected_3.calls.append(call_2)
    actual = list(strategy.get_merged_variants([non_variant_1, non_variant_2]))
    expected = [expected_1, expected_2, expected_3]

    self.assertEqual(sorted(actual), sorted(expected))
def _get_sample_variant_1(is_for_nucleus=False):
    """Get first sample variant.

  Features:
    multiple alternates
    not phased
    multiple names
    utf-8 encoded
  """
    if not is_for_nucleus:
        vcf_line = ('20	1234	rs123;rs2	C	A,T	50	'
                    'PASS	AF=0.5,0.1;NS=1;SVTYPE=BÑD	GT:GQ	0/0:48	1/0:20\n')
        variant = vcfio.Variant(reference_name='20',
                                start=1233,
                                end=1234,
                                reference_bases='C',
                                alternate_bases=['A', 'T'],
                                names=['rs123', 'rs2'],
                                quality=50,
                                filters=['PASS'],
                                info={
                                    'AF': [0.5, 0.1],
                                    'NS': 1,
                                    'SVTYPE': ['BÑD']
                                })
        variant.calls.append(
            vcfio.VariantCall(name='Sample1', genotype=[0, 0], info={'GQ':
                                                                     48}))
        variant.calls.append(
            vcfio.VariantCall(name='Sample2', genotype=[1, 0], info={'GQ':
                                                                     20}))
    else:
        # 0.1 -> 0.25 float precision loss due to binary floating point conversion.
        vcf_line = ('20	1234	rs123;rs2	C	A,T	50	'
                    'PASS	AF=0.5,0.25;NS=1	GT:GQ	0/0:48	1/0:20\n')
        variant = vcfio.Variant(reference_name='20',
                                start=1233,
                                end=1234,
                                reference_bases='C',
                                alternate_bases=['A', 'T'],
                                names=['rs123', 'rs2'],
                                quality=50,
                                filters=['PASS'],
                                info={
                                    'AF': [0.5, 0.25],
                                    'NS': 1
                                })
        variant.calls.append(
            vcfio.VariantCall(name='Sample1', genotype=[0, 0], info={'GQ':
                                                                     48}))
        variant.calls.append(
            vcfio.VariantCall(name='Sample2', genotype=[1, 0], info={'GQ':
                                                                     20}))

    return variant, vcf_line
def _get_sample_variant_3(is_for_nucleus=False):
    """Get third sample variant.

  Features:
    symbolic alternate
    no calls for sample 2
    alternate phaseset
  """
    if not is_for_nucleus:
        vcf_line = ('19	12	.	C	<SYMBOLIC>	49	q10	AF=0.5	'
                    'GT:PS:GQ	0|1:1:45	.:.:.\n')
        variant = vcfio.Variant(reference_name='19',
                                start=11,
                                end=12,
                                reference_bases='C',
                                alternate_bases=['<SYMBOLIC>'],
                                quality=49,
                                filters=['q10'],
                                info={'AF': [0.5]})
        variant.calls.append(
            vcfio.VariantCall(name='Sample1',
                              genotype=[0, 1],
                              phaseset='1',
                              info={'GQ': 45}))
        variant.calls.append(
            vcfio.VariantCall(name='Sample2',
                              genotype=[vcfio.MISSING_GENOTYPE_VALUE],
                              info={'GQ': None}))
    else:
        # '.:.:.' -> './.:.:.' due to Nucleus handeling of VariantCall.genotype.
        vcf_line = ('19	12	.	C	<SYMBOLIC>	49	PASS	'
                    'AF=0.5	GT:PS:GQ	0|1:1:45	./.:.:.\n')
        variant = vcfio.Variant(reference_name='19',
                                start=11,
                                end=12,
                                reference_bases='C',
                                alternate_bases=['<SYMBOLIC>'],
                                quality=49,
                                filters=['PASS'],
                                info={'AF': [0.5]})
        variant.calls.append(
            vcfio.VariantCall(name='Sample1',
                              genotype=[0, 1],
                              phaseset='1',
                              info={'GQ': 45}))
        variant.calls.append(
            vcfio.VariantCall(name='Sample2',
                              genotype=[
                                  vcfio.MISSING_GENOTYPE_VALUE,
                                  vcfio.MISSING_GENOTYPE_VALUE
                              ],
                              info={}))
    return variant, vcf_line
def _get_sample_variant_2(is_for_nucleus=False):
    """Get second sample variant.

  Features:
    multiple references
    no alternate
    phased
    multiple filters
    missing format field
  """
    if not is_for_nucleus:
        vcf_line = ('19	123	rs1234	GTC	.	40	q10;s50	NS=2	'
                    'GT:GQ	1|0:48	0/1:.\n')
        variant = vcfio.Variant(reference_name='19',
                                start=122,
                                end=125,
                                reference_bases='GTC',
                                alternate_bases=[],
                                names=['rs1234'],
                                quality=40,
                                filters=['q10', 's50'],
                                info={'NS': 2})
        variant.calls.append(
            vcfio.VariantCall(name='Sample1',
                              genotype=[1, 0],
                              phaseset=vcfio.DEFAULT_PHASESET_VALUE,
                              info={'GQ': 48}))
        variant.calls.append(
            vcfio.VariantCall(name='Sample2',
                              genotype=[0, 1],
                              info={'GQ': None}))
    else:
        # 'q10;s50' -> 'PASS' due to missing header fields.
        vcf_line = ('19	123	rs1234	GTC	.	40	PASS	NS=2	' 'GT:GQ	1|0:48	0/1:.\n')
        variant = vcfio.Variant(reference_name='19',
                                start=122,
                                end=125,
                                reference_bases='GTC',
                                alternate_bases=[],
                                names=['rs1234'],
                                quality=40,
                                filters=['PASS'],
                                info={'NS': 2})
        variant.calls.append(
            vcfio.VariantCall(name='Sample1',
                              genotype=[1, 0],
                              phaseset=vcfio.DEFAULT_PHASESET_VALUE,
                              info={'GQ': 48}))
        variant.calls.append(
            vcfio.VariantCall(name='Sample2', genotype=[0, 1], info={}))
    return variant, vcf_line
Exemple #24
0
 def test_all_fields(self):
   variant = vcfio.Variant(
       reference_name='chr19', start=11, end=12, reference_bases='C',
       alternate_bases=['A', 'TT'], names=['rs1', 'rs2'], quality=2,
       filters=['PASS'],
       info={'IFR': [0.1, 0.2],
             'IFR2': [0.2, 0.3],
             'IS': 'some data',
             'ISR': ['data1', 'data2']},
       calls=[
           vcfio.VariantCall(
               name='Sample1', genotype=[0, 1], phaseset='*',
               info={'GQ': 20, 'FIR': [10, 20]}),
           vcfio.VariantCall(
               name='Sample2', genotype=[1, 0],
               info={'GQ': 10, 'FB': True}),
           vcfio.VariantCall(
               name='Sample3', genotype=[vcfio.MISSING_GENOTYPE_VALUE])])
   header_num_dict = {'IFR': 'A', 'IFR2': 'A', 'IS': '1', 'ISR': '2'}
   expected_row = {
       ColumnKeyConstants.REFERENCE_NAME: 'chr19',
       ColumnKeyConstants.START_POSITION: 11,
       ColumnKeyConstants.END_POSITION: 12,
       ColumnKeyConstants.REFERENCE_BASES: 'C',
       ColumnKeyConstants.ALTERNATE_BASES: [
           {ColumnKeyConstants.ALTERNATE_BASES_ALT: 'A',
            'IFR': 0.1, 'IFR2': 0.2},
           {ColumnKeyConstants.ALTERNATE_BASES_ALT: 'TT',
            'IFR': 0.2, 'IFR2': 0.3}],
       ColumnKeyConstants.NAMES: ['rs1', 'rs2'],
       ColumnKeyConstants.QUALITY: 2,
       ColumnKeyConstants.FILTER: ['PASS'],
       ColumnKeyConstants.CALLS: [
           {ColumnKeyConstants.CALLS_NAME: 'Sample1',
            ColumnKeyConstants.CALLS_GENOTYPE: [0, 1],
            ColumnKeyConstants.CALLS_PHASESET: '*',
            'GQ': 20, 'FIR': [10, 20]},
           {ColumnKeyConstants.CALLS_NAME: 'Sample2',
            ColumnKeyConstants.CALLS_GENOTYPE: [1, 0],
            ColumnKeyConstants.CALLS_PHASESET: None,
            'GQ': 10, 'FB': True},
           {ColumnKeyConstants.CALLS_NAME: 'Sample3',
            ColumnKeyConstants.CALLS_GENOTYPE: [vcfio.MISSING_GENOTYPE_VALUE],
            ColumnKeyConstants.CALLS_PHASESET: None}],
       'IS': 'some data',
       'ISR': ['data1', 'data2']}
   self.assertEqual([expected_row],
                    self._get_row_list_from_variant(variant, header_num_dict))
Exemple #25
0
 def _get_sample_variant_1(self, split_alternate_allele_info_fields=True):
   variant = vcfio.Variant(
       reference_name='chr19', start=11, end=12, reference_bases='C',
       alternate_bases=['A', 'TT'], names=['rs1', 'rs2'], quality=2,
       filters=['PASS'],
       info={'IFR': [0.1, 0.2], 'IFR2': [0.2, 0.3],
             'IS': 'some data', 'ISR': ['data1', 'data2']},
       calls=[
           vcfio.VariantCall(
               sample_id=hash_name('Sample1'), genotype=[0, 1], phaseset='*',
               info={'GQ': 20, 'FIR': [10, 20]}),
           vcfio.VariantCall(
               sample_id=hash_name('Sample2'), genotype=[1, 0],
               info={'GQ': 10, 'FB': True}),
       ]
   )
   header_num_dict = {'IFR': 'A', 'IFR2': 'A', 'IS': '1', 'ISR': '2'}
   row = {ColumnKeyConstants.REFERENCE_NAME: 'chr19',
          ColumnKeyConstants.START_POSITION: 11,
          ColumnKeyConstants.END_POSITION: 12,
          ColumnKeyConstants.REFERENCE_BASES: 'C',
          ColumnKeyConstants.NAMES: ['rs1', 'rs2'],
          ColumnKeyConstants.QUALITY: 2,
          ColumnKeyConstants.FILTER: ['PASS'],
          ColumnKeyConstants.CALLS: [
              {ColumnKeyConstants.CALLS_SAMPLE_ID: hash_name('Sample1'),
               ColumnKeyConstants.CALLS_GENOTYPE: [0, 1],
               ColumnKeyConstants.CALLS_PHASESET: '*',
               'GQ': 20, 'FIR': [10, 20]},
              {ColumnKeyConstants.CALLS_SAMPLE_ID: hash_name('Sample2'),
               ColumnKeyConstants.CALLS_GENOTYPE: [1, 0],
               ColumnKeyConstants.CALLS_PHASESET: None,
               'GQ': 10, 'FB': True}],
          'IS': 'some data',
          'ISR': ['data1', 'data2']}
   if split_alternate_allele_info_fields:
     row[ColumnKeyConstants.ALTERNATE_BASES] = [
         {ColumnKeyConstants.ALTERNATE_BASES_ALT:
          'A', 'IFR': 0.1, 'IFR2': 0.2},
         {ColumnKeyConstants.ALTERNATE_BASES_ALT:
          'TT', 'IFR': 0.2, 'IFR2': 0.3}]
   else:
     row[ColumnKeyConstants.ALTERNATE_BASES] = [
         {ColumnKeyConstants.ALTERNATE_BASES_ALT: 'A'},
         {ColumnKeyConstants.ALTERNATE_BASES_ALT: 'TT'}]
     row['IFR'] = [0.1, 0.2]
     row['IFR2'] = [0.2, 0.3]
   return variant, row, header_num_dict
Exemple #26
0
 def _get_sample_variant_with_empty_calls(self):
     variant = vcfio.Variant(reference_name='20',
                             start=123,
                             end=125,
                             reference_bases='CT',
                             alternate_bases=[],
                             filters=['q10', 's10'],
                             info={'II': 1234},
                             calls=[
                                 vcfio.VariantCall(name='EmptySample',
                                                   genotype=[],
                                                   phaseset='*',
                                                   info={}),
                             ])
     header_num_dict = {'II': '1'}
     row = {
         ColumnKeyConstants.REFERENCE_NAME: '20',
         ColumnKeyConstants.START_POSITION: 123,
         ColumnKeyConstants.END_POSITION: 125,
         ColumnKeyConstants.REFERENCE_BASES: 'CT',
         ColumnKeyConstants.ALTERNATE_BASES: [],
         ColumnKeyConstants.FILTER: ['q10', 's10'],
         ColumnKeyConstants.CALLS: [],
         'II': 1234
     }
     return variant, row, header_num_dict
  def _densify_variants(self, variant, all_call_names):
    # type: (vcf_parser.Variant, List[str]) -> vcf_parser.Variant
    """Cherry-picks calls for the variant.

    The calls are in the same order as the `all_call_names`.
    Args:
      variant: The variant that will be modified to contain calls for
        `all_call_names`.
      all_call_names: A list of sample names that used to cherry-pick each
        variant'calls. If one call is missing, an empty `VariantCall` is added.

    Returns:
      `variant` modified to contain calls for `all_call_names`.
    """
    existing_call_name = {call.name: call for call in variant.calls}

    new_calls = []
    for call_name in all_call_names:
      if call_name in existing_call_name.keys():
        new_calls.append(existing_call_name.get(call_name))
      else:
        new_calls.append(
            vcfio.VariantCall(name=call_name,
                              genotype=vcfio.MISSING_GENOTYPE_VALUE))
    variant.calls = new_calls

    return variant
Exemple #28
0
 def _get_sample_variant_with_incompatible_records(self):
   variant = vcfio.Variant(
       reference_name='chr19', start=11, end=12, reference_bases='C',
       alternate_bases=[], filters=['PASS'],
       info={'IFR': ['0.1', '0.2'], 'IS': 1, 'ISR': 1},
       calls=[
           vcfio.VariantCall(
               sample_id=hash_name('Sample1'), genotype=[0, 1], phaseset='*',
               info={'GQ': 20, 'FIR': [10.0, 20.0]}),
       ]
   )
   header_num_dict = {'IFR': '2', 'IS': '1', 'ISR': '1'}
   row = {ColumnKeyConstants.REFERENCE_NAME: 'chr19',
          ColumnKeyConstants.START_POSITION: 11,
          ColumnKeyConstants.END_POSITION: 12,
          ColumnKeyConstants.REFERENCE_BASES: 'C',
          ColumnKeyConstants.ALTERNATE_BASES: [],
          ColumnKeyConstants.FILTER: ['PASS'],
          ColumnKeyConstants.CALLS: [
              {ColumnKeyConstants.CALLS_SAMPLE_ID: hash_name('Sample1'),
               ColumnKeyConstants.CALLS_GENOTYPE: [0, 1],
               ColumnKeyConstants.CALLS_PHASESET: '*',
               'GQ': 20, 'FIR': [10, 20]}],
          'IFR': [0.1, 0.2],
          'IS': '1',
          'ISR': ['1']}
   return variant, row, header_num_dict
Exemple #29
0
 def test_add_missing_calls(self):
     transform = densify_variants.DensifyVariants()
     variant = vcfio.Variant(calls=[vcfio.VariantCall(name='sample2')])
     new_variant = transform._densify_variants(
         variant, ['sample1', 'sample2', 'sample3'])
     call_names = [call.name for call in new_variant.calls]
     self.assertItemsEqual(call_names, ['sample1', 'sample2', 'sample3'])
Exemple #30
0
 def _get_sample_variant_1(self):
     variant = vcfio.Variant(reference_name='chr19',
                             start=11,
                             end=12,
                             reference_bases='C',
                             alternate_bases=['A', 'TT'],
                             names=['rs1', 'rs2'],
                             quality=2,
                             filters=['PASS'],
                             info={
                                 'IS': 'some data',
                                 'ISI': '1',
                                 'ISF': '1.0',
                                 'IF': 1.0,
                                 'IB': True,
                                 'IA': [1, 2]
                             },
                             calls=[
                                 vcfio.VariantCall(
                                     sample_id=hash_name('Sample1'),
                                     genotype=[0, 1],
                                     phaseset='*',
                                     info={
                                         'FI': 20,
                                         'FU': [10.0, 20.0]
                                     })
                             ])
     return variant