def test_sort_variants(self):
    sorted_variants = [
        Variant(reference_name='a', start=20, end=22),
        Variant(reference_name='a', start=20, end=22, quality=20),
        Variant(reference_name='b', start=20, end=22),
        Variant(reference_name='b', start=21, end=22),
        Variant(reference_name='b', start=21, end=23)]

    for permutation in permutations(sorted_variants):
      self.assertEqual(sorted(permutation), sorted_variants)
Esempio n. 2
0
 def test_info_numbers_and_types(self):
     info_headers = [
         '##INFO=<ID=HA,Number=A,Type=String,Description="StringInfo_A">\n',
         '##INFO=<ID=HG,Number=G,Type=Integer,Description="IntInfo_G">\n',
         '##INFO=<ID=HR,Number=R,Type=Character,Description="ChrInfo_R">\n',
         '##INFO=<ID=HF,Number=0,Type=Flag,Description="FlagInfo">\n',
         '##INFO=<ID=HU,Number=.,Type=Float,Description="FloatInfo_variable">\n'
     ]
     record_lines = [
         '19	2	.	A	T,C	.	.	HA=a1,a2;HG=1,2,3;HR=a,b,c;HF;HU=0.1	GT	1/0	0/1\n',
         '19	124	.	A	T	.	.	HG=3,4,5;HR=d,e;HU=1.1,1.2	GT	0/0	0/1'
     ]
     variant_1 = Variant(reference_name='19',
                         start=1,
                         end=2,
                         reference_bases='A',
                         alternate_bases=['T', 'C'],
                         info={
                             'HA':
                             VariantInfo(data=['a1', 'a2'],
                                         field_count='A'),
                             'HG':
                             VariantInfo(data=[1, 2, 3], field_count='G'),
                             'HR':
                             VariantInfo(data=['a', 'b', 'c'],
                                         field_count='R'),
                             'HF':
                             VariantInfo(data=True, field_count='0'),
                             'HU':
                             VariantInfo(data=[0.1], field_count=None)
                         })
     variant_1.calls.append(VariantCall(name='Sample1', genotype=[1, 0]))
     variant_1.calls.append(VariantCall(name='Sample2', genotype=[0, 1]))
     variant_2 = Variant(reference_name='19',
                         start=123,
                         end=124,
                         reference_bases='A',
                         alternate_bases=['T'],
                         info={
                             'HG':
                             VariantInfo(data=[3, 4, 5], field_count='G'),
                             'HR':
                             VariantInfo(data=['d', 'e'], field_count='R'),
                             'HU':
                             VariantInfo(data=[1.1, 1.2], field_count=None)
                         })
     variant_2.calls.append(VariantCall(name='Sample1', genotype=[0, 0]))
     variant_2.calls.append(VariantCall(name='Sample2', genotype=[0, 1]))
     read_data = self._create_temp_file_and_read_records(
         info_headers + _SAMPLE_HEADER_LINES[1:] + record_lines)
     self.assertEqual(2, len(read_data))
     self._assert_variants_equal([variant_1, variant_2], read_data)
Esempio n. 3
0
    def test_use_of_representative_header_two_files(self):
        # Info field `HU` is defined as Float in file header while data is String.
        # This results in parser failure. We test if parser completes successfully
        # when a representative headers with String definition for field `HU` is
        # given.
        file_content_1 = [
            '##INFO=<ID=HU,Number=.,Type=Float,Description="Info">\n',
            '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\r\n',
            '#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	Sample1\r\n',
            '9     2       .       A       T       .       .       HU=a,b  GT 0/0'
        ]
        file_content_2 = [
            '##INFO=<ID=HU,Number=.,Type=Float,Description="Info">\n',
            '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\r\n',
            '#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	Sample2\r\n',
            '19	2	.	A	T	.	.	HU=a,b	GT	0/1\n',
        ]
        representative_header_lines = [
            '##INFO=<ID=HU,Number=.,Type=String,Description="Info">\n',
            '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\r\n',
        ]

        variant_1 = Variant(
            reference_name='9',
            start=1,
            end=2,
            reference_bases='A',
            alternate_bases=['T'],
            info={'HU': VariantInfo(data=['a', 'b'], field_count=None)})
        variant_1.calls.append(VariantCall(name='Sample1', genotype=[0, 0]))

        variant_2 = Variant(
            reference_name='19',
            start=1,
            end=2,
            reference_bases='A',
            alternate_bases=['T'],
            info={'HU': VariantInfo(data=['a', 'b'], field_count=None)})
        variant_2.calls.append(VariantCall(name='Sample2', genotype=[0, 1]))

        read_data_1 = self._create_temp_file_and_read_records(
            file_content_1, representative_header_lines)
        self.assertEqual(1, len(read_data_1))
        self._assert_variants_equal([variant_1], read_data_1)

        read_data_2 = self._create_temp_file_and_read_records(
            file_content_2, representative_header_lines)
        self.assertEqual(1, len(read_data_2))
        self._assert_variants_equal([variant_2], read_data_2)
Esempio n. 4
0
    def test_use_of_representative_header_two_files(self):
        # Info field `HU` is defined as Float in file header while data is String.
        # This results in parser failure. We test if parser completes successfully
        # when a representative headers with String definition for field `HU` is
        # given.
        file_content_1 = [
            '##INFO=<ID=HU,Number=.,Type=Float,Descri\n',
            '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\r\n',
            '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSample1\r\n',
            '9\t2\t.\tA\tT\t.\t.\tHU=a,b\tGT\t0/0'
        ]
        file_content_2 = [
            '##INFO=<ID=HU,Number=.,Type=Float,Descri\n',
            '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\r\n',
            '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSample2\r\n',
            '19\t2\t.\tA\tT\t.\t.\tHU=a,b\tGT\t0/1\n',
        ]
        representative_header_lines = [
            '##INFO=<ID=HU,Number=.,Type=String,Description="Info">\n',
            '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\r\n',
        ]

        variant_1 = Variant(reference_name='9',
                            start=1,
                            end=2,
                            reference_bases='A',
                            alternate_bases=['T'],
                            info={'HU': ['a', 'b']})
        variant_1.calls.append(
            VariantCall(sample_id=hash_name('Sample1'), genotype=[0, 0]))

        variant_2 = Variant(reference_name='19',
                            start=1,
                            end=2,
                            reference_bases='A',
                            alternate_bases=['T'],
                            info={'HU': ['a', 'b']})
        variant_2.calls.append(
            VariantCall(sample_id=hash_name('Sample2'), genotype=[0, 1]))

        read_data_1 = self._create_temp_file_and_read_records(
            file_content_1, representative_header_lines)
        self.assertEqual(1, len(read_data_1))
        self._assert_variants_equal([variant_1], read_data_1)

        read_data_2 = self._create_temp_file_and_read_records(
            file_content_2, representative_header_lines)
        self.assertEqual(1, len(read_data_2))
        self._assert_variants_equal([variant_2], read_data_2)
 def test_empty_sample_calls(self):
   coder = self._get_coder()
   variant = Variant()
   variant.calls.append(
       VariantCall(name='Sample2', genotype=-1))
   expected = '.	.	.	.	.	.	.	.	GT	.\n'
   self._assert_variant_lines_equal(coder.encode(variant), expected)
    def test_triploid_genotype(self):
        coder = self._get_coder()
        variant = Variant()
        variant.calls.append(VariantCall(name='Sample', genotype=[1, 0, 1]))
        expected = '.	.	.	.	.	.	.	.	GT	1/0/1\n'

        self._assert_variant_lines_equal(coder.encode(variant), expected)
Esempio n. 7
0
  def test_format_numbers(self):
    format_headers = [
        '##FORMAT=<ID=FU,Number=.,Type=String,Description="Format_variable">\n',
        '##FORMAT=<ID=F1,Number=1,Type=Integer,Description="Format_1">\n',
        '##FORMAT=<ID=F2,Number=2,Type=Character,Description="Format_2">\n',
        '##FORMAT=<ID=AO,Number=A,Type=Integer,Description="Format_3">\n',
        '##FORMAT=<ID=AD,Number=G,Type=Integer,Description="Format_4">\n',]

    record_lines = [
        ('19	2	.	A	T,C	.	.	.	'
         'GT:FU:F1:F2:AO:AD	1/0:a1:3:a,b:1:3,4	'
         '0/1:a2,a3:4:b,c:1,2:3')]
    expected_variant = Variant(
        reference_name='19', start=1, end=2, reference_bases='A',
        alternate_bases=['T', 'C'])
    expected_variant.calls.append(VariantCall(
        sample_id=hash_name('Sample1'),
        name='Sample1',
        genotype=[1, 0],
        info={'FU': ['a1'], 'F1': 3, 'F2': ['a', 'b'], 'AO': [1],
              'AD': [3, 4]}))
    expected_variant.calls.append(VariantCall(
        sample_id=hash_name('Sample2'),
        name='Sample2',
        genotype=[0, 1],
        info={'FU': ['a2', 'a3'], 'F1': 4, 'F2': ['b', 'c'], 'AO': [1, 2],
              'AD':[3]}))
    read_data = self._create_temp_file_and_read_records(
        format_headers + _SAMPLE_HEADER_LINES[1:] + record_lines)
    self.assertEqual(1, len(read_data))
    self.assertEqual(expected_variant, read_data[0])
Esempio n. 8
0
  def test_use_of_representative_header(self):
    # Info field `HU` is defined as Float in file header while data is String.
    # This results in parser failure. We test if parser completes successfully
    # when a representative headers with String definition for field `HU` is
    # given.
    file_content = [
        '##INFO=<ID=HU,Number=.,Type=Float,Description="Info">\n',
        '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\r\n',
        '#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	Sample1	Sample2\r\n',
        '19	2	.	A	T	.	.	HU=a,b	GT	0/0	0/1\n',]
    representative_header_lines = [
        '##INFO=<ID=HU,Number=.,Type=String,Description="Info">\n',
        '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">\r\n',]
    variant = Variant(
        reference_name='19', start=1, end=2, reference_bases='A',
        alternate_bases=['T'], info={'HU': ['a', 'b']})
    variant.calls.append(VariantCall(sample_id=hash_name('Sample1'),
                                     name='Sample1',
                                     genotype=[0, 0]))
    variant.calls.append(VariantCall(sample_id=hash_name('Sample2'),
                                     name='Sample2',
                                     genotype=[0, 1]))

    # `file_headers` is used.
    read_data = self._create_temp_file_and_read_records(file_content)
    # Pysam expects Float value for HU, and returns Nones when list is given.
    self.assertEqual([None, None], read_data[0].info['HU'])

    # `representative_header` is used.
    read_data = self._create_temp_file_and_read_records(
        file_content, representative_header_lines)
    self.assertEqual(1, len(read_data))
    self._assert_variants_equal([variant], read_data)
Esempio n. 9
0
  def test_end_info_key_unknown_number_invalid(self):
    end_info_header_line = (
        '##INFO=<ID=END,Number=.,Type=Integer,Description="End of record.">\n')
    # PySam should only take first END field.
    variant = Variant(
        reference_name='19', start=122, end=150, reference_bases='A',
        alternate_bases=['T'])
    variant.calls.append(VariantCall(sample_id=hash_name('Sample1'),
                                     name='Sample1',
                                     genotype=[1, 0]))
    variant.calls.append(VariantCall(sample_id=hash_name('Sample2'),
                                     name='Sample2',
                                     genotype=[0, 1]))
    read_data = self._create_temp_file_and_read_records(
        [end_info_header_line] + _SAMPLE_HEADER_LINES[1:] +
        ['19	123	.	A	T	.	.	END=150,160	GT	1/0	0/1\n'])

    self.assertEqual(1, len(read_data))
    self._assert_variants_equal([variant], read_data)

    # END should be rounded down.
    read_data = self._create_temp_file_and_read_records(
        [end_info_header_line] + _SAMPLE_HEADER_LINES[1:] +
        ['19	123	.	A	T	.	.	END=150.9	GT	1/0	0/1\n'])

    self.assertEqual(1, len(read_data))
    self._assert_variants_equal([variant], read_data)

    # END should not be a string.
    with self.assertRaises(ValueError):
      self._create_temp_file_and_read_records(
          [end_info_header_line] + _SAMPLE_HEADER_LINES[1:] +
          ['19	123	.	A	T	.	.	END=text	GT	1/0	0/1\n'])
  def test_missing_genotype(self):
    coder = self._get_coder()
    variant = Variant()
    variant.calls.append(VariantCall(
        name='Sample', genotype=[1, vcfio.MISSING_GENOTYPE_VALUE]))
    expected = '.	.	.	.	.	.	.	.	GT	1/.\n'

    self._assert_variant_lines_equal(coder.encode(variant), expected)
  def test_info_list(self):
    coder = self._get_coder()
    variant = Variant()
    variant.calls.append(VariantCall(
        name='Sample', genotype=[0, 1], info={'LI': [1, None, 3]}))
    expected = '.	.	.	.	.	.	.	.	GT:LI	0/1:1,.,3\n'

    self._assert_variant_lines_equal(coder.encode(variant), expected)
 def test_end_info_key(self):
   end_info_header_line = (
       '##INFO=<ID=END,Number=1,Type=Integer,Description="End of record.">\n')
   record_lines = ['19	123	.	A	.	.	.	END=1111	GT	1/0	0/1\n',
                   '19	123	.	A	.	.	.	.	GT	0/1	1/1\n']
   variant_1 = Variant(
       reference_name='19', start=122, end=1111, reference_bases='A')
   variant_1.calls.append(VariantCall(name='Sample1', genotype=[1, 0]))
   variant_1.calls.append(VariantCall(name='Sample2', genotype=[0, 1]))
   variant_2 = Variant(
       reference_name='19', start=122, end=123, reference_bases='A')
   variant_2.calls.append(VariantCall(name='Sample1', genotype=[0, 1]))
   variant_2.calls.append(VariantCall(name='Sample2', genotype=[1, 1]))
   read_data = self._create_temp_file_and_read_records(
       [end_info_header_line] + _SAMPLE_HEADER_LINES[1:] + record_lines)
   self.assertEqual(2, len(read_data))
   self._assert_variants_equal([variant_1, variant_2], read_data)
Esempio n. 13
0
    def test_variant_equality(self):
        base_variant = Variant(reference_name='a',
                               start=20,
                               end=22,
                               reference_bases='a',
                               alternate_bases=['g', 't'],
                               names=['variant'],
                               quality=9,
                               filters=['q10'],
                               info={'key': 'value'},
                               calls=[VariantCall(genotype=[0, 0])])
        equal_variant = Variant(reference_name='a',
                                start=20,
                                end=22,
                                reference_bases='a',
                                alternate_bases=['g', 't'],
                                names=['variant'],
                                quality=9,
                                filters=['q10'],
                                info={'key': 'value'},
                                calls=[VariantCall(genotype=[0, 0])])
        different_calls = Variant(reference_name='a',
                                  start=20,
                                  end=22,
                                  reference_bases='a',
                                  alternate_bases=['g', 't'],
                                  names=['variant'],
                                  quality=9,
                                  filters=['q10'],
                                  info={'key': 'value'},
                                  calls=[VariantCall(genotype=[1, 0])])
        missing_field = Variant(reference_name='a',
                                start=20,
                                end=22,
                                reference_bases='a',
                                alternate_bases=['g', 't'],
                                names=['variant'],
                                quality=9,
                                filters=['q10'],
                                info={'key': 'value'})

        self.assertEqual(base_variant, equal_variant)
        self.assertNotEqual(base_variant, different_calls)
        self.assertNotEqual(base_variant, missing_field)
Esempio n. 14
0
 def test_no_samples(self):
   header_line = '#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO\n'
   record_line = '19	123	.	G	A	.	PASS	AF=0.2'
   expected_variant = Variant(
       reference_name='19', start=122, end=123, reference_bases='G',
       alternate_bases=['A'], filters=['PASS'], info={'AF': [0.2]})
   read_data = self._create_temp_file_and_read_records(
       _SAMPLE_HEADER_LINES[:-1] + [header_line, record_line])
   self.assertEqual(1, len(read_data))
   self.assertEqual(expected_variant, read_data[0])
 def test_no_info(self):
   record_line = 'chr19	123	.	.	.	.	.	.	GT	.	.'
   expected_variant = Variant(reference_name='chr19', start=122, end=123)
   expected_variant.calls.append(
       VariantCall(name='Sample1', genotype=[vcfio.MISSING_GENOTYPE_VALUE]))
   expected_variant.calls.append(
       VariantCall(name='Sample2', genotype=[vcfio.MISSING_GENOTYPE_VALUE]))
   read_data = self._create_temp_file_and_read_records(
       _SAMPLE_HEADER_LINES + [record_line])
   self.assertEqual(1, len(read_data))
   self.assertEqual(expected_variant, read_data[0])
  def test_missing_info_key(self):
    coder = self._get_coder()
    variant = Variant()
    variant.calls.append(VariantCall(
        name='Sample1', genotype=[0, 1], info={'GQ': 10, 'AF': 20}))
    variant.calls.append(VariantCall(
        name='Sample2', genotype=[0, 1], info={'AF': 20}))
    expected = ('.	.	.	.	.	.	.	.	GT:AF:GQ	0/1:20:10	'
                '0/1:20:.\n')

    self._assert_variant_lines_equal(coder.encode(variant), expected)
    def get_merged_variants(self, variants, unused_key=None):
        # type: (List[Variant], str) -> List[Variant]
        if not variants:
            return []
        merged_variant = None
        for variant in variants:
            if not merged_variant:
                merged_variant = Variant(
                    reference_name=variant.reference_name,
                    start=variant.start,
                    end=variant.end,
                    reference_bases=variant.reference_bases,
                    alternate_bases=variant.alternate_bases)
            # Since we use hash function in generating the merge key, there is
            # a chance (extremely low though) to have variants with different
            # `reference_bases` or `alternate_base` here due to a collision in
            # the hash function.
            assert variant.reference_bases == merged_variant.reference_bases, (
                'Cannot merge variants with different reference bases. {} vs {}'
                .format(variant.reference_bases,
                        merged_variant.reference_bases))
            assert variant.alternate_bases == merged_variant.alternate_bases, (
                'Cannot merge variants with different alternate bases. {} vs {}'
                .format(variant.alternate_bases,
                        merged_variant.alternate_bases))

            merged_variant.names.extend(variant.names)
            merged_variant.filters.extend(variant.filters)
            merged_variant.quality = max(merged_variant.quality,
                                         variant.quality)

            self.move_data_to_calls(variant)
            self.move_data_to_merged(variant, merged_variant)

            merged_variant.calls.extend(variant.calls)

        # Deduplicate names and filters.
        merged_variant.names = sorted(set(merged_variant.names))
        merged_variant.filters = sorted(set(merged_variant.filters))
        return [merged_variant]
 def test_custom_phaseset(self):
   phaseset_header_line = (
       '##FORMAT=<ID=PS,Number=1,Type=Integer,Description="Phaseset">\n')
   record_lines = ['19	123	.	A	T	.	.	.	GT:PS	1|0:1111	0/1:.\n',
                   '19	121	.	A	T	.	.	.	GT:PS	1|0:2222	0/1:2222\n']
   variant_1 = Variant(
       reference_name='19', start=122, end=123, reference_bases='A',
       alternate_bases=['T'])
   variant_1.calls.append(
       VariantCall(name='Sample1', genotype=[1, 0], phaseset='1111'))
   variant_1.calls.append(VariantCall(name='Sample2', genotype=[0, 1]))
   variant_2 = Variant(
       reference_name='19', start=120, end=121, reference_bases='A',
       alternate_bases=['T'])
   variant_2.calls.append(
       VariantCall(name='Sample1', genotype=[1, 0], phaseset='2222'))
   variant_2.calls.append(
       VariantCall(name='Sample2', genotype=[0, 1], phaseset='2222'))
   read_data = self._create_temp_file_and_read_records(
       [phaseset_header_line] + _SAMPLE_HEADER_LINES[1:] + record_lines)
   self.assertEqual(2, len(read_data))
   self._assert_variants_equal([variant_1, variant_2], read_data)
Esempio n. 19
0
 def test_end_info_key_unknown_number(self):
   end_info_header_line = (
       '##INFO=<ID=END,Number=.,Type=Integer,Description="End of record.">\n')
   record_lines = ['19	123	.	A	T	.	.	END=1111	GT	1/0	0/1\n']
   variant_1 = Variant(
       reference_name='19', start=122, end=1111, reference_bases='A',
       alternate_bases=['T'])
   variant_1.calls.append(VariantCall(sample_id=hash_name('Sample1'),
                                      name='Sample1',
                                      genotype=[1, 0]))
   variant_1.calls.append(VariantCall(sample_id=hash_name('Sample2'),
                                      name='Sample2',
                                      genotype=[0, 1]))
   read_data = self._create_temp_file_and_read_records(
       [end_info_header_line] + _SAMPLE_HEADER_LINES[1:] + record_lines)
   self.assertEqual(1, len(read_data))
   self._assert_variants_equal([variant_1], read_data)
  def test_info_field_count(self):
    coder = self._get_coder()
    variant = Variant()
    variant.info['NS'] = 3
    variant.info['AF'] = [0.333, 0.667]
    variant.info['DB'] = True
    variant.info['CSQ'] = ['G|upstream_gene_variant||MODIFIER',
                           'T|||MODIFIER']
    expected = ('.	.	.	.	.	.	.	NS=3;AF=0.333,0.667;DB;'
                'CSQ=G|upstream_gene_variant||MODIFIER,T|||MODIFIER	.\n')

    self._assert_variant_lines_equal(coder.encode(variant), expected)