Python ReferenceRegion Exemples, deepvariant.labeler.haplotype_labeler.ReferenceRegion Python Exemples

Exemple #1

0

Afficher le fichier

 def test_single_variants(self, candidate_alleles, truth_alleles,
                          truth_genotype, expected_genotype):
   candidate = _test_variant(42, candidate_alleles)
   truth = _test_variant(42, truth_alleles, truth_genotype)
   ref_allele = sorted([candidate_alleles[0], truth_alleles[0]], key=len)[0]
   self.assertGetsCorrectLabels(
       candidates=[candidate],
       true_variants=[truth],
       ref=haplotype_labeler.ReferenceRegion('x' + ref_allele + 'y', 41),
       expected_genotypes=[expected_genotype])

Exemple #2

0

Afficher le fichier

 def test_multi_allelic(self, candidate_alleles, truth_alleles,
                        truth_genotypes_and_expected):
   candidate = _test_variant(42, candidate_alleles)
   for true_gt, expected_gt in truth_genotypes_and_expected.iteritems():
     truth = _test_variant(42, truth_alleles, true_gt)
     ref_allele = sorted([candidate_alleles[0], truth_alleles[0]], key=len)[0]
     self.assertGetsCorrectLabels(
         candidates=[candidate],
         true_variants=[truth],
         ref=haplotype_labeler.ReferenceRegion('x' + ref_allele + 'y', 41),
         expected_genotypes=[expected_gt])

Exemple #3

0

Afficher le fichier

 def test_build_all_haplotypes_next_pos_is_correct(self, ref, alt):
   # Check that the next_pos calculation is working.
   pos = 10
   for gt in [(0, 0), (0, 1), (1, 1)]:
     _, next_pos = haplotype_labeler.build_all_haplotypes(
         [
             haplotype_labeler.VariantAndGenotypes(
                 _test_variant(pos, [ref, alt]), gt)
         ],
         last_pos=pos,
         ref=haplotype_labeler.ReferenceRegion(ref, pos))
     self.assertEqual(next_pos, pos + len(ref))

Exemple #4

0

Afficher le fichier

 def test_example7(self):
   self.assertGetsCorrectLabels(
       candidates=[
           _test_variant(279768, ['G', 'C']),
           _test_variant(279773, ['ATA', 'C', 'CTA']),
       ],
       true_variants=[
           _test_variant(279773, ['A', 'C'], [0, 1]),
       ],
       ref=haplotype_labeler.ReferenceRegion('CGCCCCATACCTTTT', 279767),
       expected_genotypes=[
           [0, 0],
           [0, 2],
       ])

Exemple #5

0

Afficher le fichier

 def test_example6(self):
   self.assertGetsCorrectLabels(
       candidates=[
           _test_variant(2525696, ['AAT', 'A']),
           _test_variant(2525697, ['AT', 'T']),
       ],
       true_variants=[
           _test_variant(2525696, ['AAT', 'A'], [0, 1]),
       ],
       ref=haplotype_labeler.ReferenceRegion('xAATT', 2525695),
       expected_genotypes=[
           [0, 1],
           [0, 0],
       ])

Exemple #6

0

Afficher le fichier

 def test_example2(self):
   self.assertGetsCorrectLabels(
       candidates=[
           _test_variant(4030067, ['TC', 'T']),
           _test_variant(4030072, ['C', 'G']),
       ],
       true_variants=[
           _test_variant(4030071, ['CC', 'G'], [1, 1]),
       ],
       ref=haplotype_labeler.ReferenceRegion('xTCCCCCA', 4030066),
       expected_genotypes=[
           [1, 1],
           [1, 1],
       ])

Exemple #7

0

Afficher le fichier

 def test_example5(self):
   self.assertGetsCorrectLabels(
       candidates=[
           _test_variant(2401510, ['ATGT', 'A']),
           _test_variant(2401515, ['C', 'T']),
       ],
       true_variants=[
           _test_variant(2401511, ['TG', 'A'], [1, 1]),
           _test_variant(2401513, ['TAC', 'T'], [1, 1]),
       ],
       ref=haplotype_labeler.ReferenceRegion('xATGTACACAG', 2401509),
       expected_genotypes=[
           [1, 1],
           [1, 1],
       ])

Exemple #8

0

Afficher le fichier

 def test_example1(self):
   self.assertGetsCorrectLabels(
       candidates=[
           _test_variant(3528531, ['ATAG', 'A']),
           _test_variant(3528537, ['A', 'ATT']),
       ],
       true_variants=[
           _test_variant(3528533, ['A', 'T'], [1, 1]),
           _test_variant(3528534, ['G', 'A'], [1, 1]),
           _test_variant(3528536, ['TA', 'T'], [1, 1]),
       ],
       ref=haplotype_labeler.ReferenceRegion('xATAGTTATC', 3528530),
       expected_genotypes=[
           [1, 1],
           [1, 1],
       ])

Exemple #9

0

Afficher le fichier

 def test_bad_scoring_bug(self):
   self.assertGetsCorrectLabels(
       candidates=[
           _test_variant(9508943, ['GGT', 'G']),
           _test_variant(9508967, ['T', 'C', 'TGC']),
       ],
       true_variants=[
           _test_variant(9508943, ['GGT', 'G'], [0, 1]),
           _test_variant(9508967, ['T', 'C', 'TGC'], [1, 2]),
       ],
       ref=haplotype_labeler.ReferenceRegion(
           'GGGTGTGTGTGTGTGTGTGTGTGTGTGCGTGTGTGTGTTTGTGTTG', 9508942),
       expected_genotypes=[
           [0, 1],
           [1, 2],
       ])

Exemple #10

0

Afficher le fichier

 def test_false_variants_get_homref_genotype(self):
   ref = haplotype_labeler.ReferenceRegion('xACGTAy', 10)
   v1 = _test_variant(11, ['A', 'T'], [0, 1])
   v2 = _test_variant(13, ['G', 'GG'], [1, 1])
   all_fps = [
       _test_variant(12, ['C', 'G'], [0, 0]),
       _test_variant(14, ['T', 'A'], [0, 0]),
       _test_variant(15, ['A', 'AA'], [0, 0]),
   ]
   for n_fps in range(1, len(all_fps) + 1):
     for fps in itertools.combinations(all_fps, n_fps):
       candidates = variant_utils.sorted_variants([v1, v2] + list(fps))
       self.assertGetsCorrectLabels(
           candidates=candidates,
           true_variants=[v1, v2],
           ref=ref,
           expected_genotypes=haplotype_labeler._variant_genotypes(candidates))

Exemple #11

0

Afficher le fichier

 def test_exome_variants_multiple_equivalent_representations(self):
   self.assertGetsCorrectLabels(
       candidates=[
           _test_variant(214012390, ['G', 'GAC']),
           _test_variant(214012402, ['CAA', 'C']),
           _test_variant(214012404, ['A', 'C']),
       ],
       true_variants=[
           _test_variant(214012404, ['A', 'C'], [1, 1]),
       ],
       ref=haplotype_labeler.ReferenceRegion('AGACACACACACACAAAAAAAAATCAT',
                                             214012389),
       expected_genotypes=[
           [0, 1],
           [0, 1],
           [0, 1],
           # This configuration makes the most sense but we cannot choose it
           # if we want to minimize the number of FNs, FPs, and then TPs.
           # [0, 0],
           # [0, 0],
           # [1, 1],
       ])

Exemple #12

0

Afficher le fichier

 def test_example3(self):
   self.assertGetsCorrectLabels(
       candidates=[
           _test_variant(4568151, ['AC', 'A']),
           _test_variant(4568154, ['TG', 'T']),
           _test_variant(4568156, ['G', 'T']),
           _test_variant(4568157, ['A', 'ATACCCTTT']),
       ],
       true_variants=[
           _test_variant(4568152, ['C', 'A'], [1, 1]),
           _test_variant(4568153, ['A', 'T'], [1, 1]),
           _test_variant(4568155, ['G', 'A'], [1, 1]),
           _test_variant(4568156, ['G', 'T'], [1, 1]),
           _test_variant(4568157, ['A', 'ACCCTTT'], [1, 1]),
       ],
       ref=haplotype_labeler.ReferenceRegion('xACATGGATGGA', 4568150),
       expected_genotypes=[
           [1, 1],
           [1, 1],
           [1, 1],
           [1, 1],
       ])

Exemple #13

0

Afficher le fichier

 def test_exome_complex_example(self):
   self.assertGetsCorrectLabels(
       candidates=[
           _test_variant(167012240, ['GT', 'G']),
           _test_variant(167012246, ['TTTT', 'A']),
           _test_variant(167012247, ['T', 'A']),
           _test_variant(167012248, ['T', 'A']),
           _test_variant(167012249, ['T', 'A']),
       ],
       true_variants=[
           _test_variant(167012240, ['GTTT', 'G', 'GTT'], [1, 2]),
           _test_variant(167012249, ['T', 'A', 'TAA'], [1, 2]),
       ],
       ref=haplotype_labeler.ReferenceRegion('TGTTTTTTTTTAAAAAAATTATTTCTTCTTT',
                                             167012239),
       expected_genotypes=[
           [1, 1],
           [0, 0],
           [0, 0],
           [0, 1],
           [1, 1],
       ])

Exemple #14

0

Afficher le fichier

 def test_example4(self):
   # CTGTAAACAGAA [phased alts] + CGTGAATGAAA [phased ref]
   self.assertGetsCorrectLabels(
       candidates=[
           _test_variant(1689633, ['C', 'CT']),
           _test_variant(1689635, ['TG', 'T']),
           _test_variant(1689638, ['ATG', 'A']),
           _test_variant(1689641, ['A', 'ACAG']),
       ],
       true_variants=[
           _test_variant(1689633, ['C', 'CT'], [1, 0]),
           _test_variant(1689636, ['G', 'A'], [1, 0]),
           _test_variant(1689639, ['T', 'C'], [1, 0]),
           _test_variant(1689640, ['G', 'A'], [1, 0]),
           _test_variant(1689641, ['A', 'G'], [1, 0]),
       ],
       ref=haplotype_labeler.ReferenceRegion('xCGTGAATGAAA', 1689632),
       expected_genotypes=[
           [0, 1],
           [0, 1],
           [0, 1],
           [0, 1],
       ])

Exemple #15

0

Afficher le fichier

class LabelExamplesTest(parameterized.TestCase):
  # Many of these tests are cases from our labeler analysis doc:
  # https://docs.google.com/document/d/1V89IIT0YM3P0gH_tQb-ahodf8Jvnz0alXEnjCf6JVNo

  def assertGetsCorrectLabels(self,
                              candidates,
                              true_variants,
                              ref,
                              expected_genotypes,
                              start=None,
                              end=None):
    start = start or ref.start
    end = end or ref.end
    labeled_variants = haplotype_labeler.label_variants(candidates,
                                                        true_variants, ref)
    self.assertIsNotNone(labeled_variants)

    # Check that the genotypes of our labeled variants are the ones we expect.
    self.assertEqual(
        haplotype_labeler._variant_genotypes(labeled_variants),
        [tuple(x) for x in expected_genotypes])

  @parameterized.parameters(
      dict(genotype=[0, 0], expected={(0, 0)}),
      dict(genotype=[0, 1], expected={(0, 0), (0, 1)}),
      dict(genotype=[1, 1], expected={(0, 0), (0, 1), (1, 1)}),
      dict(genotype=[0, 2], expected={(0, 0), (0, 2)}),
      dict(genotype=[2, 2], expected={(0, 0), (0, 2), (2, 2)}),
      dict(genotype=[1, 2], expected={(0, 0), (0, 2), (0, 1), (1, 2)}),
  )
  def test_with_false_negative_genotypes(self, genotype, expected):
    self.assertEqual(
        haplotype_labeler.with_false_negative_genotypes(genotype), expected)

  @parameterized.parameters(
      # All possible genotypes for a simple tri-allelic case.
      (
          dict(
              variants=[
                  _test_variant(11, ['TG', 'A', 'TGC'], gt),
              ],
              ref=haplotype_labeler.ReferenceRegion('TG', 11),
              expected_frags=expected,
              expected_next_pos=13) for gt, expected in {
                  # Simple bi-allelic configurations:
                  (0, 0): {
                      (0,): 'TG'
                  },
                  (0, 1): {
                      (0,): 'TG',
                      (1,): 'A'
                  },
                  (1, 0): {
                      (0,): 'TG',
                      (1,): 'A'
                  },
                  (1, 1): {
                      (1,): 'A'
                  },
                  # Multi-allelic configurations:
                  (0, 2): {
                      (0,): 'TG',
                      (2,): 'TGC'
                  },
                  (1, 2): {
                      (1,): 'A',
                      (2,): 'TGC'
                  },
                  (2, 2): {
                      (2,): 'TGC'
                  },
              }.iteritems()),)
  def test_build_all_haplotypes_single_variant(
      self, variants, ref, expected_frags, expected_next_pos):
    variants_and_genotypes = [
        haplotype_labeler.VariantAndGenotypes(v, tuple(v.calls[0].genotype))
        for v in variants
    ]
    frags, next_pos = haplotype_labeler.build_all_haplotypes(
        variants_and_genotypes, ref.start, ref)
    self.assertEqual(frags, expected_frags)
    self.assertEqual(next_pos, expected_next_pos)

  @parameterized.parameters(
      ('G', 'A'),
      ('GG', 'A'),
      ('GGG', 'A'),
      ('GGGG', 'A'),
      ('A', 'G'),
      ('A', 'GG'),
      ('A', 'GGG'),
      ('A', 'GGGG'),
  )
  def test_build_all_haplotypes_next_pos_is_correct(self, ref, alt):
    # Check that the next_pos calculation is working.
    pos = 10
    for gt in [(0, 0), (0, 1), (1, 1)]:
      _, next_pos = haplotype_labeler.build_all_haplotypes(
          [
              haplotype_labeler.VariantAndGenotypes(
                  _test_variant(pos, [ref, alt]), gt)
          ],
          last_pos=pos,
          ref=haplotype_labeler.ReferenceRegion(ref, pos))
      self.assertEqual(next_pos, pos + len(ref))

  @parameterized.parameters(
      # A single deletion overlapping a SNP:
      # ref: xTG
      # v1:   A-
      # v2:    C
      dict(
          variants=[
              _test_variant(11, ['TG', 'A'], (0, 1)),
              _test_variant(12, ['G', 'C'], (0, 1)),
          ],
          ref=haplotype_labeler.ReferenceRegion('xTG', 10),
          expected_frags={
              (0, 0): 'xTG',  # haplotype 0|0.
              (0, 1): 'xTC',  # haplotype 0|1.
              (1, 0): 'xA',  # haplotype 1|0.
              (1, 1): None,  # haplotype 1|1 => invalid.
          },
          expected_next_pos=13),
      # Deletion overlapping two downstream events (SNP and insertion):
      # ref: xTGC
      # v1:   A--
      # v2:    C
      # v3:     TTT
      dict(
          variants=[
              _test_variant(11, ['TGC', 'A'], (0, 1)),
              _test_variant(12, ['G', 'C'], (0, 1)),
              _test_variant(13, ['C', 'TTT'], (0, 1)),
          ],
          ref=haplotype_labeler.ReferenceRegion('xTGC', 10),
          expected_frags={
              (0, 0, 0): 'xTGC',  # haplotype 0|0|0.
              (0, 0, 1): 'xTGTTT',  # haplotype 0|0|1.
              (0, 1, 0): 'xTCC',  # haplotype 0|1|0.
              (0, 1, 1): 'xTCTTT',  # haplotype 0|1|1.
              (1, 0, 0): 'xA',  # haplotype 1|0|0.
              (1, 0, 1): None,  # haplotype 1|0|1 => invalid.
              (1, 1, 0): None,  # haplotype 1|1|0 => invalid.
              (1, 1, 1): None,  # haplotype 1|1|1 => invalid.
          },
          expected_next_pos=14),
      # Two incompatible deletions to check that the end extension is working:
      # pos: 01234
      # ref: xTGCA
      # v1:   T-
      # v2:    G-
      dict(
          variants=[
              _test_variant(11, ['TG', 'T'], (0, 1)),
              _test_variant(12, ['GC', 'G'], (0, 1)),
          ],
          ref=haplotype_labeler.ReferenceRegion('xTGCA', 10),
          expected_frags={
              (0, 0): 'xTGC',  # haplotype 0|0.
              (0, 1): 'xTG',  # haplotype 0|1.
              (1, 0): 'xTC',  # haplotype 1|0.
              (1, 1): None,  # haplotype 1|1 => invalid.
          },
          expected_next_pos=14),
      # Multiple overlapping deletions with complex incompatibilities:
      # ref: xTGCGA
      # v1:   A--
      # v2:    G---  [conflicts with v1]
      # v3:     C-   [conflicts with v1 and v2]
      # v4:      G-  [conflicts with v2 and v3, ok with v1]
      # v5:       C  [conflicts with v2 and v4, ok with v1, v3]
      dict(
          variants=[
              _test_variant(11, ['TGC', 'A'], (0, 1)),
              _test_variant(12, ['GCGA', 'G'], (0, 1)),
              _test_variant(13, ['CG', 'C'], (0, 1)),
              _test_variant(14, ['GA', 'G'], (0, 1)),
              _test_variant(15, ['A', 'C'], (0, 1)),
          ],
          ref=haplotype_labeler.ReferenceRegion('xTGCGA', 10),
          expected_frags={
              (0, 0, 0, 0, 0): 'xTGCGA',  # haplotype 0|0|0|0|0.
              (0, 0, 0, 0, 1): 'xTGCGC',  # haplotype 0|0|0|0|1.
              (0, 0, 0, 1, 0): 'xTGCG',  # haplotype 0|0|0|1|0.
              (0, 0, 0, 1, 1): None,  # haplotype 0|0|0|1|1.
              (0, 0, 1, 0, 0): 'xTGCA',  # haplotype 0|0|1|0|0.
              (0, 0, 1, 0, 1): 'xTGCC',  # haplotype 0|0|1|0|1.
              (0, 0, 1, 1, 0): None,  # haplotype 0|0|1|1|0.
              (0, 0, 1, 1, 1): None,  # haplotype 0|0|1|1|1.
              (0, 1, 0, 0, 0): 'xTG',  # haplotype 0|1|0|0|0.
              (0, 1, 0, 0, 1): None,  # haplotype 0|1|0|0|1.
              (0, 1, 0, 1, 0): None,  # haplotype 0|1|0|1|0.
              (0, 1, 0, 1, 1): None,  # haplotype 0|1|0|1|1.
              (0, 1, 1, 0, 0): None,  # haplotype 0|1|1|0|0.
              (0, 1, 1, 0, 1): None,  # haplotype 0|1|1|0|1.
              (0, 1, 1, 1, 0): None,  # haplotype 0|1|1|1|0.
              (0, 1, 1, 1, 1): None,  # haplotype 0|1|1|1|1.
              (1, 0, 0, 0, 0): 'xAGA',  # haplotype 1|0|0|0|0.
              (1, 0, 0, 0, 1): 'xAGC',  # haplotype 1|0|0|0|1.
              (1, 0, 0, 1, 0): 'xAG',  # haplotype 1|0|0|1|0.
              (1, 0, 0, 1, 1): None,  # haplotype 1|0|0|1|1.
              (1, 0, 1, 0, 0): None,  # haplotype 1|0|1|0|0.
              (1, 0, 1, 0, 1): None,  # haplotype 1|0|1|0|1.
              (1, 0, 1, 1, 0): None,  # haplotype 1|0|1|1|0.
              (1, 0, 1, 1, 1): None,  # haplotype 1|0|1|1|1.
              (1, 1, 0, 0, 0): None,  # haplotype 1|1|0|0|0.
              (1, 1, 0, 0, 1): None,  # haplotype 1|1|0|0|1.
              (1, 1, 0, 1, 0): None,  # haplotype 1|1|0|1|0.
              (1, 1, 0, 1, 1): None,  # haplotype 1|1|0|1|1.
              (1, 1, 1, 0, 0): None,  # haplotype 1|1|1|0|0.
              (1, 1, 1, 0, 1): None,  # haplotype 1|1|1|0|1.
              (1, 1, 1, 1, 0): None,  # haplotype 1|1|1|1|0.
              (1, 1, 1, 1, 1): None,  # haplotype 1|1|1|1|1.
          },
          expected_next_pos=16),
  )
  def test_build_all_haplotypes_overlapping(
      self, variants, ref, expected_frags, expected_next_pos):
    # redacted
    variants_and_genotypes = [
        haplotype_labeler.VariantAndGenotypes(v, tuple(v.calls[0].genotype))
        for v in variants
    ]
    frags, next_pos = haplotype_labeler.build_all_haplotypes(
        variants_and_genotypes, ref.start, ref)
    self.assertEqual(
        frags, {k: v
                for k, v in expected_frags.iteritems()
                if v is not None})
    self.assertEqual(next_pos, expected_next_pos)

  @parameterized.parameters(
      # Check that simple bi-allelic matching works for all possible possible
      # genotypes and a variety of types of alleles.
      (dict(
          candidate_alleles=alleles,
          truth_alleles=alleles,
          truth_genotype=gt,
          # Returns [0, 1] even if truth is [1, 0], so sort the genotypes for
          # the expected value.
          expected_genotype=sorted(gt),
      )
       for gt in [[0, 1], [1, 0], [1, 1]]
       for alleles in [['A', 'C'], ['ACC', 'A'], ['A', 'ATG'], ['AC', 'GT']]),
  )
  def test_single_variants(self, candidate_alleles, truth_alleles,
                           truth_genotype, expected_genotype):
    candidate = _test_variant(42, candidate_alleles)
    truth = _test_variant(42, truth_alleles, truth_genotype)
    ref_allele = sorted([candidate_alleles[0], truth_alleles[0]], key=len)[0]
    self.assertGetsCorrectLabels(
        candidates=[candidate],
        true_variants=[truth],
        ref=haplotype_labeler.ReferenceRegion('x' + ref_allele + 'y', 41),
        expected_genotypes=[expected_genotype])

  @parameterized.parameters(
      dict(
          candidate_alleles=['A', 'C'],
          truth_alleles=['A', 'G', 'C'],
          truth_genotypes_and_expected={
              (0, 2): (0, 1),  # A/C => 0/C
              (1, 2): (0, 1),  # G/C => 0/C
              (1, 1): (0, 0),  # G/G => 0/0
              (2, 2): (1, 1),  # C/C => C/C
          }
      ),
      dict(
          candidate_alleles=['A', 'C'],
          truth_alleles=['A', 'C', 'G'],
          truth_genotypes_and_expected={
              (0, 1): (0, 1),  # A/C => 0/C
              (2, 1): (0, 1),  # G/C => 0/C
              (2, 2): (0, 0),  # G/G => 0/0
              (1, 1): (1, 1),  # C/C => C/C
          },
      ),
      dict(
          candidate_alleles=['A', 'TT', 'TTT'],
          truth_alleles=['A', 'C', 'G'],
          truth_genotypes_and_expected={
              (i, j): (0, 0) for i, j in itertools.combinations([0, 1, 2], 2)
          }
      ),
      # Here the candidate is also multi-allelic
      dict(
          candidate_alleles=['A', 'G', 'C'],
          truth_alleles=['A', 'C', 'G'],
          truth_genotypes_and_expected={
              (0, 1): (0, 2),
              (0, 2): (0, 1),
              (1, 1): (2, 2),
              (1, 2): (1, 2),
              (2, 2): (1, 1),
          },
      ),
  )
  def test_multi_allelic(self, candidate_alleles, truth_alleles,
                         truth_genotypes_and_expected):
    candidate = _test_variant(42, candidate_alleles)
    for true_gt, expected_gt in truth_genotypes_and_expected.iteritems():
      truth = _test_variant(42, truth_alleles, true_gt)
      ref_allele = sorted([candidate_alleles[0], truth_alleles[0]], key=len)[0]
      self.assertGetsCorrectLabels(
          candidates=[candidate],
          true_variants=[truth],
          ref=haplotype_labeler.ReferenceRegion('x' + ref_allele + 'y', 41),
          expected_genotypes=[expected_gt])

  def test_false_variants_get_homref_genotype(self):
    ref = haplotype_labeler.ReferenceRegion('xACGTAy', 10)
    v1 = _test_variant(11, ['A', 'T'], [0, 1])
    v2 = _test_variant(13, ['G', 'GG'], [1, 1])
    all_fps = [
        _test_variant(12, ['C', 'G'], [0, 0]),
        _test_variant(14, ['T', 'A'], [0, 0]),
        _test_variant(15, ['A', 'AA'], [0, 0]),
    ]
    for n_fps in range(1, len(all_fps) + 1):
      for fps in itertools.combinations(all_fps, n_fps):
        candidates = variant_utils.sorted_variants([v1, v2] + list(fps))
        self.assertGetsCorrectLabels(
            candidates=candidates,
            true_variants=[v1, v2],
            ref=ref,
            expected_genotypes=haplotype_labeler._variant_genotypes(candidates))

  def test_false_negatives(self):
    ref = haplotype_labeler.ReferenceRegion('xACGTAy', 10)
    v1 = _test_variant(11, ['A', 'T'], [0, 1])
    v2 = _test_variant(13, ['G', 'GG'], [1, 1])
    all_fns = [
        _test_variant(12, ['C', 'G'], [0, 1]),
        _test_variant(14, ['T', 'A', 'G'], [1, 2]),
        _test_variant(15, ['A', 'AA'], [1, 1]),
    ]
    for n_fns in [1]:
      # for n_fns in range(1, len(all_fns) + 1):
      for fns in itertools.combinations(all_fns, n_fns):
        candidates = [v1, v2]
        self.assertGetsCorrectLabels(
            candidates=candidates,
            true_variants=variant_utils.sorted_variants([v1, v2] + list(fns)),
            ref=ref,
            expected_genotypes=haplotype_labeler._variant_genotypes(candidates))

  # example 20:3528533 and 20:3528534
  def test_example1(self):
    self.assertGetsCorrectLabels(
        candidates=[
            _test_variant(3528531, ['ATAG', 'A']),
            _test_variant(3528537, ['A', 'ATT']),
        ],
        true_variants=[
            _test_variant(3528533, ['A', 'T'], [1, 1]),
            _test_variant(3528534, ['G', 'A'], [1, 1]),
            _test_variant(3528536, ['TA', 'T'], [1, 1]),
        ],
        ref=haplotype_labeler.ReferenceRegion('xATAGTTATC', 3528530),
        expected_genotypes=[
            [1, 1],
            [1, 1],
        ])

  # example 20:4030071
  def test_example2(self):
    self.assertGetsCorrectLabels(
        candidates=[
            _test_variant(4030067, ['TC', 'T']),
            _test_variant(4030072, ['C', 'G']),
        ],
        true_variants=[
            _test_variant(4030071, ['CC', 'G'], [1, 1]),
        ],
        ref=haplotype_labeler.ReferenceRegion('xTCCCCCA', 4030066),
        expected_genotypes=[
            [1, 1],
            [1, 1],
        ])

  # example 20:4568152
  def test_example3(self):
    self.assertGetsCorrectLabels(
        candidates=[
            _test_variant(4568151, ['AC', 'A']),
            _test_variant(4568154, ['TG', 'T']),
            _test_variant(4568156, ['G', 'T']),
            _test_variant(4568157, ['A', 'ATACCCTTT']),
        ],
        true_variants=[
            _test_variant(4568152, ['C', 'A'], [1, 1]),
            _test_variant(4568153, ['A', 'T'], [1, 1]),
            _test_variant(4568155, ['G', 'A'], [1, 1]),
            _test_variant(4568156, ['G', 'T'], [1, 1]),
            _test_variant(4568157, ['A', 'ACCCTTT'], [1, 1]),
        ],
        ref=haplotype_labeler.ReferenceRegion('xACATGGATGGA', 4568150),
        expected_genotypes=[
            [1, 1],
            [1, 1],
            [1, 1],
            [1, 1],
        ])

  # example 20:1689636, 20:1689639, 20:1689640, 20:1689641
  def test_example4(self):
    # CTGTAAACAGAA [phased alts] + CGTGAATGAAA [phased ref]
    self.assertGetsCorrectLabels(
        candidates=[
            _test_variant(1689633, ['C', 'CT']),
            _test_variant(1689635, ['TG', 'T']),
            _test_variant(1689638, ['ATG', 'A']),
            _test_variant(1689641, ['A', 'ACAG']),
        ],
        true_variants=[
            _test_variant(1689633, ['C', 'CT'], [1, 0]),
            _test_variant(1689636, ['G', 'A'], [1, 0]),
            _test_variant(1689639, ['T', 'C'], [1, 0]),
            _test_variant(1689640, ['G', 'A'], [1, 0]),
            _test_variant(1689641, ['A', 'G'], [1, 0]),
        ],
        ref=haplotype_labeler.ReferenceRegion('xCGTGAATGAAA', 1689632),
        expected_genotypes=[
            [0, 1],
            [0, 1],
            [0, 1],
            [0, 1],
        ])

  # 20:2401511
  def test_example5(self):
    self.assertGetsCorrectLabels(
        candidates=[
            _test_variant(2401510, ['ATGT', 'A']),
            _test_variant(2401515, ['C', 'T']),
        ],
        true_variants=[
            _test_variant(2401511, ['TG', 'A'], [1, 1]),
            _test_variant(2401513, ['TAC', 'T'], [1, 1]),
        ],
        ref=haplotype_labeler.ReferenceRegion('xATGTACACAG', 2401509),
        expected_genotypes=[
            [1, 1],
            [1, 1],
        ])

  # 20:2525695: genotype assign was incorrect in a previous run. This is because
  # the candidate variants overlap:
  #
  # ref: AAATT
  #  v1:  A--
  #  v2:   A-
  #
  # And this is causing us to construct incorrect haplotypes.
  def test_example6(self):
    self.assertGetsCorrectLabels(
        candidates=[
            _test_variant(2525696, ['AAT', 'A']),
            _test_variant(2525697, ['AT', 'T']),
        ],
        true_variants=[
            _test_variant(2525696, ['AAT', 'A'], [0, 1]),
        ],
        ref=haplotype_labeler.ReferenceRegion('xAATT', 2525695),
        expected_genotypes=[
            [0, 1],
            [0, 0],
        ])

  # Variants were getting incorrect genotypes due to complex region.
  #
  # variants: candidates
  #   20:279768:G->C gt=(-1, -1)
  #   20:279773:ATA->C/CTA gt=(-1, -1)
  # variants: truth
  #   20:279773:A->C gt=(1, 0)
  #
  # pos    : 789012345678901
  # ref    : CGCCCCATACCTTTT
  # truth  :       C          => CGCCCCCTACCTTTT
  # DV 1   :  C               => CCCCCCATACCTTTT [bad]
  # DV 2.a :       C--        => CGCCCCCCTTTT    [bad]
  # DV 2.b :       CTA        => CGCCCCCTACCTTTT [match]
  #
  def test_example7(self):
    self.assertGetsCorrectLabels(
        candidates=[
            _test_variant(279768, ['G', 'C']),
            _test_variant(279773, ['ATA', 'C', 'CTA']),
        ],
        true_variants=[
            _test_variant(279773, ['A', 'C'], [0, 1]),
        ],
        ref=haplotype_labeler.ReferenceRegion('CGCCCCATACCTTTT', 279767),
        expected_genotypes=[
            [0, 0],
            [0, 2],
        ])

  # redacted
  # that accepts a whole region of variants so we make sure it divides up the
  # problem into more fine-grained pieces that run quickly. The current call is
  # to a lower-level API that doesn't do variant chunking.
  # Commented out because this remains super slow.
  # def test_super_slow_example(self):
  #   self.assertGetsCorrectLabels(
  #       candidates=[
  #           _test_variant(32274452, ['C', 'G']),
  #           _test_variant(32274453, ['T', 'G']),
  #           _test_variant(32274456, ['A', 'G']),
  #           _test_variant(32274459, ['C', 'G']),
  #           _test_variant(32274461, ['T', 'G']),
  #           _test_variant(32274465, ['GACA', 'G']),
  #           _test_variant(32274467, ['CA', 'C']),
  #           _test_variant(32274470, ['C', 'G']),
  #           _test_variant(32274473, ['A', 'G']),
  #           _test_variant(32274474, ['AC', 'A']),
  #           _test_variant(32274475, ['C', 'A']),
  #           _test_variant(32274477, ['T', 'A']),
  #           _test_variant(32274480, ['G', 'C']),
  #       ],
  #       true_variants=[
  #           _test_variant(32274470, ['C', 'G'], (1, 1)),
  #       ],
  #       ref=haplotype_labeler.ReferenceRegion(
  #           'GCTGGAGGCGTGGGGACACCGGAACATAGGCCCCGCCCCGCCCCGACGC', 32274451),
  #       expected_genotypes=[
  #           [0, 0],
  #           [0, 0],
  #           [0, 0],
  #           [0, 0],
  #           [0, 0],
  #           [0, 0],
  #           [0, 0],
  #           [1, 1],
  #           [0, 0],
  #           [0, 0],
  #           [0, 0],
  #           [0, 0],
  #           [0, 0],
  #       ])

# Variants were getting incorrect genotypes in an exome callset.
#
# ref: AGACACACACACACAAAAAAAAATCATAAAATGAAG, start=214012389
# candidates 2:214012390:G->GAC
# candidates 2:214012402:CAA->C
# candidates 2:214012404:A->C
# true_variants 2:214012404:A->C
#
# 2:214012390:G->GAC => gt=(1, 1) new_label=2 old_label=0 alts=[0]
# 2:214012402:CAA->C => gt=(1, 1) new_label=2 old_label=0 alts=[0]
# 2:214012404:A->C => gt=(0, 0) new_label=0 old_label=2 alts=[0]
#
#           90--------- 0---------10--------20---
# pos    : 90  1234567890123456789012345678901234
# ref    : AG  ACACACACACACAAAAAAAAATCATAAAATGAAG
# truth  :                  C => AGACACACACACACACAAAAAAATCATAAAATGAAG
# DV 1   :  GAC               => [doesn't match]
# DV 2   :                C-- => [doesn't match]
# DV 1+2 : AGACACACACACACAC  AAAAAAATCATAAAATGAAG
# DV 1+2 :                    => AGACACACACACACACAAAAAAATCATAAAATGAAG [match]
# DV 3   :                  C => AGACACACACACACACAAAAAAATCATAAAATGAAG [match]
#
# So this is an interesting case. G->GAC + CAA->C matches the true haplotype,
# and the SNP itself gets assigned a FP status since we can have either two
# FPs (dv1 and dv2 candidates) or have just one (dv3). What's annoying here is
# that DV3 exactly matches the variant as described in the truth set. It's
# also strange that we've generated multiple equivalent potential variants
# here.
#
# This test ensures that we are picking the most parsimonous genotype
# assignment (e.g., fewest number of TPs) needed to explain the truth, after
# accounting for minimizing the number of FNs and FPs.
  def test_exome_variants_multiple_equivalent_representations(self):
    self.assertGetsCorrectLabels(
        candidates=[
            _test_variant(214012390, ['G', 'GAC']),
            _test_variant(214012402, ['CAA', 'C']),
            _test_variant(214012404, ['A', 'C']),
        ],
        true_variants=[
            _test_variant(214012404, ['A', 'C'], [1, 1]),
        ],
        ref=haplotype_labeler.ReferenceRegion('AGACACACACACACAAAAAAAAATCAT',
                                              214012389),
        expected_genotypes=[
            [0, 1],
            [0, 1],
            [0, 1],
            # This configuration makes the most sense but we cannot choose it
            # if we want to minimize the number of FNs, FPs, and then TPs.
            # [0, 0],
            # [0, 0],
            # [1, 1],
        ])

  # Variant group: 5 candidates 2 truth variants
  # ref: ReferenceRegion(bases=TGTTTTTTTTTAAAAAAATTATTTCTTCTTT, start=167012239)
  #   candidates 4:167012240:GT->G
  #   candidates 4:167012246:TTTT->A
  #   candidates 4:167012247:T->A
  #   candidates 4:167012248:T->A
  #   candidates 4:167012249:T->A
  #   true_variants 4:167012240:GTTT->G/GTT [2, 1]
  #   true_variants 4:167012249:T->A/TAA [2, 1]
  #   4:167012240:GT->G => gt=(1, 1) new_label=2 old_label=1 alts=[0]
  #   4:167012246:TTTT->A => gt=(0, 0) new_label=0 old_label=0 alts=[0]
  #   4:167012247:T->A => gt=(0, 0) new_label=0 old_label=0 alts=[0]
  #   4:167012248:T->A => gt=(0, 1) new_label=1 old_label=0 alts=[0]
  #   4:167012249:T->A => gt=(1, 1) new_label=2 old_label=1 alts=[0]
  def test_exome_complex_example(self):
    self.assertGetsCorrectLabels(
        candidates=[
            _test_variant(167012240, ['GT', 'G']),
            _test_variant(167012246, ['TTTT', 'A']),
            _test_variant(167012247, ['T', 'A']),
            _test_variant(167012248, ['T', 'A']),
            _test_variant(167012249, ['T', 'A']),
        ],
        true_variants=[
            _test_variant(167012240, ['GTTT', 'G', 'GTT'], [1, 2]),
            _test_variant(167012249, ['T', 'A', 'TAA'], [1, 2]),
        ],
        ref=haplotype_labeler.ReferenceRegion('TGTTTTTTTTTAAAAAAATTATTTCTTCTTT',
                                              167012239),
        expected_genotypes=[
            [1, 1],
            [0, 0],
            [0, 0],
            [0, 1],
            [1, 1],
        ])

  # ref: GGGTGTGTGTGTGTGTGTGTGTGTGTGCGTGTGTGTGTTTGTGTTG, start=9508942
  #   candidates 20:9508943:GGT->G
  #   candidates 20:9508967:T->C/TGC
  #   candidates 20:9508967:T->C/TGC
  #   candidates 20:9508967:T->C/TGC
  #   true_variants 20:9508943:GGT->G [0, 1]
  #   true_variants 20:9508967:T->C/TGC [1, 2]
  #   20:9508943:GGT->G => gt=(0, 0) new_label=0 old_label=1 alts=[0
  #   20:9508967:T->C/TGC => gt=(1, 1) new_label=2 old_label=1 alts=[0]
  #   20:9508967:T->C/TGC => gt=(1, 1) new_label=0 old_label=1 alts=[1]
  #   20:9508967:T->C/TGC => gt=(1, 1) new_label=2 old_label=2 alts=[0, 1]
  #
  # This test fixes a bug where we weren't scoring our matches properly.
  # Previously we were not accounting for FPs in our score, so we were taking
  # a match with 0 FN, 1 FP, 1 TP over one with 0 FN, 0 FP, and 2 TP!
  #
  #      40------50--------60---------
  # pos: 2345678901234567890123456789012345678901234567
  # ref: GGGTGTGTGTGTGTGTGTGTGTGTGTGCGTGTGTGTGTTTGTGTTG
  # t1:   G--
  # t2a:                          C
  # t2b:                          Tgc
  #
  def test_bad_scoring_bug(self):
    self.assertGetsCorrectLabels(
        candidates=[
            _test_variant(9508943, ['GGT', 'G']),
            _test_variant(9508967, ['T', 'C', 'TGC']),
        ],
        true_variants=[
            _test_variant(9508943, ['GGT', 'G'], [0, 1]),
            _test_variant(9508967, ['T', 'C', 'TGC'], [1, 2]),
        ],
        ref=haplotype_labeler.ReferenceRegion(
            'GGGTGTGTGTGTGTGTGTGTGTGTGTGCGTGTGTGTGTTTGTGTTG', 9508942),
        expected_genotypes=[
            [0, 1],
            [1, 2],
        ])