コード例 #1
0
    def test_label_variant(self):
        variant = test_utils.make_variant(start=10, alleles=['A', 'C'])
        tvariant = test_utils.make_variant(start=10,
                                           alleles=['A', 'C'],
                                           gt=[0, 1])
        example = tf_utils.make_example(variant, ['C'], 'foo',
                                        self.default_shape,
                                        self.default_format)
        labeler = mock.Mock()
        labeler.match = mock.Mock(return_value=[True, tvariant])
        labeler.match_to_alt_count = mock.Mock(return_value=1)
        self.processor.labeler = labeler

        labeled = example_pb2.Example()
        labeled.CopyFrom(example)
        self.processor.label_variant(labeled, variant)

        labeler.match.assert_called_once_with(variant)
        labeler.match_to_alt_count.assert_called_once_with(
            variant, tvariant, ['C'])

        for key, value in example.features.feature.iteritems():
            self.assertEqual(value, labeled.features.feature[key])
        self.assertEqual(1, tf_utils.example_label(labeled))
        self.assertEqual(tvariant, tf_utils.example_truth_variant(labeled))
コード例 #2
0
 def test_match_to_genotype_label(self, variant_alleles, alt_alleles,
                                  truth_alleles, truth_gt, expected_n_alts):
   variant = test_utils.make_variant(start=10, alleles=variant_alleles)
   truth_variant = test_utils.make_variant(
       start=10, alleles=truth_alleles, gt=truth_gt)
   self.assertEqual(expected_n_alts,
                    self.labeler.match_to_alt_count(variant, truth_variant,
                                                    alt_alleles))
コード例 #3
0
 def test_match_selects_variant_by_start(self):
     # Tests that match() selects the variant at the same start even if that
     # variant doesn't have the same alleles at candidate and there's an
     # overlapping with the same alleles.
     overlapping = [
         test_utils.make_variant(start=20, alleles=['CC', 'A']),
         test_utils.make_variant(start=21, alleles=['AAA', 'A']),
         test_utils.make_variant(start=22, alleles=['AA', 'A']),
     ]
     self.labeler = variant_labeler.VariantLabeler(
         vcf_reader=mock_vcf_reader(overlapping))
     candidate = test_utils.make_variant(start=21, alleles=['CC', 'A'])
     self.assertEqual(self.labeler.match(candidate)[1], overlapping[1])
コード例 #4
0
  def testSelectVariantsWeights(self):
    variants = [
        test_utils.make_variant(start=10, alleles=['C', 'T']),
        test_utils.make_variant(start=11, alleles=['C', 'TA']),
        test_utils.make_variant(start=12, alleles=['C', 'A']),
        test_utils.make_variant(start=13, alleles=['CA', 'T']),
    ]
    encoded = tf.constant([v.SerializeToString() for v in variants])

    with self.test_session() as sess:
      sess.run(tf.global_variables_initializer())
      op = model_eval.select_variants_weights(
          variantutils.is_snp, encoded, name='tf_is_snp')
      self.assertTrue(op.name.startswith('tf_is_snp'))
      npt.assert_array_equal(op.eval(), [1.0, 0.0, 1.0, 0.0])
コード例 #5
0
 def test_match_to_genotype_label_no_gt_truth_variant_raises(self):
     with self.assertRaisesRegexp(ValueError,
                                  'truth_variant needs genotypes'):
         self.labeler.match_to_alt_count(
             self.snp, test_utils.make_variant(start=10, alleles=['A',
                                                                  'C']),
             self.snp.alternate_bases)
コード例 #6
0
def _create_variant(ref_name, start, ref_base, alt_bases, qual, filter_field,
                    genotype, gq, likelihoods):
    """Creates a Variant record for testing.

  Args:
    ref_name: reference name for this variant
    start: start position on the contig
    ref_base: reference base(s)
    alt_bases: list(str). alternate base(s)
    qual: PHRED scaled detection probability
    filter_field: filter string for this variant
    genotype: list of integers corresponding to the called genotype
    gq: PHRED scaled genotype quality
    likelihoods: genotype likelihoods for this variant

  Returns:
    A Variant record created with the specified arguments.
  """
    return test_utils.make_variant(chrom=ref_name,
                                   start=start,
                                   alleles=[ref_base] + alt_bases,
                                   qual=qual,
                                   filters=filter_field,
                                   gt=genotype,
                                   gq=gq,
                                   gls=likelihoods,
                                   sample_name=_DEFAULT_SAMPLE_NAME)
コード例 #7
0
    def test_create_pileup_examples(self):
        self.processor.pic = mock.Mock()
        self.add_mock('_encode_tensor',
                      side_effect=[
                          ('tensor1', self.default_shape, self.default_format),
                          ('tensor2', self.default_shape, self.default_format)
                      ])
        dv_call = mock.Mock()
        dv_call.variant = test_utils.make_variant(start=10,
                                                  alleles=['A', 'C', 'G'])
        ex = mock.Mock()
        alt1, alt2 = ['C'], ['G']
        self.processor.pic.create_pileup_images.return_value = [
            (alt1, 'tensor1'), (alt2, 'tensor2')
        ]

        actual = self.processor.create_pileup_examples(dv_call)

        self.processor.pic.create_pileup_images.assert_called_once_with(
            dv_call)

        self.assertEquals(len(actual), 2)
        for ex, (alt, img) in zip(actual, [(alt1, 'tensor1'),
                                           (alt2, 'tensor2')]):
            self.assertEqual(tf_utils.example_alt_alleles(ex), alt)
            self.assertEqual(tf_utils.example_variant(ex), dv_call.variant)
            self.assertEqual(tf_utils.example_encoded_image(ex), img)
            self.assertEqual(tf_utils.example_image_shape(ex),
                             self.default_shape)
            self.assertEqual(tf_utils.example_image_format(ex),
                             self.default_format)
コード例 #8
0
 def test_label_variant_raises_for_non_confident_variant(self):
   variant = test_utils.make_variant(start=10, alleles=['A', 'C'], gt=[0, 1])
   self.processor.labeler = mock.Mock()
   self.processor.labeler.match = mock.Mock(return_value=[False, variant])
   example = tf_utils.make_example(variant, ['C'], 'foo', self.default_shape,
                                   self.default_format)
   self.assertFalse(self.processor.label_variant(example, variant))
コード例 #9
0
def _var(chrom='1',
         start=5,
         end=None,
         ref=None,
         alt=None,
         qual=50,
         genotype=None,
         likelihoods=None,
         sample_name='NA12878'):
    """Creates a Variant record for testing.

  Args:
    chrom: reference name for this variant
    start: start position on the contig
    end: end position on the contig
    ref: reference base(s)
    alt: list(str). alternate base(s)
    qual: PHRED scaled detection probability
    genotype: list of integers corresponding to the called genotype
    likelihoods: genotype likelihoods for this variant
    sample_name: sample name for the single call in the variant

  Returns:
    A Variant record created with the specified arguments.

  Raises:
    ValueError: Both ref and end are specified, and are inconsistent.
  """
    if ref is None and end is None:
        ref = 'A'
    elif ref is None:
        ref = 'A' * (end - start)
    elif ref is not None and end is not None and end != start + len(ref):
        raise ValueError('Inconsistent end and reference allele.')

    if alt is None:
        alt = ['C']
    if genotype is None:
        genotype = [0, 1]
    if likelihoods is None:
        likelihoods = [-1.0, -0.0506099933550872, -2.0]
    return test_utils.make_variant(chrom=chrom,
                                   start=start,
                                   alleles=[ref] + alt,
                                   qual=qual,
                                   filters=None,
                                   gt=genotype,
                                   gls=likelihoods,
                                   sample_name=sample_name)
コード例 #10
0
def _create_nonvariant(ref_name, start, end):
  """Creates a non-variant Variant record for testing.

  Args:
    ref_name: str. Reference name for this variant.
    start: int. start position on the contig [0-based, half open).
    end: int. end position on the contig [0-based, half open).

  Returns:
    A non-variant Variant record created with the specified arguments.
  """
  return test_utils.make_variant(
      chrom=ref_name,
      start=start,
      end=end,
      alleles=['A', variantutils.GVCF_ALT_ALLELE])
コード例 #11
0
def _simple_variant(ref_name, start, ref_base):
  """Creates a Variant record for testing variant and non-variant merge.

  Args:
    ref_name: str. Reference name for this variant.
    start: int. start position on the contig [0-based, half open).
    ref_base: str. reference base(s).

  Returns:
    A Variant record created with the specified arguments.
  """
  return test_utils.make_variant(
      chrom=ref_name,
      start=start,
      end=start + len(ref_base),
      alleles=[ref_base, 'A' if ref_base != 'A' else 'C'])
コード例 #12
0
class VariantLabelerTest(parameterized.TestCase):
    # Confident variants: SNP, deletion, and multi-allelic.
    snp = test_utils.make_variant(start=10, alleles=['A', 'C'], gt=[0, 1])
    deletion = test_utils.make_variant(start=20, alleles=['ACG', 'A'])
    multiallelic = test_utils.make_variant(start=30,
                                           alleles=['ACT', 'ACTGT', 'A'])
    # Outside our confident regions.
    non_confident = test_utils.make_variant(start=200, alleles=['A', 'C'])
    filtered = test_utils.make_variant(start=40,
                                       alleles=['A', 'C'],
                                       filters='FAILED')
    filtered_match = test_utils.make_variant(start=40,
                                             alleles=['A', 'C'],
                                             gt=[0, 0])

    variants = [snp, deletion, multiallelic, non_confident, filtered]

    def setUp(self):
        self.labeler = variant_labeler.VariantLabeler(
            vcf_reader=mock_vcf_reader(self.variants),
            confident_regions=ranges.RangeSet(
                [ranges.make_range(self.snp.reference_name, 10, 100)]))

    @parameterized.parameters(
        # Simple tests: we get back our matching variants in the confident regions
        (snp, True, snp),
        (deletion, True, deletion),
        (multiallelic, True, multiallelic),

        # Test the behavior outside of our confident regions.
        # We get back non_confident since it matches but we're not confident.
        (non_confident, False, non_confident),
        # No matching variant, so we get a None as well as False.
        (test_utils.make_variant(start=300, alleles=['A', 'C']), False, None),

        # This variant doesn't have any match but we're confident in it.
        (test_utils.make_variant(start=15, alleles=['C', 'A']), True,
         test_utils.make_variant(start=15, alleles=['C', 'A'], gt=[0, 0])),

        # These variant start at our SNP but has a different allele. We are
        # confident and we get back the true snp variant, despite having the
        # different alleles.
        (test_utils.make_variant(start=snp.start, alleles=['A', 'G'
                                                           ]), True, snp),
        (test_utils.make_variant(start=snp.start, alleles=['AC', 'C'
                                                           ]), True, snp),
        (test_utils.make_variant(start=snp.start, alleles=['A', 'CA'
                                                           ]), True, snp),

        # We don't match filtered variants.
        (filtered, True, filtered_match),
    )
    def test_match(self, candidate, expected_confident, expected_variant):
        actual_confident, actual_variant = self.labeler.match(candidate)
        self.assertEqual(expected_confident, actual_confident)
        self.assertEqual(expected_variant, actual_variant)

    def test_match_selects_variant_by_start(self):
        # Tests that match() selects the variant at the same start even if that
        # variant doesn't have the same alleles at candidate and there's an
        # overlapping with the same alleles.
        overlapping = [
            test_utils.make_variant(start=20, alleles=['CC', 'A']),
            test_utils.make_variant(start=21, alleles=['AAA', 'A']),
            test_utils.make_variant(start=22, alleles=['AA', 'A']),
        ]
        self.labeler = variant_labeler.VariantLabeler(
            vcf_reader=mock_vcf_reader(overlapping))
        candidate = test_utils.make_variant(start=21, alleles=['CC', 'A'])
        self.assertEqual(self.labeler.match(candidate)[1], overlapping[1])

    @parameterized.parameters(
        # Make sure we get the right alt counts for all diploid genotypes.
        (['A', 'C'], ['C'], ['A', 'C'], [0, 0], 0),
        (['A', 'C'], ['C'], ['A', 'C'], [0, 1], 1),
        (['A', 'C'], ['C'], ['A', 'C'], [1, 0], 1),
        (['A', 'C'], ['C'], ['A', 'C'], [1, 1], 2),

        # Basic multi-allelic tests, without having to deal with simplifying
        # alleles as all of the alleles are SNPs. Our candidates have an extra
        # allele, but the true GT is A/C.
        (['A', 'C', 'G'], ['C'], ['A', 'C'], [0, 1], 1),
        (['A', 'C', 'G'], ['C'], ['A', 'C'], [1, 1], 2),

        # When considering A/G our answer should be 0 as we have no copies
        # of the G allele.
        (['A', 'C', 'G'], ['G'], ['A', 'C'], [0, 1], 0),
        (['A', 'C', 'G'], ['G'], ['A', 'C'], [1, 1], 0),

        # We are considering the het-alt configuration here of A vs. C+G. We've
        # got one copy of the C allele so our true genotype is het. If truth is
        # hom-var for the C, though, we again label the composite as hom_var as
        # we have two copies of the C/G alt.
        (['A', 'C', 'G'], ['C', 'G'], ['A', 'C'], [0, 1], 1),
        (['A', 'C', 'G'], ['C', 'G'], ['A', 'C'], [1, 1], 2),

        # Here we have an extra allele in truth, while candidate is bi-allelic.
        # This example 'G' is unused in truth, so we are simply the normal
        # bi-allelic result.
        (['A', 'C'], ['C'], ['A', 'C', 'G'], [0, 0], 0),
        (['A', 'C'], ['C'], ['A', 'C', 'G'], [0, 1], 1),
        (['A', 'C'], ['C'], ['A', 'C', 'G'], [1, 1], 2),

        # We check here that we get the bi-allelic result even when the extra
        # allele is in position 1 not 2.
        (['A', 'G'], ['G'], ['A', 'C', 'G'], [0, 0], 0),
        (['A', 'G'], ['G'], ['A', 'C', 'G'], [0, 2], 1),
        (['A', 'G'], ['G'], ['A', 'C', 'G'], [2, 2], 2),

        # Now for a real het-alt. We've got three alleles in both, and the true
        # genotype is 1/2.
        (['A', 'C', 'G'], ['C'], ['A', 'C', 'G'], [1, 2], 1),
        (['A', 'C', 'G'], ['G'], ['A', 'C', 'G'], [1, 2], 1),
        (['A', 'C', 'G'], ['C', 'G'], ['A', 'C', 'G'], [1, 2], 2),

        # Test ll possible values in candidate against het-alt:
        (['A', 'C', 'G', 'T'], ['C'], ['A', 'C', 'G'], [1, 2], 1),
        (['A', 'C', 'G', 'T'], ['G'], ['A', 'C', 'G'], [1, 2], 1),
        (['A', 'C', 'G', 'T'], ['T'], ['A', 'C', 'G'], [1, 2], 0),
        (['A', 'C', 'G', 'T'], ['C', 'G'], ['A', 'C', 'G'], [1, 2], 2),
        (['A', 'C', 'G', 'T'], ['C', 'T'], ['A', 'C', 'G'], [1, 2], 1),
        (['A', 'C', 'G', 'T'], ['G', 'T'], ['A', 'C', 'G'], [1, 2], 1),

        # Simple start for indel alleles => exact matching works here.
        (['A', 'AC'], ['AC'], ['A', 'AC'], [0, 0], 0),
        (['A', 'AC'], ['AC'], ['A', 'AC'], [0, 1], 1),
        (['A', 'AC'], ['AC'], ['A', 'AC'], [1, 1], 2),

        # We've got a multi-allelic truth, but again exact matching is enough.
        (['A', 'AC'], ['AC'], ['A', 'AC', 'ACC'], [0, 0], 0),
        (['A', 'AC'], ['AC'], ['A', 'AC', 'ACC'], [0, 1], 1),
        (['A', 'AC'], ['AC'], ['A', 'AC', 'ACC'], [1, 1], 2),
        (['A', 'AC'], ['AC'], ['A', 'AC', 'ACC'], [0, 2], 0),
        (['A', 'AC'], ['AC'], ['A', 'AC', 'ACC'], [1, 2], 1),
        (['A', 'AC'], ['AC'], ['A', 'AC', 'ACC'], [2, 2], 0),

        # This case has an extra allele (A) in truth but the true genotype
        # corresponds to our candidate alleles exactly.
        (['A', 'AC'], ['AC'], ['AC', 'A', 'ACC'], [0, 2], 1),
        (['A', 'AC'], ['AC'], ['AC', 'A', 'ACC'], [2, 2], 2),
        # If the true genotype involved just the deletion (A) allele, we don't
        # have that allele in our candidate so we always get 0 copies.
        (['A', 'AC'], ['AC'], ['AC', 'A', 'ACC'], [0, 1], 0),
        (['A', 'AC'], ['AC'], ['AC', 'A', 'ACC'], [1, 1], 0),
        # If the truth is het-alt, we can't match the deletion A allele but we do
        # in fact have the A => AC allele as this matches the AC => ACC allele in
        # truth set.
        (['A', 'AC'], ['AC'], ['AC', 'A', 'ACC'], [1, 2], 1),

        # We have a multi-allelic candidate but a simple bi-allelic truth. Make
        # sure we match correctly. This is an key case, as we should expect that
        # our candidates frequently have extra alleles changing the represention
        # relative to our truth candidates.
        (['ACT', 'A', 'AACT'], ['A'], ['A', 'AA'], [0, 1], 0),
        (['ACT', 'A', 'AACT'], ['A'], ['A', 'AA'], [1, 1], 0),
        (['ACT', 'A', 'AACT'], ['AACT'], ['A', 'AA'], [0, 1], 1),
        (['ACT', 'A', 'AACT'], ['AACT'], ['A', 'AA'], [1, 1], 2),
        (['ACT', 'A', 'AACT'], ['A', 'AACT'], ['A', 'AA'], [0, 1], 1),
        (['ACT', 'A', 'AACT'], ['A', 'AACT'], ['A', 'AA'], [1, 1], 2),

        # The whole complexity: multi-allelic candidate and truth, all with
        # different allele representations.
        # True genotype here is A/AGTGT where ref is AGT [common
        # dinucleotide expansion]. Both candidate and truth have this but each
        # as a different ref so none of the alleles exactly match.
        (['AGTGT', 'A', 'AGT', 'AGTGTGT'
          ], ['A'], ['AGT', 'A', 'AGTGT', 'AGTGTGT'], [1, 2], 0),
        (['AGTGT', 'A', 'AGT', 'AGTGTGT'
          ], ['AGT'], ['AGT', 'A', 'AGTGT', 'AGTGTGT'], [1, 2], 1),
        (['AGTGT', 'A', 'AGT', 'AGTGTGT'
          ], ['AGTGTGT'], ['AGT', 'A', 'AGTGT', 'AGTGTGT'], [1, 2], 1),
        (['AGTGT', 'A', 'AGT', 'AGTGTGT'
          ], ['A', 'AGT'], ['AGT', 'A', 'AGTGT', 'AGTGTGT'], [1, 2], 1),
        (['AGTGT', 'A', 'AGT', 'AGTGTGT'
          ], ['A', 'AGTGTGT'], ['AGT', 'A', 'AGTGT', 'AGTGTGT'], [1, 2], 1),
        (['AGTGT', 'A', 'AGT', 'AGTGTGT'
          ], ['AGT', 'AGTGTGT'], ['AGT', 'A', 'AGTGT', 'AGTGTGT'], [1, 2], 2),

        # Misc. checks with block substititions.
        (['AT', 'A', 'GC'], ['A'], ['ATT', 'AT', 'A'], [0, 1], 1),
        (['AT', 'A', 'GT'], ['A'], ['A', 'G'], [0, 1], 0),
        (['AT', 'A', 'GT'], ['GT'], ['A', 'G'], [0, 1], 1),
    )
    def test_match_to_genotype_label(self, variant_alleles, alt_alleles,
                                     truth_alleles, truth_gt, expected_n_alts):
        variant = test_utils.make_variant(start=10, alleles=variant_alleles)
        truth_variant = test_utils.make_variant(start=10,
                                                alleles=truth_alleles,
                                                gt=truth_gt)
        self.assertEqual(
            expected_n_alts,
            self.labeler.match_to_alt_count(variant, truth_variant,
                                            alt_alleles))

    def test_match_to_genotype_label_none_truth_variant_raises(self):
        with self.assertRaisesRegexp(ValueError,
                                     'truth_variant cannot be None'):
            self.labeler.match_to_alt_count(self.snp, None,
                                            self.snp.alternate_bases)

    def test_match_to_genotype_label_no_gt_truth_variant_raises(self):
        with self.assertRaisesRegexp(ValueError,
                                     'truth_variant needs genotypes'):
            self.labeler.match_to_alt_count(
                self.snp, test_utils.make_variant(start=10, alleles=['A',
                                                                     'C']),
                self.snp.alternate_bases)

    def test_match_to_genotype_label_none_variant_raises(self):
        with self.assertRaisesRegexp(ValueError, 'variant cannot be None'):
            self.labeler.match_to_alt_count(None, self.snp,
                                            self.snp.alternate_bases)

    def test_match_to_genotype_label_ref_variant_raises(self):
        with self.assertRaisesRegexp(
                ValueError, 'variant must have at least one alternate allele'):
            self.labeler.match_to_alt_count(
                test_utils.make_variant(start=10, alleles=['A']), self.snp,
                self.snp.alternate_bases)
コード例 #13
0
 def test_match_to_genotype_label_ref_variant_raises(self):
     with self.assertRaisesRegexp(
             ValueError, 'variant must have at least one alternate allele'):
         self.labeler.match_to_alt_count(
             test_utils.make_variant(start=10, alleles=['A']), self.snp,
             self.snp.alternate_bases)
コード例 #14
0
 def test_invalid_nonref_genotype_count(self):
     zero_calls_variant = test_utils.make_variant()
     with self.assertRaisesRegexp(ValueError,
                                  'Expecting only single-sample'):
         haplotypes._nonref_genotype_count(zero_calls_variant)
コード例 #15
0
    def test_calls_from_allele_counts(self, include_gvcfs):
        # Our test AlleleCounts are 5 positions:
        #
        # 10: A ref [no reads]
        # 11: G/C variant
        # 12: G ref [no reads]
        # 13: G ref [no reads]
        # 14: T/C variant
        #
        # The ref sites have no reads for ref or any alt simply because it
        # simplifies comparing them with the expected variant genotype likelihoods.
        # We aren't testing the correctness of the gvcf calculation here (that's
        # elsewhere) but rather focusing here on the separation of variants from
        # gvcf records, and the automatic merging of the gvcf blocks.
        allele_counter = self.fake_allele_counter(10, [
            (0, 0, 'A'),
            (10, 10, 'G'),
            (0, 0, 'G'),
            (0, 0, 'G'),
            (10, 10, 'T'),
        ])
        fake_candidates = [
            deepvariant_pb2.DeepVariantCall(
                variant=test_utils.make_variant(alleles=['G', 'C'], start=11)),
            deepvariant_pb2.DeepVariantCall(
                variant=test_utils.make_variant(alleles=['T', 'C'], start=14)),
        ]

        caller = self.make_test_caller(0.01, 100)
        with mock.patch.object(caller, 'cpp_variant_caller') as mock_cpp:
            mock_cpp.calls_from_allele_counter.return_value = fake_candidates
            candidates, gvcfs = caller.calls_from_allele_counter(
                allele_counter, include_gvcfs)

        mock_cpp.calls_from_allele_counter.assert_called_once_with(
            allele_counter)
        self.assertEqual(candidates, fake_candidates)

        # We expect our gvcfs to occur at the 10 position and that 12 and 13 have
        # been merged into a 2 bp block, if enabled. Otherwise should be empty.
        if include_gvcfs:
            self.assertLen(gvcfs, 4)
            # Expected diploid genotype likelihoods when there's no coverage. The
            # chance of having each genotype is 1/3, in log10 space.
            flat_gls = np.log10([1.0 / 3] * 3)
            self.assertGVCF(gvcfs[0],
                            ref='A',
                            start=10,
                            end=11,
                            gq=1,
                            gls=flat_gls)
            self.assertGVCF(gvcfs[1],
                            ref='G',
                            start=11,
                            end=12,
                            gq=0,
                            gls=np.array([
                                -14.0230482368, -8.32667268469e-15,
                                -14.0230482368
                            ]))
            self.assertGVCF(gvcfs[2],
                            ref='G',
                            start=12,
                            end=14,
                            gq=1,
                            gls=flat_gls)
        else:
            self.assertEmpty(gvcfs)