def _make_dv_call(ref_bases='A', alt_bases='C'):
    return deepvariant_pb2.DeepVariantCall(
        variant=variants_pb2.Variant(reference_name='chr1',
                                     start=10,
                                     end=11,
                                     reference_bases=ref_bases,
                                     alternate_bases=[alt_bases]),
        allele_support={'C': _supporting_reads('read1/1', 'read2/1')})
    def test_calls_from_allele_counts(self):
        # Our test AlleleCounts are 5 positions:
        #
        # 10: A ref [no reads]
        # 11: G/C variant
        # 12: G ref [no reads]
        # 13: G ref [no reads]
        # 14: T/C variant
        #
        # The ref sites have no reads for ref or any alt simply because it
        # simplifies comparing them with the expected variant genotype likelihoods.
        # We aren't testing the correctness of the gvcf calculation here (that's
        # elsewhere) but rather focusing here on the separation of variants from
        # gvcf records, and the automatic merging of the gvcf blocks.
        allele_counter = self.fake_allele_counter(10, [
            (0, 0, 'A'),
            (10, 10, 'G'),
            (0, 0, 'G'),
            (0, 0, 'G'),
            (10, 10, 'T'),
        ])
        fake_candidates = [
            deepvariant_pb2.DeepVariantCall(
                variant=test_utils.make_variant(alleles=['G', 'C'], start=11)),
            deepvariant_pb2.DeepVariantCall(
                variant=test_utils.make_variant(alleles=['T', 'C'], start=14)),
        ]

        caller = self.make_test_caller(0.01, 100)
        with mock.patch.object(caller, 'cpp_variant_caller') as mock_cpp:
            mock_cpp.calls_from_allele_counts.return_value = fake_candidates

            allele_counters = {'SAMPLE_ID': allele_counter}
            candidates, _ = caller.calls_and_gvcfs(
                allele_counters=allele_counters,
                target_sample='SAMPLE_ID',
                include_gvcfs=False)

        expected_allele_counts_param = {
            'SAMPLE_ID': allele_counter.counts.return_value
        }
        mock_cpp.calls_from_allele_counts.assert_called_once_with(
            expected_allele_counts_param, 'SAMPLE_ID')
        self.assertEqual(candidates, fake_candidates)
Example #3
0
    def test_ignores_reads_with_low_mapping_quality(self, min_base_qual,
                                                    min_mapping_qual):
        """Check that we discard reads with low mapping quality.

    We have the following scenario:

    position    0    1    2    3    4    5
    reference        A    A    C    A    G
    read             A    A    A
    variant               C

    We set the mapping quality of the read to different values of
    `mapping_qual`. All bases in the read have base quality greater than
    `min_base_qual`. The read should only be kept if
    `mapping_qual` > `min_mapping_qual`.

    Args:
      min_base_qual: Reads are discarded if the base at a variant start position
        does not meet this base quality requirement.
      min_mapping_qual: Reads are discarded if they do not meet this mapping
        quality requirement.
    """
        dv_call = deepvariant_pb2.DeepVariantCall(
            variant=variants_pb2.Variant(reference_name='chr1',
                                         start=2,
                                         end=3,
                                         reference_bases='A',
                                         alternate_bases=['C']))

        read_requirements = reads_pb2.ReadRequirements(
            min_base_quality=min_base_qual,
            min_mapping_quality=min_mapping_qual,
            min_base_quality_mode=reads_pb2.ReadRequirements.ENFORCED_BY_CLIENT
        )
        pie = _make_encoder(read_requirements=read_requirements)

        for mapping_qual in range(min_mapping_qual + 5):
            quals = [min_base_qual, min_base_qual, min_base_qual]
            read = test_utils.make_read('AAA',
                                        start=1,
                                        cigar='3M',
                                        quals=quals,
                                        mapq=mapping_qual)
            actual = pie.encode_read(dv_call, 'AACAG', read, 1, 'C')
            if mapping_qual < min_mapping_qual:
                self.assertIsNone(actual)
            else:
                self.assertIsNotNone(actual)
Example #4
0
    def test_keeps_reads_with_low_quality_bases(self, min_base_qual,
                                                min_mapping_qual):
        """Check that we keep reads with adequate quality at variant start position.

    We have the following scenario:

    position    0    1    2    3    4    5
    reference        A    A    C    A    G
    read             A    A    A
    variant               C

    We set the base quality of the first and third bases in the read to
    different functions of `base_qual`. The middle position of the read is
    where the variant starts, and this position always has base quality greater
    than `min_base_qual`. Thus, the read should always be kept.

    Args:
      min_base_qual: Reads are discarded if the base at a variant start position
        does not meet this base quality requirement.
      min_mapping_qual: Reads are discarded if they do not meet this mapping
        quality requirement.
    """
        dv_call = deepvariant_pb2.DeepVariantCall(
            variant=variants_pb2.Variant(reference_name='chr1',
                                         start=2,
                                         end=3,
                                         reference_bases='A',
                                         alternate_bases=['C']))

        read_requirements = reads_pb2.ReadRequirements(
            min_base_quality=min_base_qual,
            min_mapping_quality=min_mapping_qual,
            min_base_quality_mode=reads_pb2.ReadRequirements.ENFORCED_BY_CLIENT
        )
        pie = _make_encoder(read_requirements=read_requirements)

        for base_qual in range(min_base_qual + 5):
            quals = [base_qual - 1, min_base_qual, base_qual + 1]
            read = test_utils.make_read('AAA',
                                        start=1,
                                        cigar='3M',
                                        quals=quals,
                                        mapq=min_mapping_qual)
            actual = pie.encode_read(dv_call, 'AACAG', read, 1, 'C')
            self.assertIsNotNone(actual)
Example #5
0
  def test_ignores_reads_with_low_quality_bases(self):
    dv_call = deepvariant_pb2.DeepVariantCall(
        variant=variants_pb2.Variant(
            reference_name='chr1',
            start=2,
            end=3,
            reference_bases='A',
            alternate_bases=['C']))
    pie = _make_encoder()

    # Get the threshold the encoder uses.
    min_qual = self.options.read_requirements.min_base_quality

    for qual in range(0, min_qual + 5):
      quals = [min_qual - 1, qual, min_qual + 1]
      read = test_utils.make_read('AAA', start=1, cigar='3M', quals=quals)
      actual = pie.encode_read(dv_call, 'AACAG', read, 1, 'C')
      if qual < min_qual:
        self.assertIsNone(actual)
      else:
        self.assertIsNotNone(actual)
Example #6
0
    def test_calls_from_allele_counts(self, include_gvcfs):
        # Our test AlleleCounts are 5 positions:
        #
        # 10: A ref [no reads]
        # 11: G/C variant
        # 12: G ref [no reads]
        # 13: G ref [no reads]
        # 14: T/C variant
        #
        # The ref sites have no reads for ref or any alt simply because it
        # simplifies comparing them with the expected variant genotype likelihoods.
        # We aren't testing the correctness of the gvcf calculation here (that's
        # elsewhere) but rather focusing here on the separation of variants from
        # gvcf records, and the automatic merging of the gvcf blocks.
        allele_counter = self.fake_allele_counter(10, [
            (0, 0, 'A'),
            (10, 10, 'G'),
            (0, 0, 'G'),
            (0, 0, 'G'),
            (10, 10, 'T'),
        ])
        fake_candidates = [
            deepvariant_pb2.DeepVariantCall(
                variant=test_utils.make_variant(alleles=['G', 'C'], start=11)),
            deepvariant_pb2.DeepVariantCall(
                variant=test_utils.make_variant(alleles=['T', 'C'], start=14)),
        ]

        caller = self.make_test_caller(0.01, 100)
        with mock.patch.object(caller, 'cpp_variant_caller') as mock_cpp:
            mock_cpp.calls_from_allele_counter.return_value = fake_candidates
            candidates, gvcfs = caller.calls_from_allele_counter(
                allele_counter, include_gvcfs)

        mock_cpp.calls_from_allele_counter.assert_called_once_with(
            allele_counter)
        self.assertEqual(candidates, fake_candidates)

        # We expect our gvcfs to occur at the 10 position and that 12 and 13 have
        # been merged into a 2 bp block, if enabled. Otherwise should be empty.
        if include_gvcfs:
            self.assertLen(gvcfs, 4)
            # Expected diploid genotype likelihoods when there's no coverage. The
            # chance of having each genotype is 1/3, in log10 space.
            flat_gls = np.log10([1.0 / 3] * 3)
            self.assertGVCF(gvcfs[0],
                            ref='A',
                            start=10,
                            end=11,
                            gq=1,
                            min_dp=0,
                            gls=flat_gls)
            self.assertGVCF(
                gvcfs[1],
                ref='G',
                start=11,
                end=12,
                gq=0,
                min_dp=20,
                gls=np.array([-14.0230482368, -7.993606e-15, -14.0230482368]),
                # The genotype should NOT be called here ("./.") as the likelihood
                # for het is greater than hom_ref.
                gts=[-1, -1])
            self.assertGVCF(gvcfs[2],
                            ref='G',
                            start=12,
                            end=14,
                            gq=1,
                            min_dp=0,
                            gls=flat_gls)
        else:
            self.assertEmpty(gvcfs)
Example #7
0
class AlleleFrequencyTest(parameterized.TestCase):
    @parameterized.parameters(
        # A SNP.
        dict(variant=variants_pb2.Variant(reference_name='chr20',
                                          start=60168,
                                          end=60169,
                                          reference_bases='C',
                                          alternate_bases=['T']),
             reference_haplotype='GCACCT',
             reference_offset=60165,
             expected_return=[{
                 'haplotype':
                 'GCATCT',
                 'alt':
                 'T',
                 'variant':
                 variants_pb2.Variant(reference_name='chr20',
                                      start=60168,
                                      end=60169,
                                      reference_bases='C',
                                      alternate_bases=['T'])
             }]),
        # A deletion.
        dict(variant=variants_pb2.Variant(reference_name='chr20',
                                          start=60284,
                                          end=60291,
                                          reference_bases='ATTCCAG',
                                          alternate_bases=['AT']),
             reference_haplotype='TTTCCATTCCAGTCCAT',
             reference_offset=60279,
             expected_return=[{
                 'haplotype':
                 'TTTCCATTCCAT',
                 'alt':
                 'AT',
                 'variant':
                 variants_pb2.Variant(reference_name='chr20',
                                      start=60284,
                                      end=60291,
                                      reference_bases='ATTCCAG',
                                      alternate_bases=['AT'])
             }]),
        # An insertion.
        dict(variant=variants_pb2.Variant(reference_name='chr20',
                                          start=60279,
                                          end=60285,
                                          reference_bases='TTTCCA',
                                          alternate_bases=['TTTCCATTCCA']),
             reference_haplotype='TTTCCATTCCAGTCCAT',
             reference_offset=60279,
             expected_return=[{
                 'haplotype':
                 'TTTCCATTCCATTCCAGTCCAT',
                 'alt':
                 'TTTCCATTCCA',
                 'variant':
                 variants_pb2.Variant(reference_name='chr20',
                                      start=60279,
                                      end=60285,
                                      reference_bases='TTTCCA',
                                      alternate_bases=['TTTCCATTCCA'])
             }]),
        # A deletion.
        dict(variant=variants_pb2.Variant(reference_name='chr20',
                                          start=60284,
                                          end=60291,
                                          reference_bases='ATTCCAG',
                                          alternate_bases=['AT']),
             reference_haplotype='TTTCCATTCCAG',
             reference_offset=60279,
             expected_return=[{
                 'haplotype':
                 'TTTCCAT',
                 'alt':
                 'AT',
                 'variant':
                 variants_pb2.Variant(reference_name='chr20',
                                      start=60284,
                                      end=60291,
                                      reference_bases='ATTCCAG',
                                      alternate_bases=['AT'])
             }]),
        # An insertion.
        dict(variant=variants_pb2.Variant(reference_name='chr20',
                                          start=60279,
                                          end=60285,
                                          reference_bases='TTTCCA',
                                          alternate_bases=['TTTCCATTCCA']),
             reference_haplotype='TTTCCATTCCAG',
             reference_offset=60279,
             expected_return=[{
                 'haplotype':
                 'TTTCCATTCCATTCCAG',
                 'alt':
                 'TTTCCATTCCA',
                 'variant':
                 variants_pb2.Variant(reference_name='chr20',
                                      start=60279,
                                      end=60285,
                                      reference_bases='TTTCCA',
                                      alternate_bases=['TTTCCATTCCA'])
             }]))
    def test_update_haplotype(self, variant, reference_haplotype,
                              reference_offset, expected_return):
        list_hap_obj = allele_frequency.update_haplotype(
            variant, reference_haplotype, reference_offset)
        self.assertListEqual(list_hap_obj, expected_return)

    @parameterized.parameters([
        dict(dv_variant=variants_pb2.Variant(reference_name='chr20',
                                             start=60284,
                                             end=60291,
                                             reference_bases='ATTCCAG',
                                             alternate_bases=['AT']),
             cohort_variants=[
                 variants_pb2.Variant(reference_name='chr20',
                                      start=60279,
                                      end=60285,
                                      reference_bases='TTTCCA',
                                      alternate_bases=['T', 'TTTCCATTCCA']),
                 variants_pb2.Variant(reference_name='chr20',
                                      start=60285,
                                      end=60291,
                                      reference_bases='TTTCCA',
                                      alternate_bases=['T']),
             ],
             expected_ref_haplotype='TTTCCATTCCAG',
             expected_ref_offset=60279)
    ])
    def test_get_ref_haplotype_and_offset(self, dv_variant, cohort_variants,
                                          expected_ref_haplotype,
                                          expected_ref_offset):
        ref_reader = fasta.IndexedFastaReader(testdata.GRCH38_FASTA)
        ref_haplotype, ref_offset = allele_frequency.get_ref_haplotype_and_offset(
            dv_variant, cohort_variants, ref_reader)
        self.assertEqual(ref_haplotype, expected_ref_haplotype)
        self.assertEqual(ref_offset, expected_ref_offset)

    @parameterized.parameters(
        # A matched SNP.
        dict(variant=variants_pb2.Variant(reference_name='chr20',
                                          start=60168,
                                          end=60169,
                                          reference_bases='C',
                                          alternate_bases=['T']),
             expected_return=dict(C=0.9998, T=0.0002),
             label='matched_snp_1'),
        # A matched deletion.
        dict(variant=variants_pb2.Variant(reference_name='chr20',
                                          start=60285,
                                          end=60291,
                                          reference_bases='TTCCAG',
                                          alternate_bases=['T']),
             expected_return=dict(T=0.001198, TTCCAG=0.998802),
             label='matched_del_1'),
        # A unmatched deletion.
        dict(variant=variants_pb2.Variant(reference_name='chr20',
                                          start=60284,
                                          end=60291,
                                          reference_bases='ATTCCAG',
                                          alternate_bases=['A']),
             expected_return=dict(A=0, ATTCCAG=1),
             label='unmatched_del_1'),
        # A matched deletion, where the candidate is formatted differently.
        dict(variant=variants_pb2.Variant(reference_name='chr20',
                                          start=60284,
                                          end=60291,
                                          reference_bases='ATTCCAG',
                                          alternate_bases=['AT']),
             expected_return=dict(AT=0.001198, ATTCCAG=0.998802),
             label='matched_del_2: diff representation'),
        # An unmatched SNP.
        dict(variant=variants_pb2.Variant(reference_name='chr20',
                                          start=60150,
                                          end=60151,
                                          reference_bases='C',
                                          alternate_bases=['T']),
             expected_return=dict(C=1, T=0),
             label='unmatched_snp_1'),
        # A matched SNP and an unmatched SNP.
        dict(variant=variants_pb2.Variant(reference_name='chr20',
                                          start=60168,
                                          end=60169,
                                          reference_bases='C',
                                          alternate_bases=['T', 'A']),
             expected_return=dict(C=0.9998, T=0.0002, A=0),
             label='mixed_snp_1'),
        # An unmatched SNP, where the REF allele frequency is not 1.
        dict(variant=variants_pb2.Variant(reference_name='chr20',
                                          start=60168,
                                          end=60169,
                                          reference_bases='C',
                                          alternate_bases=['A']),
             expected_return=dict(C=0.9998, A=0),
             label='unmatched_snp_2: non-1 ref allele'),
        # A multi-allelic candidate at a multi-allelic locus.
        dict(variant=variants_pb2.Variant(reference_name='chr20',
                                          start=60279,
                                          end=60285,
                                          reference_bases='TTTCCA',
                                          alternate_bases=['T',
                                                           'TTTCCATTCCA']),
             expected_return=dict(TTTCCA=0.999401,
                                  T=0.000399,
                                  TTTCCATTCCA=0.0002),
             label='matched_mult_1'),
        # A multi-allelic candidate at a multi-allelic locus.
        dict(variant=variants_pb2.Variant(reference_name='chr20',
                                          start=60279,
                                          end=60285,
                                          reference_bases='TTTCCA',
                                          alternate_bases=['T',
                                                           'TATCCATTCCA']),
             expected_return=dict(TTTCCA=0.999401, T=0.000399, TATCCATTCCA=0),
             label='unmatched_mult_1'),
        # [Different representation]
        # A deletion where the cohort variant is represented differently.
        # In this case, REF frequency is calculated by going over all cohort ALTs.
        # Thus, the sum of all dict values is not equal to 1.
        dict(variant=variants_pb2.Variant(reference_name='chr20',
                                          start=60295,
                                          end=60301,
                                          reference_bases='TTCCAT',
                                          alternate_bases=['T']),
             expected_return=dict(T=0.000399, TTCCAT=0.923922),
             label='matched_del_3: diff representation'),
        # [Non-candidate allele]
        # One allele of a multi-allelic cohort variant is not in candidate.
        # The non-candidate allele should be ignored.
        dict(variant=variants_pb2.Variant(reference_name='chr20',
                                          start=60279,
                                          end=60285,
                                          reference_bases='TTTCCA',
                                          alternate_bases=['T']),
             expected_return=dict(TTTCCA=0.999401, T=0.000399),
             label='matched_del_4: multi-allelic cohort'),
        # A left-align example.
        dict(variant=variants_pb2.Variant(reference_name='chr20',
                                          start=9074790,
                                          end=9074794,
                                          reference_bases='CT',
                                          alternate_bases=['C', 'CTTT']),
             expected_return=dict(C=0.167732, CTTT=0.215256, CT=0.442092),
             label='matched_mult_2: left align'),
        # A left-align example.
        dict(variant=variants_pb2.Variant(reference_name='chr20',
                                          start=9074790,
                                          end=9074794,
                                          reference_bases='C',
                                          alternate_bases=['CTTT']),
             expected_return=dict(CTTT=0.145367, C=0.442092),
             label='matched_ins_1: left align'),
        # A left-align example.
        dict(variant=variants_pb2.Variant(reference_name='chr20',
                                          start=9074790,
                                          end=9074793,
                                          reference_bases='CTT',
                                          alternate_bases=['CTTA']),
             expected_return=dict(CTTA=0, CTT=0.442092),
             label='unmatched_ins_1: left align'),
        # A matched mnps.
        dict(variant=variants_pb2.Variant(reference_name='chr20',
                                          start=61065,
                                          end=61066,
                                          reference_bases='T',
                                          alternate_bases=['C']),
             expected_return=dict(C=0.079872, T=0.919729),
             label='matched_mnps_1'),
        # A matched SNP.
        dict(variant=variants_pb2.Variant(reference_name='chr20',
                                          start=62022,
                                          end=62023,
                                          reference_bases='G',
                                          alternate_bases=['C', 'T']),
             expected_return=dict(G=0.996206, C=0.003594, T=0),
             label='matched_snp_2'))
    def test_find_matching_allele_frequency(self, variant, expected_return,
                                            label):
        ref_reader = fasta.IndexedFastaReader(testdata.GRCH38_FASTA)
        vcf_reader = vcf.VcfReader(testdata.VCF_WITH_ALLELE_FREQUENCIES)
        allele_frequencies = allele_frequency.find_matching_allele_frequency(
            variant, vcf_reader, ref_reader)
        # Compare keys.
        self.assertSetEqual(set(allele_frequencies.keys()),
                            set(expected_return.keys()),
                            msg=label)
        # Compare values (almost equal).
        for key in allele_frequencies.keys():
            self.assertAlmostEqual(allele_frequencies[key],
                                   expected_return[key],
                                   msg=label)

    def test_make_population_vcf_readers_with_multiple_vcfs(self):
        filenames = [testdata.AF_VCF_CHR20, testdata.AF_VCF_CHR21]

        output = allele_frequency.make_population_vcf_readers(filenames)

        self.assertIsInstance(output['chr20'], vcf.VcfReader)
        self.assertIsInstance(output['chr21'], vcf.VcfReader)
        self.assertEqual(next(output['chr20']).reference_name, 'chr20')
        self.assertEqual(next(output['chr21']).reference_name, 'chr21')
        # Check that chr22 has no reader rather than outputting another reader for
        # a different chromosome.
        self.assertIsNone(output['chr22'])

    def test_make_population_vcf_readers_with_one_vcf(self):
        filenames = [testdata.AF_VCF_CHR20_AND_21]

        output = allele_frequency.make_population_vcf_readers(filenames)

        self.assertIsInstance(output['chr20'], vcf.VcfReader)
        self.assertIsInstance(output['chr21'], vcf.VcfReader)
        self.assertIsInstance(output['chr22'], vcf.VcfReader)
        # All reference names should map to the same VCF that starts with chr20.
        self.assertEqual(next(output['chr20']).reference_name, 'chr20')
        self.assertEqual(next(output['chr21']).reference_name, 'chr20')
        self.assertEqual(next(output['chr22']).reference_name, 'chr20')

    def test_make_population_vcf_readers_raises_on_shared_chromosomes(self):
        filenames = [
            testdata.AF_VCF_CHR20, testdata.AF_VCF_CHR21,
            testdata.AF_VCF_CHR20_AND_21
        ]

        with self.assertRaisesRegex(
                expected_exception=ValueError,
                expected_regex='Variants on chr20 are included in multiple VCFs'
        ):
            allele_frequency.make_population_vcf_readers(filenames)

    @parameterized.parameters(
        dict(dv_calls=iter([
            deepvariant_pb2.DeepVariantCall(variant=variants_pb2.Variant(
                reference_name='chr20',
                start=60168,
                end=60169,
                reference_bases='C',
                alternate_bases=['T']),
                                            allele_support=None)
        ]),
             expected_return=dict(C=0.9998, T=0.0002),
             testcase='valid'),
        dict(dv_calls=iter([
            deepvariant_pb2.DeepVariantCall(variant=variants_pb2.Variant(
                reference_name='chrM',
                start=10000,
                end=10001,
                reference_bases='T',
                alternate_bases=['G']),
                                            allele_support=None)
        ]),
             expected_return=dict(T=1, G=0),
             testcase='no VCF'))
    def test_add_allele_frequencies_to_candidates(self, dv_calls,
                                                  expected_return, testcase):
        if testcase == 'valid':
            pop_vcf_reader = vcf.VcfReader(
                testdata.VCF_WITH_ALLELE_FREQUENCIES)
            ref_reader = fasta.IndexedFastaReader(testdata.GRCH38_FASTA)
        elif testcase == 'no VCF':
            pop_vcf_reader = None
            ref_reader = None
        else:
            raise ValueError('Invalid testcase for parameterized test.')
        updated_dv_call = list(
            allele_frequency.add_allele_frequencies_to_candidates(
                dv_calls, pop_vcf_reader, ref_reader))
        actual_frequency = updated_dv_call[0].allele_frequency
        # Compare keys.
        self.assertSetEqual(set(actual_frequency.keys()),
                            set(expected_return.keys()))
        # Compare values (almost equal).
        for key in actual_frequency.keys():
            self.assertAlmostEqual(actual_frequency[key], expected_return[key])