Esempio n. 1
0
    def testOneLowFrequencyDeletion(self):
        """
        A deletion from the reference that does not meet the required deletion
        frequency should not appear in the consensus.
        """
        template = (
            'CACGTG',
            ' A-A',
            ' ?-?',
            ' AGA',
            ' ???',
            ' AGA',
            ' ???',
        )

        with makeBAM(template) as (fastaFilename, bamFilename):
            self.assertEqual(
                'CAGATG',
                consensusFromBAM(bamFilename,
                                 quiet=True,
                                 referenceFasta=fastaFilename,
                                 threshold=0.7,
                                 deletionSymbol='x',
                                 deletionThreshold=0.5,
                                 ignoreQuality=self.ignoreQuality).sequence)
Esempio n. 2
0
    def testOmicronEPE214Insertion(self):
        """
        Test that an amino acid EPE sequence (here 'GAGCCAGAA') insertion
        into the SARS-CoV-2 spike nucleotide sequence at location 642 (amino
        acid location 214) works as expected.

        The nucleotide sequence below can be obtained via:

        $ ncbi-fetch-id.py MN908947.3 > MN908947.3.fasta
        $ describe-genome.py --feature S --printNtSeq < MN908947.3.fasta | \
              filter-fasta.py --keepSites 630-673 --quiet | tail -n 1

        I then inserted the 9 nucleotide sequence GAGCCAGAA before the TGAT...
        starting at position 642.
        """
        template = (
            'TAATTTAGTGCG---------TGATCTCCCTCAGGGTTTTTCGGCTTTAGAAC',
            '      AGTGCGGAGCCAGAATGATCTCCCTCAGGGTTTTTCGGCTTT',
            '      ??????????????????????????????????????????',
        )

        with makeBAM(template) as (fastaFilename, bamFilename):
            self.assertEqual(
                'TAATTTAGTGCGGAGCCAGAATGATCTCCCTCAGGGTTTTTCGGCTTTAGAAC',
                consensusFromBAM(bamFilename,
                                 quiet=True,
                                 referenceFasta=fastaFilename,
                                 insertionCountThreshold=1,
                                 ignoreQuality=self.ignoreQuality).sequence)
Esempio n. 3
0
    def testLowReadsCharNoCoverageX(self):
        """
        If fewer reads than needed are present and resolution of low-coverage
        sites is 'N' (or '? etc), then Ns (or ?s, etc) should be returned in
        the low coverage sites, and (for example) 'X' in the sites with no
        coverage.
        """
        template = (
            'ACGTTCCG',
            '  GTT',
            '  ???',
        )

        with makeBAM(template) as (fastaFilename, bamFilename):
            for char in 'N?':
                self.assertEqual(
                    'XX' + char * 3 + 'XXX',
                    consensusFromBAM(
                        bamFilename,
                        quiet=True,
                        referenceFasta=fastaFilename,
                        noCoverage='X',
                        lowCoverage=char,
                        minCoverage=2,
                        ignoreQuality=self.ignoreQuality).sequence)
Esempio n. 4
0
    def testTwoReadsDifferingFromPartOfTheReferenceLowAndNoCoverage(self):
        """
        If two reads are present and they differ from part of the reference and
        resolution of no-coverage bases is 'N', the expected hybrid of the read
        and the reference should be returned, with the part of the reads that
        has insufficient coverage returning the low-coverage symbol (here '+').
        """
        template = (
            'ACGTTCCG',
            '  AAA',
            '  ???',
            '  AAAT',
            '  ????',
        )

        with makeBAM(template) as (fastaFilename, bamFilename):
            self.assertEqual(
                'NNAAA+NN',
                consensusFromBAM(bamFilename,
                                 quiet=True,
                                 referenceFasta=fastaFilename,
                                 minCoverage=2,
                                 noCoverage='N',
                                 lowCoverage='+',
                                 ignoreQuality=self.ignoreQuality).sequence)
Esempio n. 5
0
    def testNoReadsReference(self):
        """
        If no reads are present and resolution of no-coverage bases is the
        reference sequence, the reference should be returned.
        """
        template = ('ACGTTCCG', )

        with makeBAM(template) as (fastaFilename, bamFilename):
            self.assertEqual(
                template[0],
                consensusFromBAM(bamFilename,
                                 quiet=True,
                                 referenceFasta=fastaFilename,
                                 noCoverage='reference',
                                 ignoreQuality=self.ignoreQuality).sequence)
Esempio n. 6
0
    def testNoReadsN(self):
        """
        If no reads are present and resolution of no-coverage bases is 'N'
        (or '?', etc) a sequence of Ns (or ?s, etc) should be returned.
        """
        template = ('ACGTTCCG', )

        with makeBAM(template) as (fastaFilename, bamFilename):
            for char in 'N?':
                self.assertEqual(
                    char * len(template[0]),
                    consensusFromBAM(
                        bamFilename,
                        quiet=True,
                        referenceFasta=fastaFilename,
                        noCoverage=char,
                        ignoreQuality=self.ignoreQuality).sequence)
Esempio n. 7
0
    def testReferenceAndReferenceIdNotGiven(self):
        """
        If the reference to use is not given, the single reference in the BAM
        file should be used. A string of 'N's is returned because there
        are no reads and we pass noCoverage='N'.
        """
        template = ('ACGTTCCG', )

        with makeBAM(template) as (fastaFilename, bamFilename):
            self.assertEqual(
                'N' * len(template[0]),
                consensusFromBAM(bamFilename,
                                 quiet=True,
                                 referenceFasta=fastaFilename,
                                 noCoverage='N',
                                 lowCoverage='N',
                                 ignoreQuality=self.ignoreQuality).sequence)
Esempio n. 8
0
    def testNoReadsReferenceFromId(self):
        """
        The reference can be passed using just its id (as opposed to a full
        reference Read instance). A string of 'N's is returned because there
        are no reads and we pass noCoverage='N'.
        """
        template = ('ACGTTCCG', )

        with makeBAM(template) as (fastaFilename, bamFilename):
            self.assertEqual(
                'N' * len(template[0]),
                consensusFromBAM(bamFilename,
                                 quiet=True,
                                 bamId='ref-id',
                                 noCoverage='N',
                                 lowCoverage='N',
                                 ignoreQuality=self.ignoreQuality).sequence)
Esempio n. 9
0
    def testReadHasLaterSitesNoSoftClipped(self):
        """
        If a read has sites that come after the reference, they must
        not appear in the consensus if soft-clipped bases are not included.
        """
        template = (
            'ACGT',
            '   TAA',
            '   ???',
        )

        with makeBAM(template) as (fastaFilename, bamFilename):
            self.assertEqual(
                'ACGT',
                consensusFromBAM(bamFilename,
                                 quiet=True,
                                 referenceFasta=fastaFilename,
                                 threshold=0.7,
                                 ignoreQuality=self.ignoreQuality).sequence)
Esempio n. 10
0
    def testTwoDeletionsFromReferenceUnmarked(self):
        """
        Two deletions from the reference must be handled correctly.
        """
        template = (
            'CACGTG',
            ' A-A-G',
            ' ?-?-?',
        )

        with makeBAM(template) as (fastaFilename, bamFilename):
            self.assertEqual(
                'CAAG',
                consensusFromBAM(bamFilename,
                                 quiet=True,
                                 referenceFasta=fastaFilename,
                                 threshold=0.7,
                                 deletionSymbol='',
                                 ignoreQuality=self.ignoreQuality).sequence)
Esempio n. 11
0
    def testOneInsertionInReference(self):
        """
        An insertion in the reference must be handled correctly.
        """
        template = (
            'CA-CGTG',
            ' AAC',
            ' ???',
        )

        with makeBAM(template) as (fastaFilename, bamFilename):
            self.assertEqual(
                'CAACGTG',
                consensusFromBAM(bamFilename,
                                 quiet=True,
                                 referenceFasta=fastaFilename,
                                 threshold=0.7,
                                 insertionCountThreshold=1,
                                 ignoreQuality=self.ignoreQuality).sequence)
Esempio n. 12
0
    def testGeneiousExamplesTie(self):
        """
        Test the tied counts example from
        https://assets.geneious.com/manual/2020.1/static/GeneiousManualse43.html
        """
        template = (
            'ACGT',
            '  A',
            '  ?',
            '  A',
            '  ?',
            '  A',
            '  ?',
            '  A',
            '  ?',
            '  A',
            '  ?',
            '  A',
            '  ?',
            '  A',
            '  ?',
            '  A',
            '  ?',
            '  G',
            '  ?',
            '  G',
            '  ?',
            '  T',
            '  ?',
            '  T',
            '  ?',
        )

        with makeBAM(template) as (fastaFilename, bamFilename):
            for expected, threshold in ('A', 0.4), ('D', 0.7), ('D', 0.95):
                self.assertEqual(
                    f'AC{expected}T',
                    consensusFromBAM(
                        bamFilename,
                        quiet=True,
                        referenceFasta=fastaFilename,
                        threshold=threshold,
                        ignoreQuality=self.ignoreQuality).sequence)
Esempio n. 13
0
    def testOneReadMatchingPartOfTheReference(self):
        """
        If one read is present and it matches part of the reference and
        resolution of no-coverage bases is the reference sequence, the
        reference should be returned.
        """
        template = (
            'ACGTTCCG',
            '  GTT',
            '  ???',
        )

        with makeBAM(template) as (fastaFilename, bamFilename):
            self.assertEqual(
                template[0],
                consensusFromBAM(bamFilename,
                                 quiet=True,
                                 referenceFasta=fastaFilename,
                                 noCoverage='reference',
                                 ignoreQuality=self.ignoreQuality).sequence)
Esempio n. 14
0
    def testLowReadsReference(self):
        """
        If fewer reads than needed are present and resolution of low-coverage
        sites is the reference sequence, the reference should be returned.
        """
        template = (
            'ACGTTCCG',
            '  GTT',
            '  ???',
        )

        with makeBAM(template) as (fastaFilename, bamFilename):
            self.assertEqual(
                template[0],
                consensusFromBAM(bamFilename,
                                 quiet=True,
                                 referenceFasta=fastaFilename,
                                 lowCoverage='reference',
                                 minCoverage=2,
                                 ignoreQuality=self.ignoreQuality).sequence)
Esempio n. 15
0
    def testOneReadDifferingFromPartOfTheReference(self):
        """
        If one read is present and it differs from part of the reference and
        resolution of no-coverage bases is the reference sequence, the
        expected hybrid of the read and the reference should be returned.
        """
        template = (
            'ACGTTCCG',
            '  AAA',
            '  ???',
        )

        with makeBAM(template) as (fastaFilename, bamFilename):
            self.assertEqual(
                'ACAAACCG',
                consensusFromBAM(bamFilename,
                                 quiet=True,
                                 referenceFasta=fastaFilename,
                                 noCoverage='reference',
                                 ignoreQuality=self.ignoreQuality).sequence)
Esempio n. 16
0
    def testTwoAgreeingSoftClipsNothingBeforeNoSoftClipped(self):
        """
        Test that two soft-clipped regions that agree with each other are
        ignored when soft-clipped bases are not including.
        """
        template = (
            '        TGATCTCC',
            'AGCCAGAATGATCTCC',
            '????????????????',
            '    AGAATGATCTCC',
            '    ????????????',
        )

        with makeBAM(template) as (fastaFilename, bamFilename):
            self.assertEqual(
                'TGATCTCC',
                consensusFromBAM(bamFilename,
                                 quiet=True,
                                 referenceFasta=fastaFilename,
                                 ignoreQuality=self.ignoreQuality).sequence)
Esempio n. 17
0
    def testReadHasEarlierAndLaterSites(self):
        """
        If a read has sites that come before and after the reference, the sites
        must be included in the consensus.
        """
        template = (
            '  ACGT',
            'TCACGTGA',
            '????????',
        )

        with makeBAM(template) as (fastaFilename, bamFilename):
            self.assertEqual(
                'TCACGTGA',
                consensusFromBAM(bamFilename,
                                 quiet=True,
                                 referenceFasta=fastaFilename,
                                 threshold=0.7,
                                 includeSoftClipped=True,
                                 ignoreQuality=self.ignoreQuality).sequence)
Esempio n. 18
0
    def testOneInsertionInReferenceLowFrequency(self):
        """
        An insertion in the reference must not be included if it does not
        meet the insertionCountThreshold.
        """
        template = (
            'CA-CGTG',
            ' ATC',
            ' ???',
        )

        with makeBAM(template) as (fastaFilename, bamFilename):
            self.assertEqual(
                'CACGTG',
                consensusFromBAM(bamFilename,
                                 quiet=True,
                                 referenceFasta=fastaFilename,
                                 threshold=0.7,
                                 insertionCountThreshold=2,
                                 ignoreQuality=self.ignoreQuality).sequence)
Esempio n. 19
0
    def testTwoInsertions(self):
        """
        Test that two insertions that agree with each other, with one also
        matching reference bases before the insertion give the expected result.
        """
        template = (
            'TT--------TGAT--CTCC',
            'TTAGCCAGAATGA',
            '?????????????',
            '           GATGGCTCC',
            '           ?????????',
        )

        with makeBAM(template) as (fastaFilename, bamFilename):
            self.assertEqual(
                'TTAGCCAGAATGATGGCTCC',
                consensusFromBAM(bamFilename,
                                 quiet=True,
                                 referenceFasta=fastaFilename,
                                 insertionCountThreshold=1,
                                 ignoreQuality=self.ignoreQuality).sequence)
Esempio n. 20
0
    def testOmicronEPE214PartialNoSoftClipped(self):
        """
        Test that a trailing part of the amino acid EPE sequence (here
        'AGCCAGAA') insertion into the SARS-CoV-2 spike nucleotide sequence
        that usually is found at location 642 (amino acid location 214) is
        excluded when it appears as a partial soft-clipped region and
        soft-clipped bases are not included.
        """
        template = (
            '        TGATCTCCCTCAGGGTTTTTCGGCTTTAGAAC',
            'AGCCAGAATGATCTCCCTCAGGGTTTTTCGGCTTT',
            '???????????????????????????????????',
        )

        with makeBAM(template) as (fastaFilename, bamFilename):
            self.assertEqual(
                'TGATCTCCCTCAGGGTTTTTCGGCTTTAGAAC',
                consensusFromBAM(bamFilename,
                                 quiet=True,
                                 referenceFasta=fastaFilename,
                                 ignoreQuality=self.ignoreQuality).sequence)
Esempio n. 21
0
    def testOmicronEPE214InsertionLeftSideExactSoftClipped(self):
        """
        Test that an amino acid EPE sequence (here 'GAGCCAGAA') insertion
        into the SARS-CoV-2 spike nucleotide sequence at location 642 (amino
        acid location 214) works as expected when the insertion sequence is
        the very beginning of the read, in which case it will be marked as a
        soft-clipped region.
        """
        template = (
            'TAATTTAGTGCG---------TGATCTCCCTCAGGGTTTTTCGGCTTTAGAAC',
            '            GAGCCAGAATGATCT',
            '            ???????????????',
        )

        with makeBAM(template) as (fastaFilename, bamFilename):
            self.assertEqual(
                'TAAGAGCCAGAATGATCTCCCTCAGGGTTTTTCGGCTTTAGAAC',
                consensusFromBAM(bamFilename,
                                 quiet=True,
                                 referenceFasta=fastaFilename,
                                 insertionCountThreshold=1,
                                 includeSoftClipped=True,
                                 ignoreQuality=self.ignoreQuality).sequence)
Esempio n. 22
0
    def testTwoInsertionsAndTwoDeletionsInReference(self):
        """
        Two insertions and two deletions in the reference must be handled
        correctly and the deletions must appear in the consensus as requested.
        """
        template = (
            'CA-CGT-G',
            ' AA--',
            ' ??--',
            '     TAG',
            '     ???',
        )

        with makeBAM(template) as (fastaFilename, bamFilename):
            self.assertEqual(
                'CAAxxTAG',
                consensusFromBAM(bamFilename,
                                 quiet=True,
                                 referenceFasta=fastaFilename,
                                 threshold=0.7,
                                 deletionSymbol='x',
                                 insertionCountThreshold=1,
                                 ignoreQuality=self.ignoreQuality).sequence)
Esempio n. 23
0
    def testTwoInsertionsAndOneDeletionInReference(self):
        """
        Two insertions and one deletion in the reference must be handled
        correctly, with the deletion marked with 'x' as requested.
        """
        template = (
            'CA-CGT-G',
            ' AA-G',
            ' ??-?',
            '     TAG',
            '     ???',
        )

        with makeBAM(template) as (fastaFilename, bamFilename):
            self.assertEqual(
                'CAAxGTAG',
                consensusFromBAM(bamFilename,
                                 quiet=True,
                                 referenceFasta=fastaFilename,
                                 threshold=0.7,
                                 deletionSymbol='x',
                                 insertionCountThreshold=1,
                                 ignoreQuality=self.ignoreQuality).sequence)
Esempio n. 24
0
    def testOmicronEPE214PartialInsertionInTwoReads(self):
        """
        This is the same as the immediately above test
        (testOmicronEPE214PartialInsertionInTwoReadsSoftClipped), but the
        reads are now fully aligned wtih the reference so there is no soft
        clipping. The alignment of the reads is the same and so is the result.
        """
        template = (
            'TAATTTAGTGCGTGATCTCCCTCA',
            '      AGTGCGGAGCCA',
            '      ????????????',
            '    AGCCAGAATGATCTCCCTCA',
            '    ????????????????????',
        )

        with makeBAM(template) as (fastaFilename, bamFilename):
            self.assertEqual(
                'TAATAGMSWGMRKRRYCWCCCTCA',
                consensusFromBAM(bamFilename,
                                 quiet=True,
                                 referenceFasta=fastaFilename,
                                 insertionCountThreshold=1,
                                 ignoreQuality=self.ignoreQuality).sequence)
Esempio n. 25
0
    def testSimpleMajorityBelowThreshold(self):
        """
        If conflicting reads at a site do not give a simple (above threshold)
        majority, the ambiguous code should be in the consensus.
        """
        template = (
            'ACGT',
            '  A',
            '  ?',
            '  A',
            '  ?',
            '  C',
            '  ?',
        )

        with makeBAM(template) as (fastaFilename, bamFilename):
            self.assertEqual(
                'ACMT',
                consensusFromBAM(bamFilename,
                                 quiet=True,
                                 referenceFasta=fastaFilename,
                                 threshold=0.7,
                                 ignoreQuality=self.ignoreQuality).sequence)
Esempio n. 26
0
    def testSimpleMajority(self):
        """
        If three reads result in a majority base at a site, that base should
        be in the consensus.
        """
        template = (
            'ACGT',
            '  A',
            '  ?',
            '  A',
            '  ?',
            '  C',
            '  ?',
        )

        with makeBAM(template) as (fastaFilename, bamFilename):
            self.assertEqual(
                'ACAT',
                consensusFromBAM(bamFilename,
                                 quiet=True,
                                 referenceFasta=fastaFilename,
                                 threshold=0.5,
                                 ignoreQuality=self.ignoreQuality).sequence)
Esempio n. 27
0
    def testOmicronEPE214PartialInsertionInTwoReadsSoftClipped(self):
        """
        The amino acid EPE sequence (here 'GAGCCAGAA') insertion
        into the SARS-CoV-2 spike nucleotide sequence at location 642 (amino
        acid location 214) must work as expected, when the reads only
        partially cover it and are therefore both soft-clipped in that region.
        They overlap in a non-agreeing way and ambiguous nucleotide codes
        result.
        """
        template = (
            'TAATTTAGTGCG---------TGATCTCCCTCA',
            '      AGTGCGGAGCCA',
            '      ????????????',
            '             AGCCAGAATGATCTCCCTCA',
            '             ????????????????????',
        )

        # This example is a bit complicated. The reference and reads will
        # align as follows (with the '-' from the reference removed to make
        # it easier to see what's going on). The final line of hyphens shows
        # the ambiguous region.
        #
        # TAATTTAGTGCGTGATCTCCCTCA
        #       AGTGCGGAGCCA
        #     AGCCAGAATGATCTCCCTCA
        #       ------------

        with makeBAM(template) as (fastaFilename, bamFilename):
            self.assertEqual(
                'TAATAGMSWGMRKRRYCWCCCTCA',
                consensusFromBAM(bamFilename,
                                 quiet=True,
                                 referenceFasta=fastaFilename,
                                 insertionCountThreshold=1,
                                 includeSoftClipped=True,
                                 ignoreQuality=self.ignoreQuality).sequence)
Esempio n. 28
0
    def testReadsHaveEarlierAndLaterSitesNoSoftClipped(self):
        """
        If two reads have sites that come before and after the reference, the
        sites must not appear in the consensus if soft-clipped bases are not
        included. This is the same as the
        testReadHasEarlierAndLaterSitesNoSoftClipped test above, but using two
        reads (instead of one) to give the before and after sites.
        """
        template = (
            '  ACGT',
            'TCA',
            '???',
            '     TGA',
            '     ???',
        )

        with makeBAM(template) as (fastaFilename, bamFilename):
            self.assertEqual(
                'ACGT',
                consensusFromBAM(bamFilename,
                                 quiet=True,
                                 referenceFasta=fastaFilename,
                                 threshold=0.7,
                                 ignoreQuality=self.ignoreQuality).sequence)
Esempio n. 29
0
    def testHighQualityDominates(self):
        """
        If one read has a very high quality base (here 'C', quality ']' = 60)
        that base should take precedence over two other reads that agree with
        each other but which have lower ('5' = 30) quality.
        """
        template = (
            'ACGT',
            '  A',
            '  5',
            '  A',
            '  5',
            '  C',
            '  ]',
        )

        with makeBAM(template) as (fastaFilename, bamFilename):
            self.assertEqual(
                'ACCT',
                consensusFromBAM(bamFilename,
                                 quiet=True,
                                 referenceFasta=fastaFilename,
                                 threshold=0.5,
                                 ignoreQuality=self.ignoreQuality).sequence)