Example #1
0
    def testQueryTooLong(self):
        """
        If the query sequence is longer than the total of the lengths in the
        CIGAR operations, a ValueError must be raised.
        """
        # This test just returns. It used to be possible to reach the
        # "Query ... not fully consumed when parsing CIGAR string."
        # ValueError in sam.py, prior to the fix of
        # https://github.com/acorg/dark-matter/issues/630 but it is not
        # possible to get a CIGAR string that has a different total length
        # from the sequence length through to our code in sam.py because
        # pysam catches the error.  I'm leaving this test here because it
        # documents that the error checked for in sam.py cannot currently
        # be reached and the test may become useful. For now it just returns.
        return
        data = '\n'.join([
            '@SQ SN:ref1 LN:90',
            'query1 0 ref1 1 60 4M * 0 0 TCTAGG ZZZZZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(SAMFilter(filename))
            error = ('^Query TCTAGG not fully consumed when parsing CIGAR '
                     'string\\.')
            assertRaisesRegex(self, ValueError, error, list, ps.queries())
Example #2
0
    def testHardClippingInCIGARButQueryNotHardClipped(self):
        """
        As documented in https://github.com/acorg/dark-matter/issues/630 we
        must deal correctly with a case in which the CIGAR string says a
        query is hard-clipped but the query sequence in the SAM file
        actually isn't. This can be due to a prior alignment with a soft clip,
        in which case the full query sequence has to be given before the
        secondary alignment with the hard clip.
        """
        data = '\n'.join([
            '@SQ SN:Chimp-D00220 LN:8',
            '@SQ SN:D-AM494716 LN:8',
            '@SQ SN:D-XXX LN:8',
            '@SQ SN:Chimp-YYY LN:8',
            'query1 0 Chimp-D00220 1 0 3S5M * 0 0 TTTTGGTT 12345678',
            'query1 256 D-AM494716 1 0 3H5M * 0 0 * *',
            'query1 256 D-XXX 1 0 5H3M * 0 0 * *',
            'query1 0 Chimp-YYY 1 0 8M * 0 0 * *',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(SAMFilter(filename))
            (read1, read2, read3, read4) = list(ps.queries(addAlignment=True))

            self.assertEqual(Read('query1', 'TGGTT---', '45678!!!'), read1)
            self.assertEqual('TTTTGGTT', read1.alignment.query_sequence)

            self.assertEqual(Read('query1/1', 'TGGTT---', '45678!!!'), read2)
            self.assertEqual('TGGTT', read2.alignment.query_sequence)

            self.assertEqual(Read('query1/2', 'GTT-----', '678!!!!!'), read3)
            self.assertEqual('GTT', read3.alignment.query_sequence)

            self.assertEqual(Read('query1/3', 'TTTTGGTT', '12345678'), read4)
            self.assertEqual('TTTTGGTT', read4.alignment.query_sequence)
Example #3
0
def parseCommandLineOptions(args, returnSignificantOffsets=True):
    """
    Deal with the various command-line options added to the ArgumentParser
    instance by addCommandLineOptions.

    @param args: The result of calling C{parse_args} on an C{ArgumentParser}
        instance (the one that was passed to C{addCommandLineOptions}, unless
        we're testing).
    @param returnSignificantOffsets: If C{True} also return a list of the
        significant offsets (else that element of the return value will be
        C{None}).
    @return: A C{tuple}: (genomeLength, alignedReads, padddedSAM,
        readCountAtOffset, baseCountAtOffset, readsAtOffset,
        significantOffsets).
    """
    genomeLength = None
    alignedReads = []
    samFilter = SAMFilter.parseFilteringOptions(args)

    if samFilter.referenceIds and len(samFilter.referenceIds) > 1:
        raise ValueError('Only one reference id can be given.')

    referenceLengths = samFilter.referenceLengths()

    if len(referenceLengths) == 1:
        referenceId, genomeLength = referenceLengths.popitem()
    else:
        raise ValueError(
            'If you do not specify a reference sequence with '
            '--referenceId, the SAM/BAM file must contain exactly one '
            'reference. But %s contains %d.' %
            (args.samfile, len(referenceLengths)))

    paddedSAM = PaddedSAM(samFilter)

    for query in paddedSAM.queries():
        alignedReads.append(AlignedRead(query.id, query.sequence))

    readCountAtOffset, baseCountAtOffset, readsAtOffset = gatherData(
        genomeLength, alignedReads)

    if returnSignificantOffsets:
        significantOffsets = list(
            findSignificantOffsets(baseCountAtOffset, readCountAtOffset,
                                   args.minReads, args.homogeneousCutoff))
        for read in alignedReads:
            read.setSignificantOffsets(significantOffsets)
    else:
        significantOffsets = None

    return (genomeLength, alignedReads, paddedSAM, readCountAtOffset,
            baseCountAtOffset, readsAtOffset, significantOffsets)
Example #4
0
    def testAllMMatch(self):
        """
        A simple all-'M' match must result in the expected padded sequence.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 2 60 6M * 0 0 TCTAGG ZZZZZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(SAMFilter(filename))
            (read,) = list(ps.queries())
            self.assertEqual(Read('query1', '-TCTAGG---', '!ZZZZZZ!!!'), read)
Example #5
0
    def testQuerySoftClipReachesRightEdge(self):
        """
        A match with a soft-clipped region that reaches to the right edge of
        the reference must result in the expected padded sequence.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 5 60 2M4S * 0 0 TCTAGG ZZZZZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(SAMFilter(filename))
            (read,) = list(ps.queries())
            self.assertEqual(Read('query1', '----TCTAGG', '!!!!ZZZZZZ'), read)
Example #6
0
    def testQuerySoftClipProtrudesLeft(self):
        """
        A match with a soft-clipped region that extends to the left of the
        reference must result in the expected padded sequence.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 2 60 4S2M * 0 0 TCTAGG ZZZZZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(SAMFilter(filename))
            (read,) = list(ps.queries())
            self.assertEqual(Read('query1', 'AGG-------', 'ZZZ!!!!!!!'), read)
Example #7
0
    def testDropSecondary(self):
        """
        Dropping matches flagged as secondary must give the expected result.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 2 60 2=2X2M * 0 0 TCTAGG ZZZZZZ',
            'query2 256 ref1 2 60 2= * 0 0 TC ZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(SAMFilter(filename, dropSecondary=True))
            (read,) = list(ps.queries())
            self.assertEqual(Read('query1', '-TCTAGG---', '!ZZZZZZ!!!'), read)
Example #8
0
    def testQuerySoftClipProtrudesBothSides(self):
        """
        A match with a soft-clipped region that extends to both the left and
        right of the reference must result in the expected padded sequence.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 4 60 5S5M5S * 0 0 TCTAGGCTGACTAAG ZZZZZZZZZZZZZZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(SAMFilter(filename))
            (read,) = list(ps.queries())
            self.assertEqual(Read('query1', 'TAGGCTGACT', 'ZZZZZZZZZZ'), read)
Example #9
0
    def testReferencesToStr(self):
        """
        The referencesToStr method must return the expected string.
        """
        data = '\n'.join([
            '@SQ SN:id1 LN:90',
            '@SQ SN:id2 LN:91',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(filename)
            self.assertEqual('id1 (length 90)\nid2 (length 91)',
                             ps.referencesToStr())
            ps.close()
Example #10
0
    def testReferenceDeletion(self):
        """
        An deletion of reference bases must result in the expected padded
        sequence (with Ns inserted for the deleted reference bases).
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 2 60 2M2D4M * 0 0 TCTAGG ZZZZZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(SAMFilter(filename))
            (read,) = list(ps.queries())
            self.assertEqual(Read('query1', '-TCNNTAGG-', '!ZZ!!ZZZZ!'), read)
Example #11
0
    def testMixedMatch(self):
        """
        A match that is a mix of M, =, and X must result in the expected
        padded sequence.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 2 60 2=2X2M * 0 0 TCTAGG ZZZZZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(SAMFilter(filename))
            (read,) = list(ps.queries())
            self.assertEqual(Read('query1', '-TCTAGG---', '!ZZZZZZ!!!'), read)
Example #12
0
    def testMixedMatchSpecificReferenceButNoMatches(self):
        """
        A request for reads aligned against a reference that exists but that
        has no matches must result in an empty list.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:15',
            '@SQ SN:ref2 LN:15',
            'query1 0 ref1 2 60 2=2X2M * 0 0 TCTAGG ZZZZZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(SAMFilter(filename, referenceIds={'ref2'}))
            self.assertEqual([], list(ps.queries()))
Example #13
0
    def testRcNeeded(self):
        """
        A reverse-complimented match (flag = 16) when rcNeeded=True is passed
        must result in the expected (reverse complimented) padded sequence.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 16 ref1 2 60 6M * 0 0 TCTAGG ZZZZZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(filename)
            (read,) = list(ps.queries(rcNeeded=True))
            self.assertEqual(Read('query1', '-CCTAGA---'), read)
            ps.close()
Example #14
0
    def testReferenceSkipAlternateChar(self):
        """
        An skip of reference bases must result in the expected padded
        sequence (with gaps) when a queryInsertionChar is passed.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 2 60 2M2N4M * 0 0 TCTAGG ZZZZZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(filename)
            (read,) = list(ps.queries(queryInsertionChar='X'))
            self.assertEqual(Read('query1', '-TCXXTAGG-'), read)
            ps.close()
Example #15
0
    def testRcNeeded(self):
        """
        A reverse-complemented match (flag = 16) when rcNeeded=True is passed
        must result in the expected (reverse complemented) padded sequence
        and reversed quality string.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 16 ref1 2 60 6M * 0 0 TCTAGG 123456',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(SAMFilter(filename))
            (read,) = list(ps.queries(rcNeeded=True))
            self.assertEqual(Read('query1', '-CCTAGA---', '!654321!!!'), read)
Example #16
0
    def testHardClipRight(self):
        """
        A simple all-'M' match with a hard clip right must result in the
        expected padded sequence.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 2 60 6M10H * 0 0 TCTAGG ZZZZZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(filename)
            (read,) = list(ps.queries())
            self.assertEqual(Read('query1', '-TCTAGG---'), read)
            ps.close()
Example #17
0
    def testRcSuffix(self):
        """
        A reverse-complimented sequence should have the rcSuffix string added
        to its id when an rcSuffix value is passed.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 16 ref1 2 60 6M * 0 0 TCTAGG ZZZZZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(filename)
            (read,) = list(ps.queries(rcSuffix='-rc'))
            self.assertEqual(Read('query1-rc', '-TCTAGG---'), read)
            ps.close()
Example #18
0
    def testReferenceSkip(self):
        """
        An skip of reference bases must result in the expected padded
        sequence with the passed unknown quality character when the
        unknownQualityChar argument is passed.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 2 60 2M2N4M * 0 0 TCTAGG ZZZZZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(SAMFilter(filename))
            (read,) = list(ps.queries(unknownQualityChar='.'))
            self.assertEqual(Read('query1', '-TCNNTAGG-', '.ZZ..ZZZZ.'), read)
Example #19
0
    def testRcSuffix(self):
        """
        A reverse-complemented sequence should have the rcSuffix string added
        to its id when an rcSuffix value is passed.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 16 ref1 2 60 6M * 0 0 TCTAGG 123456',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(SAMFilter(filename))
            (read,) = list(ps.queries(rcSuffix='-rc', rcNeeded=True))
            self.assertEqual(Read('query1-rc', '-CCTAGA---', '!654321!!!'),
                             read)
Example #20
0
    def testQuerySoftClipProtrudesRight(self):
        """
        A match with a soft-clipped region that extends to the right of
        the reference must result in the expected padded sequence.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 6 60 2M4S * 0 0 TCTAGG ZZZZZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(filename)
            (read,) = list(ps.queries())
            self.assertEqual(Read('query1', '-----TCTAG'), read)
            ps.close()
Example #21
0
    def testQueryHardClipAndSoftClipProtrudesBothSides(self):
        """
        A match with a soft-clipped region that extends to both the left and
        right of the reference must result in the expected padded sequence
        when hard clipping is also indicated by the CIGAR string.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 4 0 3H5S5M4S5H * 0 0 TCTAGGCTGACTAA ZZZZZZZZZZZZZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(filename)
            (read,) = list(ps.queries())
            self.assertEqual(Read('query1', 'TAGGCTGACT'), read)
            ps.close()
Example #22
0
    def testMixedMatchSpecificReference(self):
        """
        A match that is a mix of M, =, and X must result in the expected
        padded sequence when a reference sequence is specified.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            '@SQ SN:ref2 LN:15',
            'query1 0 ref1 2 60 2=2X2M * 0 0 TCTAGG ZZZZZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(filename)
            (read,) = list(ps.queries(referenceName='ref1'))
            self.assertEqual(Read('query1', '-TCTAGG---'), read)
            ps.close()
Example #23
0
    def testKeepQualityControlFailures(self):
        """
        Keeping matches flagged as quality control failures must give the
        expected result.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 2 60 2=2X2M * 0 0 TCTAGG ZZZZZZ',
            'query2 512 ref1 4 60 2= * 0 0 TC ZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(SAMFilter(filename, keepQCFailures=True))
            (read1, read2) = list(ps.queries())
            self.assertEqual(Read('query1', '-TCTAGG---', '!ZZZZZZ!!!'), read1)
            self.assertEqual(Read('query2', '---TC-----', '!!!ZZ!!!!!'), read2)
Example #24
0
    def testMinLength(self):
        """
        A request for reads longer than a certain value should result
        in the expected result.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 2 60 2=2X2M * 0 0 TCTAGG ZZZZZZ',
            'query2 0 ref1 2 60 2= * 0 0 TC ZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(filename)
            (read,) = list(ps.queries(minLength=6))
            self.assertEqual(Read('query1', '-TCTAGG---'), read)
            ps.close()
Example #25
0
    def testMinLengthWithReferenceDeletion(self):
        """
        The minLength specification must be applied after deletion of
        reference bases (which results in the query being lengthened to
        continue the match).
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 2 60 2M2D4M * 0 0 TCTAGG ZZZZZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(filename)
            (read,) = list(ps.queries(minLength=7))
            self.assertEqual(Read('query1', '-TCNNTAGG-'), read)
            ps.close()
Example #26
0
    def testDropDuplicates(self):
        """
        Dropping matches flagged as optical or PCR duplicates must give the
        expected result.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 2 60 2=2X2M * 0 0 TCTAGG ZZZZZZ',
            'query2 1024 ref1 2 60 2= * 0 0 TC ZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(filename)
            (read,) = list(ps.queries(dropDuplicates=True))
            self.assertEqual(Read('query1', '-TCTAGG---'), read)
            ps.close()
Example #27
0
    def testMinLength(self):
        """
        A request for reads that are only longer than a certain value should
        result in the expected result.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 2 60 2=2X2M * 0 0 TCTAGG ZZZZZZ',
            'query2 0 ref1 2 60 2= * 0 0 TC ZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            filterRead = ReadFilter(minLength=6).filter
            ps = PaddedSAM(SAMFilter(filename, filterRead=filterRead))
            (read,) = list(ps.queries())
            self.assertEqual(Read('query1', '-TCTAGG---', '!ZZZZZZ!!!'), read)
Example #28
0
    def testReferenceSkipAlternateChars(self):
        """
        An skip of reference bases must result in the expected padded
        sequence (with the passed query insertion character and unknown
        quality character) when queryInsertionChar and unknownQualityChar
        arguments are passed.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 2 60 2M2N4M * 0 0 TCTAGG ZZZZZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(SAMFilter(filename))
            (read,) = list(ps.queries(queryInsertionChar='X',
                                      unknownQualityChar='+'))
            self.assertEqual(Read('query1', '-TCXXTAGG-', '+ZZ++ZZZZ+'), read)
Example #29
0
    def testKF414679SoftClipLeft(self):
        """
        Test for a case that wasn't working.
        """
        seq = ('GCCATGCAGTGGAACTCCACAGCATTCCACCAAGCTCTGC'
               'AGAATCCCAAAGTCAGGGGTTTGTATCTTCTTGCTGGTGGC')
        quality = ('ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZ'
                   'ZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZZ')
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 5 60 18S63M * 0 0 %s %s' % (seq, quality),
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(SAMFilter(filename))
            (read,) = list(ps.queries())
            self.assertEqual(Read('query1', seq[14:], quality[14:]), read)
Example #30
0
    def testNotSecondaryAndNotSupplementaryWithNoSequence(self):
        """
        An alignment with a '*' seq that is not secondary or supplementary
        must result in a ValueError being raised.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query 0 ref1 3 60 4M * 0 0 * *',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(SAMFilter(filename))
            error = ('^pysam produced an alignment \\(number 1\\) with no '
                     'query sequence without previously giving an alignment '
                     'with a sequence\\.$')
            queries = ps.queries()
            assertRaisesRegex(self, InvalidSAM, error, list, queries)
Example #31
0
    def testAllowDuplicateIds(self):
        """
        It must be possible to allow duplicate ids (in this case due to a
        secondary match).
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 2 60 2=2X2M * 0 0 TCTAGG ZZZZZZ',
            'query1 0 ref1 3 60 2= * 0 0 TC ZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(SAMFilter(filename))
            (read1, read2) = list(ps.queries(allowDuplicateIds=True))
            self.assertEqual(Read('query1', '-TCTAGG---', '!ZZZZZZ!!!'), read1)
            self.assertEqual(Read('query1', '--TC------', '!!ZZ!!!!!!'),
                             read2)
Example #32
0
    def testSupplementaryWithNoSequence(self):
        """
        A supplementary match with a '*' seq must result in the sequence from
        the previous query being used.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 2 60 3M * 0 0 TCT ZZZ',
            'query2 0 ref1 2 60 4M * 0 0 TCTA ZZZZ',
            'query2 2048 ref1 6 60 4M * 0 0 * *',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(SAMFilter(filename))
            (read1, read2, read3) = list(ps.queries())
            self.assertEqual(Read('query1', '-TCT------', '!ZZZ!!!!!!'), read1)
            self.assertEqual(Read('query2', '-TCTA-----', '!ZZZZ!!!!!'), read2)
            self.assertEqual(Read('query2/1', '-----TCTA-', '!!!!!ZZZZ!'),
                             read3)
Example #33
0
    def testReferenceInsertion(self):
        """
        An insertion into the reference must result in the expected padded
        sequence and the expected value in the referenceInsertions dictionary.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 2 60 2M2I2M * 0 0 TCTAGG ZZZZZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(SAMFilter(filename))
            (read,) = list(ps.queries())
            self.assertEqual(Read('query1', '-TCGG-----', '!ZZZZ!!!!!'), read)
            self.assertEqual(
                {
                    'query1': [(3, 'TA')],
                },
                ps.referenceInsertions)
Example #34
0
    def testDuplicateIdDisambiguation(self):
        """
        Duplicate ids must be disambiguated if allowDuplicateIds is not given.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 2 60 2=2X2M * 0 0 TCTAGG ZZZZZZ',
            'query1 0 ref1 3 60 2= * 0 0 TC ZZ',
            'query1 0 ref1 5 60 2S2= * 0 0 TCGA ZZZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(SAMFilter(filename))
            (read1, read2, read3) = list(ps.queries())
            self.assertEqual(Read('query1', '-TCTAGG---', '!ZZZZZZ!!!'), read1)
            self.assertEqual(Read('query1/1', '--TC------', '!!ZZ!!!!!!'),
                             read2)
            self.assertEqual(Read('query1/2', '--TCGA----', '!!ZZZZ!!!!'),
                             read3)
Example #35
0
    def testSecondaryAlignmentHasQuery(self):
        """
        If the first alignment of a query is against a reference that is not
        wanted, a subsequent secondary alignment (SAM flag = 256) must have
        the original query and quality strings (even though these are only
        present in the SAM as * characters and the query is None when it comes
        back from pysam).
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            '@SQ SN:ref2 LN:10',
            'query1 0 ref1 2 60 2=2X2M * 0 0 TCTAGG ZZZZZZ',
            'query1 256 ref2 2 60 2=2X2M * 0 0 * *',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(SAMFilter(filename))
            (read1, read2) = list(ps.queries(addAlignment=True))
            self.assertEqual(Read('query1', '-TCTAGG---', '!ZZZZZZ!!!'), read1)
            self.assertEqual('ref1', read1.alignment.reference_name)
            self.assertEqual(Read('query1/1', '-TCTAGG---', '!ZZZZZZ!!!'),
                             read2)
            self.assertEqual('ref2', read2.alignment.reference_name)
Example #36
0
    def testAlsoYieldAlignments(self):
        """
        A request for queries with their pysam alignments should have the
        expected result.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 2 60 2=2X2M * 0 0 TCTAGG 123456',
            'query2 0 ref1 2 60 2= * 0 0 TC 78',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(SAMFilter(filename))
            (read1, read2) = list(ps.queries(addAlignment=True))

            self.assertEqual(Read('query1', '-TCTAGG---', '!123456!!!'), read1)
            self.assertEqual('TCTAGG', read1.alignment.query_sequence)
            self.assertEqual('123456', ''.join(
                map(lambda x: chr(x + 33), read1.alignment.query_qualities)))

            self.assertEqual(Read('query2', '-TC-------', '!78!!!!!!!'), read2)
            self.assertEqual('TC', read2.alignment.query_sequence)
            self.assertEqual('78', ''.join(
                map(lambda x: chr(x + 33), read2.alignment.query_qualities)))
Example #37
0
    def testPrimaryAndSecondaryReferenceInsertion(self):
        """
        A primary and secondary insertion into the reference (of the same
        query) must result in the expected padded sequences and the expected
        value in the referenceInsertions dictionary.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 2 60 2M2I2M * 0 0 TCTAGG ZZZZZZ',
            'query1 256 ref1 4 60 2M3I1M * 0 0 * *',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(SAMFilter(filename))
            (read1, read2) = list(ps.queries())
            self.assertEqual(Read('query1', '-TCGG-----', '!ZZZZ!!!!!'), read1)
            self.assertEqual(Read('query1/1', '---TCG----', '!!!ZZZ!!!!'),
                             read2)
            self.assertEqual(
                {
                    'query1': [(3, 'TA')],
                    'query1/1': [(5, 'TAG')],
                },
                ps.referenceInsertions)
Example #38
0
    def initialReferenceIdAnalysis(self, referenceId, alignmentFile,
                                   outputDir):
        """
        Analyze the given reference id in the given alignment file (if an
        alignment to the reference id is present).

        @param referenceId: The C{str} id of the reference sequence to analyze.
        @param alignmentFile: The C{str} name of an alignment file.
        @param outputDir: The C{str} name of the output directory.
        @return: C{None} if C{referenceId} is not present in C{alignmentFile}
            or if no significant offsets are found. Else, a C{dict} containing
            the signifcant offsets and the consensus sequence that best matches
            C{referenceId}.
        """

        # Make sure this reference id is in this alignment file and if so
        # get its length (and check it's the same as the length of the
        # sequence given in the reference file).
        with samfile(alignmentFile) as sam:
            tid = sam.get_tid(referenceId)
            if tid == -1:
                # This referenceId is not in this alignment file.
                self.report('    Reference %s not in alignment file.' %
                            referenceId)
                return
            else:
                genomeLength = sam.lengths[tid]
                # Sanity check.
                assert genomeLength == len(self.referenceGenomes[referenceId])

        if self.plotSAM:
            filename = join(outputDir, 'reads.html')
            self.report('    Saving reads alignment plot to %s' % filename)
            plotSAM(SAMFilter(alignmentFile, referenceIds={referenceId}),
                    filename,
                    title=referenceId,
                    jitter=0.45)

        alignedReads = []
        samFilter = SAMFilter(
            alignmentFile,
            referenceIds={referenceId},
            dropDuplicates=True,
            dropSupplementary=True,
            # dropSecondary=True,
            storeQueryIds=True)
        paddedSAM = PaddedSAM(samFilter)
        for query in paddedSAM.queries(addAlignment=True):
            assert len(query) == genomeLength
            alignedReads.append(
                AlignedRead(query.id, query.sequence, query.alignment))

        # Sanity check that all aligned reads have different ids. This
        # should be the case because the padded SAM queries method adds /2,
        # /3 etc to queries that have more than one alignment.
        assert len(alignedReads) == len(set(read.id for read in alignedReads))

        readCountAtOffset, baseCountAtOffset, readsAtOffset = gatherData(
            genomeLength, alignedReads)

        significantOffsets = list(
            findSignificantOffsets(baseCountAtOffset, readCountAtOffset,
                                   self.minReads, self.homogeneousCutoff))

        self.report(
            '    %d alignment%s (of %d unique %s) read from %s' %
            (samFilter.alignmentCount, s(
                samFilter.alignmentCount), len(samFilter.queryIds), 'query'
             if len(samFilter.queryIds) == 1 else 'queries', alignmentFile))
        self.report('    %d of which %s aligned to %s' %
                    (len(alignedReads),
                     'was' if len(alignedReads) == 1 else 'were', referenceId))
        self.report('    Reference genome length %d' % genomeLength)
        self.report('    Found %d significant location%s' %
                    (len(significantOffsets), s(len(significantOffsets))))

        self.saveBaseFrequencies(outputDir, genomeLength, baseCountAtOffset)

        if not significantOffsets:
            self.report('    No significant locations found.')
            return

        if self.saveReducedFASTA:
            self.saveReducedFasta(significantOffsets, outputDir)

        self._plotCoverageAndSignificantLocations(referenceId, alignmentFile,
                                                  readCountAtOffset,
                                                  genomeLength,
                                                  significantOffsets,
                                                  outputDir)

        self.saveSignificantOffsets(significantOffsets, outputDir)

        for read in alignedReads:
            read.setSignificantOffsets(significantOffsets)

        self.saveReferenceBaseFrequencyPlot(referenceId, genomeLength,
                                            significantOffsets,
                                            baseCountAtOffset,
                                            readCountAtOffset, outputDir)

        # Save the reference.
        filename = join(outputDir, 'reference.fasta')
        self.report('    Saving reference to', filename)
        reference = self.referenceGenomes[referenceId]
        Reads([reference]).save(filename)

        # Extract a consensus according to bcftools.
        self.writeBcftoolsConsensus(referenceId, alignmentFile, outputDir)

        return (genomeLength, alignedReads, readCountAtOffset,
                baseCountAtOffset, readsAtOffset, significantOffsets,
                samFilter, paddedSAM)
          'fact we typically do not have the reference in the SAM/BAM file), '
          'we cut the inserted bases out of the aligned query and save the '
          'information about what would have been inserted and where. That '
          'information is printed by this option. The output gives the '
          '0-based offset where the inserted base would be placed, followed '
          'by a list of the nucleotides that were suggested as being '
          'inserted and the number of times each nucleotide was suggested. '
          'So for example the output might contain "27: T:3, G:10" which '
          'indicates that 13 query (3 with T and 10 with G) matches would '
          'insert a nucleotide into the reference at offset 27.'))

SAMFilter.addFilteringOptions(parser)
addFASTAFilteringCommandLineOptions(parser)

args = parser.parse_args()
reads = parseFASTAFilteringCommandLineOptions(args, Reads())
samFilter = SAMFilter.parseFilteringOptions(args, filterRead=reads.filterRead)
paddedSAM = PaddedSAM(samFilter)

for read in paddedSAM.queries(rcSuffix=args.rcSuffix, rcNeeded=args.rcNeeded):
    print(read.toString('fasta'), end='')

if args.listReferenceInsertions:
    if paddedSAM.referenceInsertions:
        print('(0-based) insertions into the reference:\n%s' %
              nucleotidesToStr(paddedSAM.referenceInsertions, '  '),
              file=sys.stderr)
    else:
        print('No matches required an insertion into the reference.',
              file=sys.stderr)