コード例 #1
0
    def testMinScoreNoScores(self):
        """
        A request for reads with alignment scores no lower than a given value
        must produce an empty result when no alignments have scores.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 2 60 2=2X2M * 0 0 TCTAGG ZZZZZZ',
            'query2 0 ref1 2 60 2= * 0 0 TC ZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            sf = SAMFilter(filename, minScore=6)
            self.assertEqual([], list(sf.alignments()))
コード例 #2
0
    def testDropSecondary(self):
        """
        Dropping matches flagged as secondary must give the expected result.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 2 60 2=2X2M * 0 0 TCTAGG ZZZZZZ',
            'query2 256 ref1 2 60 2= * 0 0 TC ZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            sf = SAMFilter(filename, dropSecondary=True)
            (alignment,) = list(sf.alignments())
            self.assertEqual('query1', alignment.query_name)
コード例 #3
0
    def testStoreQueryIds(self):
        """
        If we request that query ids are saved, they must be.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 2 60 2=2X2M * 0 0 TCTAGG 123456',
            'query2 0 ref1 2 60 2= * 0 0 TC XY',
            'query2 0 ref1 2 60 2= * 0 0 TC XY',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            sf = SAMFilter(filename, storeQueryIds=True)
            list(sf.alignments())
            self.assertEqual({'query1', 'query2'}, sf.queryIds)
コード例 #4
0
    def testAlignmentCount(self):
        """
        When all queries have been yielded, the alignment count must be
        as expected.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 2 60 2=2X2M * 0 0 TCTAGG 123456',
            'query2 0 ref1 2 60 2= * 0 0 TC XY',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            sf = SAMFilter(filename)
            list(sf.alignments())
            self.assertEqual(2, sf.alignmentCount)
コード例 #5
0
    def testMaxScore(self):
        """
        A request for reads with alignment scores no higher than a given value
        must produce the expected result when some alignments have scores.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 2 60 2=2X2M * 0 0 TCTAGG ZZZZZZ AS:i:10',
            'query2 0 ref1 2 60 2= * 0 0 TC ZZ',
            'query3 0 ref1 2 60 2=2X2M * 0 0 TCTAGG ZZZZZZ AS:i:3',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            sf = SAMFilter(filename, maxScore=6)
            (alignment,) = list(sf.alignments())
            self.assertEqual('query3', alignment.query_name)
コード例 #6
0
    def testKeepQualityControlFailures(self):
        """
        Keeping matches flagged as quality control failures must give the
        expected result.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 2 60 2=2X2M * 0 0 TCTAGG ZZZZZZ',
            'query2 512 ref1 4 60 2= * 0 0 TC ZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            sf = SAMFilter(filename, keepQCFailures=True)
            (alignment1, alignment2) = list(sf.alignments())
            self.assertEqual('query1', alignment1.query_name)
            self.assertEqual('query2', alignment2.query_name)
コード例 #7
0
    def testNoQuality(self):
        """
        If an alignment has * for the quality string, the filter must
        return an alignment with a C{None} quality value.
        """
        data = '\n'.join([
            '@SQ SN:ref LN:10',
            'query1 4 * 0 0 6M * 0 0 TCTAGG *',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            sf = SAMFilter(filename)
            (alignment,) = list(sf.alignments())
            self.assertEqual('query1', alignment.query_name)
            self.assertEqual('TCTAGG', alignment.query_sequence)
            self.assertIsNone(alignment.query_qualities)
コード例 #8
0
    def testHardClippingInCIGARButQueryNotHardClipped(self):
        """
        As documented in https://github.com/acorg/dark-matter/issues/630 we
        must deal correctly with a case in which the CIGAR string says a
        query is hard-clipped but the query sequence in the SAM file
        actually isn't. This can be due to a prior alignment with a soft clip,
        in which case the full query sequence has to be given before the
        secondary alignment with the hard clip.
        """
        data = '\n'.join([
            '@SQ SN:Chimp-D00220 LN:8',
            '@SQ SN:D-AM494716 LN:8',
            '@SQ SN:D-XXX LN:8',
            '@SQ SN:Chimp-YYY LN:8',
            'query1 0 Chimp-D00220 1 0 3S5M * 0 0 TTTTGGTT 12345678',
            'query1 256 D-AM494716 1 0 3H5M * 0 0 * *',
            'query1 256 D-XXX 1 0 5H3M * 0 0 * *',
            'query1 0 Chimp-YYY 1 0 8M * 0 0 * *',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(SAMFilter(filename))
            (read1, read2, read3, read4) = list(ps.queries(addAlignment=True))

            self.assertEqual(Read('query1', 'TGGTT---', '45678!!!'), read1)
            self.assertEqual('TTTTGGTT', read1.alignment.query_sequence)

            self.assertEqual(Read('query1/1', 'TGGTT---', '45678!!!'), read2)
            self.assertEqual('TGGTT', read2.alignment.query_sequence)

            self.assertEqual(Read('query1/2', 'GTT-----', '678!!!!!'), read3)
            self.assertEqual('GTT', read3.alignment.query_sequence)

            self.assertEqual(Read('query1/3', 'TTTTGGTT', '12345678'), read4)
            self.assertEqual('TTTTGGTT', read4.alignment.query_sequence)
コード例 #9
0
    def testMinLength(self):
        """
        A request for reads that are only longer than a certain value should
        result in the expected result.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 2 60 2=2X2M * 0 0 TCTAGG ZZZZZZ',
            'query2 0 ref1 2 60 2= * 0 0 TC ZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            filterRead = ReadFilter(minLength=6).filter
            sf = SAMFilter(filename, filterRead=filterRead)
            (alignment,) = list(sf.alignments())
            self.assertEqual('query1', alignment.query_name)
コード例 #10
0
    def testQueryTooLong(self):
        """
        If the query sequence is longer than the total of the lengths in the
        CIGAR operations, a ValueError must be raised.
        """
        # This test just returns. It used to be possible to reach the
        # "Query ... not fully consumed when parsing CIGAR string."
        # ValueError in sam.py, prior to the fix of
        # https://github.com/acorg/dark-matter/issues/630 but it is not
        # possible to get a CIGAR string that has a different total length
        # from the sequence length through to our code in sam.py because
        # pysam catches the error.  I'm leaving this test here because it
        # documents that the error checked for in sam.py cannot currently
        # be reached and the test may become useful. For now it just returns.
        return
        data = '\n'.join([
            '@SQ SN:ref1 LN:90',
            'query1 0 ref1 1 60 4M * 0 0 TCTAGG ZZZZZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(SAMFilter(filename))
            error = ('^Query TCTAGG not fully consumed when parsing CIGAR '
                     'string\\.')
            assertRaisesRegex(self, ValueError, error, list, ps.queries())
コード例 #11
0
    def testCloseButNoCIGAR(self):
        """
        An unmapped query with no CIGAR string must be passed through
        unchanged if dropUnmapped is not specified.
        """
        data = '\n'.join([
            '@SQ SN:ref LN:10',
            'query1 4 * 0 0 * * 0 0 TCTAGG ZZZZZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            sf = SAMFilter(filename)
            (alignment,) = list(sf.alignments())
            self.assertEqual('query1', alignment.query_name)
            self.assertEqual('TCTAGG', alignment.query_sequence)
            self.assertEqual('ZZZZZZ', ''.join(
                map(lambda x: chr(x + 33), alignment.query_qualities)))
コード例 #12
0
    def testMinAndMaxScore(self):
        """
        A request for reads with alignment scores no lower or higher than
        given values must produce the expected result.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 2 60 2=2X2M * 0 0 TCTAGG ZZZZZZ AS:i:10',
            'query2 0 ref1 2 60 2=2X2M * 0 0 TCTAGG ZZZZZZ AS:i:12',
            'query3 0 ref1 2 60 2= * 0 0 TC ZZ',
            'query4 0 ref1 2 60 2=2X2M * 0 0 TCTAGG ZZZZZZ AS:i:3',
            'query5 0 ref1 2 60 2=2X2M * 0 0 TCTAGG ZZZZZZ AS:i:2',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            sf = SAMFilter(filename, minScore=3, maxScore=10)
            (alignment1, alignment2) = list(sf.alignments())
            self.assertEqual('query1', alignment1.query_name)
            self.assertEqual('query4', alignment2.query_name)
コード例 #13
0
def addCommandLineOptions(parser, outfileDefaultName=None):
    """
    Add standard command-line options to an argument parser.

    @param parser: An C{ArgumentParser} instance.
    @param outfileDefaultName: The C{str} output file to use as a default
        in case the user does not give one on the command line.
    """

    addCommonOptions(parser)
    SAMFilter.addFilteringOptions(parser)

    parser.add_argument('--outfile',
                        default=outfileDefaultName,
                        help='The filename to store the resulting HTML.')

    parser.add_argument('--show',
                        action='store_true',
                        default=False,
                        help='If specified, show the figure interactively.')
コード例 #14
0
def parseCommandLineOptions(args, returnSignificantOffsets=True):
    """
    Deal with the various command-line options added to the ArgumentParser
    instance by addCommandLineOptions.

    @param args: The result of calling C{parse_args} on an C{ArgumentParser}
        instance (the one that was passed to C{addCommandLineOptions}, unless
        we're testing).
    @param returnSignificantOffsets: If C{True} also return a list of the
        significant offsets (else that element of the return value will be
        C{None}).
    @return: A C{tuple}: (genomeLength, alignedReads, padddedSAM,
        readCountAtOffset, baseCountAtOffset, readsAtOffset,
        significantOffsets).
    """
    genomeLength = None
    alignedReads = []
    samFilter = SAMFilter.parseFilteringOptions(args)

    if samFilter.referenceIds and len(samFilter.referenceIds) > 1:
        raise ValueError('Only one reference id can be given.')

    referenceLengths = samFilter.referenceLengths()

    if len(referenceLengths) == 1:
        referenceId, genomeLength = referenceLengths.popitem()
    else:
        raise ValueError(
            'If you do not specify a reference sequence with '
            '--referenceId, the SAM/BAM file must contain exactly one '
            'reference. But %s contains %d.' %
            (args.samfile, len(referenceLengths)))

    paddedSAM = PaddedSAM(samFilter)

    for query in paddedSAM.queries():
        alignedReads.append(AlignedRead(query.id, query.sequence))

    readCountAtOffset, baseCountAtOffset, readsAtOffset = gatherData(
        genomeLength, alignedReads)

    if returnSignificantOffsets:
        significantOffsets = list(
            findSignificantOffsets(baseCountAtOffset, readCountAtOffset,
                                   args.minReads, args.homogeneousCutoff))
        for read in alignedReads:
            read.setSignificantOffsets(significantOffsets)
    else:
        significantOffsets = None

    return (genomeLength, alignedReads, paddedSAM, readCountAtOffset,
            baseCountAtOffset, readsAtOffset, significantOffsets)
コード例 #15
0
    def testAllMMatch(self):
        """
        A simple all-'M' match must result in the expected padded sequence.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 2 60 6M * 0 0 TCTAGG ZZZZZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(SAMFilter(filename))
            (read,) = list(ps.queries())
            self.assertEqual(Read('query1', '-TCTAGG---', '!ZZZZZZ!!!'), read)
コード例 #16
0
    def testMixedMatchSpecificReferenceButNoMatches(self):
        """
        A request for reads aligned against a reference that exists but that
        has no matches must result in an empty list.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:15',
            '@SQ SN:ref2 LN:15',
            'query1 0 ref1 2 60 2=2X2M * 0 0 TCTAGG ZZZZZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(SAMFilter(filename, referenceIds={'ref2'}))
            self.assertEqual([], list(ps.queries()))
コード例 #17
0
    def testReferenceDeletion(self):
        """
        An deletion of reference bases must result in the expected padded
        sequence (with Ns inserted for the deleted reference bases).
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 2 60 2M2D4M * 0 0 TCTAGG ZZZZZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(SAMFilter(filename))
            (read,) = list(ps.queries())
            self.assertEqual(Read('query1', '-TCNNTAGG-', '!ZZ!!ZZZZ!'), read)
コード例 #18
0
    def testQuerySoftClipProtrudesBothSides(self):
        """
        A match with a soft-clipped region that extends to both the left and
        right of the reference must result in the expected padded sequence.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 4 60 5S5M5S * 0 0 TCTAGGCTGACTAAG ZZZZZZZZZZZZZZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(SAMFilter(filename))
            (read,) = list(ps.queries())
            self.assertEqual(Read('query1', 'TAGGCTGACT', 'ZZZZZZZZZZ'), read)
コード例 #19
0
    def testQuerySoftClipReachesRightEdge(self):
        """
        A match with a soft-clipped region that reaches to the right edge of
        the reference must result in the expected padded sequence.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 5 60 2M4S * 0 0 TCTAGG ZZZZZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(SAMFilter(filename))
            (read,) = list(ps.queries())
            self.assertEqual(Read('query1', '----TCTAGG', '!!!!ZZZZZZ'), read)
コード例 #20
0
    def testQuerySoftClipProtrudesLeft(self):
        """
        A match with a soft-clipped region that extends to the left of the
        reference must result in the expected padded sequence.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 2 60 4S2M * 0 0 TCTAGG ZZZZZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(SAMFilter(filename))
            (read,) = list(ps.queries())
            self.assertEqual(Read('query1', 'AGG-------', 'ZZZ!!!!!!!'), read)
コード例 #21
0
    def testMixedMatch(self):
        """
        A match that is a mix of M, =, and X must result in the expected
        padded sequence.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 2 60 2=2X2M * 0 0 TCTAGG ZZZZZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(SAMFilter(filename))
            (read,) = list(ps.queries())
            self.assertEqual(Read('query1', '-TCTAGG---', '!ZZZZZZ!!!'), read)
コード例 #22
0
    def testDropSecondary(self):
        """
        Dropping matches flagged as secondary must give the expected result.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 2 60 2=2X2M * 0 0 TCTAGG ZZZZZZ',
            'query2 256 ref1 2 60 2= * 0 0 TC ZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(SAMFilter(filename, dropSecondary=True))
            (read,) = list(ps.queries())
            self.assertEqual(Read('query1', '-TCTAGG---', '!ZZZZZZ!!!'), read)
コード例 #23
0
    def testReferenceSkip(self):
        """
        An skip of reference bases must result in the expected padded
        sequence with the passed unknown quality character when the
        unknownQualityChar argument is passed.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 2 60 2M2N4M * 0 0 TCTAGG ZZZZZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(SAMFilter(filename))
            (read,) = list(ps.queries(unknownQualityChar='.'))
            self.assertEqual(Read('query1', '-TCNNTAGG-', '.ZZ..ZZZZ.'), read)
コード例 #24
0
    def testRcNeeded(self):
        """
        A reverse-complemented match (flag = 16) when rcNeeded=True is passed
        must result in the expected (reverse complemented) padded sequence
        and reversed quality string.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 16 ref1 2 60 6M * 0 0 TCTAGG 123456',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(SAMFilter(filename))
            (read,) = list(ps.queries(rcNeeded=True))
            self.assertEqual(Read('query1', '-CCTAGA---', '!654321!!!'), read)
コード例 #25
0
    def testUnequalReferenceLengths(self):
        """
        Passing no reference ids when the references have different lengths
        must result in an UnequalReferenceLengthError exception.
        """
        data = '\n'.join([
            '@SQ SN:id1 LN:90',
            '@SQ SN:id2 LN:91',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            error = ('^Your 2 SAM/BAM file reference sequence lengths '
                     '\\(id1=90, id2=91\\) are not all identical\\.$')
            assertRaisesRegex(self, UnequalReferenceLengthError, error,
                              PaddedSAM, SAMFilter(filename))
コード例 #26
0
    def testRcSuffix(self):
        """
        A reverse-complemented sequence should have the rcSuffix string added
        to its id when an rcSuffix value is passed.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 16 ref1 2 60 6M * 0 0 TCTAGG 123456',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(SAMFilter(filename))
            (read,) = list(ps.queries(rcSuffix='-rc', rcNeeded=True))
            self.assertEqual(Read('query1-rc', '-CCTAGA---', '!654321!!!'),
                             read)
コード例 #27
0
    def testUnknownReferences(self):
        """
        Passing an unknown reference id to the referenceLengths method must
        result in an UnknownReference exception.
        """
        data = '\n'.join([
            '@SQ SN:id1 LN:90',
            '@SQ SN:id2 LN:90',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            sam = SAMFilter(filename, referenceIds={'unknown'})
            error = ("^Reference 'unknown' is not present in the "
                     "SAM/BAM file\\.$")
            assertRaisesRegex(self, UnknownReference, error,
                              sam.referenceLengths)
コード例 #28
0
    def testMinLength(self):
        """
        A request for reads that are only longer than a certain value should
        result in the expected result.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 2 60 2=2X2M * 0 0 TCTAGG ZZZZZZ',
            'query2 0 ref1 2 60 2= * 0 0 TC ZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            filterRead = ReadFilter(minLength=6).filter
            ps = PaddedSAM(SAMFilter(filename, filterRead=filterRead))
            (read,) = list(ps.queries())
            self.assertEqual(Read('query1', '-TCTAGG---', '!ZZZZZZ!!!'), read)
コード例 #29
0
    def testKeepQualityControlFailures(self):
        """
        Keeping matches flagged as quality control failures must give the
        expected result.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query1 0 ref1 2 60 2=2X2M * 0 0 TCTAGG ZZZZZZ',
            'query2 512 ref1 4 60 2= * 0 0 TC ZZ',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(SAMFilter(filename, keepQCFailures=True))
            (read1, read2) = list(ps.queries())
            self.assertEqual(Read('query1', '-TCTAGG---', '!ZZZZZZ!!!'), read1)
            self.assertEqual(Read('query2', '---TC-----', '!!!ZZ!!!!!'), read2)
コード例 #30
0
    def testNotSecondaryAndNotSupplementaryWithNoSequence(self):
        """
        An alignment with a '*' seq that is not secondary or supplementary
        must result in a ValueError being raised.
        """
        data = '\n'.join([
            '@SQ SN:ref1 LN:10',
            'query 0 ref1 3 60 4M * 0 0 * *',
        ]).replace(' ', '\t')

        with dataFile(data) as filename:
            ps = PaddedSAM(SAMFilter(filename))
            error = ('^pysam produced an alignment \\(number 1\\) with no '
                     'query sequence without previously giving an alignment '
                     'with a sequence\\.$')
            queries = ps.queries()
            assertRaisesRegex(self, InvalidSAM, error, list, queries)
コード例 #31
0
    help=('If given, information about reference sequence insertions will be '
          'printed to standard error. These correspond to "I" CIGAR '
          'operations that for the match would require inserting query bases '
          'into the reference. Because we cannot change the reference (in '
          'fact we typically do not have the reference in the SAM/BAM file), '
          'we cut the inserted bases out of the aligned query and save the '
          'information about what would have been inserted and where. That '
          'information is printed by this option. The output gives the '
          '0-based offset where the inserted base would be placed, followed '
          'by a list of the nucleotides that were suggested as being '
          'inserted and the number of times each nucleotide was suggested. '
          'So for example the output might contain "27: T:3, G:10" which '
          'indicates that 13 query (3 with T and 10 with G) matches would '
          'insert a nucleotide into the reference at offset 27.'))

SAMFilter.addFilteringOptions(parser)
addFASTAFilteringCommandLineOptions(parser)

args = parser.parse_args()
reads = parseFASTAFilteringCommandLineOptions(args, Reads())
samFilter = SAMFilter.parseFilteringOptions(
    args, filterRead=reads.filterRead)
paddedSAM = PaddedSAM(samFilter)

for read in paddedSAM.queries(rcSuffix=args.rcSuffix, rcNeeded=args.rcNeeded):
    print(read.toString('fasta'), end='')

if args.listReferenceInsertions:
    if paddedSAM.referenceInsertions:
        print('(0-based) insertions into the reference:\n%s' %
              nucleotidesToStr(paddedSAM.referenceInsertions, '  '),