Ejemplo n.º 1
0
 def testSummary(self):
     """
     The summary method must return the correct result.
     """
     titleAlignments = TitleAlignments('subject title', 10)
     titleAlignments.addAlignment(
         TitleAlignment(Read('id1', 'ACGT'), [
             HSP(30, subjectStart=0, subjectEnd=2),
         ]))
     titleAlignments.addAlignment(
         TitleAlignment(Read('id2', 'ACGT'), [
             HSP(55, subjectStart=2, subjectEnd=4),
             HSP(40, subjectStart=8, subjectEnd=9),
         ]))
     self.assertEqual(
         {
             'bestScore': 55,
             'coverage': 0.5,
             'hspCount': 3,
             'medianScore': 40,
             'readCount': 2,
             'subjectLength': 10,
             'subjectTitle': 'subject title',
         },
         titleAlignments.summary())
Ejemplo n.º 2
0
 def testLength(self):
     """
     Sorting on sequence length must work, including a secondary sort on
     title.
     """
     mockOpener = mockOpen(
         read_data=(dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' +
                    dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' +
                    dumps(RECORD3) + '\n'))
     with patch.object(builtins, 'open', mockOpener):
         reads = Reads()
         reads.add(Read('id0', 'A' * 70))
         reads.add(Read('id1', 'A' * 70))
         reads.add(Read('id2', 'A' * 70))
         reads.add(Read('id3', 'A' * 70))
         readsAlignments = BlastReadsAlignments(
             reads, 'file.json', scoreClass=LowerIsBetterScore)
         titlesAlignments = TitlesAlignments(readsAlignments)
         result = titlesAlignments.sortTitles('length')
         self.assertEqual(
             [
                 'gi|887699|gb|DQ37780 Squirrelpox virus 55',  # 38000
                 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99',  # 37000
                 'gi|887699|gb|DQ37780 Monkeypox virus 456',  # 35000
                 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.',  # 35000
                 'gi|887699|gb|DQ37780 Cowpox virus 15',  # 30000
             ],
             result)
Ejemplo n.º 3
0
 def testMaxScore_EValue(self):
     """
     Sorting on max score must work when scores are e values, including a
     secondary sort on title.
     """
     mockOpener = mock_open(read_data=(
         dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' +
         dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' +
         dumps(RECORD3) + '\n'))
     with patch.object(builtins, 'open', mockOpener):
         reads = Reads()
         reads.add(Read('id0', 'A' * 70))
         reads.add(Read('id1', 'A' * 70))
         reads.add(Read('id2', 'A' * 70))
         reads.add(Read('id3', 'A' * 70))
         readsAlignments = DiamondReadsAlignments(
             reads, 'file.json', scoreClass=LowerIsBetterScore)
         titlesAlignments = TitlesAlignments(readsAlignments)
         result = titlesAlignments.sortTitles('maxScore')
         # self.assertEqual([
         #     'gi|887699|gb|DQ37780 Cowpox virus 15',            # 1e-6
         #     'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.',   # 1e-7
         #     'gi|887699|gb|DQ37780 Monkeypox virus 456',        # 1e-8
         #     'gi|887699|gb|DQ37780 Squirrelpox virus 55',       # 1e-10
         #     'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99',  # 1e-11
         # ], result)
         self.assertEqual([
             'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99',  # 1e-11
             'gi|887699|gb|DQ37780 Squirrelpox virus 55',       # 1e-10
             'gi|887699|gb|DQ37780 Monkeypox virus 456',        # 1e-8
             'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.',   # 1e-7
             'gi|887699|gb|DQ37780 Cowpox virus 15',            # 1e-6
         ], result)
Ejemplo n.º 4
0
    def __init__(self, id_, sequence, alignment=None):
        self.significantOffsets = OrderedDict()
        self._originalLength = len(sequence)
        self.alignment = alignment

        # Scan the sequence for initial gaps.
        offset = 0
        for base in sequence:
            if base == '-':
                offset += 1
            else:
                break

        if offset == len(sequence):
            raise ValueError('Read is all gaps.')

        # Scan for final gaps.
        trailing = 0
        for base in sequence[::-1]:
            if base == '-':
                trailing += 1
            else:
                break

        # Make sure the read is not all gaps.
        assert offset + trailing < len(sequence)
        self.offset = offset

        Read.__init__(self, id_,
                      sequence[offset:len(sequence) - trailing].upper())
Ejemplo n.º 5
0
 def testReadSetFilterAllowAnything(self):
     """
     The filter function must work correctly when passed a 0.0 value for
     minNewReads, i.e. that considers any read set sufficiently novel.
     """
     mockOpener = mock_open(read_data=(
         dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' +
         dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' +
         dumps(RECORD3) + '\n'))
     with patch.object(builtins, 'open', mockOpener):
         reads = Reads()
         reads.add(Read('id0', 'A' * 70))
         reads.add(Read('id1', 'A' * 70))
         reads.add(Read('id2', 'A' * 70))
         reads.add(Read('id3', 'A' * 70))
         readsAlignments = DiamondReadsAlignments(reads, 'file.json')
         titlesAlignments = TitlesAlignments(readsAlignments)
         result = titlesAlignments.filter(minNewReads=0.0)
         self.assertEqual(
             [
                 'gi|887699|gb|DQ37780 Cowpox virus 15',
                 'gi|887699|gb|DQ37780 Monkeypox virus 456',
                 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.',
                 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99',
                 'gi|887699|gb|DQ37780 Squirrelpox virus 55',
             ],
             sorted(result))
Ejemplo n.º 6
0
 def testCoverageIncludesSome(self):
     """
     The coverage function must return an titlesAlignments instance with
     only the expected titles if only some of its titles have sufficient
     coverage.
     """
     mockOpener = mock_open(read_data=(
         dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' +
         dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' +
         dumps(RECORD3) + '\n'))
     with patch.object(builtins, 'open', mockOpener):
         reads = Reads()
         reads.add(Read('id0', 'A' * 70))
         reads.add(Read('id1', 'A' * 70))
         reads.add(Read('id2', 'A' * 70))
         reads.add(Read('id3', 'A' * 70))
         readsAlignments = DiamondReadsAlignments(reads, 'file.json')
         titlesAlignments = TitlesAlignments(readsAlignments)
         # To understand why the following produces the result it does,
         # you need to look at the HSP coverage in sample_data.py and
         # calculate the coverage by hand.
         result = titlesAlignments.filter(minCoverage=0.0003)
         self.assertEqual(
             [
                 'gi|887699|gb|DQ37780 Cowpox virus 15',
                 'gi|887699|gb|DQ37780 Monkeypox virus 456',
                 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.',
             ],
             sorted(result))
Ejemplo n.º 7
0
 def testGapAmbiguous(self):
     """
     Testing that the ambiguousOffset shows ambiguous characters paired
     with gaps as expected
     """
     self.assertEqual(
         {
             'match': {
                 'identicalMatchCount': 2,
                 'ambiguousMatchCount': 0,
                 'gapMismatchCount': 2,
                 'gapGapMismatchCount': 1,
                 'nonGapMismatchCount': 0,
             },
             'read1': {
                 'ambiguousOffsets': [1],
                 'extraCount': 0,
                 'gapOffsets': [2, 3],
             },
             'read2': {
                 'ambiguousOffsets': [3],
                 'extraCount': 0,
                 'gapOffsets': [1, 2],
             },
         }, compareDNAReads(Read('id1', 'AN--T'), Read('id2', 'A--NT')))
Ejemplo n.º 8
0
 def testMaxMatchingReads(self):
     """
     The filter function must work correctly when passed a value for
     maxMatchingReads.
     """
     mockOpener = mock_open(read_data=(
         dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' +
         dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' +
         dumps(RECORD3) + '\n'))
     with patch.object(builtins, 'open', mockOpener):
         reads = Reads()
         reads.add(Read('id0', 'A' * 70))
         reads.add(Read('id1', 'A' * 70))
         reads.add(Read('id2', 'A' * 70))
         reads.add(Read('id3', 'A' * 70))
         readsAlignments = DiamondReadsAlignments(reads, 'file.json')
         titlesAlignments = TitlesAlignments(readsAlignments)
         result = titlesAlignments.filter(maxMatchingReads=1)
         # Cowpox virus 15 is not in the results as it is matched by two
         # reads.
         self.assertEqual(
             sorted([
                 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99',
                 'gi|887699|gb|DQ37780 Squirrelpox virus 55',
                 'gi|887699|gb|DQ37780 Monkeypox virus 456',
                 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.'
             ]),
             sorted(result))
Ejemplo n.º 9
0
 def testMatchWithIdenticalAmbiguityButStrict(self):
     """
     Two sequences that match exactly, including one (identical)
     ambiguity at the same location in the sequence, must compare as
     expected. Strict.
     """
     self.assertEqual(
         {
             'match': {
                 'identicalMatchCount': 5,
                 'ambiguousMatchCount': 0,
                 'gapMismatchCount': 0,
                 'gapGapMismatchCount': 0,
                 'nonGapMismatchCount': 1,
             },
             'read1': {
                 'ambiguousOffsets': [5],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
             'read2': {
                 'ambiguousOffsets': [5],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
         },
         compareDNAReads(Read('id1', 'ACGTTN'),
                         Read('id2', 'ACGTTN'),
                         matchAmbiguous=False))
Ejemplo n.º 10
0
 def testNonDefaultGapChars(self):
     """
     We must be able to specify the gap characters.
     """
     for gap in '+$':
         self.assertEqual(
             {
                 'match': {
                     'identicalMatchCount': 3,
                     'ambiguousMatchCount': 0,
                     'gapMismatchCount': 2,
                     'gapGapMismatchCount': 0,
                     'nonGapMismatchCount': 0,
                 },
                 'read1': {
                     'ambiguousOffsets': [],
                     'extraCount': 0,
                     'gapOffsets': [2],
                 },
                 'read2': {
                     'ambiguousOffsets': [],
                     'extraCount': 0,
                     'gapOffsets': [0],
                 },
             },
             compareDNAReads(Read('id1', 'AC%sTT' % gap),
                             Read('id2', '%sCGTT' % gap),
                             gapChars='+$'))
Ejemplo n.º 11
0
 def testMatchWithIncompatibleAmbiguityInBoth(self):
     """
     Two sequences that match exactly, apart from one (incompatible)
     ambiguity at the same location in the sequence, must compare as
     expected.
     """
     self.assertEqual(
         {
             'match': {
                 'identicalMatchCount': 5,
                 'ambiguousMatchCount': 0,
                 'gapMismatchCount': 0,
                 'gapGapMismatchCount': 0,
                 'nonGapMismatchCount': 1,
             },
             'read1': {
                 'ambiguousOffsets': [5],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
             'read2': {
                 'ambiguousOffsets': [5],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
         }, compareDNAReads(Read('id1', 'ACGTTW'), Read('id2', 'ACGTTS')))
Ejemplo n.º 12
0
 def testNonMatchingAmbiguityInFirst(self):
     """
     Two sequences that match exactly, apart from one (incompatible)
     ambiguity in the second sequence, must compare as expected.
     """
     self.assertEqual(
         {
             'match': {
                 'identicalMatchCount': 5,
                 'ambiguousMatchCount': 0,
                 'gapMismatchCount': 0,
                 'gapGapMismatchCount': 0,
                 'nonGapMismatchCount': 1,
             },
             'read1': {
                 'ambiguousOffsets': [5],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
             'read2': {
                 'ambiguousOffsets': [],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
         }, compareDNAReads(Read('id1', 'ACGTTW'), Read('id2', 'ACGTTC')))
Ejemplo n.º 13
0
 def testMatchWithAmbiguityButStrict(self):
     """
     Two sequences that match exactly, apart from one ambiguity in the first
     sequence, must compare as expected when we specify matchAmbiguous=False
     to disallow ambiguous matching.
     """
     self.assertEqual(
         {
             'match': {
                 'identicalMatchCount': 5,
                 'ambiguousMatchCount': 0,
                 'gapMismatchCount': 0,
                 'gapGapMismatchCount': 0,
                 'nonGapMismatchCount': 1,
             },
             'read1': {
                 'ambiguousOffsets': [5],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
             'read2': {
                 'ambiguousOffsets': [],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
         },
         compareDNAReads(Read('id1', 'ACGTTS'),
                         Read('id2', 'ACGTTC'),
                         matchAmbiguous=False))
Ejemplo n.º 14
0
    def testTitleCollection(self):
        """
        A title that occurs in the alignments of multiple reads must have
        the data from both reads collected properly.
        """
        mockOpener = mock_open(read_data=(
            dumps(PARAMS) + '\n' + dumps(RECORD2) + '\n' +
            dumps(RECORD3) + '\n'))
        with patch.object(builtins, 'open', mockOpener):
            reads = Reads()
            read2 = Read('id2', 'A' * 70)
            read3 = Read('id3', 'A' * 70)
            reads.add(read2)
            reads.add(read3)
            readsAlignments = DiamondReadsAlignments(reads, 'file.json')
            titlesAlignments = TitlesAlignments(readsAlignments)

            title = 'gi|887699|gb|DQ37780 Cowpox virus 15'
            titleAlignments = titlesAlignments[title]
            self.assertEqual(title, titleAlignments.subjectTitle)
            self.assertEqual(30000, titleAlignments.subjectLength)
            self.assertEqual(2, len(titleAlignments))

            self.assertEqual(read2, titleAlignments[0].read)
            self.assertEqual(HSP(20), titleAlignments[0].hsps[0])

            self.assertEqual(read3, titleAlignments[1].read)
            self.assertEqual(HSP(20), titleAlignments[1].hsps[0])
Ejemplo n.º 15
0
 def testExtraInSecond(self):
     """
     If the second sequence has extra bases, they must be indicated in the
     extraCount.
     """
     self.assertEqual(
         {
             'match': {
                 'identicalMatchCount': 5,
                 'ambiguousMatchCount': 0,
                 'gapMismatchCount': 0,
                 'gapGapMismatchCount': 0,
                 'nonGapMismatchCount': 0,
             },
             'read1': {
                 'ambiguousOffsets': [],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
             'read2': {
                 'ambiguousOffsets': [],
                 'extraCount': 2,
                 'gapOffsets': [],
             },
         }, compareDNAReads(Read('id1', 'ACGTT'), Read('id2', 'ACGTTCC')))
Ejemplo n.º 16
0
 def testFilterWithNoArguments(self):
     """
     The filter function must return a TitlesAlignments instance with all
     the titles of the original when called with no arguments.
     """
     mockOpener = mock_open(read_data=(
         dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' +
         dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n'))
     with patch.object(builtins, 'open', mockOpener):
         reads = Reads()
         reads.add(Read('id0', 'A' * 70))
         reads.add(Read('id1', 'A' * 70))
         reads.add(Read('id2', 'A' * 70))
         readsAlignments = DiamondReadsAlignments(reads, 'file.json')
         titlesAlignments = TitlesAlignments(readsAlignments)
         result = titlesAlignments.filter()
         self.assertEqual(
             [
                 'gi|887699|gb|DQ37780 Cowpox virus 15',
                 'gi|887699|gb|DQ37780 Monkeypox virus 456',
                 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.',
                 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99',
                 'gi|887699|gb|DQ37780 Squirrelpox virus 55',
             ],
             sorted(result))
Ejemplo n.º 17
0
 def testExtraAmbiguous(self):
     """
     If the first sequence has extra bases which are ambiguous,they must
     be indicated in the extraCount and in the ambiguousOffset.
     """
     self.assertEqual(
         {
             'match': {
                 'identicalMatchCount': 5,
                 'ambiguousMatchCount': 0,
                 'gapMismatchCount': 0,
                 'gapGapMismatchCount': 0,
                 'nonGapMismatchCount': 0,
             },
             'read1': {
                 'ambiguousOffsets': [6],
                 'extraCount': 2,
                 'gapOffsets': [5],
             },
             'read2': {
                 'ambiguousOffsets': [],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
         }, compareDNAReads(Read('id1', 'ACGTT-N'), Read('id2', 'ACGTT')))
Ejemplo n.º 18
0
 def testOffsets(self):
     """
     If a set of wanted offsets is passed, the result must be restricted to
     just those offsets.
     """
     self.assertEqual(
         {
             'match': {
                 'identicalMatchCount': 1,
                 'ambiguousMatchCount': 0,
                 'gapMismatchCount': 0,
                 'gapGapMismatchCount': 0,
                 'nonGapMismatchCount': 1,
             },
             'read1': {
                 'ambiguousOffsets': [],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
             'read2': {
                 'ambiguousOffsets': [],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
         },
         compareDNAReads(Read('id1', 'ATT-T'),
                         Read('id2', 'A-GTC'),
                         offsets=set([0, 4])))
Ejemplo n.º 19
0
 def testMismatch(self):
     """
     If the sequences have mismatched (non-ambiguous) bases, their count
     must be given correctly in the nonGapMismatchCount.
     """
     self.assertEqual(
         {
             'match': {
                 'identicalMatchCount': 3,
                 'ambiguousMatchCount': 0,
                 'gapMismatchCount': 0,
                 'gapGapMismatchCount': 0,
                 'nonGapMismatchCount': 2,
             },
             'read1': {
                 'ambiguousOffsets': [],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
             'read2': {
                 'ambiguousOffsets': [],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
         }, compareDNAReads(Read('id1', 'ACGTT'), Read('id2', 'ACGCC')))
Ejemplo n.º 20
0
 def testCoverageIncludesAll(self):
     """
     The coverage function must return an titlesAlignments instance with
     all titles if all its titles has sufficient coverage.
     """
     mockOpener = mock_open(read_data=(
         dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' +
         dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' +
         dumps(RECORD3) + '\n'))
     with patch.object(builtins, 'open', mockOpener):
         reads = Reads()
         reads.add(Read('id0', 'A' * 70))
         reads.add(Read('id1', 'A' * 70))
         reads.add(Read('id2', 'A' * 70))
         reads.add(Read('id3', 'A' * 70))
         readsAlignments = DiamondReadsAlignments(reads, 'file.json')
         titlesAlignments = TitlesAlignments(readsAlignments)
         result = titlesAlignments.filter(minCoverage=0.0)
         self.assertEqual(
             [
                 'gi|887699|gb|DQ37780 Cowpox virus 15',
                 'gi|887699|gb|DQ37780 Monkeypox virus 456',
                 'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.',
                 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99',
                 'gi|887699|gb|DQ37780 Squirrelpox virus 55',
             ],
             sorted(result))
Ejemplo n.º 21
0
    def testMatchWithAmbiguityAndNotStrict(self):
        """
        Two sequences that match exactly, apart from one ambiguity in the first
        sequence, must compare as expected when we specify matchAmbiguous=True
        to allow ambiguous matching.
        """
        read1 = Read('id1', 'ACGTTS')
        read2 = Read('id2', 'ACGTTC')
        match = compareDNAReads(read1, read2, matchAmbiguous=True)

        self.assertEqual(
            '''\
Exact matches: 5/6 (83.33%)
Ambiguous matches: 1/6 (16.67%)
Exact or ambiguous matches: 6/6 (100.00%)
Mismatches: 0
  Not involving gaps (i.e., conflicts): 0
  Involving a gap in one sequence: 0
  Involving a gap in both sequences: 0
  Id: id1
    Length: 6
    Gaps: 0
    Ambiguous: 1/6 (16.67%)
  Id: id2
    Length: 6
    Gaps: 0
    Ambiguous: 0''', matchToString(match, read1, read2, matchAmbiguous=True))
Ejemplo n.º 22
0
 def testMedianScore_Bits(self):
     """
     Sorting on median score must work when scores are bit scores,
     including a secondary sort on title.
     """
     mockOpener = mock_open(read_data=(
         dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' +
         dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' +
         dumps(RECORD3) + '\n' + dumps(RECORD4) + '\n'))
     with patch.object(builtins, 'open', mockOpener):
         reads = Reads()
         reads.add(Read('id0', 'A' * 70))
         reads.add(Read('id1', 'A' * 70))
         reads.add(Read('id2', 'A' * 70))
         reads.add(Read('id3', 'A' * 70))
         reads.add(Read('id4', 'A' * 70))
         readsAlignments = DiamondReadsAlignments(reads, 'file.json')
         titlesAlignments = TitlesAlignments(readsAlignments)
         result = titlesAlignments.sortTitles('medianScore')
         self.assertEqual([
             'gi|887699|gb|DQ37780 Squirrelpox virus 55',       # 25
             'gi|887699|gb|DQ37780 Monkeypox virus 456',        # 20
             'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.',   # 20
             'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99',  # 20
             'gi|887699|gb|DQ37780 Cowpox virus 15',            # 20
         ], result)
Ejemplo n.º 23
0
    def testResidueCountsTwoReadsTwoHSPsLeftOverhang(self):
        """
        The residueCounts method must return the correct result when two
        reads, each with one HSP are aligned to a title and the leftmost HSP
        is aligned before the left edge of the subject (i.e, will include
        negative subject offsets).

        Subject:      GTT
        HSP1:       ACGT
        HSP2:        CGTT
        """
        read1 = Read('id', 'ACGT')
        hsp1 = HSP(33, readStart=0, readEnd=4, readStartInSubject=-2,
                   readEndInSubject=2, subjectStart=0, subjectEnd=2,
                   readMatchedSequence='GT', subjectMatchedSequence='GT')
        read2 = Read('id', 'CGTT')
        hsp2 = HSP(33, readStart=0, readEnd=4, readStartInSubject=-1,
                   readEndInSubject=3, subjectStart=0, subjectEnd=3,
                   readMatchedSequence='GTT', subjectMatchedSequence='GTT')
        titleAlignments = TitleAlignments('subject title', 55)
        titleAlignment = TitleAlignment(read1, [hsp1])
        titleAlignments.addAlignment(titleAlignment)
        titleAlignment = TitleAlignment(read2, [hsp2])
        titleAlignments.addAlignment(titleAlignment)
        self.assertEqual(
            {
                -2: {'A': 1},
                -1: {'C': 2},
                0: {'G': 2},
                1: {'T': 2},
                2: {'T': 1},
            },
            titleAlignments.residueCounts())
Ejemplo n.º 24
0
 def testTitle(self):
     """
     Sorting on title must work.
     """
     mockOpener = mock_open(read_data=(
         dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' +
         dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' +
         dumps(RECORD3) + '\n'))
     with patch.object(builtins, 'open', mockOpener):
         reads = Reads()
         reads.add(Read('id0', 'A' * 70))
         reads.add(Read('id1', 'A' * 70))
         reads.add(Read('id2', 'A' * 70))
         reads.add(Read('id3', 'A' * 70))
         readsAlignments = DiamondReadsAlignments(
             reads, 'file.json', scoreClass=LowerIsBetterScore)
         titlesAlignments = TitlesAlignments(readsAlignments)
         result = titlesAlignments.sortTitles('title')
         self.assertEqual([
             'gi|887699|gb|DQ37780 Cowpox virus 15',
             'gi|887699|gb|DQ37780 Monkeypox virus 456',
             'gi|887699|gb|DQ37780 Mummypox virus 3000 B.C.',
             'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99',
             'gi|887699|gb|DQ37780 Squirrelpox virus 55',
         ], result)
Ejemplo n.º 25
0
 def testMinMedianScore_EValue(self):
     """
     The filter function must work correctly when passed a value for
     minMedianScore when using e values.
     """
     mockOpener = mock_open(read_data=(
         dumps(PARAMS) + '\n' + dumps(RECORD0) + '\n' +
         dumps(RECORD1) + '\n' + dumps(RECORD2) + '\n' +
         dumps(RECORD3) + '\n'))
     with patch.object(builtins, 'open', mockOpener):
         reads = Reads()
         reads.add(Read('id0', 'A' * 70))
         reads.add(Read('id1', 'A' * 70))
         reads.add(Read('id2', 'A' * 70))
         reads.add(Read('id3', 'A' * 70))
         readsAlignments = DiamondReadsAlignments(
             reads, 'file.json', scoreClass=LowerIsBetterScore)
         titlesAlignments = TitlesAlignments(readsAlignments)
         result = titlesAlignments.filter(minMedianScore=1e-9)
         self.assertEqual(
             [
                 'gi|887699|gb|DQ37780 Squirrelpox virus 1296/99',
                 'gi|887699|gb|DQ37780 Squirrelpox virus 55',
             ],
             sorted(result))
Ejemplo n.º 26
0
    def testYP_009259545(self):
        """
        Test for a match against YP_009259545
        """
        proteinAccession = 'YP_009259545.1'
        proteinSequence = SAMPLE_DATA['proteins'][proteinAccession]['protein']
        proteinId = SAMPLE_DATA['proteins'][proteinAccession]['id']

        qid = 'query'
        qseq = ''.join(CODONS[aa][0] for aa in proteinSequence[10:50])
        qqual = 'E' * len(qseq)

        with DiamondExecutor() as de:
            de.addSubject(Read(proteinId, proteinSequence))
            queries = Reads([Read(qid, qseq, qqual)])
            (result, ) = list(de.search(queries))

        self.assertEqual(
            {
                'bitscore': 83.6,
                'btop': '40',
                'qframe': 1,
                'qend': 120,
                'full_qqual': qqual,
                'qlen': len(qseq),
                'full_qseq': qseq,
                'qseqid': 'query',
                'qstart': 1,
                'slen': len(proteinSequence),
                'sstart': 11,
                'stitle': proteinId,
            }, result)
Ejemplo n.º 27
0
 def testNoOffsets(self):
     """
     If an empty set of wanted offsets is passed, the result must be empty.
     """
     self.assertEqual(
         {
             'match': {
                 'identicalMatchCount': 0,
                 'ambiguousMatchCount': 0,
                 'gapMismatchCount': 0,
                 'gapGapMismatchCount': 0,
                 'nonGapMismatchCount': 0,
             },
             'read1': {
                 'ambiguousOffsets': [],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
             'read2': {
                 'ambiguousOffsets': [],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
         },
         compareDNAReads(Read('id1', 'ATT-T'),
                         Read('id2', 'A-GTC'),
                         offsets=set()))
Ejemplo n.º 28
0
 def testWikiAnswerAsDict(self):
     """
     Test the example given in Wikipedia:
     http://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm
     with the return result being a dict.
     """
     seq1 = Read('seq1', 'ACACACTA')
     seq2 = Read('seq2', 'AGCACACA')
     align = LocalAlignment(seq1, seq2, match=2)
     result = align.createAlignment()
     self.assertEqual(
         {
             'cigar':
             '1=1I5=1D1=',
             'sequence1Start':
             1,
             'sequence1End':
             8,
             'sequence2Start':
             1,
             'sequence2End':
             8,
             'text': [
                 'seq1 1 A-CACACTA 8',
                 '       | ||||| |',
                 'seq2 1 AGCACAC-A 8',
             ]
         }, result)
Ejemplo n.º 29
0
 def testWikiAnswerWithMatchOneAsDict(self):
     """
     Test the example given in Wikipedia
     http://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm
     Wikipedia uses a match score of two, here we use a score of one.
     Get the result as a dict.
     """
     seq1 = Read('seq1', 'ACACACTA')
     seq2 = Read('seq2', 'AGCACACA')
     align = LocalAlignment(seq1, seq2, match=1)
     result = align.createAlignment()
     self.assertEqual(
         {
             'cigar':
             '5=1D1=',
             'sequence1Start':
             2,
             'sequence1End':
             8,
             'sequence2Start':
             3,
             'sequence2End':
             8,
             'text': [
                 'seq1 2 CACACTA 8',
                 '       ||||| |',
                 'seq2 3 CACAC-A 8',
             ]
         }, result)
Ejemplo n.º 30
0
 def testLengthOne(self):
     """
     A FASTA list with just one item gets de-duped to the same one item.
     """
     reads = Reads()
     reads.add(Read('id', 'GGG'))
     self.assertEqual(list(dedupFasta(reads)), [Read('id', 'GGG')])
Ejemplo n.º 31
0
 def testReverseComplementAmbiguousRNA(self):
     """
     The reverseComplement function must work for RNA that includes
     ambiguous bases.
     """
     read = Read('id', 'aucgmrwsykvhxn', type='rna')
     self.assertEqual('NXDBMRSWYKCGAU', read.reverseComplement().sequence)
Ejemplo n.º 32
0
 def testReverseComplementAA(self):
     """
     The reverseComplement function must raise a C{ValueError} when called
     on an amino acid sequence.
     """
     read = Read('id', 'atcg', type='aa')
     error = 'Cannot reverse complement an amino acid sequence'
     with self.assertRaisesRegexp(ValueError, error):
         read.reverseComplement()
Ejemplo n.º 33
0
    def queries(self, rcSuffix='', rcNeeded=False, padChar='-',
                queryInsertionChar='N', unknownQualityChar='!',
                allowDuplicateIds=False, addAlignment=False):
        """
        Produce padded (with gaps) queries according to the CIGAR string and
        reference sequence length for each matching query sequence.

        @param rcSuffix: A C{str} to add to the end of query names that are
            reverse complemented. This is added before the /1, /2, etc., that
            are added for duplicated ids (if there are duplicates and
            C{allowDuplicateIds} is C{False}.
        @param rcNeeded: If C{True}, queries that are flagged as matching when
            reverse complemented should have reverse complementing when
            preparing the output sequences. This must be used if the program
            that created the SAM/BAM input flags reversed matches but does not
            also store the reverse complemented query.
        @param padChar: A C{str} of length one to use to pad queries with to
            make them the same length as the reference sequence.
        @param queryInsertionChar:  A C{str} of length one to use to insert
            into queries when the CIGAR string indicates that the alignment
            of a query would cause a deletion in the reference. This character
            is inserted as a 'missing' query character (i.e., a base that can
            be assumed to have been lost due to an error) whose existence is
            necessary for the match to continue.
        @param unknownQualityChar: The character to put into the quality
            string when unknown bases are inserted in the query or the query
            is padded on the left/right with gaps.
        @param allowDuplicateIds: If C{True}, repeated query ids (due to
            secondary or supplemental matches) will not have /1, /2, etc.
            appended to their ids. So repeated ids may appear in the yielded
            FASTA.
        @param addAlignment: If C{True} the reads yielded by the returned
            generator will also have an C{alignment} attribute, being the
            C{pysam.AlignedSegment} for the query.
        @raises InvalidSAM: If a query has an empty SEQ field and either there
            is no previous alignment or the alignment is not marked as
            secondary or supplementary.
        @return: A generator that yields C{Read} instances that are padded
            with gap characters to align them to the length of the reference
            sequence. See C{addAlignment}, above, to yield reads with the
            corresponding C{pysam.AlignedSegment}.
        """
        referenceLength = self.referenceLength

        # Hold the count for each id so we can add /1, /2 etc to duplicate
        # ids (unless --allowDuplicateIds was given).
        idCount = Counter()

        MATCH_OPERATIONS = {CMATCH, CEQUAL, CDIFF}

        for lineNumber, alignment in enumerate(
                self.samFilter.alignments(), start=1):

            query = alignment.query_sequence
            quality = ''.join(chr(q + 33) for q in alignment.query_qualities)

            if alignment.is_reverse:
                if rcNeeded:
                    query = DNARead('id', query).reverseComplement().sequence
                    quality = quality[::-1]
                if rcSuffix:
                    alignment.query_name += rcSuffix

            # Adjust the query id if it's a duplicate and we're not allowing
            # duplicates.
            if allowDuplicateIds:
                queryId = alignment.query_name
            else:
                count = idCount[alignment.query_name]
                idCount[alignment.query_name] += 1
                queryId = alignment.query_name + (
                    '' if count == 0 else '/%d' % count)

            referenceStart = alignment.reference_start
            atStart = True
            queryIndex = 0
            referenceIndex = referenceStart
            alignedSequence = ''
            alignedQuality = ''

            for operation, length in alignment.cigartuples:

                # The operations are tested in the order they appear in
                # https://samtools.github.io/hts-specs/SAMv1.pdf It would be
                # more efficient to test them in order of frequency of
                # occurrence.
                if operation in MATCH_OPERATIONS:
                    atStart = False
                    alignedSequence += query[queryIndex:queryIndex + length]
                    alignedQuality += quality[queryIndex:queryIndex + length]
                elif operation == CINS:
                    # Insertion to the reference. This consumes query bases but
                    # we don't output them because the reference cannot be
                    # changed.  I.e., these bases in the query would need to be
                    # inserted into the reference.  Remove these bases from the
                    # query but record what would have been inserted into the
                    # reference.
                    atStart = False
                    self.referenceInsertions[queryId].append(
                        (referenceIndex,
                         query[queryIndex:queryIndex + length]))
                elif operation == CDEL:
                    # Delete from the reference. Some bases from the reference
                    # would need to be deleted to continue the match. So we put
                    # an insertion into the query to compensate.
                    atStart = False
                    alignedSequence += queryInsertionChar * length
                    alignedQuality += unknownQualityChar * length
                elif operation == CREF_SKIP:
                    # Skipped reference. Opens a gap in the query. For
                    # mRNA-to-genome alignment, an N operation represents an
                    # intron.  For other types of alignments, the
                    # interpretation of N is not defined. So this is unlikely
                    # to occur.
                    atStart = False
                    alignedSequence += queryInsertionChar * length
                    alignedQuality += unknownQualityChar * length
                elif operation == CSOFT_CLIP:
                    # Bases in the query that are not part of the match. We
                    # remove these from the query if they protrude before the
                    # start or after the end of the reference. According to the
                    # SAM docs, 'S' operations may only have 'H' operations
                    # between them and the ends of the CIGAR string.
                    if atStart:
                        # Don't set atStart=False, in case there's another 'S'
                        # operation.
                        unwantedLeft = length - referenceStart
                        if unwantedLeft > 0:
                            # The query protrudes left. Copy its right part.
                            alignedSequence += query[
                                queryIndex + unwantedLeft:queryIndex + length]
                            alignedQuality += quality[
                                queryIndex + unwantedLeft:queryIndex + length]
                            referenceStart = 0
                        else:
                            referenceStart -= length
                            alignedSequence += query[
                                queryIndex:queryIndex + length]
                            alignedQuality += quality[
                                queryIndex:queryIndex + length]
                    else:
                        unwantedRight = (
                            (referenceStart + len(alignedSequence) + length) -
                            referenceLength)

                        if unwantedRight > 0:
                            # The query protrudes right. Copy its left part.
                            alignedSequence += query[
                                queryIndex:queryIndex + length - unwantedRight]
                            alignedQuality += quality[
                                queryIndex:queryIndex + length - unwantedRight]
                        else:
                            alignedSequence += query[
                                queryIndex:queryIndex + length]
                            alignedQuality += quality[
                                queryIndex:queryIndex + length]
                elif operation == CHARD_CLIP:
                    # Some bases have been completely removed from the query.
                    # This (H) can only be present as the first and/or last
                    # operation. There is nothing to do as the bases are simply
                    # not present in the query string in the SAM/BAM file.
                    pass
                elif operation == CPAD:
                    # This is "silent deletion from the padded reference",
                    # which consumes neither query nor reference.
                    atStart = False
                else:
                    raise ValueError('Unknown CIGAR operation:', operation)

                if operation in _CONSUMES_QUERY:
                    queryIndex += length

                if operation in _CONSUMES_REFERENCE:
                    referenceIndex += length

            if queryIndex != len(query):
                # Oops, we did not consume the entire query.
                raise ValueError(
                    'Query %r not fully consumed when parsing CIGAR string. '
                    'Query %r (len %d), final query index %d, CIGAR: %r' %
                    (alignment.query_name, query, len(query), queryIndex,
                     alignment.cigartuples))

            # We cannot test we consumed the entire reference.  The CIGAR
            # string applies to (and exhausts) the query but is silent
            # about the part of the reference that lies to the right of the
            # aligned query.

            # Put gap characters before and after the aligned sequence so that
            # it is offset properly and matches the length of the reference.
            padRightLength = (referenceLength -
                              (referenceStart + len(alignedSequence)))
            paddedSequence = (padChar * referenceStart +
                              alignedSequence +
                              padChar * padRightLength)
            paddedQuality = (unknownQualityChar * referenceStart +
                             alignedQuality +
                             unknownQualityChar * padRightLength)

            read = Read(queryId, paddedSequence, paddedQuality)

            if addAlignment:
                read.alignment = alignment

            yield read
Ejemplo n.º 34
0
 def testReverseComplementRNA(self):
     """
     The reverseComplement function must work for RNA
     """
     read = Read('id', 'aucg', type='rna')
     self.assertEqual('CGAU', read.reverseComplement().sequence)
Ejemplo n.º 35
0
 def testReverseComplementDNA(self):
     """
     The reverseComplement function must work for DNA
     """
     read = Read('id', 'atcg', quality='!@#$', type='dna')
     self.assertEqual('CGAT', read.reverseComplement().sequence)
Ejemplo n.º 36
0
 def testReverseComplementReversesQuality(self):
     """
     The reverseComplement function must return a reversed quality string.
     """
     read = Read('id', 'atcg', quality='!@#$')
     self.assertEqual('$#@!', read.reverseComplement().quality)