Beispiel #1
0
 def testNoOffsets(self):
     """
     If an empty set of wanted offsets is passed, the result must be empty.
     """
     self.assertEqual(
         {
             'match': {
                 'identicalMatchCount': 0,
                 'ambiguousMatchCount': 0,
                 'gapMismatchCount': 0,
                 'gapGapMismatchCount': 0,
                 'nonGapMismatchCount': 0,
             },
             'read1': {
                 'ambiguousOffsets': [],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
             'read2': {
                 'ambiguousOffsets': [],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
         },
         compareDNAReads(Read('id1', 'ATT-T'),
                         Read('id2', 'A-GTC'),
                         offsets=set()))
Beispiel #2
0
    def testMatchWithAmbiguityAndNotStrict(self):
        """
        Two sequences that match exactly, apart from one ambiguity in the first
        sequence, must compare as expected when we specify matchAmbiguous=True
        to allow ambiguous matching.
        """
        read1 = Read('id1', 'ACGTTS')
        read2 = Read('id2', 'ACGTTC')
        match = compareDNAReads(read1, read2, matchAmbiguous=True)

        self.assertEqual(
            '''\
Exact matches: 5/6 (83.33%)
Ambiguous matches: 1/6 (16.67%)
Exact or ambiguous matches: 6/6 (100.00%)
Mismatches: 0
  Not involving gaps (i.e., conflicts): 0
  Involving a gap in one sequence: 0
  Involving a gap in both sequences: 0
  Id: id1
    Length: 6
    Gaps: 0
    Ambiguous: 1/6 (16.67%)
  Id: id2
    Length: 6
    Gaps: 0
    Ambiguous: 0''',
            matchToString(match, read1, read2, matchAmbiguous=True)
        )
def collectData(reads1, reads2, square, matchAmbiguous):
    """
    Get pairwise matching statistics for two sets of reads.

    @param reads1: An C{OrderedDict} of C{str} read ids whose values are
        C{Read} instances. These will be the rows of the table.
    @param reads2: An C{OrderedDict} of C{str} read ids whose values are
        C{Read} instances. These will be the columns of the table.
    @param square: If C{True} we are making a square table of a set of
        sequences against themselves (in which case we show nothing on the
        diagonal).
    @param matchAmbiguous: If C{True}, count ambiguous nucleotides that are
        possibly correct as actually being correct. Otherwise, we are strict
        and insist that only non-ambiguous nucleotides can contribute to the
        matching nucleotide count.
    """
    result = defaultdict(dict)
    for id1, read1 in reads1.items():
        for id2, read2 in reads2.items():
            if id1 != id2 or not square:
                match = compareDNAReads(
                    read1, read2, matchAmbiguous=matchAmbiguous)['match']
                if not matchAmbiguous:
                    assert match['ambiguousMatchCount'] == 0
                result[id1][id2] = result[id2][id1] = match

    return result
Beispiel #4
0
 def testGapAmbiguous(self):
     """
     Testing that the ambiguousOffset shows ambiguous characters paired
     with gaps as expected
     """
     self.assertEqual(
         {
             'match': {
                 'identicalMatchCount': 2,
                 'ambiguousMatchCount': 0,
                 'gapMismatchCount': 2,
                 'gapGapMismatchCount': 1,
                 'nonGapMismatchCount': 0,
             },
             'read1': {
                 'ambiguousOffsets': [1],
                 'extraCount': 0,
                 'gapOffsets': [2, 3],
             },
             'read2': {
                 'ambiguousOffsets': [3],
                 'extraCount': 0,
                 'gapOffsets': [1, 2],
             },
         }, compareDNAReads(Read('id1', 'AN--T'), Read('id2', 'A--NT')))
Beispiel #5
0
 def testExtraAmbiguous(self):
     """
     If the first sequence has extra bases which are ambiguous,they must
     be indicated in the extraCount and in the ambiguousOffset.
     """
     self.assertEqual(
         {
             'match': {
                 'identicalMatchCount': 5,
                 'ambiguousMatchCount': 0,
                 'gapMismatchCount': 0,
                 'gapGapMismatchCount': 0,
                 'nonGapMismatchCount': 0,
             },
             'read1': {
                 'ambiguousOffsets': [6],
                 'extraCount': 2,
                 'gapOffsets': [5],
             },
             'read2': {
                 'ambiguousOffsets': [],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
         },
         compareDNAReads(Read('id1', 'ACGTT-N'),
                         Read('id2', 'ACGTT')))
Beispiel #6
0
 def testNonMatchingAmbiguityInFirst(self):
     """
     Two sequences that match exactly, apart from one (incompatible)
     ambiguity in the second sequence, must compare as expected.
     """
     self.assertEqual(
         {
             'match': {
                 'identicalMatchCount': 5,
                 'ambiguousMatchCount': 0,
                 'gapMismatchCount': 0,
                 'gapGapMismatchCount': 0,
                 'nonGapMismatchCount': 1,
             },
             'read1': {
                 'ambiguousOffsets': [5],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
             'read2': {
                 'ambiguousOffsets': [],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
         }, compareDNAReads(Read('id1', 'ACGTTW'), Read('id2', 'ACGTTC')))
Beispiel #7
0
 def testMatchWithIdenticalAmbiguityButStrict(self):
     """
     Two sequences that match exactly, including one (identical)
     ambiguity at the same location in the sequence, must compare as
     expected. Strict.
     """
     self.assertEqual(
         {
             'match': {
                 'identicalMatchCount': 5,
                 'ambiguousMatchCount': 0,
                 'gapMismatchCount': 0,
                 'gapGapMismatchCount': 0,
                 'nonGapMismatchCount': 1,
             },
             'read1': {
                 'ambiguousOffsets': [5],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
             'read2': {
                 'ambiguousOffsets': [5],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
         },
         compareDNAReads(Read('id1', 'ACGTTN'),
                         Read('id2', 'ACGTTN'),
                         matchAmbiguous=False))
Beispiel #8
0
    def testExcludeGapLocations(self):
        """
        If gap locations are not wanted, they should not appear in the result
        of a call to matchToString.
        """
        read1 = Read('id1', 'TTTTTAAAAAAGCGCG')
        read2 = Read('id2', 'TTTTT------GCGCG')
        match = compareDNAReads(read1, read2)
        self.maxDiff = None
        self.assertEqual(
            '''\
Exact matches: 10/16 (62.50%)
Ambiguous matches: 0
Mismatches: 6/16 (37.50%)
  Not involving gaps (i.e., conflicts): 0
  Involving a gap in one sequence: 6/16 (37.50%)
  Involving a gap in both sequences: 0
  Id: id1
    Length: 16
    Gaps: 0
    Ambiguous: 0
  Id: id2
    Length: 16
    Gaps: 6/16 (37.50%)
    Ambiguous: 0''',
            matchToString(match, read1, read2, includeGapLocations=False)
        )
Beispiel #9
0
 def testNoOffsets(self):
     """
     If an empty set of wanted offsets is passed, the result must be empty.
     """
     self.assertEqual(
         {
             'match': {
                 'identicalMatchCount': 0,
                 'ambiguousMatchCount': 0,
                 'gapMismatchCount': 0,
                 'gapGapMismatchCount': 0,
                 'nonGapMismatchCount': 0,
             },
             'read1': {
                 'ambiguousOffsets': [],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
             'read2': {
                 'ambiguousOffsets': [],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
         },
         compareDNAReads(Read('id1', 'ATT-T'),
                         Read('id2', 'A-GTC'), offsets=set()))
Beispiel #10
0
 def testEmptySequences(self):
     """
     Two empty sequences must compare as expected.
     """
     self.assertEqual(
         {
             'match': {
                 'identicalMatchCount': 0,
                 'ambiguousMatchCount': 0,
                 'gapMismatchCount': 0,
                 'gapGapMismatchCount': 0,
                 'nonGapMismatchCount': 0,
             },
             'read1': {
                 'ambiguousOffsets': [],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
             'read2': {
                 'ambiguousOffsets': [],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
         },
         compareDNAReads(Read('id1', ''),
                         Read('id2', '')))
Beispiel #11
0
    def testGapLocations(self):
        """
        Gap locations must be returned correctly.
        """
        read1 = Read('id1', 'TTTTTAAAAAAGCGCG')
        read2 = Read('id2', 'TTTTT------GCGCG')
        match = compareDNAReads(read1, read2)
        self.maxDiff = None
        self.assertEqual(
            '''\
Exact matches: 10/16 (62.50%)
Ambiguous matches: 0
Mismatches: 6/16 (37.50%)
  Not involving gaps (i.e., conflicts): 0
  Involving a gap in one sequence: 6/16 (37.50%)
  Involving a gap in both sequences: 0
  Id: id1
    Length: 16
    Gaps: 0
    Ambiguous: 0
  Id: id2
    Length: 16
    Gaps: 6/16 (37.50%)
    Gap locations (1-based): 6, 7, 8, 9, 10, 11
    Ambiguous: 0''',
            matchToString(match, read1, read2)
        )
Beispiel #12
0
 def testGapGap(self):
     """
     Coinciding gaps in the sequences must be dealt with correctly
     """
     self.assertEqual(
         {
             'match': {
                 'identicalMatchCount': 2,
                 'ambiguousMatchCount': 0,
                 'gapMismatchCount': 2,
                 'gapGapMismatchCount': 1,
                 'nonGapMismatchCount': 0,
             },
             'read1': {
                 'ambiguousOffsets': [],
                 'extraCount': 0,
                 'gapOffsets': [2, 3],
             },
             'read2': {
                 'ambiguousOffsets': [],
                 'extraCount': 0,
                 'gapOffsets': [1, 2],
             },
         },
         compareDNAReads(Read('id1', 'AC--T'),
                         Read('id2', 'A--TT')))
Beispiel #13
0
 def testGapInSecond(self):
     """
     A gap in the second sequence must be dealt with correctly
     """
     self.assertEqual(
         {
             'match': {
                 'identicalMatchCount': 3,
                 'ambiguousMatchCount': 0,
                 'gapMismatchCount': 2,
                 'gapGapMismatchCount': 0,
                 'nonGapMismatchCount': 0,
             },
             'read1': {
                 'ambiguousOffsets': [],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
             'read2': {
                 'ambiguousOffsets': [],
                 'extraCount': 0,
                 'gapOffsets': [1, 2],
             },
         },
         compareDNAReads(Read('id1', 'ACGTT'),
                         Read('id2', 'A--TT')))
Beispiel #14
0
 def testExactMatch(self):
     """
     Two sequences that match exactly must compare as expected.
     """
     self.assertEqual(
         {
             'match': {
                 'identicalMatchCount': 5,
                 'ambiguousMatchCount': 0,
                 'gapMismatchCount': 0,
                 'gapGapMismatchCount': 0,
                 'nonGapMismatchCount': 0,
             },
             'read1': {
                 'ambiguousOffsets': [],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
             'read2': {
                 'ambiguousOffsets': [],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
         },
         compareDNAReads(Read('id1', 'ACGTT'),
                         Read('id2', 'ACGTT')))
Beispiel #15
0
 def testOffsets(self):
     """
     If a set of wanted offsets is passed, the result must be restricted to
     just those offsets.
     """
     self.assertEqual(
         {
             'match': {
                 'identicalMatchCount': 1,
                 'ambiguousMatchCount': 0,
                 'gapMismatchCount': 0,
                 'gapGapMismatchCount': 0,
                 'nonGapMismatchCount': 1,
             },
             'read1': {
                 'ambiguousOffsets': [],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
             'read2': {
                 'ambiguousOffsets': [],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
         },
         compareDNAReads(Read('id1', 'ATT-T'),
                         Read('id2', 'A-GTC'),
                         offsets=set([0, 4])))
Beispiel #16
0
 def testOffsets(self):
     """
     If a set of wanted offsets is passed, the result must be restricted to
     just those offsets.
     """
     self.assertEqual(
         {
             'match': {
                 'identicalMatchCount': 1,
                 'ambiguousMatchCount': 0,
                 'gapMismatchCount': 0,
                 'gapGapMismatchCount': 0,
                 'nonGapMismatchCount': 1,
             },
             'read1': {
                 'ambiguousOffsets': [],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
             'read2': {
                 'ambiguousOffsets': [],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
         },
         compareDNAReads(Read('id1', 'ATT-T'),
                         Read('id2', 'A-GTC'), offsets=set([0, 4])))
Beispiel #17
0
 def testMatchWithAmbiguityButStrict(self):
     """
     Two sequences that match exactly, apart from one ambiguity in the first
     sequence, must compare as expected when we specify matchAmbiguous=False
     to disallow ambiguous matching.
     """
     self.assertEqual(
         {
             'match': {
                 'identicalMatchCount': 5,
                 'ambiguousMatchCount': 0,
                 'gapMismatchCount': 0,
                 'gapGapMismatchCount': 0,
                 'nonGapMismatchCount': 1,
             },
             'read1': {
                 'ambiguousOffsets': [5],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
             'read2': {
                 'ambiguousOffsets': [],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
         },
         compareDNAReads(Read('id1', 'ACGTTS'),
                         Read('id2', 'ACGTTC'),
                         matchAmbiguous=False))
Beispiel #18
0
 def testMatchWithIdenticalAmbiguityButStrict(self):
     """
     Two sequences that match exactly, including one (identical)
     ambiguity at the same location in the sequence, must compare as
     expected. Strict.
     """
     self.assertEqual(
         {
             'match': {
                 'identicalMatchCount': 5,
                 'ambiguousMatchCount': 0,
                 'gapMismatchCount': 0,
                 'gapGapMismatchCount': 0,
                 'nonGapMismatchCount': 1,
             },
             'read1': {
                 'ambiguousOffsets': [5],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
             'read2': {
                 'ambiguousOffsets': [5],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
         },
         compareDNAReads(Read('id1', 'ACGTTN'),
                         Read('id2', 'ACGTTN'), matchAmbiguous=False))
Beispiel #19
0
 def testMatchWithIncompatibleAmbiguityInBoth(self):
     """
     Two sequences that match exactly, apart from one (incompatible)
     ambiguity at the same location in the sequence, must compare as
     expected.
     """
     self.assertEqual(
         {
             'match': {
                 'identicalMatchCount': 5,
                 'ambiguousMatchCount': 0,
                 'gapMismatchCount': 0,
                 'gapGapMismatchCount': 0,
                 'nonGapMismatchCount': 1,
             },
             'read1': {
                 'ambiguousOffsets': [5],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
             'read2': {
                 'ambiguousOffsets': [5],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
         }, compareDNAReads(Read('id1', 'ACGTTW'), Read('id2', 'ACGTTS')))
Beispiel #20
0
 def testGapAmbiguous(self):
     """
     Testing that the ambiguousOffset shows ambiguous characters paired
     with gaps as expected
     """
     self.assertEqual(
         {
             'match': {
                 'identicalMatchCount': 2,
                 'ambiguousMatchCount': 0,
                 'gapMismatchCount': 2,
                 'gapGapMismatchCount': 1,
                 'nonGapMismatchCount': 0,
             },
             'read1': {
                 'ambiguousOffsets': [1],
                 'extraCount': 0,
                 'gapOffsets': [2, 3],
             },
             'read2': {
                 'ambiguousOffsets': [3],
                 'extraCount': 0,
                 'gapOffsets': [1, 2],
             },
         },
         compareDNAReads(Read('id1', 'AN--T'),
                         Read('id2', 'A--NT')))
Beispiel #21
0
 def testNonDefaultGapChars(self):
     """
     We must be able to specify the gap characters.
     """
     for gap in '+$':
         self.assertEqual(
             {
                 'match': {
                     'identicalMatchCount': 3,
                     'ambiguousMatchCount': 0,
                     'gapMismatchCount': 2,
                     'gapGapMismatchCount': 0,
                     'nonGapMismatchCount': 0,
                 },
                 'read1': {
                     'ambiguousOffsets': [],
                     'extraCount': 0,
                     'gapOffsets': [2],
                 },
                 'read2': {
                     'ambiguousOffsets': [],
                     'extraCount': 0,
                     'gapOffsets': [0],
                 },
             },
             compareDNAReads(Read('id1', 'AC%sTT' % gap),
                             Read('id2', '%sCGTT' % gap),
                             gapChars='+$'))
Beispiel #22
0
 def testExtraInSecond(self):
     """
     If the second sequence has extra bases, they must be indicated in the
     extraCount.
     """
     self.assertEqual(
         {
             'match': {
                 'identicalMatchCount': 5,
                 'ambiguousMatchCount': 0,
                 'gapMismatchCount': 0,
                 'gapGapMismatchCount': 0,
                 'nonGapMismatchCount': 0,
             },
             'read1': {
                 'ambiguousOffsets': [],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
             'read2': {
                 'ambiguousOffsets': [],
                 'extraCount': 2,
                 'gapOffsets': [],
             },
         }, compareDNAReads(Read('id1', 'ACGTT'), Read('id2', 'ACGTTCC')))
Beispiel #23
0
 def testExactMatch(self):
     """
     Two sequences that match exactly must compare as expected.
     """
     self.assertEqual(
         {
             'match': {
                 'identicalMatchCount': 5,
                 'ambiguousMatchCount': 0,
                 'gapMismatchCount': 0,
                 'gapGapMismatchCount': 0,
                 'nonGapMismatchCount': 0,
             },
             'read1': {
                 'ambiguousOffsets': [],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
             'read2': {
                 'ambiguousOffsets': [],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
         },
         compareDNAReads(Read('id1', 'ACGTT'),
                         Read('id2', 'ACGTT')))
Beispiel #24
0
 def testExtraAmbiguous(self):
     """
     If the first sequence has extra bases which are ambiguous,they must
     be indicated in the extraCount and in the ambiguousOffset.
     """
     self.assertEqual(
         {
             'match': {
                 'identicalMatchCount': 5,
                 'ambiguousMatchCount': 0,
                 'gapMismatchCount': 0,
                 'gapGapMismatchCount': 0,
                 'nonGapMismatchCount': 0,
             },
             'read1': {
                 'ambiguousOffsets': [6],
                 'extraCount': 2,
                 'gapOffsets': [5],
             },
             'read2': {
                 'ambiguousOffsets': [],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
         }, compareDNAReads(Read('id1', 'ACGTT-N'), Read('id2', 'ACGTT')))
Beispiel #25
0
 def testMatchWithAmbiguityInSecond(self):
     """
     Two sequences that match exactly, apart from one ambiguity in the
     second sequence, must compare as expected.
     """
     self.assertEqual(
         {
             'match': {
                 'identicalMatchCount': 5,
                 'ambiguousMatchCount': 1,
                 'gapMismatchCount': 0,
                 'gapGapMismatchCount': 0,
                 'nonGapMismatchCount': 0,
             },
             'read1': {
                 'ambiguousOffsets': [],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
             'read2': {
                 'ambiguousOffsets': [5],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
         },
         compareDNAReads(Read('id1', 'ACGTTC'),
                         Read('id2', 'ACGTTS')))
Beispiel #26
0
 def testGapInSecond(self):
     """
     A gap in the second sequence must be dealt with correctly
     """
     self.assertEqual(
         {
             'match': {
                 'identicalMatchCount': 3,
                 'ambiguousMatchCount': 0,
                 'gapMismatchCount': 2,
                 'gapGapMismatchCount': 0,
                 'nonGapMismatchCount': 0,
             },
             'read1': {
                 'ambiguousOffsets': [],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
             'read2': {
                 'ambiguousOffsets': [],
                 'extraCount': 0,
                 'gapOffsets': [1, 2],
             },
         },
         compareDNAReads(Read('id1', 'ACGTT'),
                         Read('id2', 'A--TT')))
Beispiel #27
0
 def testMatchWithIncompatibleAmbiguityInBoth(self):
     """
     Two sequences that match exactly, apart from one (incompatible)
     ambiguity at the same location in the sequence, must compare as
     expected.
     """
     self.assertEqual(
         {
             'match': {
                 'identicalMatchCount': 5,
                 'ambiguousMatchCount': 0,
                 'gapMismatchCount': 0,
                 'gapGapMismatchCount': 0,
                 'nonGapMismatchCount': 1,
             },
             'read1': {
                 'ambiguousOffsets': [5],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
             'read2': {
                 'ambiguousOffsets': [5],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
         },
         compareDNAReads(Read('id1', 'ACGTTW'),
                         Read('id2', 'ACGTTS')))
Beispiel #28
0
 def testGapGap(self):
     """
     Coinciding gaps in the sequences must be dealt with correctly
     """
     self.assertEqual(
         {
             'match': {
                 'identicalMatchCount': 2,
                 'ambiguousMatchCount': 0,
                 'gapMismatchCount': 2,
                 'gapGapMismatchCount': 1,
                 'nonGapMismatchCount': 0,
             },
             'read1': {
                 'ambiguousOffsets': [],
                 'extraCount': 0,
                 'gapOffsets': [2, 3],
             },
             'read2': {
                 'ambiguousOffsets': [],
                 'extraCount': 0,
                 'gapOffsets': [1, 2],
             },
         },
         compareDNAReads(Read('id1', 'AC--T'),
                         Read('id2', 'A--TT')))
Beispiel #29
0
 def testNonDefaultGapChars(self):
     """
     We must be able to specify the gap characters.
     """
     for gap in '+$':
         self.assertEqual(
             {
                 'match': {
                     'identicalMatchCount': 3,
                     'ambiguousMatchCount': 0,
                     'gapMismatchCount': 2,
                     'gapGapMismatchCount': 0,
                     'nonGapMismatchCount': 0,
                 },
                 'read1': {
                     'ambiguousOffsets': [],
                     'extraCount': 0,
                     'gapOffsets': [2],
                 },
                 'read2': {
                     'ambiguousOffsets': [],
                     'extraCount': 0,
                     'gapOffsets': [0],
                 },
             },
             compareDNAReads(Read('id1', 'AC%sTT' % gap),
                             Read('id2', '%sCGTT' % gap), gapChars='+$'))
Beispiel #30
0
 def testMismatch(self):
     """
     If the sequences have mismatched (non-ambiguous) bases, their count
     must be given correctly in the nonGapMismatchCount.
     """
     self.assertEqual(
         {
             'match': {
                 'identicalMatchCount': 3,
                 'ambiguousMatchCount': 0,
                 'gapMismatchCount': 0,
                 'gapGapMismatchCount': 0,
                 'nonGapMismatchCount': 2,
             },
             'read1': {
                 'ambiguousOffsets': [],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
             'read2': {
                 'ambiguousOffsets': [],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
         },
         compareDNAReads(Read('id1', 'ACGTT'),
                         Read('id2', 'ACGCC')))
Beispiel #31
0
 def testExtraInSecond(self):
     """
     If the second sequence has extra bases, they must be indicated in the
     extraCount.
     """
     self.assertEqual(
         {
             'match': {
                 'identicalMatchCount': 5,
                 'ambiguousMatchCount': 0,
                 'gapMismatchCount': 0,
                 'gapGapMismatchCount': 0,
                 'nonGapMismatchCount': 0,
             },
             'read1': {
                 'ambiguousOffsets': [],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
             'read2': {
                 'ambiguousOffsets': [],
                 'extraCount': 2,
                 'gapOffsets': [],
             },
         },
         compareDNAReads(Read('id1', 'ACGTT'),
                         Read('id2', 'ACGTTCC')))
Beispiel #32
0
 def testEmptySequences(self):
     """
     Two empty sequences must compare as expected.
     """
     self.assertEqual(
         {
             'match': {
                 'identicalMatchCount': 0,
                 'ambiguousMatchCount': 0,
                 'gapMismatchCount': 0,
                 'gapGapMismatchCount': 0,
                 'nonGapMismatchCount': 0,
             },
             'read1': {
                 'ambiguousOffsets': [],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
             'read2': {
                 'ambiguousOffsets': [],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
         },
         compareDNAReads(Read('id1', ''),
                         Read('id2', '')))
Beispiel #33
0
    def testMatchWithAmbiguityAndNotStrict(self):
        """
        Two sequences that match exactly, apart from one ambiguity in the first
        sequence, must compare as expected when we specify matchAmbiguous=True
        to allow ambiguous matching.
        """
        read1 = Read('id1', 'ACGTTS')
        read2 = Read('id2', 'ACGTTC')
        match = compareDNAReads(read1, read2, matchAmbiguous=True)

        self.assertEqual(
            '''\
Exact matches: 5/6 (83.33%)
Ambiguous matches: 1/6 (16.67%)
Exact or ambiguous matches: 6/6 (100.00%)
Mismatches: 0
  Not involving gaps (i.e., conflicts): 0
  Involving a gap in one sequence: 0
  Involving a gap in both sequences: 0
  Id: id1
    Length: 6
    Gaps: 0
    Ambiguous: 1/6 (16.67%)
  Id: id2
    Length: 6
    Gaps: 0
    Ambiguous: 0''', matchToString(match, read1, read2, matchAmbiguous=True))
Beispiel #34
0
 def testMismatch(self):
     """
     If the sequences have mismatched (non-ambiguous) bases, their count
     must be given correctly in the nonGapMismatchCount.
     """
     self.assertEqual(
         {
             'match': {
                 'identicalMatchCount': 3,
                 'ambiguousMatchCount': 0,
                 'gapMismatchCount': 0,
                 'gapGapMismatchCount': 0,
                 'nonGapMismatchCount': 2,
             },
             'read1': {
                 'ambiguousOffsets': [],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
             'read2': {
                 'ambiguousOffsets': [],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
         }, compareDNAReads(Read('id1', 'ACGTT'), Read('id2', 'ACGCC')))
Beispiel #35
0
 def testMatchWithAmbiguityButStrict(self):
     """
     Two sequences that match exactly, apart from one ambiguity in the first
     sequence, must compare as expected when we specify matchAmbiguous=False
     to disallow ambiguous matching.
     """
     self.assertEqual(
         {
             'match': {
                 'identicalMatchCount': 5,
                 'ambiguousMatchCount': 0,
                 'gapMismatchCount': 0,
                 'gapGapMismatchCount': 0,
                 'nonGapMismatchCount': 1,
             },
             'read1': {
                 'ambiguousOffsets': [5],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
             'read2': {
                 'ambiguousOffsets': [],
                 'extraCount': 0,
                 'gapOffsets': [],
             },
         },
         compareDNAReads(Read('id1', 'ACGTTS'),
                         Read('id2', 'ACGTTC'), matchAmbiguous=False))
Beispiel #36
0
def collectData(reads1, reads2, square, matchAmbiguous):
    """
    Get pairwise matching statistics for two sets of reads.

    @param reads1: An C{OrderedDict} of C{str} read ids whose values are
        C{Read} instances. These will be the rows of the table.
    @param reads2: An C{OrderedDict} of C{str} read ids whose values are
        C{Read} instances. These will be the columns of the table.
    @param square: If C{True} we are making a square table of a set of
        sequences against themselves (in which case we show nothing on the
        diagonal).
    @param matchAmbiguous: If C{True}, count ambiguous nucleotides that are
        possibly correct as actually being correct. Otherwise, we are strict
        and insist that only non-ambiguous nucleotides can contribute to the
        matching nucleotide count.
    """
    result = defaultdict(dict)
    for id1, read1 in reads1.items():
        for id2, read2 in reads2.items():
            if id1 != id2 or not square:
                match = compareDNAReads(read1,
                                        read2,
                                        matchAmbiguous=matchAmbiguous)['match']
                if not matchAmbiguous:
                    assert match['ambiguousMatchCount'] == 0
                result[id1][id2] = result[id2][id1] = match

    return result
Beispiel #37
0
def processFeature(featureName, features, genome, fps, featureNumber, args):
    """
    Process a feature from a genome.

    @param featureName: A C{str} feature name.
    @param features: A C{Features} instance.
    @param genome: A C{SARS2Genome} instance.
    @param fps: A C{dict} of file pointers for the various output streams.
    @param featureNumber: The C{int} 0-based count of the features requested.
        This will be zero for the first feature, 1 for the second, etc.
    @param args: A C{Namespace} instance as returned by argparse with
        values for command-line options.
    """
    result = genome.feature(featureName)
    feature = features.getFeature(featureName)
    referenceNt, genomeNt = result.ntSequences()
    referenceAa, genomeAa = result.aaSequences()

    newlineNeeded = False

    if args.printNtMatch:
        fp = fps['nt-match']
        if featureNumber:
            print(file=fp)
        print(f'Feature: {featureName} nucleotide match', file=fp)
        print(f'  Reference nt location {feature["start"] + 1}, genome nt '
              f'location {result.genomeOffset + 1}', file=fp)
        match = compareDNAReads(referenceNt, genomeNt)
        print(dnaMatchToString(match, referenceNt, genomeNt,
                               matchAmbiguous=False, indent='  '), file=fp)
        printDiffs(referenceNt, genomeNt, True, feature['start'], fp,
                   indent='    ')
        newlineNeeded = True

    if args.printAaMatch:
        fp = fps['aa-match']
        if newlineNeeded or featureNumber:
            print(file=fp)
        print(f'Feature: {featureName} amino acid match', file=fp)
        match = compareAaReads(referenceAa, genomeAa)
        print(aaMatchToString(match, referenceAa, genomeAa, indent='  '),
              file=fp)
        printDiffs(referenceAa, genomeAa, False, feature['start'], fp,
                   indent='    ')

    if args.printNtSequence:
        noGaps = Read(genomeNt.id, genomeNt.sequence.replace('-', ''))
        Reads([noGaps]).save(fps['nt-sequence'])

    if args.printAaSequence:
        noGaps = Read(genomeAa.id, genomeAa.sequence.replace('-', ''))
        Reads([noGaps]).save(fps['aa-sequence'])

    if args.printNtAlignment:
        Reads([genomeNt, referenceNt]).save(fps['nt-align'])

    if args.printAaAlignment:
        Reads([genomeAa, referenceAa]).save(fps['aa-align'])
Beispiel #38
0
    def _writeOverallResultSummarySummary(self, results, outputDir):
        """
        Write a summary of the summary of the overall results.

        @param results: A C{dict} of C{dicts}. Keyed by C{str} short alignment
           file name, then C{str} short reference name, and with values being
           C{dict}s with signifcant offsets and best consensus sequence for
           the corresponding reference in the alignment file.
        """
        filename = join(outputDir, 'result-summary-summary.txt')
        self.report('Writing overall result summary summary to', filename)

        bestFraction = 0.0
        bestAlignmentReference = []

        with open(filename, 'w') as fp:
            for alignmentFilename in sorted(results):
                print(alignmentFilename, file=fp)
                resultSummary = []
                for referenceId in sorted(results[alignmentFilename]):
                    result = results[alignmentFilename][referenceId]
                    referenceRead = self.referenceGenomes[referenceId]
                    consensusRead = result['consensusRead']
                    match = compareDNAReads(referenceRead,
                                            consensusRead)['match']
                    matchCount = (match['identicalMatchCount'] +
                                  match['ambiguousMatchCount'])
                    fraction = matchCount / len(referenceRead)

                    if fraction > bestFraction:
                        bestFraction = fraction
                        bestAlignmentReference = [(alignmentFilename,
                                                   referenceId)]
                    elif fraction == bestFraction:
                        bestAlignmentReference.append(
                            (alignmentFilename, referenceId))

                    resultSummary.append(
                        (fraction, '  %s: %d/%d (%.2f%%)' %
                         (referenceId, matchCount, len(referenceRead),
                          fraction * 100.0)))

                # Sort the result summary by decreasing nucleotide identity
                # fraction.
                resultSummary.sort(reverse=True)
                for fraction, summary in resultSummary:
                    print(summary, file=fp)

                print(file=fp)

            print('Best match%s (%.2f%%):' %
                  ('' if len(bestAlignmentReference) == 1 else 'es',
                   bestFraction * 100.0),
                  file=fp)
            for alignmentFilename, referenceId in bestAlignmentReference:
                print('  %s: %s' % (alignmentFilename, referenceId), file=fp)
Beispiel #39
0
    def _writeOverallResultSummary(self, results, outputDir):
        """
        Write a summary of the overall results.

        @param results: A C{dict} of C{dicts}. Keyed by C{str} short alignment
           file name, then C{str} short reference name, and with values being
           C{dict}s with signifcant offsets and best consensus sequence for
           the corresponding reference in the alignment file.
        """
        filename = join(outputDir, 'result-summary.txt')
        self.report('Writing overall result summary to', filename)
        with open(filename, 'w') as fp:
            for alignmentFilename in sorted(results):
                print('Alignment file', alignmentFilename, file=fp)
                for referenceId in sorted(results[alignmentFilename]):
                    result = results[alignmentFilename][referenceId]
                    referenceRead = self.referenceGenomes[referenceId]
                    consensusRead = result['consensusRead']
                    genomeLength = len(referenceRead)
                    significantOffsets = result['significantOffsets']
                    print('\n  Reference %s (length %d)' %
                          (referenceId, genomeLength),
                          file=fp)
                    print('    %d significant offsets found.' %
                          len(significantOffsets),
                          file=fp)

                    # Overall match.
                    match = compareDNAReads(referenceRead, consensusRead)
                    print('\n    Overall match of reference with consensus:',
                          file=fp)
                    print(matchToString(match,
                                        referenceRead,
                                        consensusRead,
                                        indent='    '),
                          file=fp)

                    # Significant sites match.
                    match = compareDNAReads(referenceRead,
                                            consensusRead,
                                            offsets=significantOffsets)
                    print('\n    Match of reference with consensus at '
                          '%d SIGNIFICANT sites:' % len(significantOffsets),
                          file=fp)
                    print(matchToString(match,
                                        referenceRead,
                                        consensusRead,
                                        indent='    ',
                                        offsets=significantOffsets),
                          file=fp)

                    # Non-significant sites match.
                    nonSignificantOffsets = (set(range(genomeLength)) -
                                             set(significantOffsets))
                    match = compareDNAReads(referenceRead,
                                            consensusRead,
                                            offsets=nonSignificantOffsets)
                    print('\n    Match of reference with consensus at '
                          '%d NON-SIGNIFICANT sites:' %
                          len(nonSignificantOffsets),
                          file=fp)
                    print(matchToString(match,
                                        referenceRead,
                                        consensusRead,
                                        indent='    ',
                                        offsets=nonSignificantOffsets),
                          file=fp)
Beispiel #40
0
    def saveClosestReferenceConsensus(self, referenceId, components,
                                      baseCountAtOffset, genomeLength,
                                      alignedReads, referenceInsertions,
                                      outputDir):
        """
        Calculate and save the best consensus to a reference genome.

        @param referenceId: The C{str} id of the reference sequence.
        @param components: A C{list} of C{ComponentByOffsets} instances.
        @param baseCountAtOffset: A C{list} of C{Counter} instances giving
            the count of each nucleotide at each genome offset.
        @param genomeLength: The C{int} length of the genome the reads were
            aligned to.
        @param alignedReads: A list of C{AlignedRead} instances.
        @param referenceInsertions: A C{dict} keyed by read id (the read
            that would cause a reference insertion). The values are lists
            of 2-tuples, with each 2-tuple containing an offset into the
            reference sequence and the C{str} of nucleotide that would be
            inserted starting at that offset.
        @param outputDir: A C{str} directory path.
        @return: A tuple of (consensus, unwantedReads, wantedCcReadCount,
                 wantedReadsCountAtOffset, wantedReadsBaseCountAtOffset).
        """
        def ccMatchCount(cc, reference, drawFp, drawMessage):
            """
            Count the matches between a consistent component and a reference
            genome.

            @param cc: A C{ConsistentComponent} instance.
            @param reference: A C{Read} instance.
            @param drawFp: A file pointer to write information about draws (if
                any) to.
            @param drawMessage: A C{str} message to write to C{drawFp}. If the
                string contains '%(baseCounts)s' that will be replaced by a
                string representation of the base counts (in C{counts})
                obtained from C{baseCountsToStr}. If not, the base count info
                will be printed after the message.
            @return: The C{int} count of bases that match the reference
                for the offsets covered by the consistent component.
            """
            referenceSequence = reference.sequence
            nucleotides = cc.nucleotides
            count = 0
            for offset in nucleotides:
                message = (drawMessage + (' location %d: base counts' %
                                          (offset + 1)) + ' %(baseCounts)s.')
                referenceBase = referenceSequence[offset]
                componentBase = commonest(nucleotides[offset],
                                          referenceBase,
                                          drawFp=drawFp,
                                          drawMessage=message)
                count += int(componentBase == referenceBase)
            return count

        def sortedConsistentComponent(component, reference, fp):
            """
            Sort the consistent components in the given C{ComponentByOffsets}
            instance according to how well they match the passed reference.
            The sort order is by increasing match score, so the best
            consistent component is last.

            @param component: A C{ComponentByOffsets} instance.
            @param reference: A C{Read} instance.
            @param fp: A file pointer to write information to.
            @return: The C{int} index of the best consistent component.
            """
            result = []
            for index, cc in enumerate(component.consistentComponents):
                matchCount = ccMatchCount(
                    cc, reference, fp,
                    '    Consistent component %d base draw' % (index + 1))
                score = matchCount / len(cc.nucleotides)
                print('  Consistent component %d (%d reads) has %d exact '
                      'matches with the reference, out of the %d offsets it '
                      'covers (%.2f%%).' %
                      (index + 1, len(cc.reads), matchCount, len(
                          cc.nucleotides), score * 100.0),
                      file=fp)
                result.append((score, len(cc.nucleotides), index, cc))

            result.sort()
            return result

        reference = self.referenceGenomes[referenceId]
        fields = reference.id.split(maxsplit=1)
        if len(fields) == 1:
            referenceIdRest = ''
        else:
            referenceIdRest = ' ' + fields[1]

        infoFile = join(outputDir, 'reference-consensus.txt')
        self.report('    Saving closest consensus to reference info to',
                    infoFile)

        with open(infoFile, 'w') as infoFp:
            print('Building consensus at significant offsets.', file=infoFp)
            consensus = [None] * genomeLength
            offsetsDone = set()
            wantedReads = set()
            unwantedReads = set()
            for count, component in enumerate(components, start=1):
                print('\nExamining component %d with %d locations: %s' %
                      (count, len(component.offsets),
                       commas(map(lambda offset: offset + 1,
                                  component.offsets))),
                      file=infoFp)
                componentOffsets = set(component.offsets)
                sortedCcs = sortedConsistentComponent(component, reference,
                                                      infoFp)

                while componentOffsets - offsetsDone:
                    # The following pop call will raise an IndexError if
                    # the sorted cc list is empty. But if it's empty we
                    # shouldn't be here, because the set of included
                    # offsets should at that point include everything in
                    # this component. Having the naked pop here ensures we
                    # get an exception if this assumption is incorrect.
                    # It's like having an assert to test that we found all
                    # the component's offsets following the loop.
                    score, _, ccIndex, cc = sortedCcs.pop()

                    print('  Incorporating nucleotides from consistent '
                          'component %d (%d reads, score %.2f, covering %d '
                          'locations (%d still undecided in consensus)) to '
                          'consensus.' %
                          (ccIndex + 1, len(
                              cc.reads), score, len(cc.nucleotides),
                           len(set(cc.nucleotides) - offsetsDone)),
                          file=infoFp)

                    wantedReads |= cc.reads
                    for offset in sorted(cc.nucleotides):
                        if offset in offsetsDone:
                            continue
                        nucleotides = cc.nucleotides[offset]
                        referenceBase = reference.sequence[offset]
                        base = commonest(
                            nucleotides,
                            referenceBase,
                            drawFp=infoFp,
                            drawMessage=('      WARNING: base count draw at '
                                         'location %d ' %
                                         (offset + 1)) + ' %(baseCounts)s.')
                        assert consensus[offset] is None
                        consensus[offset] = base
                        offsetsDone.add(offset)

                        # Do some reporting on the base just added.
                        if base == referenceBase:
                            mismatch = ''
                        else:
                            consensusBase = commonest(
                                baseCountAtOffset[offset],
                                referenceBase,
                                drawFp=infoFp,
                                drawMessage=(
                                    '      WARNING: consensus base count '
                                    'draw at location %d ' % (offset + 1)) +
                                ' %(baseCounts)s.')
                            mismatch = (
                                ' (mismatch: reference has %s, all-read '
                                'consensus has %s)' %
                                (referenceBase, consensusBase))

                        print('    Location %d: %s from nucleotides %s%s' %
                              (offset + 1, base, nucleotides.baseCountsToStr(),
                               mismatch),
                              file=infoFp)

                # Print info about the cccs that were not needed to cover
                # all the offsets in this cc. Reverse the list so we print
                # them in decreasing match score order.
                for score, _, ccIndex, cc in reversed(sortedCcs):
                    unwantedReads |= cc.reads
                    print('  Will NOT incorporate nucleotides from consistent '
                          'component %d (%d reads, score %.2f, covering %d '
                          'locations) to consensus.' %
                          (ccIndex + 1, len(
                              cc.reads), score, len(cc.nucleotides)),
                          file=infoFp)

            # Get the base counts at each offset, from the full set of
            # aligned reads minus the reads in cccs we're not using.
            (wantedReadsCountAtOffset, wantedReadsBaseCountAtOffset,
             _) = gatherData(genomeLength,
                             set(alignedReads) - unwantedReads)

            # Process the insignificant offsets, based on all reads EXCEPT
            # those not used in the connected components.
            offsetsToTry = sorted(set(range(genomeLength)) - offsetsDone)
            print('\nAttempting to add bases from %d non-significant '
                  'consensus locations, using all reads, EXCEPT those '
                  'belonging to unused consistent components:' %
                  len(offsetsToTry),
                  file=infoFp)
            for offset in offsetsToTry:
                assert consensus[offset] is None
                baseCount = wantedReadsBaseCountAtOffset[offset]
                if baseCount:
                    referenceBase = reference.sequence[offset]
                    base = commonest(
                        baseCount,
                        referenceBase,
                        drawFp=infoFp,
                        drawMessage=(
                            '    WARNING: consensus base count draw at '
                            'location %d' % (offset + 1)) + ' %(baseCounts)s.')
                    print('  Location %d: %s from nucleotides %s' %
                          (offset + 1, base, baseCountsToStr(baseCount)),
                          file=infoFp,
                          end='')

                    if base == referenceBase:
                        print(file=infoFp)
                    else:
                        print(' (mismatch: reference has %s)' % referenceBase,
                              file=infoFp)
                    consensus[offset] = base
                    offsetsDone.add(offset)

            # Process remaining insignificant offsets, using ALL reads
            # (i.e., including those in cccs that we wanted to avoid
            # using).  At this point, this is the best we can do with these
            # final offsets (otherwise we will get gaps - which in some
            # cases may actually might be preferable because the reference
            # sequence may not be fully covered by the actual infection
            # sequence).
            offsetsToTry = sorted(set(range(genomeLength)) - offsetsDone)
            print('\nAttempting to add bases from %d non-significant '
                  'consensus locations, using all reads, INCLUDING those '
                  'belonging to unused consistent components:' %
                  len(offsetsToTry),
                  file=infoFp)
            for offset in offsetsToTry:
                assert consensus[offset] is None
                referenceBase = reference.sequence[offset]
                baseCount = baseCountAtOffset[offset]
                if baseCount:
                    base = commonest(
                        baseCount,
                        referenceBase,
                        drawFp=infoFp,
                        drawMessage=(
                            '    WARNING: consensus base count draw at '
                            'location %d' % (offset + 1)) + ' %(baseCounts)s.')
                    print('  Location %d: %s from nucleotides %s' %
                          (offset + 1, base, baseCountsToStr(baseCount)),
                          file=infoFp,
                          end='')
                else:
                    # The reads did not cover this offset.
                    base = '-'
                    print('  Location %d: -' % (offset + 1),
                          file=infoFp,
                          end='')

                if base == referenceBase:
                    print(file=infoFp)
                else:
                    print(' (mismatch: reference has %s)' % referenceBase,
                          file=infoFp)
                consensus[offset] = base
                offsetsDone.add(offset)

            # Sanity check: make sure we processed all offsets.
            assert offsetsDone == set(range(genomeLength))

            consensusId = (
                '%s-consensus%s' %
                (self.shortReferenceId[referenceId], referenceIdRest))

            consensus = Read(consensusId, ''.join(consensus))

            # Print details of the match of the consensus to the reference.
            match = compareDNAReads(reference, consensus)
            print('\nOVERALL match with reference:', file=infoFp)
            print(matchToString(match, reference, consensus, indent='  '),
                  file=infoFp)

            # Print any insertions to the reference.
            wantedReadsWithInsertions = (set(referenceInsertions) &
                                         (set(alignedReads) - unwantedReads))
            if wantedReadsWithInsertions:
                print('\nReference insertions present in %d read%s:' %
                      (len(wantedReadsWithInsertions),
                       s(len(wantedReadsWithInsertions))),
                      file=infoFp)
                nucleotides = defaultdict(Counter)
                for readId in wantedReadsWithInsertions:
                    for (offset, sequence) in referenceInsertions[readId]:
                        for index, base in enumerate(sequence):
                            nucleotides[offset + index][base] += 1
                print(nucleotidesToStr(nucleotides, prefix='  '), file=infoFp)
            else:
                print('\nReference insertions: none.', file=infoFp)

        filename = join(outputDir, 'reference-consensus.fasta')
        self.report('    Saving consensus to', filename)
        Reads([consensus]).save(filename)

        wantedCcReadCount = 0
        filename = join(outputDir, 'cc-wanted.fastq')
        with open(filename, 'w') as fp:
            for wantedCcRead in wantedReads:
                alignment = wantedCcRead.alignment
                if not (alignment.is_secondary or alignment.is_supplementary):
                    wantedCcReadCount += 1
                    print(Read(alignment.query_name, alignment.query_sequence,
                               alignmentQuality(alignment)).toString('fastq'),
                          end='',
                          file=fp)
        self.report(
            '    Saved %d read%s wanted in consistent connected components '
            'to %s' % (wantedCcReadCount, s(wantedCcReadCount), filename))

        unwantedReads = set(alignedReads) - wantedReads

        return (consensus, unwantedReads, wantedCcReadCount,
                wantedReadsCountAtOffset, wantedReadsBaseCountAtOffset)
Beispiel #41
0
    def saveAlternateConsensus(self, referenceId, consensusRead,
                               baseCountAtOffset, readCountAtOffset,
                               genomeLength, outputDir):
        """
        Calculate and save an alternate consensus to a reference genome.

        @param referenceId: The C{str} id of the reference sequence.
        @consensusRead: The C{dark.reads.Read} consensus sequence to calculate
            an alternative to.
        @param baseCountAtOffset: A C{list} of C{Counter} instances giving
            the count of each nucleotide at each genome offset.
        @param readCountAtOffset: A C{list} of C{int} counts of the total
            number of reads at each genome offset (i.e., just the sum of the
            values in C{baseCountAtOffset})
        @param genomeLength: The C{int} length of the genome the reads were
            aligned to.
        @param outputDir: A C{str} directory path.
        @return: An alternate consensus C{dark.reads.Read} instance.
        """
        filename = join(outputDir, 'reference-alternate-consensus.txt')
        self.report('    Writing alternate consensus info to', filename)
        alternateConsensus = []

        referenceRead = self.referenceGenomes[referenceId]
        fields = referenceRead.id.split(maxsplit=1)
        if len(fields) == 1:
            referenceIdRest = ''
        else:
            referenceIdRest = ' ' + fields[1]

        with open(filename, 'w') as infoFp:
            print(
                'The three nucleotides shown on each line are for the '
                'reference, the consensus (made by clustering) and the '
                'alternate nucleotide.\nThese are followed by up to two '
                'asterisks: the first if the alternate nucleotide does not '
                'agree with the reference, the second if it does not agree '
                'with the consensus.',
                file=infoFp)
            for offset in range(genomeLength):
                referenceBase = referenceRead.sequence[offset]
                consensusBase = consensusRead.sequence[offset]
                baseCount = baseCountAtOffset[offset]
                if baseCount:
                    if len(baseCount) == 1:
                        # Only one nucleotide was found at this location.
                        # The reference doesn't necessarily agree with the
                        # consensus here, since the aligned reads may have
                        # had a different base at this site.
                        base = consensusBase
                    else:
                        # Find the nucleotide with the highest count that
                        # is not the consensus sequence nucleotide.
                        orderedCounts = baseCount.most_common()
                        alternateBase, alternateCount = [
                            x for x in orderedCounts if x[0] != consensusBase
                        ][0]

                        # Check that we found a base that's not the
                        # consensus base.
                        assert alternateBase != consensusBase

                        # If the frequency of the alternate base is high
                        # enough, go with it. Else take the base from the
                        # original consensus.
                        alternateFraction = (alternateCount /
                                             readCountAtOffset[offset])
                        if (alternateCount > 1 and alternateFraction >
                                self.alternateNucleotideMinFreq):
                            base = alternateBase
                        else:
                            base = consensusBase

                    agreeWithReference = referenceBase == base
                    agreeWithConsensus = consensusBase == base
                    print('Location %d: %s %s %s %s %s nucleotides %s' %
                          (offset + 1, referenceBase, consensusBase, base,
                           ' ' if agreeWithReference else '*',
                           ' ' if agreeWithConsensus else '*',
                           baseCountsToStr(baseCount)),
                          file=infoFp)
                else:
                    # The reads did not cover this offset.
                    base = '-'
                    print('  Location %d: -' % (offset + 1), file=infoFp)

                alternateConsensus.append(base)

            alternateConsensusId = (
                '%s-alternate-consensus%s' %
                (self.shortReferenceId[referenceId], referenceIdRest))

            alternateConsensusRead = Read(alternateConsensusId,
                                          ''.join(alternateConsensus))

            # Print details of the match of the alternate consensus to the
            # reference.
            match = compareDNAReads(referenceRead, alternateConsensusRead)
            print('\nAlternate consensus match with reference:', file=infoFp)
            print(matchToString(match,
                                referenceRead,
                                alternateConsensusRead,
                                indent='  '),
                  file=infoFp)

            # Print details of the match of the alternate consensus to the
            # original consensus.
            match = compareDNAReads(consensusRead, alternateConsensusRead)
            print('\nAlternate consensus match with original consensus:',
                  file=infoFp)
            print(matchToString(match,
                                consensusRead,
                                alternateConsensusRead,
                                indent='  '),
                  file=infoFp)

            # Print details of the match of the original consensus to the
            # reference.
            match = compareDNAReads(referenceRead, consensusRead)
            print('\nOriginal consensus match with reference:', file=infoFp)
            print(matchToString(match,
                                referenceRead,
                                consensusRead,
                                indent='  '),
                  file=infoFp)

        filename = join(outputDir, 'reference-alternate-consensus.fasta')
        self.report('    Saving alternate consensus FASTA to', filename)
        Reads([alternateConsensusRead]).save(filename)

        return alternateConsensusRead
def processFeature(featureName, genome, fps, featureNumber, args):
    """
    Process a feature from a genome.

    @param featureName: A C{str} feature name.
    @param genome: A C{SARS2Genome} instance.
    @param fps: A C{dict} of file pointers for the various output streams.
    @param featureNumber: The C{int} 0-based count of the features requested.
        This will be zero for the first feature, 1 for the second, etc.
    @param args: A C{Namespace} instance as returned by argparse with
        values for command-line options.
    """
    referenceNt, genomeNt = genome.ntSequences(featureName)
    feature = genome.features[featureName]

    if args.printAaMatch or args.printAaSequence or args.printAaAlignment:
        try:
            referenceAa, genomeAa = genome.aaSequences(featureName)
        except TranslationError as e:
            if args.onError == 'raise':
                raise
            elif args.onError == 'print':
                print(
                    f'Could not translate feature {featureName} in genome '
                    f'{genome.genome.id}: {e}',
                    file=sys.stderr)
            referenceAa = genomeAa = None

    newlineNeeded = False

    if args.printNtMatch:
        fp = fps['nt-match']
        if featureNumber:
            print(file=fp)
        print(f'Feature: {featureName} nucleotide match', file=fp)
        print(f'  Reference nt location {feature["start"] + 1}', file=fp)
        match = compareDNAReads(referenceNt, genomeNt)
        print(dnaMatchToString(match,
                               referenceNt,
                               genomeNt,
                               matchAmbiguous=False,
                               indent='  '),
              file=fp)
        printDiffs(referenceNt,
                   genomeNt,
                   True,
                   feature['start'],
                   fp,
                   indent='    ')
        newlineNeeded = True

    if args.printAaMatch and genomeAa:
        fp = fps['aa-match']
        if newlineNeeded or featureNumber:
            print(file=fp)
        print(f'Feature: {featureName} amino acid match', file=fp)
        match = compareAaReads(referenceAa, genomeAa)
        print(aaMatchToString(match, referenceAa, genomeAa, indent='  '),
              file=fp)
        printDiffs(referenceAa,
                   genomeAa,
                   False,
                   feature['start'],
                   fp,
                   indent='    ')

    if args.printNtSequence:
        noGaps = Read(genomeNt.id, genomeNt.sequence.replace('-', ''))
        Reads([noGaps]).save(fps['nt-sequence'])

    if args.printAaSequence and genomeAa:
        noGaps = Read(genomeAa.id, genomeAa.sequence.replace('-', ''))
        Reads([noGaps]).save(fps['aa-sequence'])

    if args.printNtAlignment:
        Reads([genomeNt, referenceNt]).save(fps['nt-align'])

    if args.printAaAlignment and genomeAa:
        Reads([genomeAa, referenceAa]).save(fps['aa-align'])
Beispiel #43
0
def plotConsistentComponents(referenceId,
                             genomeLength,
                             components,
                             significantOffsets,
                             outfile,
                             infoFile,
                             outputDir,
                             title='xxx',
                             show=False,
                             titleFontSize=12,
                             axisFontSize=12):
    """
    Plot consistent connected components.
    """
    def offsetsToLocationsStr(offsets):
        return ', '.join(map(lambda i: str(i + 1), sorted(offsets)))

    data = []

    with open(infoFile, 'w') as fp:

        print('There are %d significant location%s: %s' %
              (len(significantOffsets), s(len(significantOffsets)),
               offsetsToLocationsStr(significantOffsets)),
              file=fp)

        for count, component in enumerate(components, start=1):

            print('Processing component %d, with %d consistent component%s' %
                  (count, len(component.consistentComponents),
                   s(len(component.consistentComponents))),
                  file=fp)

            # Get the reference sequence for the component.
            reads = list(
                FastaReads(
                    join(outputDir, 'component-%d-consensuses.fasta' % count)))

            reference = reads[0]
            length = len(reference)
            minOffset = min(component.offsets)
            maxOffset = max(component.offsets)

            print('  Offset range: %d to %d' % (minOffset + 1, maxOffset + 1),
                  file=fp)

            # Add a top line to represent the reference.
            data.append(
                go.Scatter(x=(minOffset + 1, maxOffset + 1),
                           y=(1.05, 1.05),
                           hoverinfo='text',
                           name=('Reference component %s' % count),
                           text=('Reference component %s, %d offsets' %
                                 (count, len(component.offsets)))))

            # Add vertical lines at the start and end of this component.
            data.append(
                go.Scatter(
                    x=(minOffset + 1, minOffset + 1),
                    y=(-0.05, 1.05),
                    mode='lines',
                    hoverinfo='none',
                    line={
                        'color': '#eee',
                    },
                    showlegend=False,
                ))
            data.append(
                go.Scatter(
                    x=(maxOffset + 1, maxOffset + 1),
                    y=(-0.05, 1.05),
                    mode='lines',
                    hoverinfo='none',
                    line={
                        'color': '#eee',
                    },
                    showlegend=False,
                ))

            for ccCount, cc in enumerate(component.consistentComponents,
                                         start=1):

                ccSummary = ('Component read count %d, offsets covered %d/%d' %
                             (len(cc.reads), len(
                                 cc.nucleotides), len(component.offsets)))

                # Get the consistent connected component consensus.
                consensus = reads[ccCount]
                assert ('consistent-component-%d' % ccCount) in consensus.id

                print('  Processing consistent component', ccCount, file=fp)
                print('  Component sequence:', consensus.sequence, file=fp)
                print('  %d offset%s: %s' %
                      (len(cc.nucleotides), s(len(cc.nucleotides)),
                       offsetsToLocationsStr(cc.nucleotides)),
                      file=fp)

                match = compareDNAReads(reference, consensus)
                print(matchToString(match, reference, consensus,
                                    indent='    '),
                      file=fp)

                identicalMatchCount = match['match']['identicalMatchCount']
                ambiguousMatchCount = match['match']['ambiguousMatchCount']

                # The match fraction will ignore gaps in the consensus
                # sequence as it is padded with '-' chars to align it to
                # the reference.
                fraction = (identicalMatchCount + ambiguousMatchCount) / (
                    length - len(match['read2']['gapOffsets']))

                x = []
                y = [fraction] * len(cc.nucleotides)
                text = []
                identical = []
                for index, offset in enumerate(sorted(component.offsets)):
                    if offset in cc.nucleotides:

                        consensusBase = consensus.sequence[index]
                        referenceBase = reference.sequence[index]

                        if consensusBase == referenceBase:
                            identical.append(len(x))

                        # x axis values are 1-based (locations, not offsets)
                        x.append(offset + 1)

                        text.append(
                            'Location: %d, component: %s, reference: %s'
                            '<br>Component nucleotides: %s<br>%s' %
                            (offset + 1, consensusBase, referenceBase,
                             baseCountsToStr(
                                 cc.nucleotides[offset]), ccSummary))

                data.append(
                    go.Scatter(x=x,
                               y=y,
                               hoverinfo='text',
                               selectedpoints=identical,
                               showlegend=False,
                               text=text,
                               mode='markers',
                               selected={'marker': {
                                   'color': 'blue',
                               }},
                               unselected={'marker': {
                                   'color': 'red',
                               }}))

    # Add the significant offsets.
    n = len(significantOffsets)
    data.append(
        go.Scatter(x=[i + 1 for i in significantOffsets],
                   y=[-0.05] * n,
                   text=[
                       'Location %d' % (offset + 1)
                       for offset in significantOffsets
                   ],
                   hoverinfo='text',
                   mode='markers',
                   name='Significant locations'))

    layout = go.Layout(
        title=title,
        titlefont={
            'size': titleFontSize,
        },
        xaxis={
            'range': (0, genomeLength + 1),
            'title': 'Genome location',
            'titlefont': {
                'size': axisFontSize,
            },
        },
        yaxis={
            'range': (-0.1, 1.1),
            'title': 'Nucleotide identity with reference sequence',
            'titlefont': {
                'size': axisFontSize,
            },
        },
        hovermode='closest',
    )

    fig = go.Figure(data=data, layout=layout)
    plotly.offline.plot(fig, filename=outfile, auto_open=show, show_link=False)
Beispiel #44
0
    # Align.
    reads = needle(reads)

    if args.alignmentFile:
        assert reads.save(args.alignmentFile) == 2

read1, read2 = reads
len1, len2 = map(len, reads)
identicalLengths = len1 == len2

# Sanity check.
if args.align:
    assert identicalLengths

result = compareDNAReads(read1, read2, matchAmbiguous=(not args.strict))

match = result['match']
identicalMatchCount = match['identicalMatchCount']
ambiguousMatchCount = match['ambiguousMatchCount']
gapMismatchCount = match['gapMismatchCount']
gapGapMismatchCount = match['gapGapMismatchCount']
nonGapMismatchCount = match['nonGapMismatchCount']

x = 'Post-alignment, sequence' if args.align else 'Sequence'
if identicalLengths:
    print('%s lengths are identical: %s' % (x, len1))
else:
    print('%s lengths: %d, %d (difference %d)' % (x, len1, len2,
                                                  abs(len1 - len2)))
Beispiel #45
0
    if args.alignmentFile:
        assert reads.save(args.alignmentFile) == 2

offsets = (parseRangeString(args.sites, convertToZeroBased=True)
           if args.sites else None)

read1, read2 = reads
len1, len2 = map(len, reads)
identicalLengths = len1 == len2

# Sanity check.
if args.align:
    assert identicalLengths

match = compareDNAReads(read1, read2, matchAmbiguous=(not args.strict),
                        offsets=offsets)

x = 'Post-alignment, sequence' if args.align else 'Sequence'
if identicalLengths:
    print('%s lengths are identical: %s' % (x, len1))
else:
    print('%s lengths: %d, %d (difference %d)' % (x, len1, len2,
                                                  abs(len1 - len2)))

print(matchToString(match, read1, read2, matchAmbiguous=(not args.strict),
                    offsets=offsets))

if args.showDiffs:
    # Print all sites where the sequences differ.
    width = int(log10(max(len1, len2))) + 1
    headerPrinted = False
    if args.alignmentFile:
        assert reads.save(args.alignmentFile) == 2

offsets = (parseRangeString(args.sites, convertToZeroBased=True)
           if args.sites else None)

read1, read2 = reads
len1, len2 = map(len, reads)
identicalLengths = len1 == len2

# Sanity check.
if args.align:
    assert identicalLengths

match = compareDNAReads(read1,
                        read2,
                        matchAmbiguous=(not args.strict),
                        offsets=offsets)

x = 'Post-alignment, sequence' if args.align else 'Sequence'
if identicalLengths:
    print('%s lengths are identical: %s' % (x, len1))
else:
    print('%s lengths: %d, %d (difference %d)' %
          (x, len1, len2, abs(len1 - len2)))

print(
    matchToString(match,
                  read1,
                  read2,
                  matchAmbiguous=(not args.strict),
                  offsets=offsets))