Beispiel #1
0
    def _getBestFrontBackRecord(self, domFN):
        """Parses DOM output from phmmer and fill in best_of_front, best_of_back
           bestOf: sequence id ---> DOMRecord
        """
        logging.info("Get the best front & back primer hits.")
        # bestOf_ = {} # key: sid --> primer name --> DOMRecord
        best_of_front = defaultdict(lambda: None)
        best_of_back = defaultdict(lambda: None)

        reader = DOMReader(domFN)
        for r in reader:
            # allow missing adapter
            if r.sStart > 48 or r.pStart > 48:
                continue

            if r.sid.endswith('_front'):  # _front
                bestOf = best_of_front
                r.sid = r.sid[:-6]
            elif r.sid.endswith('_back'):  # _back
                bestOf = best_of_back
                r.sid = r.sid[:-5]
            else:
                raise ClassifierException(
                    "Unable to parse a read {r} in phmmer dom file {f}.".
                    format(r=r.sid, f=domFN))
            if r.sid not in bestOf:
                bestOf[r.sid] = {}
            if (r.pid in bestOf[r.sid] and
                bestOf[r.sid][r.pid].score < r.score) or \
               (r.pid not in bestOf[r.sid]):
                bestOf[r.sid][r.pid] = r
        return (best_of_front, best_of_back)
Beispiel #2
0
 def test_Reader(self):
     """Test DOMReader."""
     inDOMFN = op.join(self.testDir, "data/test_DOMReader.dom")
     reader = DOMReader(inDOMFN)
     res = [r for r in reader]
     expected_0 = DOMRecord(
         "F1",
         "m131018_081703_42161_c100585152550000001823088404281404_s1_p0/45/ccs",
         23.7, 0, 31, 31, 2170, 2201, 3931)
     expected_1 = DOMRecord(
         "R1",
         "m131018_081703_42161_c100585152550000001823088404281404_s1_p0/45/ccs",
         16.2, 0, 25, 25, 3906, 3931, 3931)
     self.assertEqual(res[0], expected_0)
     self.assertEqual(res[1], expected_1)
Beispiel #3
0
 def _getChimeraRecord(self, domFN, opts):
     """Parses phmmer DOM output from trimmed reads for chimera
        detection, return DOMRecord of suspicious chimeras, which
        have primer hits in the MIDDLE of the sequence.
     """
     logging.info("Identify chimera records from {f}.".format(f=domFN))
     # sid --> list of DOMRecord with primer hits in the middle
     # of sequence.
     suspicous_hits = defaultdict(lambda: [])
     reader = DOMReader(domFN)
     for r in reader:
         # A hit has to be in the middle of sequence, and with
         # decent score.
         if r.sStart > opts.min_dist_from_end and \
            r.sEnd < r.sLen - opts.min_dist_from_end and \
            r.score > opts.min_score:
             suspicous_hits[r.sid].append(r)
     return suspicous_hits
Beispiel #4
0
    def _getBestFrontBackRecord(self, domFN, min_score):
        """Parses DOM output from phmmer and fill in best_of_front, best_of_back
           bestOf: sequence id ---> DOMRecord
        """
        logging.info("Get the best front & back primer hits.")
        # bestOf_ = {} # key: sid --> (score, primer
        best_of_front = defaultdict(lambda: None)
        best_of_back = defaultdict(lambda: None)

        reader = DOMReader(domFN)
        for r in reader:
            # allow missing adapter
            if r.sStart > 48 or r.pStart > 48:
                continue

            if r.score < min_score:
                continue

            # ex: sid m160213_091647_42134_c100957952550000001823213806221633_s1_p0/54497/ccs_front
            # ex: pid f_G11
            if r.sid.endswith('_front'):  # _front
                bestOf = best_of_front
                r.sid = r.sid[:-6]
            elif r.sid.endswith('_back'):  # _back
                bestOf = best_of_back
                r.sid = r.sid[:-5]
            else:
                raise ClassifierException(
                    "Unable to parse a read {r} in phmmer dom file {f}.".
                    format(r=r.sid, f=domFN))
            if r.sid not in bestOf:
                bestOf[r.sid] = {}
            if (r.pid not in bestOf[r.sid]) or \
                (bestOf[r.sid][r.pid].score < r.score):
                bestOf[r.sid][r.pid] = r
        return (best_of_front, best_of_back)