def _getBestFrontBackRecord(self, domFN): """Parses DOM output from phmmer and fill in best_of_front, best_of_back bestOf: sequence id ---> DOMRecord """ logging.info("Get the best front & back primer hits.") # bestOf_ = {} # key: sid --> primer name --> DOMRecord best_of_front = defaultdict(lambda: None) best_of_back = defaultdict(lambda: None) reader = DOMReader(domFN) for r in reader: # allow missing adapter if r.sStart > 48 or r.pStart > 48: continue if r.sid.endswith('_front'): # _front bestOf = best_of_front r.sid = r.sid[:-6] elif r.sid.endswith('_back'): # _back bestOf = best_of_back r.sid = r.sid[:-5] else: raise ClassifierException( "Unable to parse a read {r} in phmmer dom file {f}.". format(r=r.sid, f=domFN)) if r.sid not in bestOf: bestOf[r.sid] = {} if (r.pid in bestOf[r.sid] and bestOf[r.sid][r.pid].score < r.score) or \ (r.pid not in bestOf[r.sid]): bestOf[r.sid][r.pid] = r return (best_of_front, best_of_back)
def test_Reader(self): """Test DOMReader.""" inDOMFN = op.join(self.testDir, "data/test_DOMReader.dom") reader = DOMReader(inDOMFN) res = [r for r in reader] expected_0 = DOMRecord( "F1", "m131018_081703_42161_c100585152550000001823088404281404_s1_p0/45/ccs", 23.7, 0, 31, 31, 2170, 2201, 3931) expected_1 = DOMRecord( "R1", "m131018_081703_42161_c100585152550000001823088404281404_s1_p0/45/ccs", 16.2, 0, 25, 25, 3906, 3931, 3931) self.assertEqual(res[0], expected_0) self.assertEqual(res[1], expected_1)
def _getChimeraRecord(self, domFN, opts): """Parses phmmer DOM output from trimmed reads for chimera detection, return DOMRecord of suspicious chimeras, which have primer hits in the MIDDLE of the sequence. """ logging.info("Identify chimera records from {f}.".format(f=domFN)) # sid --> list of DOMRecord with primer hits in the middle # of sequence. suspicous_hits = defaultdict(lambda: []) reader = DOMReader(domFN) for r in reader: # A hit has to be in the middle of sequence, and with # decent score. if r.sStart > opts.min_dist_from_end and \ r.sEnd < r.sLen - opts.min_dist_from_end and \ r.score > opts.min_score: suspicous_hits[r.sid].append(r) return suspicous_hits
def _getBestFrontBackRecord(self, domFN, min_score): """Parses DOM output from phmmer and fill in best_of_front, best_of_back bestOf: sequence id ---> DOMRecord """ logging.info("Get the best front & back primer hits.") # bestOf_ = {} # key: sid --> (score, primer best_of_front = defaultdict(lambda: None) best_of_back = defaultdict(lambda: None) reader = DOMReader(domFN) for r in reader: # allow missing adapter if r.sStart > 48 or r.pStart > 48: continue if r.score < min_score: continue # ex: sid m160213_091647_42134_c100957952550000001823213806221633_s1_p0/54497/ccs_front # ex: pid f_G11 if r.sid.endswith('_front'): # _front bestOf = best_of_front r.sid = r.sid[:-6] elif r.sid.endswith('_back'): # _back bestOf = best_of_back r.sid = r.sid[:-5] else: raise ClassifierException( "Unable to parse a read {r} in phmmer dom file {f}.". format(r=r.sid, f=domFN)) if r.sid not in bestOf: bestOf[r.sid] = {} if (r.pid not in bestOf[r.sid]) or \ (bestOf[r.sid][r.pid].score < r.score): bestOf[r.sid][r.pid] = r return (best_of_front, best_of_back)