def test_getBestFrontBackRecord(self): """Test function _parseBestFrontBackRecord().""" obj = Classifier() domFN = op.join(self.testDir, "data/test_parseHmmDom.dom") front, back = obj._getBestFrontBackRecord(domFN) # In the following, verify the front and back are equivalent # to stdout/test_parseHmmDom_dFront/Back.txt def prettystr(d): """Return Pretty print string for front & back.""" return "\n".join( [key + ":\n" + "\n".join( [k + ":" + str(v) for k, v in val.iteritems()]) for key, val in d.iteritems()]) frontFN = op.join(self.testDir, "out/test_parseHmmDom_dFront.txt") backFN = op.join(self.testDir, "out/test_parseHmmDom_dBack.txt") f = open(frontFN, 'w') f.write(prettystr(front)) f.close() f = open(backFN, 'w') f.write(prettystr(back)) f.close() stdoutFrontFN = op.join(self.testDir, "stdout/test_parseHmmDom_dFront.txt") stdoutBackFN = op.join(self.testDir, "stdout/test_parseHmmDom_dBack.txt") self.assertTrue(filecmp.cmp(frontFN, stdoutFrontFN)) self.assertTrue(filecmp.cmp(backFN, stdoutBackFN))
def test_chunkReads(self): """Test function _chunkReads(readsFN, chunkSize, chunkedReadsFNs).""" obj = Classifier() readsFN = op.join(self.testDir, "data/test_chunkReads_1.fa") chunkedReadsFN = op.join(self.testDir, "out/test_chunkReads_1.fa") if op.exists(chunkedReadsFN): os.remove(chunkedReadsFN) stdoutChunkedReadsFN = op.join(self.testDir, "stdout/test_chunkReads_1.fa") obj._chunkReads(readsFN, 10, [chunkedReadsFN]) self.assertTrue(filecmp.cmp(chunkedReadsFN, stdoutChunkedReadsFN))
def test_processPrimers(self): """Test function _processPrimers().""" inPFN = op.join(self.testDir, "data/test_primers_in.fa") obj = Classifier() # Test on an artificial example. outPFN = op.join(self.testDir, "out/test_primers_out.fa") stdoutPFN = op.join(self.testDir, "stdout/test_primers_out.fa") obj._processPrimers(primer_fn=inPFN, window_size=50, primer_out_fn=outPFN, revcmp_primers=False) self.assertTrue(filecmp.cmp(outPFN, stdoutPFN)) # Test on real PacBio primers.fa pbPFN = op.join(self.testDir, "data/primers.fa") # outPFN2 = primers.fa for primer detection. outPFN2 = op.join(self.testDir, "out/test_primers_out_2.fa") stdoutPFN2 = op.join(self.testDir, "stdout/test_primers_out_2.fa") obj._processPrimers(primer_fn=pbPFN, window_size=50, primer_out_fn=outPFN2, revcmp_primers=False) self.assertTrue(filecmp.cmp(outPFN2, stdoutPFN2)) # outPFN3 = primers.fa for chimera detection. outPFN2 = op.join(self.testDir, "out/test_primers_out_3.fa") stdoutPFN2 = op.join(self.testDir, "stdout/test_primers_out_3.fa") obj._processPrimers(primer_fn=pbPFN, window_size=50, primer_out_fn=outPFN2, revcmp_primers=True) self.assertTrue(filecmp.cmp(outPFN2, stdoutPFN2))
def test_findPolyA(self): """Test function _findPolyA(seq, minANum, p3Start).""" obj = Classifier() seq1 = ("GTGAAGTAGGTGTCCCGCACCAAGGCACGGAGCCAGAGAGGTGTGGGTGC" + "TAAAAGCCACCCGTTAGGACCCAGAGCAGCTGAAGCTGGATGCGAAAGGA" + "TACAGGCTTAGTAGCCATGGAGACCAAACTGGAACAAATGCCGACTGGAA" + "AGTGTATCTTATAACTTATTAAATAAAATGTTTGCTCCACGAAAAAAAAA" + "AAAAAAAAAAAAAAGTACTCTGCGTTGATACCACTGCTT") seq2 = ("TGGTTGGTCGGCGTTTAGCTTTGTGAGGCTCCCTGAACAGAAACACTGTT" + "GGAAGAAGAGTCCCCTGACATCACCCAGCGTCAAGTGGGAGTTAGCCTCT" + "GAAGTTCAGTGTATCACGTTAATGCTAATATGCTTTGTGGTGGCAGAATT" + "TATTTTGGCTTTTTGTCATTTAGCCAAATTAAAGGCAAACGCGTTTCTAA" + "AAAAAAAAAAAAAAAAAAAAGTAGCTCTGCGTTTGATACCACTGCTT") seq3 = ("TATTTTGGCTTTTTGTCATTTAGCCAAATTAAAGGCAAACGCGTTTCTAA") self.assertEqual(obj._findPolyA(seq1), 188) self.assertEqual(obj._findPolyA(seq2), 196) self.assertEqual(obj._findPolyA(seq3), -1)
def test_pickBestPrimerCombo(self): """Test funciton _pickBestPrimerCombo().""" obj = Classifier() domFN = op.join(self.testDir, "data/test_parseHmmDom.dom") front, back = obj._getBestFrontBackRecord(domFN) # Now pick up the best primer combo movie = "m131018_081703_42161_c100585152550000001823088404281404_s1_p0" rids = [movie + "/" + str(zmw) + "/ccs" for zmw in [43, 45, 54]] res = obj._pickBestPrimerCombo( front[rids[0]], back[rids[0]], [0, 1], 10) self.assertTrue(res[2] is None) self.assertTrue(res[3] is None) res = obj._pickBestPrimerCombo( front[rids[1]], back[rids[1]], [0, 1], 10) fw = DOMRecord("F1", movie + "/45/ccs", 33.0, 0, 30, 31, 0, 30, 100) rc = DOMRecord("R1", movie + "/45/ccs", 27.2, 0, 25, 25, 0, 25, 100) self.assertEqual(res[0], 1) self.assertEqual(res[1], "+") self.assertTrue(str(fw) == str(res[2])) self.assertTrue(str(rc) == str(res[3])) res = obj._pickBestPrimerCombo( front[rids[2]], back[rids[2]], [0, 1], 10) rc = DOMRecord("R1", movie + "/54/ccs", 22.3, 0, 25, 25, 0, 27, 100) self.assertEqual(res[0], 1) self.assertEqual(res[1], "+") self.assertTrue(res[2] is None) self.assertTrue(str(res[3]) == str(rc))
def run(self): """Run classify, cluster, polish or subset.""" cmd = self.args.subCommand try: if cmd == 'classify': opts = ChimeraDetectionOptions( min_seq_len=self.args.min_seq_len, min_score=self.args.min_score, min_dist_from_end=self.args.min_dist_from_end, max_adjacent_hit_dist=self.args.max_adjacent_hit_dist, primer_search_window=self.args.primer_search_window, detect_chimera_nfl=self.args.detect_chimera_nfl) obj = Classifier(reads_fn=self.args.readsFN, out_dir=self.args.outDir, out_reads_fn=self.args.outReadsFN, primer_fn=self.args.primerFN, primer_report_fn=self.args.primerReportFN, summary_fn=self.args.summary_fn, cpus=self.args.cpus, change_read_id=True, opts=opts, out_flnc_fn=self.args.flnc_fa, out_nfl_fn=self.args.nfl_fa, ignore_polyA=self.args.ignore_polyA, reuse_dom=self.args.reuse_dom) obj.run() elif cmd == 'cluster': ice_opts = IceOptions(cDNA_size=self.args.cDNA_size, quiver=self.args.quiver, use_finer_qv=self.args.use_finer_qv) sge_opts = SgeOptions(unique_id=self.args.unique_id, use_sge=self.args.use_sge, max_sge_jobs=self.args.max_sge_jobs, blasr_nproc=self.args.blasr_nproc, quiver_nproc=self.args.quiver_nproc) ipq_opts = IceQuiverHQLQOptions(qv_trim_5=self.args.qv_trim_5, qv_trim_3=self.args.qv_trim_3, hq_quiver_min_accuracy=self.args.hq_quiver_min_accuracy, hq_isoforms_fa=self.args.hq_isoforms_fa, hq_isoforms_fq=self.args.hq_isoforms_fq, lq_isoforms_fa=self.args.lq_isoforms_fa, lq_isoforms_fq=self.args.lq_isoforms_fq) obj = Cluster(root_dir=self.args.root_dir, flnc_fa=self.args.flnc_fa, nfl_fa=self.args.nfl_fa, bas_fofn=self.args.bas_fofn, ccs_fofn=self.args.ccs_fofn, fasta_fofn=self.args.fasta_fofn, out_fa=self.args.consensusFa, sge_opts=sge_opts, ice_opts=ice_opts, ipq_opts=ipq_opts, report_fn=self.args.report_fn, summary_fn=self.args.summary_fn, nfl_reads_per_split=self.args.nfl_reads_per_split) obj.run() elif cmd == 'subset': rules = SubsetRules(FL=self.args.FL, nonChimeric=self.args.nonChimeric) obj = ReadsSubsetExtractor(inFN=self.args.readsFN, outFN=self.args.outFN, rules=rules, ignore_polyA=self.args.ignore_polyA, printReadLengthOnly=self.args.printReadLengthOnly) obj.run() else: raise PBTranscriptException(cmd, "Unknown command passed to pbtranscript.py:" + self.args.subName) except Exception: logging.exception("Exiting pbtranscript with return code 1.") return 1 return 0
def run(self): """Run classify, cluster, polish or subset.""" cmd = self.args.subCommand try: if cmd == 'classify': opts = ChimeraDetectionOptions( min_seq_len=self.args.min_seq_len, min_score=self.args.min_score, min_dist_from_end=self.args.min_dist_from_end, max_adjacent_hit_dist=self.args.max_adjacent_hit_dist, primer_search_window=self.args.primer_search_window) obj = Classifier(reads_fn=self.args.readsFN, out_dir=self.args.outDir, out_reads_fn=self.args.outReadsFN, primer_fn=self.args.primerFN, primer_report_fn=self.args.primerReportFN, summary_fn=self.args.summary_fn, cpus=self.args.cpus, change_read_id=True, opts=opts, out_flnc_fn=self.args.flnc_fa, out_nfl_fn=self.args.nfl_fa, ignore_polyA=self.args.ignore_polyA) obj.run() elif cmd == 'cluster': ice_opts = IceOptions(cDNA_size=self.args.cDNA_size, quiver=self.args.quiver) sge_opts = SgeOptions(unique_id=self.args.unique_id, use_sge=self.args.use_sge, max_sge_jobs=self.args.max_sge_jobs, blasr_nproc=self.args.blasr_nproc, quiver_nproc=self.args.quiver_nproc) obj = Cluster(root_dir=self.args.root_dir, flnc_fa=self.args.flnc_fa, nfl_fa=self.args.nfl_fa, bas_fofn=self.args.bas_fofn, ccs_fofn=self.args.ccs_fofn, out_fa=self.args.consensusFa, sge_opts=sge_opts, ice_opts=ice_opts, hq_isoforms_fa=self.args.hq_isoforms_fa, hq_isoforms_fq=self.args.hq_isoforms_fq, lq_isoforms_fa=self.args.lq_isoforms_fa, lq_isoforms_fq=self.args.lq_isoforms_fq, report_fn=self.args.report_fn, summary_fn=self.args.summary_fn) obj.run() elif cmd == 'subset': rules = SubsetRules(FL=self.args.FL, nonChimeric=self.args.nonChimeric) obj = ReadsSubsetExtractor(inFN=self.args.readsFN, outFN=self.args.outFN, rules=rules, ignore_polyA=self.args.ignore_polyA, printReadLengthOnly= self.args.printReadLengthOnly) obj.run() else: raise PBTranscriptException(cmd, "Unknown command passed to pbtranscript.py:" + self.args.subName) except Exception as err: logging.error(str(err)) return 1 return 0
def run(self): """Run classify, cluster, polish or subset.""" cmd = self.args.subCommand try: if cmd == 'classify': opts = ChimeraDetectionOptions( min_seq_len=self.args.min_seq_len, min_score=self.args.min_score, min_dist_from_end=self.args.min_dist_from_end, max_adjacent_hit_dist=self.args.max_adjacent_hit_dist, primer_search_window=self.args.primer_search_window, detect_chimera_nfl=self.args.detect_chimera_nfl) obj = Classifier(reads_fn=self.args.readsFN, out_dir=self.args.outDir, out_reads_fn=self.args.outReadsFN, primer_fn=self.args.primerFN, primer_report_fn=self.args.primerReportFN, summary_fn=self.args.summary_fn, cpus=self.args.cpus, change_read_id=True, opts=opts, out_flnc_fn=self.args.flnc_fa, out_nfl_fn=self.args.nfl_fa, ignore_polyA=self.args.ignore_polyA, keep_primer=self.args.keep_primer, reuse_dom=self.args.reuse_dom) obj.run() elif cmd == 'cluster': ice_opts = IceOptions(quiver=self.args.quiver, use_finer_qv=self.args.use_finer_qv, targeted_isoseq=self.args.targeted_isoseq, ece_penalty=self.args.ece_penalty, ece_min_len=self.args.ece_min_len) sge_opts = SgeOptions(unique_id=self.args.unique_id, use_sge=self.args.use_sge, max_sge_jobs=self.args.max_sge_jobs, blasr_nproc=self.args.blasr_nproc, quiver_nproc=self.args.quiver_nproc, gcon_nproc=self.args.gcon_nproc, sge_env_name=self.args.sge_env_name, sge_queue=self.args.sge_queue) ipq_opts = IceQuiverHQLQOptions(qv_trim_5=self.args.qv_trim_5, qv_trim_3=self.args.qv_trim_3, hq_quiver_min_accuracy=self.args.hq_quiver_min_accuracy, hq_isoforms_fa=self.args.hq_isoforms_fa, hq_isoforms_fq=self.args.hq_isoforms_fq, lq_isoforms_fa=self.args.lq_isoforms_fa, lq_isoforms_fq=self.args.lq_isoforms_fq) obj = Cluster(root_dir=self.args.root_dir, flnc_fa=self.args.flnc_fa, nfl_fa=self.args.nfl_fa, bas_fofn=self.args.bas_fofn, ccs_fofn=self.args.ccs_fofn, fasta_fofn=self.args.fasta_fofn, out_fa=self.args.consensusFa, sge_opts=sge_opts, ice_opts=ice_opts, ipq_opts=ipq_opts, report_fn=self.args.report_fn, summary_fn=self.args.summary_fn, nfl_reads_per_split=self.args.nfl_reads_per_split) obj.run() elif cmd == 'subset': rules = SubsetRules(FL=self.args.FL, nonChimeric=self.args.nonChimeric) obj = ReadsSubsetExtractor(inFN=self.args.readsFN, outFN=self.args.outFN, rules=rules, ignore_polyA=self.args.ignore_polyA, printReadLengthOnly=self.args.printReadLengthOnly) obj.run() else: raise PBTranscriptException(cmd, "Unknown command passed to pbtranscript.py:" + self.args.subName) except Exception: logging.exception("Exiting pbtranscript with return code 1.") return 1 return 0
def run(self): """Run classify, cluster, polish or subset.""" cmd = self.args.subCommand try: if cmd == 'classify': opts = ChimeraDetectionOptions( min_seq_len=self.args.min_seq_len, min_score=self.args.min_score, min_dist_from_end=self.args.min_dist_from_end, max_adjacent_hit_dist=self.args.max_adjacent_hit_dist, primer_search_window=self.args.primer_search_window) obj = Classifier(reads_fn=self.args.readsFN, out_dir=self.args.outDir, out_reads_fn=self.args.outReadsFN, primer_fn=self.args.primerFN, primer_report_fn=self.args.primerReportFN, summary_fn=self.args.summary_fn, cpus=self.args.cpus, change_read_id=True, opts=opts, out_flnc_fn=self.args.flnc_fa, out_nfl_fn=self.args.nfl_fa, ignore_polyA=self.args.ignore_polyA) obj.run() elif cmd == 'cluster': ice_opts = IceOptions(cDNA_size=self.args.cDNA_size, quiver=self.args.quiver) sge_opts = SgeOptions(unique_id=self.args.unique_id, use_sge=self.args.use_sge, max_sge_jobs=self.args.max_sge_jobs, blasr_nproc=self.args.blasr_nproc, quiver_nproc=self.args.quiver_nproc) obj = Cluster(root_dir=self.args.root_dir, flnc_fa=self.args.flnc_fa, nfl_fa=self.args.nfl_fa, bas_fofn=self.args.bas_fofn, ccs_fofn=self.args.ccs_fofn, out_fa=self.args.consensusFa, sge_opts=sge_opts, ice_opts=ice_opts, hq_isoforms_fa=self.args.hq_isoforms_fa, hq_isoforms_fq=self.args.hq_isoforms_fq, lq_isoforms_fa=self.args.lq_isoforms_fa, lq_isoforms_fq=self.args.lq_isoforms_fq, report_fn=self.args.report_fn, summary_fn=self.args.summary_fn) obj.run() elif cmd == 'subset': rules = SubsetRules(FL=self.args.FL, nonChimeric=self.args.nonChimeric) obj = ReadsSubsetExtractor( inFN=self.args.readsFN, outFN=self.args.outFN, rules=rules, ignore_polyA=self.args.ignore_polyA, printReadLengthOnly=self.args.printReadLengthOnly) obj.run() else: raise PBTranscriptException( cmd, "Unknown command passed to pbtranscript.py:" + self.args.subName) except Exception as err: logging.error(str(err)) return 1 return 0