def pickup_best_clusters(self, fq_filenames): """Pick up hiqh QV clusters.""" self.add_log("Picking up the best clusters according to QVs from {fs}.".format(fs=", ".join(fq_filenames))) a = load(open(self.final_pickle_fn)) uc = a["uc"] quivered = {} for fq in fq_filenames: self.add_log("Looking at quivered fq {f}".format(f=fq)) for r in FastqReader(fq): cid = r.name.split("|")[0] if cid.endswith("_ref"): cid = cid[:-4] cid = int(cid[1:]) quivered[cid] = r good = [] for cid, r in quivered.iteritems(): q = [phred_to_qv(x) for x in r.quality] if sum(q[self.qv_trim_5 : -self.qv_trim_3]) <= self.qv_max_err: good.append(cid) partial_uc = load(open(self.nfl_all_pickle_fn))["partial_uc"] partial_uc2 = defaultdict(lambda: []) partial_uc2.update(partial_uc) self.add_log("Writing hiqh-quality isoforms to {f}|fq".format(f=self.quivered_good_fa)) self.add_log("Writing low-quality isoforms to {f}|fq".format(f=self.quivered_bad_fa)) with FastaWriter(self.quivered_good_fa) as good_fa_writer, FastaWriter( self.quivered_bad_fa ) as bad_fa_writer, FastqWriter(self.quivered_good_fq) as good_fq_writer, FastqWriter( self.quivered_bad_fq ) as bad_fq_writer: for cid in quivered: r = quivered[cid] newname = "c{cid}/f{flnc_num}p{nfl_num}/{read_len}".format( cid=cid, flnc_num=len(uc[cid]), nfl_num=len(partial_uc2[cid]), read_len=len(r.sequence) ) if cid in good: self.add_log("processing quivered cluster {c} --> good.".format(c=cid)) good_fa_writer.writeRecord(newname, r.sequence) good_fq_writer.writeRecord(newname, r.sequence, r.quality) else: self.add_log("processing quivered cluster {c} --> bad.".format(c=cid)) bad_fa_writer.writeRecord(newname, r.sequence) bad_fq_writer.writeRecord(newname, r.sequence, r.quality) self.add_log("-" * 60, level=logging.INFO) self.add_log( "High-quality Quivered consensus written " + "to:\n{0}\n{1}\n".format(self.quivered_good_fa, self.quivered_good_fq) ) self.add_log( "Low-qulality Quivered consensus written " + "to:\n{0}\n{1}".format(self.quivered_bad_fa, self.quivered_bad_fq) ) self.add_log("-" * 60, level=logging.INFO)
def pickup_best_clusters(self, fq_filenames): """Pick up hiqh QV clusters.""" self.add_log("Picking up the best clusters according to QVs from {fs}.". format(fs=", ".join(fq_filenames))) a = load(open(self.final_pickle_fn)) uc = a['uc'] quivered = {} for fq in fq_filenames: self.add_log("Looking at quivered fq {f}".format(f=fq)) for r in FastqReader(fq): # possible ID: c0/0_1611|quiver cid = r.name.split('|')[0] if cid.endswith('_ref'): cid = cid[:-4] i = cid.find('/') if i > 0: cid = cid[:i] cid = int(cid[1:]) quivered[cid] = r good = [] for cid, r in quivered.iteritems(): qv_len = max(0, len(r.quality) - self.qv_trim_5 - self.qv_trim_3) if qv_len != 0: q = [phred_to_qv(x) for x in r.quality] err_sum = sum(q[self.qv_trim_5: -self.qv_trim_3]) # LIZ HACK: definitely of HQ must include # of FL >= 2 !!! if 1.0 - (err_sum / float(qv_len)) >= self.hq_quiver_min_accuracy and len(uc[cid]) >= 2: good.append(cid) partial_uc = load(open(self.nfl_all_pickle_fn))['partial_uc'] partial_uc2 = defaultdict(lambda: []) partial_uc2.update(partial_uc) if self.report_fn is not None: self.write_report(report_fn=self.report_fn, uc=uc, partial_uc=partial_uc2) self.add_log("Writing hiqh-quality isoforms to {f}|fq". format(f=self.quivered_good_fa)) self.add_log("Writing low-quality isoforms to {f}|fq". format(f=self.quivered_bad_fa)) with FastaWriter(self.quivered_good_fa) as good_fa_writer, \ FastaWriter(self.quivered_bad_fa) as bad_fa_writer, \ FastqWriter(self.quivered_good_fq) as good_fq_writer, \ FastqWriter(self.quivered_bad_fq) as bad_fq_writer: for cid in quivered: r = quivered[cid] newname = "c{cid}/f{flnc_num}p{nfl_num}/{read_len}".\ format(cid=cid, flnc_num=len(uc[cid]), nfl_num=len(partial_uc2[cid]), read_len=len(r.sequence)) newname = cid_with_annotation(newname) if cid in good: self.add_log("processing quivered cluster {c} --> good.". format(c=cid)) good_fa_writer.writeRecord(newname, r.sequence) good_fq_writer.writeRecord(newname, r.sequence, r.quality) else: self.add_log("processing quivered cluster {c} --> bad.". format(c=cid)) bad_fa_writer.writeRecord(newname, r.sequence) bad_fq_writer.writeRecord(newname, r.sequence, r.quality) self.add_log("-" * 60, level=logging.INFO) self.add_log("High-quality Quivered consensus written " + "to:\n{0}\n{1}".format(self.quivered_good_fa, self.quivered_good_fq)) self.add_log("Low-qulality Quivered consensus written " + "to:\n{0}\n{1}".format(self.quivered_bad_fa, self.quivered_bad_fq)) self.add_log("-" * 60, level=logging.INFO)
def pickup_best_clusters(self, fq_filenames): """Pick up hiqh QV clusters.""" self.add_log( "Picking up the best clusters according to QVs from {fs}.".format( fs=", ".join(fq_filenames))) a = load(open(self.final_pickle_fn)) uc = a['uc'] quivered = {} for fq in fq_filenames: self.add_log("Looking at quivered fq {f}".format(f=fq)) for r in FastqReader(fq): # possible ID: c0/0_1611|quiver cid = r.name.split('|')[0] if cid.endswith('_ref'): cid = cid[:-4] i = cid.find('/') if i > 0: cid = cid[:i] cid = int(cid[1:]) quivered[cid] = r good = [] for cid, r in quivered.iteritems(): qv_len = max(0, len(r.quality) - self.qv_trim_5 - self.qv_trim_3) if qv_len != 0: q = [phred_to_qv(x) for x in r.quality] err_sum = sum(q[self.qv_trim_5:-self.qv_trim_3]) if 1.0 - (err_sum / float(qv_len)) >= self.hq_quiver_min_accuracy: good.append(cid) partial_uc = load(open(self.nfl_all_pickle_fn))['partial_uc'] partial_uc2 = defaultdict(lambda: []) partial_uc2.update(partial_uc) if self.report_fn is not None: self.write_report(report_fn=self.report_fn, uc=uc, partial_uc=partial_uc2) self.add_log("Writing hiqh-quality isoforms to {f}|fq".format( f=self.quivered_good_fa)) self.add_log("Writing low-quality isoforms to {f}|fq".format( f=self.quivered_bad_fa)) with FastaWriter(self.quivered_good_fa) as good_fa_writer, \ FastaWriter(self.quivered_bad_fa) as bad_fa_writer, \ FastqWriter(self.quivered_good_fq) as good_fq_writer, \ FastqWriter(self.quivered_bad_fq) as bad_fq_writer: for cid in quivered: r = quivered[cid] newname = "c{cid}/f{flnc_num}p{nfl_num}/{read_len}".\ format(cid=cid, flnc_num=len(uc[cid]), nfl_num=len(partial_uc2[cid]), read_len=len(r.sequence)) newname = cid_with_annotation(newname) if cid in good: self.add_log( "processing quivered cluster {c} --> good.".format( c=cid)) good_fa_writer.writeRecord(newname, r.sequence) good_fq_writer.writeRecord(newname, r.sequence, r.quality) else: self.add_log( "processing quivered cluster {c} --> bad.".format( c=cid)) bad_fa_writer.writeRecord(newname, r.sequence) bad_fq_writer.writeRecord(newname, r.sequence, r.quality) self.add_log("-" * 60, level=logging.INFO) self.add_log("High-quality Quivered consensus written " + "to:\n{0}\n{1}".format(self.quivered_good_fa, self.quivered_good_fq)) self.add_log( "Low-qulality Quivered consensus written " + "to:\n{0}\n{1}".format(self.quivered_bad_fa, self.quivered_bad_fq)) self.add_log("-" * 60, level=logging.INFO)
def pickup_best_clusters(self, fq_filenames): """Pick up hiqh QV clusters.""" self.add_log( "Picking up the best clusters according to QVs from {fs}.".format( fs=", ".join(fq_filenames))) a = load(open(self.final_pickle_fn)) uc = a['uc'] quivered = {} for fq in fq_filenames: self.add_log("Looking at quivered fq {f}".format(f=fq)) for r in FastqReader(fq): cid = r.name.split('|')[0] if cid.endswith('_ref'): cid = cid[:-4] cid = int(cid[1:]) quivered[cid] = r good = [] for cid, r in quivered.iteritems(): q = [phred_to_qv(x) for x in r.quality] if sum(q[self.qv_trim_5:-self.qv_trim_3]) <= self.qv_max_err: good.append(cid) partial_uc = load(open(self.nfl_all_pickle_fn))['partial_uc'] partial_uc2 = defaultdict(lambda: []) partial_uc2.update(partial_uc) self.add_log("Writing hiqh-quality isoforms to {f}|fq".format( f=self.quivered_good_fa)) self.add_log("Writing low-quality isoforms to {f}|fq".format( f=self.quivered_bad_fa)) with FastaWriter(self.quivered_good_fa) as good_fa_writer, \ FastaWriter(self.quivered_bad_fa) as bad_fa_writer, \ FastqWriter(self.quivered_good_fq) as good_fq_writer, \ FastqWriter(self.quivered_bad_fq) as bad_fq_writer: for cid in quivered: r = quivered[cid] newname = "c{cid}/f{flnc_num}p{nfl_num}/{read_len}".\ format(cid=cid, flnc_num=len(uc[cid]), nfl_num=len(partial_uc2[cid]), read_len=len(r.sequence)) if cid in good: self.add_log( "processing quivered cluster {c} --> good.".format( c=cid)) good_fa_writer.writeRecord(newname, r.sequence) good_fq_writer.writeRecord(newname, r.sequence, r.quality) else: self.add_log( "processing quivered cluster {c} --> bad.".format( c=cid)) bad_fa_writer.writeRecord(newname, r.sequence) bad_fq_writer.writeRecord(newname, r.sequence, r.quality) self.add_log("-" * 60, level=logging.INFO) self.add_log("High-quality Quivered consensus written " + "to:\n{0}\n{1}\n".format(self.quivered_good_fa, self.quivered_good_fq)) self.add_log( "Low-qulality Quivered consensus written " + "to:\n{0}\n{1}".format(self.quivered_bad_fa, self.quivered_bad_fq)) self.add_log("-" * 60, level=logging.INFO)