def _progress_indicator(rec_nr, n_discarded, handle=None): if not handle is None: if isinstance(handle, gzip.GzipFile): _progress_indicator.endpos = os.path.getsize(handle.name) _progress_indicator.get_pos = handle.fileobj.tell else: try: _progress_indicator.endpos = filesize(handle) _progress_indicator.get_pos = handle.tell except IOError: _progress_indicator.endpos = None _progress_indicator.get_pos = lambda: None _progress_indicator.prev_time = time() return get_pos = _progress_indicator.get_pos endpos = _progress_indicator.endpos frac = None if endpos is None else float(get_pos()) / endpos if time() - _progress_indicator.prev_time > .5 or frac == 1.0: _progress_indicator.prev_time = time() perc_str = '?%' if frac is None else '%.2f%%' % (frac * 100) stdout.write(term.EL(2) + term.CHA(0) + \ 'Processed %s records (%s)'%(rec_nr + 1, perc_str)) stdout.flush()
def _progress_indicator(rec_nr, n_discarded, handle = None): if not handle is None: if isinstance(handle, gzip.GzipFile): _progress_indicator.endpos = os.path.getsize(handle.name) _progress_indicator.get_pos = handle.fileobj.tell else: try: _progress_indicator.endpos = filesize(handle) _progress_indicator.get_pos = handle.tell except IOError: _progress_indicator.endpos = None _progress_indicator.get_pos = lambda : None _progress_indicator.prev_time = time() return get_pos = _progress_indicator.get_pos endpos = _progress_indicator.endpos frac = None if endpos is None else float(get_pos()) / endpos if time() - _progress_indicator.prev_time > .5 or frac == 1.0: _progress_indicator.prev_time = time() perc_str = '?%' if frac is None else '%.2f%%'%(frac*100) stdout.write(term.EL(2) + term.CHA(0) + \ 'Processed %s records (%s)'%(rec_nr + 1, perc_str)) stdout.flush()
def prog_checkout(args): search_rc = args.reverse_complement barcodes_fn = args.barcodes adapters = list(BarcodeFormat.records_in(open(barcodes_fn, 'r'))) outfiles = {sample_id : open("%s.fastq"%sample_id,'w') \ for (sample_id, master, slave) in adapters} fq1_fn = args.i fq2_fn = args.i2 max_mm = args.max_mm fq1_file = zopen(fq1_fn, 'r') if isinstance(fq1_file, gzip.GzipFile): fq1_filesize = os.path.getsize(fq1_file.name) fq1_filepos = fq1_file.fileobj.tell else: fq1_filesize = filesize(fq1_file) fq1_filepos = fq1_file.tell fq1 = FastqFormat.records_in(fq1_file, encoding=None) if fq2_fn: fq2 = FastqFormat.records_in(zopen(fq2_fn, 'r'), encoding=None) else: fq2 = False n_accepted = 0 prev_time = time() for i, r1 in enumerate(fq1): if time() - prev_time > .5: prev_time = time() frac = float(fq1_filepos()) / fq1_filesize stdout.write(term.EL(2) + term.CHA(0) + \ "Processed %s records (%.2f%%), accepted %s (%.2f%%)"%(i, frac*100, n_accepted, 100*float(n_accepted)/i)) stdout.flush() if fq2: r2 = next(fq2) assert (r1.id == r2.id) # Demultiplex best_match = None for (sample_id, master, slave) in adapters: matches, matches_rc = master.locate_in(r1.seq, max_mm, search_rc) master_match, is_rc = get_best_match(matches, matches_rc) # look for master on the mate if (not master_match or master_match[0] < len(master.seq)) and \ fq2: # master not found or no full length match found matches2, matches_rc2 = master.locate_in( r2.seq, max_mm, search_rc) master_match2, is_rc2 = get_best_match(matches2, matches_rc2) if not master_match2 and not master_match: # master not found on r1 nor on r2 continue if not master_match or (master_match2 and \ master_match2[0] < master_match[0]): master_match = master_match2 is_rc = is_rc2 # apparently strands are swapped r1, r2 = r2, r1 if master_match == None: continue if is_rc: master_match = list(master_match) master_match[1] = \ len(r1.seq) - (master_match[1] + len(master.seq)) master_match = tuple(master_match) r1 = FastqRecord(id=r1.id + " rc", desc=r1.desc, seq=revcomp(r1.seq), qual_str=r1.qual_str[::-1]) if fq2: r2 = FastqRecord(id=r2.id + " rc", desc=r2.desc, seq=revcomp(r2.seq), qual_str=r2.qual_str[::-1]) # Master adapter has been found, retrieve its UMI (if it has one) master_umi = ("", "") if master.has_UMI(): # get umi master_umi = master.get_UMI(r1.seq, r1.qual_str, master_match[1]) if master.UMI_length != len(master_umi[0]): # failed to retrieve UMI from master adapter continue # Look for slave adapter slave_match = None slave_umi = ("", "") if slave: # has slave adapter slave_matches, slave_matches_rc = slave.locate_in( r2.seq, max_mm, search_rc=False) slave_match = get_best_match(slave_matches, slave_matches_rc) if slave.has_UMI(): # get umi slave_umi = slave.get_UMI(r2.seq, r2.qual_str, slave_match[1]) if slave.UMI_length != len(slave_umi[0]): continue if not best_match or best_match[0][0] > master_match[0]: umi = [x + y for x, y in zip(master_umi, slave_umi)] best_match = (master_match, sample_id, umi) if best_match: master_match, sample_id, umi = best_match out = outfiles[sample_id] out.write("@%s UMI:%s:%s\n" % (r1.id, umi[0], umi[1])) out.write("%s\n+\n%s\n" % (r1.seq, r1.qual_str)) n_accepted += 1
def run(self): if self._alignments_fn is None or self._alignments_fn == "": output_alignments = False else: output_alignments = True if output_alignments and os.path.isfile(self._alignments_fn): logger.info("SKIPPING creation of %s"%self._alignments_fn) output_alignments = False alignment_file = zopen(self._alignments_fn, 'r') vj_recs = SAMFormat.records_in(alignment_file) # Get two (rows/)alignments at a time from vj_recs alns = ((rec, next(vj_recs)) for rec in vj_recs) self._listener.notify("Reading alignments from %s"% self._alignments_fn) else: alns = get_vj_alignments(self._ref, self._reads, self._cmd_build_index, self._args_build_index, self._cmd_align, self._args_align_v, self._args_align_j, phred_encoding = self._phred_encoding, n_threads = self._n_threads) self._listener.notify("Aligning reference sequences to reads") # Keep track of the quality scores of the bases that went into the # sequences of the clones. Q_counts = {} # Build clones and use alignments to count mismatches and indels cs = CloneSet() alnstats = {"V":{}, "J":{}} if self._include_cysphe: v_refpos_offset = -3 j_refpos_offset = 3 else: v_refpos_offset = 0 j_refpos_offset = 0 try: if output_alignments: out = zopen(self._alignments_fn, 'w') if output_alignments: infile = self._reads else: infile = alignment_file prev_time = time() if isinstance(infile, gzip.GzipFile): infile_size = os.path.getsize(infile.name) infile_pos = infile.fileobj.tell else: infile_size = filesize(infile) infile_pos = infile.tell self._listener.notify(("PROGRESSBAR", "Alignments", "start")) for v_rec, j_rec in alns: if self.stopped(): logger.warning("Pipeline stopped") return if time() - prev_time >= self._update_interval: prev_time = time() if not infile.closed: pos = infile_pos() else: # assuming a closed infile means the entire infile has # been processed. pos = infile_size frac = float(pos) / infile_size self._listener.notify(("PROGRESSBAR", "Alignments", frac)) if output_alignments: out.write("\t".join(map(str, v_rec)) + "\n" + \ "\t".join(map(str, j_rec)) + "\n") clone = build_clone(self._ref, v_rec, j_rec, self._include_cysphe, self._clone_classname) if clone is None: continue seqlen = len(clone.seq) if seqlen < self._min_seqlen: continue # Count base qualities in the clone (which is possible because # at this point the clone is based on a single read) lenfam_Q_counts = Q_counts.setdefault(seqlen, [0] * 42) for i in xrange(clone.v.end, clone.j.start): lenfam_Q_counts[clone.qual[i]] += 1 cs.add(clone, merge = True) v_allele = self._ref[v_rec.RNAME] j_allele = self._ref[j_rec.RNAME] # Count errors in the alignments for (rec, r_roi_start, r_roi_end) in \ ((v_rec, v_allele.refpos + v_refpos_offset, 0), (j_rec, 0, j_allele.refpos + j_refpos_offset)): allele = self._ref[rec.RNAME] lenfam_alnstats = alnstats[allele.region].setdefault( seqlen, { "n" : 0, "mm" : 0, "ins" : 0, "dels" : 0, "Q_mm" : [0] * 42, "Q_n" : [0] * 42}) n, mm, ins, dels, r_roi_as, r_roi_ae = get_error_stats(rec, allele.seq, lenfam_alnstats["Q_mm"], lenfam_alnstats["Q_n"], r_roi_start, r_roi_end) lenfam_alnstats["n"] += n lenfam_alnstats["mm"] += mm lenfam_alnstats["ins"] += ins lenfam_alnstats["dels"] += dels finally: if output_alignments: out.close() self._listener.notify(("PROGRESSBAR", "Alignments", "end")) if len(cs) == 0: msg = "No clones found in alignments. \ Was the correct germline reference used?" logger.error(msg) raise Exception(msg) if not self._alignment_stats_fn is None and \ self._alignment_stats_fn != "": logger.info("Writing alignment stats to \"%s\""% self._alignment_stats_fn) with zopen(self._alignment_stats_fn, 'w') as out: out.write("seqlen,region,n,mm,ins,dels\n") for region in alnstats: for seqlen, lenfam_alnstats in \ alnstats[region].iteritems(): out.write(",".join(map(str,[ seqlen, region, lenfam_alnstats["n"], lenfam_alnstats["mm"], lenfam_alnstats["ins"], lenfam_alnstats["dels"]])) + "\n") self._save_cloneset(cs, "r") # Sum all the counts in the V and J regions separately, and calculate # average error rates tot_err = {"V":{}, "J":{}} for region in ("V", "J"): region_stats = alnstats[region] x = tot_err[region] x["n"] = sum([y["n"] for y in region_stats.itervalues()]) x["mm"] = sum([y["mm"] for y in region_stats.itervalues()]) x["ins"] = sum([y["ins"] for y in region_stats.itervalues()]) x["dels"]= sum([y["dels"] for y in region_stats.itervalues()]) n = x["n"] if n > 0: x["mmr"] = float(x["mm"]) / n x["insr"] = float(x["ins"]) / n x["delsr"] = float(x["dels"]) / n else: x["mmr"] = 0 x["insr"] = 0 x["delsr"] = 0 global_mmr = max(tot_err["V"]["mmr"], tot_err["J"]["mmr"]) global_insr = max(tot_err["V"]["insr"], tot_err["J"]["insr"]) global_delsr = max(tot_err["V"]["delsr"], tot_err["J"]["delsr"]) logger.info("global error rates: mmr: %(global_mmr)s, \ insr: %(global_insr)s, delsr: %(global_delsr)s"%locals()) # Calculate observed error rates for Phred scores Q_mm_stats = {"V":{}, "J":{}} for region, region_stats in alnstats.iteritems(): Q_mm = Q_mm_stats[region].setdefault("Q_mm", [0] * 42) Q_n = Q_mm_stats[region].setdefault("Q_n", [0] * 42) for lenfam_alnstats in region_stats.itervalues(): for i in xrange(42): Q_mm[i] += lenfam_alnstats["Q_mm"][i] Q_n[i] += lenfam_alnstats["Q_n"][i] if not self._Q_mm_stats_fn is None and self._Q_mm_stats_fn != "": with zopen(self._Q_mm_stats_fn, 'w') as out: out.write("region,Q,n,mm\n") for region in Q_mm_stats: for Q,(mm, n) in enumerate(izip(Q_mm_stats[region]["Q_mm"], Q_mm_stats[region]["Q_n"])): out.write("%s,%s,%s,%s\n"%(region, Q, n, mm)) # Calculate ratio between base quality score assigned by the sequencer # and observed base quality (based on alignments with germline # reference). sum_ratios = 0 n_ratios = 0 for region in Q_mm_stats: Q_mm = Q_mm_stats[region]["Q_mm"] Q_n = Q_mm_stats[region]["Q_n"] for q in xrange(42): mm = Q_mm[q] n = Q_n[q] if mm > 0 and n > 0: q_obs = p2q(float(mm) / n) if q_obs > 0: sum_ratios += (q / q_obs) * n n_ratios += n if n_ratios > 0: alpha = float(sum_ratios) / n_ratios else: logger.warning('No instances found of a Phred score associated ' +\ 'with mismatches.') alpha = 1.0 logger.info("Ratio between base quality and observed quality: %s"% alpha) if not self._Q_mm_stats_plot_fn is None and \ self._Q_mm_stats_plot_fn != "": plot_Q_mm_stats(Q_mm_stats, self._Q_mm_stats_plot_fn) # Get median quality score Q_n = [0] * 42 # count number of bases for every Q score for lenfam_Q_counts in Q_counts.itervalues(): for q, count in enumerate(lenfam_Q_counts): Q_n[q] += count i = ((sum(Q_n) + 1) // 2) - 1 # index of median element in Q_n j = 0 for max_Q, count in enumerate(Q_n): j += count if j > i: break logger.info("max_Q = %s"%max_Q) pool = ConnectedConsumerPool(n_consumers = self._n_threads) by_seqlen = lambda clone:len(clone.seq) confidence = self._confidence for seqlen, clones in groupby(sorted(cs, key = by_seqlen), by_seqlen): if self.stopped(): logger.warning("Pipeline stopped") return cs2 = CloneSet(clones) # Calculate expected number of errors based on Q scores lenfam_Q_counts = Q_counts[seqlen] # get total number of bases between V and J region n_o = sum(lenfam_Q_counts) mm_o = 0 for q, count in enumerate(lenfam_Q_counts): q /= alpha mm_o += q2p(q) * count mm_v = alnstats["V"][seqlen]["mm"] n_v = alnstats["V"][seqlen]["n"] mm_j = alnstats["J"][seqlen]["mm"] n_j = alnstats["J"][seqlen]["n"] mm_tot = mm_v + mm_o + mm_j n_tot = n_v + n_o + n_j logger.info("Mismatch stats for seqlen %s: mm_v (%s, %s),\ mm_o (%s, %s), mm_j (%s, %s), mm_tot (%s, %s)"%(seqlen, mm_v, float(mm_v)/n_v if n_v > 0 else 0, mm_o, float(mm_o)/n_o if n_o > 0 else 0, mm_j, float(mm_j)/n_j if n_j > 0 else 0, mm_tot, float(mm_tot)/n_tot if n_tot > 0 else 0)) local_mmr = float(mm_tot) / n_tot mmr = max(local_mmr, global_mmr) logger.info("Adding task: seqlen: %(seqlen)s, mismatch_rate: \ %(mmr)s, confidence: %(confidence)s, max_Q: %(max_Q)s"%locals()) pool.add_task(run_ec_on_bin, (cs2, mmr, confidence, max_Q)) self._listener.notify("Running QMerge and IMerge on bins.") self.run_pool(pool, desc = 'QMerge, IMerge') results = pool.results cloneset = CloneSet(chain.from_iterable([x[0] for x in results])) self._save_cloneset(cloneset, "rqi") self._listener.notify("Running LMerge") cloneset = run_lmerge(cloneset, global_mmr, global_insr, global_delsr, confidence) self._save_cloneset(cloneset, "rqil") pool = ConnectedConsumerPool(n_consumers = self._n_threads) for seqlen, clones in groupby(sorted(cloneset, key = by_seqlen), by_seqlen): cs2 = CloneSet(clones) pool.add_task(wrapper_run_nmerge_on_bin, args = (cs2,)) self._listener.notify("Running NMerge on bins.") self.run_pool(pool, desc = 'NMerge') results = pool.results cloneset = CloneSet(chain.from_iterable(results)) self._save_cloneset(cloneset, "rqiln") ######################## # Write clones to file # ######################## self._listener.notify("Writing clones") with open(self._output_fn, 'w') as res_ok: with open("discarded_clones.tsv", 'w') as res_not_ok: header = self._output_hdr res_ok.write(header) res_not_ok.write(header) n_discarded = 0 for clone in sorted(cloneset, key = lambda clone:(-clone.count, clone.seq)): min_phred = min(clone.qual) aa_seq = nt2aa(clone.seq) n_stop_codons = sum([aa == '*' for aa in aa_seq]) frame = len(clone.seq) % 3 if min_phred < self._min_phred_threshold \ or n_stop_codons > 0 or frame != 0: n_discarded += 1 out = res_not_ok else: out = res_ok out.write(clone2str(clone, fmt = self._output_fmt)) self._listener.notify("Discarded %s clones"%n_discarded)
def checkout(fq1_fn, fq2_fn, adapters, max_mm, search_rc, paired=False): assert not fq1_fn is None assert not (paired and not fq2_fn is None) print 'Handling file(s): %s' % ''.join( [fq1_fn, '' if fq2_fn is None else ', %s' % fq2_fn]) fq1_file = zopen(fq1_fn, 'r') if isinstance(fq1_file, gzip.GzipFile): fq1_filesize = os.path.getsize(fq1_file.name) fq1_filepos = fq1_file.fileobj.tell else: fq1_filesize = filesize(fq1_file) fq1_filepos = fq1_file.tell fq1 = FastqFormat.records_in(fq1_file, encoding=None) if not fq2_fn is None: fq2 = FastqFormat.records_in(zopen(fq2_fn, 'r'), encoding=None) else: fq2 = None outfiles = {} for (sample_id, master, slave) in adapters: outfiles[sample_id] = { "out1" : (open("%s_R1.fastq"%sample_id, 'w'), 'R1') \ if not paired else \ (open("%s_R12.fastq"%sample_id, 'w'), 'R12'), "out2" : (None, None) if fq2 is None else \ (open("%s_R2.fastq"%sample_id, 'w'), 'R2')} n_accepted = 0 prev_time = time() for i, r1 in enumerate(fq1): if fq2: r2 = next(fq2) assert (r1.id == r2.id) else: r2 = None # Demultiplex best_match = None for (sample_id, master, slave) in adapters: matches, matches_rc = master.locate_in(r1.seq, max_mm, search_rc) master_match, is_rc = get_best_match(matches, matches_rc) # look for master on the mate if (not master_match or master_match[0] < len(master.seq)) and \ fq2: # master not found or no full length match found matches2, matches_rc2 = master.locate_in( r2.seq, max_mm, search_rc) master_match2, is_rc2 = get_best_match(matches2, matches_rc2) if not master_match2 and not master_match: # master not found on r1 nor on r2 continue if not master_match or (master_match2 and \ master_match2[0] < master_match[0]): master_match = master_match2 is_rc = is_rc2 # apparently strands are swapped r1, r2 = r2, r1 if master_match is None: continue if is_rc: master_match = list(master_match) master_match[1] = \ len(r1.seq) - (master_match[1] + len(master.seq)) master_match = tuple(master_match) r1 = FastqRecord(id=r1.id + " rc", desc=r1.desc, seq=revcomp(r1.seq), qual_str=r1.qual_str[::-1]) if fq2: r2 = FastqRecord(id=r2.id + " rc", desc=r2.desc, seq=revcomp(r2.seq), qual_str=r2.qual_str[::-1]) # Master adapter has been found, retrieve its UMI (if it has one) master_umi = ("", "") if master.has_UMI(): # get umi master_umi = master.get_UMI(r1.seq, r1.qual_str, master_match[1]) if master.UMI_length != len(master_umi[0]): # failed to retrieve UMI from master adapter continue # Look for slave adapter slave_match = None slave_umi = ("", "") if slave: # has slave adapter if paired: r = r1 else: r = r2 slave_matches, slave_matches_rc = slave.locate_in( r.seq, max_mm, search_rc=search_rc) slave_match, slave_is_rc = get_best_match( slave_matches, slave_matches_rc) if not slave_match: # No slave found continue if slave.has_UMI(): # get umi if slave_is_rc: slave_umi_start = len( r.seq) - (slave_match[1] + len(slave.seq)) slave_umi = slave.get_UMI(revcomp(r.seq), r.qual_str[::-1], slave_umi_start) else: slave_umi = slave.get_UMI(r.seq, r.qual_str, slave_match[1]) if slave.UMI_length != len(slave_umi[0]): continue if not best_match or best_match[0][0] > master_match[0] or \ (best_match[0][0] == master_match[0] and \ slave_match and \ (not best_match[1] or not best_match[1][0] or \ best_match[1][0] > slave_match[0])): umi = [x + y for x, y in zip(master_umi, slave_umi)] best_match = (master_match, slave_match, sample_id, umi) if best_match: master_match, slave_match, sample_id, umi = best_match for (r, (out, typename)) in ((r1, outfiles[sample_id]["out1"]), (r2, outfiles[sample_id]["out2"])): if not out: continue out.write("@%s UMI:%s:%s:%s\n" % (r.id, typename, umi[0], umi[1])) out.write("%s\n+\n%s\n" % (r.seq, r.qual_str)) n_accepted += 1 frac = float(fq1_filepos()) / fq1_filesize if time() - prev_time > .5 or frac == 1.0: prev_time = time() stdout.write(term.EL(2) + term.CHA(0) + \ "Processed %s records (%.2f%%), accepted %s (%.2f%%)"%(i + 1, frac*100, n_accepted, (100*float(n_accepted)/(i+1)))) stdout.flush() stdout.write('\n')
def run(self): if self._alignments_fn is None or self._alignments_fn == "": output_alignments = False else: output_alignments = True if output_alignments and os.path.isfile(self._alignments_fn): logger.info("SKIPPING creation of %s" % self._alignments_fn) output_alignments = False alignment_file = zopen(self._alignments_fn, 'r') vj_recs = SAMFormat.records_in(alignment_file) # Get two (rows/)alignments at a time from vj_recs alns = ((rec, next(vj_recs)) for rec in vj_recs) self._listener.notify("Reading alignments from %s" % self._alignments_fn) else: alns = get_vj_alignments(self._ref, self._reads, self._cmd_build_index, self._args_build_index, self._cmd_align, self._args_align_v, self._args_align_j, phred_encoding=self._phred_encoding, n_threads=self._n_threads) self._listener.notify("Aligning reference sequences to reads") # Keep track of the quality scores of the bases that went into the # sequences of the clones. Q_counts = {} # Build clones and use alignments to count mismatches and indels cs = CloneSet() alnstats = {"V": {}, "J": {}} v_refpos_offset = -3 j_refpos_offset = 3 try: if output_alignments: out = zopen(self._alignments_fn, 'w') if output_alignments: infile = self._reads else: infile = alignment_file prev_time = time() if isinstance(infile, gzip.GzipFile): infile_size = os.path.getsize(infile.name) infile_pos = infile.fileobj.tell else: infile_size = filesize(infile) infile_pos = infile.tell self._listener.notify(("PROGRESSBAR", "Alignments", "start")) for v_rec, j_rec in alns: if self.stopped(): logger.warning("Pipeline stopped") return if time() - prev_time >= self._update_interval: prev_time = time() if not infile.closed: pos = infile_pos() else: # assuming a closed infile means the entire infile has # been processed. pos = infile_size frac = float(pos) / infile_size self._listener.notify(("PROGRESSBAR", "Alignments", frac)) if output_alignments: out.write("\t".join(map(str, v_rec)) + "\n" + \ "\t".join(map(str, j_rec)) + "\n") clone = build_clone(self._ref, v_rec, j_rec, self._clone_classname) if clone is None: continue seqlen = len(clone.seq) if seqlen < self._min_seqlen: continue # Count base qualities in the clone (which is possible because # at this point the clone is based on a single read) lenfam_Q_counts = Q_counts.setdefault(seqlen, [0] * 42) for i in xrange(clone.v.end, clone.j.start): lenfam_Q_counts[clone.qual[i]] += 1 cs.add(clone, merge=True) v_allele = self._ref[v_rec.RNAME] j_allele = self._ref[j_rec.RNAME] # Count errors in the alignments for (rec, r_roi_start, r_roi_end) in \ ((v_rec, v_allele.refpos + v_refpos_offset, 0), (j_rec, 0, j_allele.refpos + j_refpos_offset)): allele = self._ref[rec.RNAME] lenfam_alnstats = alnstats[allele.region].setdefault( seqlen, { "n": 0, "mm": 0, "ins": 0, "dels": 0, "Q_mm": [0] * 42, "Q_n": [0] * 42 }) n, mm, ins, dels, r_roi_as, r_roi_ae = get_error_stats( rec, allele.seq, lenfam_alnstats["Q_mm"], lenfam_alnstats["Q_n"], r_roi_start, r_roi_end) lenfam_alnstats["n"] += n lenfam_alnstats["mm"] += mm lenfam_alnstats["ins"] += ins lenfam_alnstats["dels"] += dels finally: if output_alignments: out.close() self._listener.notify(("PROGRESSBAR", "Alignments", "end")) if len(cs) == 0: msg = "No clones found in alignments. \ Was the correct germline reference used?" logger.error(msg) raise Exception(msg) if not self._alignment_stats_fn is None and \ self._alignment_stats_fn != "": logger.info("Writing alignment stats to \"%s\"" % self._alignment_stats_fn) with zopen(self._alignment_stats_fn, 'w') as out: out.write("seqlen,region,n,mm,ins,dels\n") for region in alnstats: for seqlen, lenfam_alnstats in \ alnstats[region].iteritems(): out.write(",".join( map(str, [ seqlen, region, lenfam_alnstats["n"], lenfam_alnstats["mm"], lenfam_alnstats["ins"], lenfam_alnstats["dels"] ])) + "\n") self._save_cloneset(cs, "r") # Sum all the counts in the V and J regions separately, and calculate # average error rates tot_err = {"V": {}, "J": {}} for region in ("V", "J"): region_stats = alnstats[region] x = tot_err[region] x["n"] = sum([y["n"] for y in region_stats.itervalues()]) x["mm"] = sum([y["mm"] for y in region_stats.itervalues()]) x["ins"] = sum([y["ins"] for y in region_stats.itervalues()]) x["dels"] = sum([y["dels"] for y in region_stats.itervalues()]) n = x["n"] if n > 0: x["mmr"] = float(x["mm"]) / n x["insr"] = float(x["ins"]) / n x["delsr"] = float(x["dels"]) / n else: x["mmr"] = 0 x["insr"] = 0 x["delsr"] = 0 global_mmr = max(tot_err["V"]["mmr"], tot_err["J"]["mmr"]) global_insr = max(tot_err["V"]["insr"], tot_err["J"]["insr"]) global_delsr = max(tot_err["V"]["delsr"], tot_err["J"]["delsr"]) logger.info("global error rates: mmr: %(global_mmr)s, \ insr: %(global_insr)s, delsr: %(global_delsr)s" % locals()) # Calculate observed error rates for Phred scores Q_mm_stats = {"V": {}, "J": {}} for region, region_stats in alnstats.iteritems(): Q_mm = Q_mm_stats[region].setdefault("Q_mm", [0] * 42) Q_n = Q_mm_stats[region].setdefault("Q_n", [0] * 42) for lenfam_alnstats in region_stats.itervalues(): for i in xrange(42): Q_mm[i] += lenfam_alnstats["Q_mm"][i] Q_n[i] += lenfam_alnstats["Q_n"][i] if not self._Q_mm_stats_fn is None and self._Q_mm_stats_fn != "": with zopen(self._Q_mm_stats_fn, 'w') as out: out.write("region,Q,n,mm\n") for region in Q_mm_stats: for Q, (mm, n) in enumerate( izip(Q_mm_stats[region]["Q_mm"], Q_mm_stats[region]["Q_n"])): out.write("%s,%s,%s,%s\n" % (region, Q, n, mm)) # Calculate ratio between base quality score assigned by the sequencer # and observed base quality (based on alignments with germline # reference). sum_ratios = 0 n_ratios = 0 for region in Q_mm_stats: Q_mm = Q_mm_stats[region]["Q_mm"] Q_n = Q_mm_stats[region]["Q_n"] for q in xrange(42): mm = Q_mm[q] n = Q_n[q] if mm > 0 and n > 0: q_obs = p2q(float(mm) / n) if q_obs > 0: sum_ratios += (q / q_obs) * n n_ratios += n if n_ratios > 0: alpha = float(sum_ratios) / n_ratios else: logger.warning('No instances found of a Phred score associated ' +\ 'with mismatches.') alpha = 1.0 logger.info("Ratio between base quality and observed quality: %s" % alpha) if not self._Q_mm_stats_plot_fn is None and \ self._Q_mm_stats_plot_fn != "": plot_Q_mm_stats(Q_mm_stats, self._Q_mm_stats_plot_fn) # Get median quality score Q_n = [0] * 42 # count number of bases for every Q score for lenfam_Q_counts in Q_counts.itervalues(): for q, count in enumerate(lenfam_Q_counts): Q_n[q] += count i = ((sum(Q_n) + 1) // 2) - 1 # index of median element in Q_n j = 0 for max_Q, count in enumerate(Q_n): j += count if j > i: break logger.info("max_Q = %s" % max_Q) pool = ConnectedConsumerPool(n_consumers=self._n_threads) by_seqlen = lambda clone: len(clone.seq) confidence = self._confidence for seqlen, clones in groupby(sorted(cs, key=by_seqlen), by_seqlen): if self.stopped(): logger.warning("Pipeline stopped") return cs2 = CloneSet(clones) # Calculate expected number of errors based on Q scores lenfam_Q_counts = Q_counts[seqlen] # get total number of bases between V and J region n_o = sum(lenfam_Q_counts) mm_o = 0 for q, count in enumerate(lenfam_Q_counts): q /= alpha mm_o += q2p(q) * count mm_v = alnstats["V"][seqlen]["mm"] n_v = alnstats["V"][seqlen]["n"] mm_j = alnstats["J"][seqlen]["mm"] n_j = alnstats["J"][seqlen]["n"] mm_tot = mm_v + mm_o + mm_j n_tot = n_v + n_o + n_j logger.info("Mismatch stats for seqlen %s: mm_v (%s, %s),\ mm_o (%s, %s), mm_j (%s, %s), mm_tot (%s, %s)" % (seqlen, mm_v, float(mm_v) / n_v if n_v > 0 else 0, mm_o, float(mm_o) / n_o if n_o > 0 else 0, mm_j, float(mm_j) / n_j if n_j > 0 else 0, mm_tot, float(mm_tot) / n_tot if n_tot > 0 else 0)) local_mmr = float(mm_tot) / n_tot mmr = max(local_mmr, global_mmr) logger.info("Adding task: seqlen: %(seqlen)s, mismatch_rate: \ %(mmr)s, confidence: %(confidence)s, max_Q: %(max_Q)s" % locals()) pool.add_task(run_ec_on_bin, (cs2, mmr, confidence, max_Q)) self._listener.notify("Running QMerge and IMerge on bins.") self.run_pool(pool, desc='QMerge, IMerge') results = pool.results cloneset = CloneSet(chain.from_iterable([x[0] for x in results])) self._save_cloneset(cloneset, "rqi") self._listener.notify("Running LMerge") cloneset = run_lmerge(cloneset, global_mmr, global_insr, global_delsr, confidence) self._save_cloneset(cloneset, "rqil") pool = ConnectedConsumerPool(n_consumers=self._n_threads) for seqlen, clones in groupby(sorted(cloneset, key=by_seqlen), by_seqlen): cs2 = CloneSet(clones) pool.add_task(wrapper_run_nmerge_on_bin, args=(cs2, )) self._listener.notify("Running NMerge on bins.") self.run_pool(pool, desc='NMerge') results = pool.results cloneset = CloneSet(chain.from_iterable(results)) self._save_cloneset(cloneset, "rqiln") ######################## # Write clones to file # ######################## self._listener.notify("Writing clones") sequence_id = 0 with open(self._output_fn, 'w') as res_ok: with open(self._output_not_ok_fn, 'w') as res_not_ok: header = '\t'.join(\ clone2AIRRDict(clone = None, ref = None).keys()) + '\n' res_ok.write(header) res_not_ok.write(header) n_discarded = 0 for clone in sorted(cloneset, key=lambda clone: (-clone.count, clone.seq)): record = clone2AIRRDict(clone=clone, ref=self._ref) min_phred = int(record['junction_minimum_quality_score']) if min_phred < self._min_phred_threshold \ or record['stop_codon'] == 'T' or \ record['vj_in_frame'] == 'F': n_discarded += 1 out = res_not_ok else: out = res_ok sequence_id += 1 record['sequence_id'] = str(sequence_id) out.write('\t'.join([v for k, v in record.iteritems()]) +\ '\n') self._listener.notify("Discarded %s clones" % n_discarded)