def write(self, output_prefix): """ Output to clusters to a fasta file <output_prefix>.fasta >{cluster_index} {sequence here} And to a otu-style file <output_prefix>.otu.txt <cluster_index> \t <tab delimited seq IDs> """ w = FastqWriter(output_prefix + '.fq') f = open(output_prefix + '.fasta', 'w') h = open(output_prefix + '.otu.txt', 'w') a = open(output_prefix + '.abundance.txt', 'w') for cluster in self.cluster_by_otu.itervalues(): for o in cluster.itervalues(): # need to massage qual for fq writing o['qual'] = "".join(chr(o['qual'][i]+33) for i in xrange(o['len'])) o['ID'] = o['cids'][0] w.write(o) f.write(">{0}\n{1}\n".format(o['ID'], o['seq'])) h.write("{0}\t{1}\n".format(o['ID'], "\t".join(o['cids']))) a.write("{0}\t{1}\n".format(o['ID'], o['size'])) w.close() f.close() h.close() a.close() os.system("gzip " + w.f.name)
def combine_RF(fotu, rotu, ffastq, rfastq, output_prefix): """ Reads two OTU files, 1 for forward 1 for reverse Returns: forward otu cid --> reverse otu cid --> abundance """ seqid2otu = {} combo = {} with open(fotu) as f: for line in f: otu, rest = line.strip().split(None, 1) combo[otu] = defaultdict(lambda: 0) for seqid in rest.split(): if seqid.endswith('/1') or seqid.endswith('/2'): seqid = seqid[:-2] seqid2otu[seqid] = otu with open(rotu) as f: for line in f: otu2, rest = line.strip().split(None, 1) for seqid in rest.split(): if seqid.endswith('/1') or seqid.endswith('/2'): seqid = seqid[:-2] if seqid not in seqid2otu: print >> sys.stderr, "{0} is missing in forward, ignore".format(seqid) continue otu1 = seqid2otu[seqid] combo[otu1][otu2] += 1 # now write this out as <output_prefix>.combined.{1|2}.fq seqdict = {} for r in FastqReader(rfastq): seqdict[r['ID']] = r fqw1 = FastqWriter(output_prefix + '.combined.1.fq') fqw2 = FastqWriter(output_prefix + '.combined.2.fq') fout = open(output_prefix + '.combined.abundance.txt','w') for r in FastqReader(ffastq): if r['ID'] in combo: for id2, abundance in combo[r['ID']].iteritems(): newid = "{0}_{1}".format(r['ID'], id2) fqw1.write(r, id=newid+'/1') fqw2.write(seqdict[id2], id=newid+'/2') #fqw1.write(">{id}\n{seq}\n".format(seq=r.seq, id=newid+'/1')) #fqw2.write(">{id}\n{seq}\n".format(seq=seqdict[id2].seq, id=newid+'/2')) fout.write("{0}\t{1}\n".format(newid, abundance)) fqw1.close() fqw2.close() fout.close() return combo
def main(fq1, fq2, output_prefix, abundance_filename): abundance = {} if abundance_filename is None: abundance = defaultdict(lambda: 1) else: with open(abundance_filename) as f: for line in f: _id, _count = line.strip().split('\t') abundance[_id] = int(_count) matchf = BowTieWriter(output_prefix + '.overlap.aligned') unf1 = FastqWriter(output_prefix + '.overlap.1.unaligned') unf2 = FastqWriter(output_prefix + '.overlap.2.unaligned') total = 0 total_expanded = 0 aligned = 0 aligned_expanded = 0 for r1, r2 in FastqReaderPaired(fq1, fq2): realid = r1['ID'][:r1['ID'].find('/')] total += 1 total_expanded += abundance[realid] if find_overlap(r1, r2, matchf, unf1, unf2): #overlap found aligned += 1 aligned_expanded += abundance[realid] with open(output_prefix + '.overlap.log', 'w') as f: p = aligned * 100. / total f.write("# reads processed: {0}\n".format(total)) f.write( "# reads with at least one reported alignment: {0} ({1:.2f}%)\n". format(aligned, p)) f.write("# reads that failed to align: {0} ({1:.2f}%)\n".format( total - aligned, 100 - p)) f.write("Reported {0} paired-end alignments to 1 output stream(s)\n". format(aligned)) with open(output_prefix + '.overlap.log_expanded', 'w') as f: p = aligned_expanded * 100. / total_expanded f.write("# reads processed: {0}\n".format(total_expanded)) f.write( "# reads with at least one reported alignment: {0} ({1:.2f}%)\n". format(aligned_expanded, p)) f.write("# reads that failed to align: {0} ({1:.2f}%)\n".format( total_expanded - aligned_expanded, 100 - p)) f.write("Reported {0} paired-end alignments to 1 output stream(s)\n". format(aligned_expanded)) matchf.close() unf1.close() unf2.close()
def consolidate_corrected_clusters(dir, output_prefix): """ in each cluster <dir>/<cluster_index> find the corrected <cluster_index>.errcor.{fq|otu.txt} and consolate them into one file rename the new seq ids to <cluster_index>_<otu_index> """ fqw = FastqWriter(output_prefix+'.fq') otuw = open(output_prefix+'.otu.txt', 'w') for cid in os.listdir(dir): d2 = os.path.join(dir, cid) with open(os.path.join(d2, cid+'.errcor.otu.txt')) as f: for line in f: otuw.write("{cid}_{rest}".format(cid=cid, rest=line)) for r in FastqReader(os.path.join(d2, cid+'.errcor.fq.gz')): r['ID'] = cid + '_' + r['ID'] fqw.write(r) otuw.close() fqw.close()
def write(self, output_prefix): """ Output to clusters to a fasta file <output_prefix>.fasta >{cluster_index} {sequence here} And to a otu-style file <output_prefix>.otu.txt <cluster_index> \t <tab delimited seq IDs> """ w = FastqWriter(output_prefix + '.fq') f = open(output_prefix + '.fasta', 'w') h = open(output_prefix + '.otu.txt', 'w') a = open(output_prefix + '.abundance.txt', 'w') for cluster in self.cluster_by_otu.itervalues(): for o in cluster.itervalues(): # need to massage qual for fq writing o['qual'] = "".join( chr(o['qual'][i] + 33) for i in xrange(o['len'])) o['ID'] = o['cids'][0] w.write(o) f.write(">{0}\n{1}\n".format(o['ID'], o['seq'])) h.write("{0}\t{1}\n".format(o['ID'], "\t".join(o['cids']))) a.write("{0}\t{1}\n".format(o['ID'], o['size'])) w.close() f.close() h.close() a.close() os.system("gzip " + w.f.name)
def bowtie2fastq(input, revcomp=False): output = input + '.fq' f = FastqWriter(output) for r in BowTieReader(input, False): if revcomp: r['seq'] = Seq(r['seq']).reverse_complement().tostring() r['qual'] = r['qual'][::-1] f.write(r) f.close()
def main(fq1, fq2, output_prefix, abundance_filename): abundance = {} if abundance_filename is None: abundance = defaultdict(lambda: 1) else: with open(abundance_filename) as f: for line in f: _id, _count = line.strip().split('\t') abundance[_id] = int(_count) matchf = BowTieWriter(output_prefix + '.overlap.aligned') unf1 = FastqWriter(output_prefix + '.overlap.1.unaligned') unf2 = FastqWriter(output_prefix + '.overlap.2.unaligned') total = 0 total_expanded = 0 aligned = 0 aligned_expanded = 0 for r1, r2 in FastqReaderPaired(fq1, fq2): realid = r1['ID'][:r1['ID'].find('/')] total += 1 total_expanded += abundance[realid] if find_overlap(r1, r2, matchf, unf1, unf2): #overlap found aligned += 1 aligned_expanded += abundance[realid] with open(output_prefix + '.overlap.log', 'w') as f: p = aligned*100./total f.write("# reads processed: {0}\n".format(total)) f.write("# reads with at least one reported alignment: {0} ({1:.2f}%)\n".format(aligned,p)) f.write("# reads that failed to align: {0} ({1:.2f}%)\n".format(total-aligned,100-p)) f.write("Reported {0} paired-end alignments to 1 output stream(s)\n".format(aligned)) with open(output_prefix + '.overlap.log_expanded', 'w') as f: p = aligned_expanded*100./total_expanded f.write("# reads processed: {0}\n".format(total_expanded)) f.write("# reads with at least one reported alignment: {0} ({1:.2f}%)\n".format(aligned_expanded,p)) f.write("# reads that failed to align: {0} ({1:.2f}%)\n".format(total_expanded-aligned_expanded,100-p)) f.write("Reported {0} paired-end alignments to 1 output stream(s)\n".format(aligned_expanded)) matchf.close() unf1.close() unf2.close()
def consolidate_corrected_clusters(dir, output_prefix): """ in each cluster <dir>/<cluster_index> find the corrected <cluster_index>.errcor.{fq|otu.txt} and consolate them into one file rename the new seq ids to <cluster_index>_<otu_index> """ fqw = FastqWriter(output_prefix + '.fq') otuw = open(output_prefix + '.otu.txt', 'w') for cid in os.listdir(dir): d2 = os.path.join(dir, cid) with open(os.path.join(d2, cid + '.errcor.otu.txt')) as f: for line in f: otuw.write("{cid}_{rest}".format(cid=cid, rest=line)) for r in FastqReader(os.path.join(d2, cid + '.errcor.fq.gz')): r['ID'] = cid + '_' + r['ID'] fqw.write(r) otuw.close() fqw.close()
def detect_primers_PE(input1, input2, output_prefix, f_primer, r_primer, min_match_len, max_mm, max_de, max_in): """ NOTE: this is for paired end reads that comes in two separate files ex: DS19342_CTTGTA_L006_R1_001.fastq.gz and DS19342_CTTGTA_L006_R2_001.fastq.gz Given a pair of reads from input1, input2: 1. Detect that F primer exists in one read and R primer in the other 2. If both reads pass primer detection, output 3. Otherwise, discard Output: <output_prefix>.{F|R}primer_good <output_prefix>.primer.bad <output_prefix>.primer.log """ def process_primer(r, match_len, is_reverse): # get record into miscBowTie.BowTieReader format # strip away primers from seq & qual, properly rev comp! r['offset'] = match_len r['seq'] = r['seq'][match_len:] r['qual'] = r['qual'][match_len:] r['ref'] = 'NA' if is_reverse: r['seq'] = Seq(r['seq']).reverse_complement().tostring() r['qual'] = r['qual'][::-1] os.system("rm {0}.*primer_*".format(output_prefix)) Fgood = BowTieWriter(output_prefix + '.Fprimer_good') Rgood = BowTieWriter(output_prefix + '.Rprimer_good') hbad1 = FastqWriter(output_prefix + '.primer_bad.1') hbad2 = FastqWriter(output_prefix + '.primer_bad.2') hverbose = open(output_prefix + '.primer.verbose', 'w') hlog = open(output_prefix + '.primer.log', 'w') start_t = time.time() good, bad = 0,0 pmF = PrimerMatch(f_primer) pmR = PrimerMatch(r_primer) for r1, r2 in itertools.izip(FastqReader(input1), FastqReader(input2)): # NOTE: in the case of PE reads # regardless of whether we're matching for F or R primer # they would all appear at the 5' end of the read # which is why we call match_primer_len with is_reverse = False match_f_len1, mmf1 = match_primer_len(r1['seq'], f_primer, max_mm, min_match_len, False) match_r_len1, mmr1 = match_primer_len(r1['seq'], r_primer, max_mm, min_match_len, False) match_f_len2, mmf2 = match_primer_len(r2['seq'], f_primer, max_mm, min_match_len, False) match_r_len2, mmr2 = match_primer_len(r2['seq'], r_primer, max_mm, min_match_len, False) #match_f_len1 = match_f_len2 =match_r_len1=match_r_len2=0 if match_f_len1 > 0 and match_r_len2 > 0: # case 1, read 1 is F, read 2 is R good += 1 process_primer(r1, match_f_len1, False) Fgood.write(r1) process_primer(r2, match_r_len2, False) Rgood.write(r2) elif match_f_len2 > 0 and match_r_len1 > 0: # case 2, read 1 is R, case 2 is F good += 1 process_primer(r2, match_f_len2, False) Fgood.write(r2) process_primer(r1, match_r_len1, False) Rgood.write(r1) else: pmF.make_suffix(r1['seq']) pmF.match(min_match_len, max_mm, max_in, max_de) if pmF.match_result is not None: pmR.make_suffix(r2['seq']) pmR.match(min_match_len, max_mm, max_in, max_de) if pmR.match_result is not None: # case 1, read 1 is F, read 2 is R good += 1 process_primer(r1, pmF.match_result.match_len, False) Fgood.write(r1) hverbose.write("{0}\t{1}\t{2}\n".format(r1['ID'], pmF.match_result.match_len, pmF.match_result.miss)) process_primer(r2, pmR.match_result.match_len, False) Rgood.write(r2) hverbose.write("{0}\t{1}\t{2}\n".format(r2['ID'], pmR.match_result.match_len, pmR.match_result.miss)) else: hbad1.write(r1) hbad2.write(r2) bad += 1 else: pmR.make_suffix(r1['seq']) pmR.match(min_match_len, max_mm, max_in, max_de) if pmR.match_result is not None: pmF.make_suffix(r2['seq']) pmF.match(min_match_len, max_mm, max_in, max_de) if pmF.match_result is not None: good += 1 # case 2, read 1 is R, read 2 is F process_primer(r2, pmF.match_result.match_len, False) hverbose.write("{0}\t{1}\t{2}\n".format(r2['ID'], pmF.match_result.match_len, pmF.match_result.miss)) Fgood.write(r2) process_primer(r1, pmR.match_result.match_len, False) Rgood.write(r1) hverbose.write("{0}\t{1}\t{2}\n".format(r1['ID'], pmR.match_result.match_len, pmR.match_result.miss)) else: # case 3: unresolved, bad read pair hbad1.write(r1) hbad2.write(r2) bad += 1 hlog.write("Input 1: {0}\nInput 2: {1}\n".format(input1, input2)) hlog.write("F primer: {0}\nR primer: {1}\n".format(f_primer, r_primer)) hlog.write("Min match len: {0}\n".format(min_match_len)) hlog.write("Max mismatch: {0}\n".format(max_mm)) hlog.write("Max deletion: {0}\n".format(max_de)) hlog.write("Max insertion: {0}\n".format(max_in)) hlog.write("Primer detection and removal took {0} sec.\n".format(time.time()-start_t)) hlog.write("# of original reads: {0}\n".format(good+bad)) hlog.write("# of reads removed: {0} ({1:.2f})\n".format(bad,bad*1./(good+bad))) hlog.write("# of reads remaining: {0} ({1:.2f})\n".format(good,good*1./(good+bad))) Fgood.close() Rgood.close() hbad1.close() hbad2.close() hlog.close() hverbose.close() os.system("gzip " + Fgood.f.name) os.system("gzip " + Rgood.f.name) os.system("gzip " + hbad1.f.name) os.system("gzip " + hbad2.f.name) os.system("gzip " + hverbose.name)
def combine_RF(fotu, rotu, ffastq, rfastq, output_prefix): """ Reads two OTU files, 1 for forward 1 for reverse Returns: forward otu cid --> reverse otu cid --> abundance """ seqid2otu = {} combo = {} with open(fotu) as f: for line in f: otu, rest = line.strip().split(None, 1) combo[otu] = defaultdict(lambda: 0) for seqid in rest.split(): if seqid.endswith('/1') or seqid.endswith('/2'): seqid = seqid[:-2] seqid2otu[seqid] = otu with open(rotu) as f: for line in f: otu2, rest = line.strip().split(None, 1) for seqid in rest.split(): if seqid.endswith('/1') or seqid.endswith('/2'): seqid = seqid[:-2] if seqid not in seqid2otu: print >> sys.stderr, "{0} is missing in forward, ignore".format( seqid) continue otu1 = seqid2otu[seqid] combo[otu1][otu2] += 1 # now write this out as <output_prefix>.combined.{1|2}.fq seqdict = {} for r in FastqReader(rfastq): seqdict[r['ID']] = r fqw1 = FastqWriter(output_prefix + '.combined.1.fq') fqw2 = FastqWriter(output_prefix + '.combined.2.fq') fout = open(output_prefix + '.combined.abundance.txt', 'w') for r in FastqReader(ffastq): if r['ID'] in combo: for id2, abundance in combo[r['ID']].iteritems(): newid = "{0}_{1}".format(r['ID'], id2) fqw1.write(r, id=newid + '/1') fqw2.write(seqdict[id2], id=newid + '/2') #fqw1.write(">{id}\n{seq}\n".format(seq=r.seq, id=newid+'/1')) #fqw2.write(">{id}\n{seq}\n".format(seq=seqdict[id2].seq, id=newid+'/2')) fout.write("{0}\t{1}\n".format(newid, abundance)) fqw1.close() fqw2.close() fout.close() return combo