def bowtie2fasta(input, name, trimid): output = input + '.fasta' f = open(output, 'w') if name is not None: fmap = open(output + '.map', 'w') for r in BowTieReader(input, False): id = r['ID'].split()[0] if trimid else r['ID'] f.write(">{name}_{id}\n{seq}\n".format(id=id, seq=r['seq'], name=name)) fmap.write("{name}_{id}\t{name}\n".format(name=name, id=id)) else: for r in BowTieReader(input, False): id = r['ID'].split()[0] if trimid else r['ID'] f.write(">{id}\n{seq}\n".format(id=id, seq=r['seq'])) f.close() if name is not None: fmap.close()
def filter_low_qual_seqs(gz_filename, phred_offset, phred_cutoff): """ Takes a BowTie-style gzipped file (ex: .aligned.composite.gz) and retain only seqs that have every base phred >= <cutoff> Outputs: .phred<cutoff>_passed for both files """ assert phred_offset >= 0 assert phred_cutoff >= 0 bad = 0 good = 0 start_t = time.time() f = BowTieWriter(gz_filename + ".phred{0}_passed".format(phred_cutoff), 'w') for r in BowTieReader(gz_filename, False): if all(ord(x) - phred_offset >= phred_cutoff for x in r['qual']): good += 1 f.write(r) else: bad += 1 with open(gz_filename + ".phred{0}_passed.log".format(phred_cutoff), 'w') as f: f.write( "Running filter_low_qual_seq took {0} secs\n".format(time.time() - start_t)) f.write("Input: " + gz_filename + '\n') f.write("PhredCutoff: " + str(phred_cutoff) + '\n') f.write("RemovedDueToLowQual: " + str(bad) + '\n') f.write("RemainingTotal: " + str(good) + '\n')
def parse_blast_xml_for_training(xml_filename, bowtie_filename, output_filename): """ Parse the XML output, looking only at the 1st alignment for each query Write out in format: Phred Cycle B2 B1 B0 Class """ fa_dict = dict((r['ID'], r) for r in BowTieReader(bowtie_filename)) f = open(output_filename, 'w') f.write("Phred\tCycle\tB2\tB1\tB0\tClass\n") for blastout in NCBIXML.parse(open(xml_filename)): if len(blastout.alignments) == 0: # no match was found! continue hsp = blastout.alignments[0].hsps[0] record = fa_dict[blastout.query] primer_offset = int(record['offset']) for i in xrange(2, len( hsp.match)): # toDO: allow for i<2 and still get B2, B1 # global position is i + (query_start-1) + primer_offset if hsp.match[ i] == " " and hsp.query[i] != '-' and hsp.sbjct[i] != '-': pdb.set_trace() # is a mismatch! f.write( str(ord(record['qual'][i + hsp.query_start - 1]) - 33) + '\t') f.write(str(i + hsp.query_start - 1 + primer_offset) + '\t') f.write(hsp.query[i - 2] + '\t') f.write(hsp.query[i - 1] + '\t') f.write(hsp.query[i] + '\t') f.write('-\n') f.close()
def bowtie2fastq(input, revcomp=False): output = input + '.fq' f = FastqWriter(output) for r in BowTieReader(input, False): if revcomp: r['seq'] = Seq(r['seq']).reverse_complement().tostring() r['qual'] = r['qual'][::-1] f.write(r) f.close()
def remove_high_expected_error_PE(file1, file2, max_expected_error): """ Remove all reads where the expected error (sum of err probs from phred scores) exceeds <max_expected_error> """ assert os.path.exists(file1) and os.path.exists(file2) os.system("rm {0}.experror_*".format(file1)) os.system("rm {0}.experror_*".format(file2)) hgood1 = BowTieWriter(file1 + '.experror_good') hgood2 = BowTieWriter(file2 + '.experror_good') hbad1 = BowTieWriter(file1 + '.experror_bad') hbad2 = BowTieWriter(file2 + '.experror_bad') hlog = open(file1 + '.experror.log', 'w') start_t = time.time() good, bad = 0, 0 for r1, r2 in itertools.izip(BowTieReader(file1, False), BowTieReader(file2, False)): if sum(10**-((ord(x)-33)/10.) for x in r1['qual']) <= max_expected_error and \ sum(10**-((ord(x)-33)/10.) for x in r2['qual']) <= max_expected_error: hgood1.write(r1) hgood2.write(r2) good += 1 else: hbad1.write(r1) hbad2.write(r2) bad += 1 hlog.write("Expected error filtering took {0} sec.\n".format(time.time() - start_t)) hlog.write("Max allowed expected error: {0}\n".format(max_expected_error)) hlog.write("# of original reads: {0}\n".format(good + bad)) hlog.write("# of reads removed: {0} ({1:.2f})\n".format( bad, bad * 1. / (good + bad))) hlog.write("# of reads remaining: {0} ({1:.2f})\n".format( good, good * 1. / (good + bad))) hgood1.close() hgood2.close() hbad1.close() hbad2.close() hlog.close() os.system("gzip " + hgood1.f.name) os.system("gzip " + hgood2.f.name) os.system("gzip " + hbad1.f.name) os.system("gzip " + hbad2.f.name)
def tally_overlap_size(composite_gz_filename): tally = defaultdict(lambda: 0) for r in BowTieReader(composite_gz_filename, False): overlap = int(r['ID'].split()[-1][len('COMPOSED/'):]) tally[overlap] += 1 _min = min(tally) _max = max(tally) print("OVERLAP," + ",".join(map(str, xrange(_min, _max + 1)))) print("COUNT," + ",".join(str(tally[i]) for i in xrange(_min, _max + 1))) return tally
def parse_blast_xml_for_training(xml_filename, bowtie_filename, output_filename): """ Parse the XML output, looking only at the 1st alignment for each query Write out in format: Phred Cycle B2 B1 B0 Class """ fa_dict = dict((r['ID'], r) for r in BowTieReader(bowtie_filename, False)) f = open(output_filename, 'w') f.write("Phred\tCycle\tB2\tB1\tB0\tClass\n") for blastout in NCBIXML.parse(open(xml_filename)): if len(blastout.alignments) == 0: # no match was found! continue hsp = blastout.alignments[0].hsps[0] record = fa_dict[blastout.query] primer_offset = int(record['offset']) gap_offset = 0 for i in xrange(2, len( hsp.match)): # toDO: allow for i<2 and still get B2, B1 gap_offset += hsp.query[i] == '-' # global position is i + (query_start-1) + primer_offset - gap_offset doit = False if hsp.match[ i] == " " and hsp.query[i] != '-' and hsp.sbjct[i] != '-': doit = True _class = '-' elif hsp.match[i] == '|' and hsp.query[i] == hsp.sbjct[ i] and random.random() <= 1e-3: doit = True _class = '+' if doit: # is a mismatch! record_i = i + (hsp.query_start - 1) - gap_offset assert hsp.query[i] == record['seq'][record_i] f.write(str(ord(record['qual'][record_i]) - 33) + '\t') f.write(str(record_i + primer_offset) + '\t') b2, b1 = None, None j = i - 1 while hsp.query[j] == '-': j -= 1 b1 = hsp.query[j] j -= 1 while hsp.query[j] == '-': j -= 1 b2 = hsp.query[j] assert b1 == record['seq'][ record_i - 1] and b2 == record['seq'][record_i - 2] f.write(b2 + '\t') f.write(b1 + '\t') f.write(hsp.query[i] + '\t') f.write(_class + '\n') f.close()
def tally_qual_scores_gz_bowtie(gz_filename, output, strand, reverse_pos): quals_at = defaultdict(lambda: defaultdict(lambda: 0)) f = BowTieReader(gz_filename, False) max_phred_seen = 42 count = 0 for r in f: if strand is not None and r['strand']!=strand: continue count += 1 r_qual = r['qual'] if reverse_pos: r_qual = r_qual[::-1] for pos,q in enumerate(r_qual): assert ord(q) - 33 >= 0 # for combined reads it is possible to go above 41 quals_at[pos][ord(q) - 33] += 1 max_phred_seen = max(max_phred_seen, ord(q) - 33) # sanity check for pos in quals_at: sum(quals_at[pos]) == count poses = quals_at.keys() poses.sort() print >> sys.stderr, "{0} reads used".format(count) with open(output, 'w') as f: f.write("POS," + ",".join([str(x) for x in xrange(max_phred_seen)]) + '\n') for pos in poses: f.write(str(pos) + ',' + ",".join([str(quals_at[pos][x]) for x in xrange(max_phred_seen)]) + '\n')
def filter_low_qual_seqs(gz_filename, phred_offset, phred_cutoff): """ Takes a BowTie-style gzipped file (ex: .aligned.composite.gz) and retain only seqs that have every base phred >= <cutoff> also uniquifies the sequences Outputs: .phred<cutoff>_passed.unique.fasta .phred<cutoff>_passed.unique.count """ assert phred_offset >= 0 assert phred_cutoff >= 0 seen = {} # seq --> {'ids':list of IDs, 'index':associated cluster index} index = 0 # for tracking unique clusters bad = 0 good = 0 start_t = time.time() f = open( gz_filename + ".phred{0}_passed.unique.fasta".format(phred_cutoff), 'w') for r in BowTieReader(gz_filename, False): if all(ord(x) - phred_offset >= phred_cutoff for x in r['qual']): good += 1 if r['seq'] in seen: seen[r['seq']]['ids'].append(r['ID']) else: seen[r['seq']] = {'index': index, 'ids': [r['ID']]} f.write('>' + str(index) + '\n') f.write(r['seq'] + '\n') index += 1 else: bad += 1 f.close() with open( gz_filename + ".phred{0}_passed.unique.count".format(phred_cutoff), 'w') as f: for d in seen.itervalues(): f.write("{0}\t{1}\n".format(d['index'], "\t".join(d['ids']))) with open(gz_filename + ".phred{0}_passed.unique.log".format(phred_cutoff), 'w') as f: f.write( "Running filter_low_qual_seq took {0} secs\n".format(time.time() - start_t)) f.write("Input: " + gz_filename + '\n') f.write("PhredCutoff: " + str(phred_cutoff) + '\n') f.write("RemovedDueToLowQual: " + str(bad) + '\n') f.write("RemainingTotal: " + str(good) + '\n') f.write("RemainingUnique: " + str(len(seen)) + '\n')
def split_fasta_by_otu(fasta_filename, bowtie_gz_filename, otu_filename, output_dir): """ For each OTU, create a subdir <output_dir>/<cluster_index> and put it in the OTU's fasta and bowtie (gzipped) """ otu = {} fa_d = {} bw_d = {} cids = set() with open(otu_filename) as f: for line in f: raw = line.strip().split() cid = raw[0] cids.add(cid) if not os.path.exists(os.path.join(output_dir, cid)): os.mkdir(os.path.join(output_dir, cid)) #os.mkdir(os.path.join(output_dir, cid)) for seqid in raw[1:]: otu[seqid] = cid print >> sys.stderr, "finished reading", otu_filename for cid in cids: fa_d[cid] = open(os.path.join(output_dir, cid, cid + '.fasta'), 'w') bw_d[cid] = BowTieWriter(os.path.join(output_dir, cid, cid + '.bowtie'), mode='w') for r in SeqIO.parse(open(fasta_filename), 'fasta'): if r.id not in otu: continue cid = otu[r.id] fa_d[cid].write(">{0}\n{1}\n".format(r.id, r.seq)) for r in BowTieReader(bowtie_gz_filename, False): try: cid = otu[r['ID'].split()[0]] except KeyError: continue bw_d[cid].write(r) for handle in fa_d.itervalues(): handle.close() for handle in bw_d.itervalues(): handle.close() os.system("gzip " + handle.f.name)
def create_clusters_from_bowtie(self): """ The 'offset' field is actually 'abundance' The 'ref' field is actually 'cycle' offset """ with open(self.otu_txt) as f: for line in f: otuid, rest = line.strip().split(None, 1) for x in rest.split(): self.otu_info[x] = otuid self.cluster_by_otu[otuid] = {} for r in BowTieReader(self.input_bowtie, False): cid = r['ID'] otuid = self.otu_info[r['ID']] self.cluster_by_otu[otuid][cid] = {'dirty':True, 'cids':[cid], 'len':len(r['seq']), 'seq': MutableSeq(r['seq']), 'size':int(r['offset']), \ 'qual': [ord(x)-33 for x in r['qual']], 'cycle': range(int(r['ref']), int(r['ref'])+len(r['seq']))}
def main(input_filename): f = open(input_filename + '.summary', 'w') f.write( 'ID\tForR\tPRIMERlen\tMATCH\tSEQ_sansPrimer\tQUAL_sansPrimer\tMISMATCHES_globalPos\n' ) for r in BowTieReader(input_filename, False): mm1 = diff(r['seq'], int(r['offset']), ecoli1) mm2 = diff(r['seq'], int(r['offset']), ecoli2) mm = mm1 if len(mm1) < len(mm2) else mm2 if len(mm) > 5: print >> sys.stderr, "MORE than 5 errors. Discard!!!!" continue f.write(r['ID'] + '\t') f.write('F\t' + str(r['offset']) + '\tECOLI\t') f.write(r['seq'] + '\t') f.write(r['qual'] + '\t') f.write(",".join(mm) + '\n') f.close()
def uniquify_bowtie_output_to_fastq(filename): """ <filename> is in BowTie format (either pre- or post-composite), gzipped read through it, ignore the qual scores, and simply output: (1) <filename>.unique.fasta.gz (2) <filename>.unique.count.gz """ seen_seq = {} # sequence --> list of ids for r in BowTieReader(filename, False): if r['seq'] in seen_seq: seen_seq[r['seq']].append(r['ID']) else: seen_seq[r['seq']] = [r['ID']] f1 = gzip.open(filename + '.unique.fasta.gz', 'w') f2 = gzip.open(filename + '.unique.count.gz', 'w') items = seen_seq.items() items.sort(key=lambda x: len(x[1]), reverse=True) for seq, ids in items: f1.write(">{0}\n{1}\n".format(ids[0], seq)) f2.write("{0}\t{1}\t{2}\n".format(ids[0], len(ids), ",".join(ids))) f1.close() f2.close()
def main(input_filename, abundance_filename, output): abundance = {} with open(abundance_filename) as f: for line in f: _id, _count = line.strip().split('\t') abundance[_id] = int(_count) total = sum(abundance.itervalues()) aligned = 0 for r1, r2 in BowTieReader(input_filename, True): realid = r1['ID'][:r1['ID'].find('/')] aligned += abundance[realid] with open(output, 'w') as f: p = aligned * 100. / total f.write("# reads processed: {0}\n".format(total)) f.write( "# reads with at least one reported alignment: {0} ({1:.2f}%)\n". format(aligned, p)) f.write("# reads that failed to align: {0} ({1:.2f}%)\n".format( total - aligned, 100 - p)) f.write("Reported {0} paired-end alignments to 1 output stream(s)\n". format(aligned))
def filter_low_count_low_qual_seqs(gz_filename): """ Reads in a bowtie gzip file (ex: DS19187.aligned.composite.gz) which must have a corressponding .unique.count.gz file Outputs a text file which denotes seqs that should be REMOVED (ex: via Qiime's filter_seqs.py) because it has (1) only 1 count and (2) has 1 or more phred-2 bases ('#') Output is written to .unique.count1phred2.filter.txt """ failed = {} for r in BowTieReader(gz_filename, False): if r['qual'].count('#') >= 1: failed[r['ID']] = 1 print >> sys.stderr, "finished reading gz" with open(gz_filename + '.unique.count1phred2.filter.txt', 'w') as f: for line in gziplines(gz_filename + '.unique.count.gz'): a = line.strip().split('\t') if int(a[1]) == 1 and a[0] in failed: del failed[a[0]] f.write(a[0] + '\n')
deino = 'TAGGAATCTTCCACAATGGGCGCAAGCCTGATGGAGCGACGCCGCGTGAGGGATGAAGGTTTTCGGATCGTAAACCTCTGAATCTGGGACGAAAGAGCCTTAGGGCAGATGACGGTACCAGAGTAATAGCACCGGCTAACTCC' myco_r = Seq(myco).reverse_complement().tostring() deino_r = Seq(deino).reverse_complement().tostring() input1 = sys.argv[ 1] # ex: DSXXXXX.aligned.composite.gz.primer_good.gz, BowTie format, gzipped input2 = sys.argv[2] sample = sys.argv[3] # ex: DSXXXXX h3 = open(sample + '.alien.summary', 'w') h3.write( 'Sample\tID\tForR\tPRIMERlen\tMATCH\tSEQ_sansPrimer\tQUAL_sansPrimer\tMISMATCHES_localPos\n' ) for r1, r2 in itertools.izip(BowTieReader(input1, False), BowTieReader(input2, False)): match = None if check_seq(r1, myco) and check_seq(r2, myco_r): mm1 = diff_seq(r1['seq'], 0, myco) mm2 = diff_seq(r2['seq'], 0, myco_r) if len(mm1) * 1. / len(r1['seq']) < .1 and len(mm2) * 1. / len( r2['seq']) < .1: match = 'MP' elif check_seq(r1, deino) and check_seq(r2, deino_r): mm1 = diff_seq(r1['seq'], 0, deino) mm2 = diff_seq(r2['seq'], 0, deino_r) if len(mm1) * 1. / len(r1['seq']) < .1 and len(mm2) * 1. / len( r2['seq']) < .1: match = 'DR' if match is not None:
qual += chr(int(-10*log10(e)+33)) seq += r2['seq'][N-delta:] qual += r2['qual'][N-delta:] return seq, qual, N-delta if __name__ == "__main__": from miscBowTie import BowTieReader, BowTieWriter from cPickle import * from optparse import OptionParser parser = OptionParser() parser.add_option("--input", dest="input", help="Input bowtie aligned file (gzipped)") parser.add_option("--output", dest="output", help="Output composite read filename") options, args = parser.parse_args() reader = BowTieReader(options.input, is_paired=True) print >> sys.stderr, "calculating base frequencies" base_freq_pickle = options.input + ".base_freq.pickle" if os.path.exists(base_freq_pickle): base_freq = load(open(base_freq_pickle)) else: base_freq = reader.get_base_frequency() with open(options.input + ".base_freq.pickle", 'w') as f: dump(base_freq, f) print >> sys.stderr, "reading bowtie aligned file..." writer = BowTieWriter(options.output) for r1, r2 in reader: seq, qual, overlap = compose2(r1, r2, base_freq) writer.write_composite(r1, r2, seq, qual, overlap)
return mm[:-1] d = {} # SEQID --> {MATCH, MISMATCHES} for x in DictReader(open(SUMMARY), delimiter='\t'): id = x['SEQID'] id = id[:id.find('/')] d[id] = x f = open(OUTPUT, 'w') f.write( "ID\tForR\tPRIMERlen\tMATCH\tSEQ_sansPrimer\tQUAL_sansPrimer\tMISMATCHES_globalPos\n" ) for x1, x2 in BowTieReader(ALIGNED_GZ, True): id = x1['ID'][:-2] if id in d: if d[id]['MATCH'] == 'MP': real, real_r = myco, myco_r else: real, real_r = deino, deino_r i = find_primer(x1['seq'], f_primer, 2, 10, False) f.write("{id}\tF\t{primer}\t{match}\t{seq}\t{qual}\t{mm}\n".format(\ id=id, primer=i, match=d[id]['MATCH'], seq=x1['seq'][i:],\ qual=x1['qual'][i:], mm=list_mm(real, x1['seq'][i:], i))) i = find_primer(x2['seq'], r_primer, 2, 10, True) x2_r = Seq.Seq(x2['seq'][:-i]).reverse_complement().tostring() q2_r = x2['qual'][:-i] q2_r = q2_r[::-1]