Esempio n. 1
0
def bowtie2fasta(input, name, trimid):
    output = input + '.fasta'
    f = open(output, 'w')
    if name is not None:
        fmap = open(output + '.map', 'w')
        for r in BowTieReader(input, False):
            id = r['ID'].split()[0] if trimid else r['ID']
            f.write(">{name}_{id}\n{seq}\n".format(id=id, seq=r['seq'], name=name))
            fmap.write("{name}_{id}\t{name}\n".format(name=name, id=id))
    else:
        for r in BowTieReader(input, False):
            id = r['ID'].split()[0] if trimid else r['ID']
            f.write(">{id}\n{seq}\n".format(id=id, seq=r['seq']))
    f.close() 
    if name is not None:
        fmap.close()
Esempio n. 2
0
def filter_low_qual_seqs(gz_filename, phred_offset, phred_cutoff):
    """
    Takes a BowTie-style gzipped file (ex: .aligned.composite.gz)
    and retain only seqs that have every base phred >= <cutoff>

    Outputs: .phred<cutoff>_passed for both files
    """
    assert phred_offset >= 0
    assert phred_cutoff >= 0
    bad = 0
    good = 0
    start_t = time.time()
    f = BowTieWriter(gz_filename + ".phred{0}_passed".format(phred_cutoff),
                     'w')
    for r in BowTieReader(gz_filename, False):
        if all(ord(x) - phred_offset >= phred_cutoff for x in r['qual']):
            good += 1
            f.write(r)
        else:
            bad += 1

    with open(gz_filename + ".phred{0}_passed.log".format(phred_cutoff),
              'w') as f:
        f.write(
            "Running filter_low_qual_seq took {0} secs\n".format(time.time() -
                                                                 start_t))
        f.write("Input: " + gz_filename + '\n')
        f.write("PhredCutoff: " + str(phred_cutoff) + '\n')
        f.write("RemovedDueToLowQual: " + str(bad) + '\n')
        f.write("RemainingTotal: " + str(good) + '\n')
Esempio n. 3
0
def parse_blast_xml_for_training(xml_filename, bowtie_filename,
                                 output_filename):
    """
    Parse the XML output, looking only at the 1st alignment for each query
    Write out in format:
        
    Phred   Cycle   B2  B1  B0  Class
    """
    fa_dict = dict((r['ID'], r) for r in BowTieReader(bowtie_filename))

    f = open(output_filename, 'w')
    f.write("Phred\tCycle\tB2\tB1\tB0\tClass\n")
    for blastout in NCBIXML.parse(open(xml_filename)):
        if len(blastout.alignments) == 0:  # no match was found!
            continue
        hsp = blastout.alignments[0].hsps[0]
        record = fa_dict[blastout.query]
        primer_offset = int(record['offset'])
        for i in xrange(2, len(
                hsp.match)):  # toDO: allow for i<2 and still get B2, B1
            # global position is i + (query_start-1) + primer_offset
            if hsp.match[
                    i] == " " and hsp.query[i] != '-' and hsp.sbjct[i] != '-':
                pdb.set_trace()
                # is a mismatch!
                f.write(
                    str(ord(record['qual'][i + hsp.query_start - 1]) - 33) +
                    '\t')
                f.write(str(i + hsp.query_start - 1 + primer_offset) + '\t')
                f.write(hsp.query[i - 2] + '\t')
                f.write(hsp.query[i - 1] + '\t')
                f.write(hsp.query[i] + '\t')
                f.write('-\n')

    f.close()
Esempio n. 4
0
def bowtie2fastq(input, revcomp=False):
    output = input + '.fq'
    f = FastqWriter(output)
    for r in BowTieReader(input, False):
        if revcomp:
            r['seq'] = Seq(r['seq']).reverse_complement().tostring()
            r['qual'] = r['qual'][::-1]
        f.write(r)
    f.close()       
def remove_high_expected_error_PE(file1, file2, max_expected_error):
    """
	Remove all reads where the expected error (sum of err probs from phred scores)
	exceeds <max_expected_error>
	"""
    assert os.path.exists(file1) and os.path.exists(file2)
    os.system("rm {0}.experror_*".format(file1))
    os.system("rm {0}.experror_*".format(file2))
    hgood1 = BowTieWriter(file1 + '.experror_good')
    hgood2 = BowTieWriter(file2 + '.experror_good')
    hbad1 = BowTieWriter(file1 + '.experror_bad')
    hbad2 = BowTieWriter(file2 + '.experror_bad')
    hlog = open(file1 + '.experror.log', 'w')
    start_t = time.time()
    good, bad = 0, 0
    for r1, r2 in itertools.izip(BowTieReader(file1, False),
                                 BowTieReader(file2, False)):
        if sum(10**-((ord(x)-33)/10.) for x in r1['qual']) <= max_expected_error and \
        sum(10**-((ord(x)-33)/10.) for x in r2['qual']) <= max_expected_error:
            hgood1.write(r1)
            hgood2.write(r2)
            good += 1
        else:
            hbad1.write(r1)
            hbad2.write(r2)
            bad += 1
    hlog.write("Expected error filtering took {0} sec.\n".format(time.time() -
                                                                 start_t))
    hlog.write("Max allowed expected error: {0}\n".format(max_expected_error))
    hlog.write("# of original reads: {0}\n".format(good + bad))
    hlog.write("# of reads removed: {0} ({1:.2f})\n".format(
        bad, bad * 1. / (good + bad)))
    hlog.write("# of reads remaining: {0} ({1:.2f})\n".format(
        good, good * 1. / (good + bad)))

    hgood1.close()
    hgood2.close()
    hbad1.close()
    hbad2.close()
    hlog.close()
    os.system("gzip " + hgood1.f.name)
    os.system("gzip " + hgood2.f.name)
    os.system("gzip " + hbad1.f.name)
    os.system("gzip " + hbad2.f.name)
Esempio n. 6
0
def tally_overlap_size(composite_gz_filename):
    tally = defaultdict(lambda: 0)
    for r in BowTieReader(composite_gz_filename, False):
        overlap = int(r['ID'].split()[-1][len('COMPOSED/'):])
        tally[overlap] += 1
    _min = min(tally)
    _max = max(tally)
    print("OVERLAP," + ",".join(map(str, xrange(_min, _max + 1))))
    print("COUNT," + ",".join(str(tally[i]) for i in xrange(_min, _max + 1)))
    return tally
def parse_blast_xml_for_training(xml_filename, bowtie_filename,
                                 output_filename):
    """
    Parse the XML output, looking only at the 1st alignment for each query
    Write out in format:
        
    Phred   Cycle   B2  B1  B0  Class
    """
    fa_dict = dict((r['ID'], r) for r in BowTieReader(bowtie_filename, False))

    f = open(output_filename, 'w')
    f.write("Phred\tCycle\tB2\tB1\tB0\tClass\n")
    for blastout in NCBIXML.parse(open(xml_filename)):
        if len(blastout.alignments) == 0:  # no match was found!
            continue
        hsp = blastout.alignments[0].hsps[0]
        record = fa_dict[blastout.query]
        primer_offset = int(record['offset'])
        gap_offset = 0
        for i in xrange(2, len(
                hsp.match)):  # toDO: allow for i<2 and still get B2, B1
            gap_offset += hsp.query[i] == '-'
            # global position is i + (query_start-1) + primer_offset - gap_offset
            doit = False
            if hsp.match[
                    i] == " " and hsp.query[i] != '-' and hsp.sbjct[i] != '-':
                doit = True
                _class = '-'
            elif hsp.match[i] == '|' and hsp.query[i] == hsp.sbjct[
                    i] and random.random() <= 1e-3:
                doit = True
                _class = '+'
            if doit:
                # is a mismatch!
                record_i = i + (hsp.query_start - 1) - gap_offset
                assert hsp.query[i] == record['seq'][record_i]
                f.write(str(ord(record['qual'][record_i]) - 33) + '\t')
                f.write(str(record_i + primer_offset) + '\t')
                b2, b1 = None, None
                j = i - 1
                while hsp.query[j] == '-':
                    j -= 1
                b1 = hsp.query[j]
                j -= 1
                while hsp.query[j] == '-':
                    j -= 1
                b2 = hsp.query[j]
                assert b1 == record['seq'][
                    record_i - 1] and b2 == record['seq'][record_i - 2]
                f.write(b2 + '\t')
                f.write(b1 + '\t')
                f.write(hsp.query[i] + '\t')
                f.write(_class + '\n')

    f.close()
Esempio n. 8
0
def tally_qual_scores_gz_bowtie(gz_filename, output, strand, reverse_pos):
	quals_at = defaultdict(lambda: defaultdict(lambda: 0))
	f = BowTieReader(gz_filename, False)
	max_phred_seen = 42
	count = 0
	for r in f:
		if strand is not None and r['strand']!=strand:
			continue
		count += 1
		r_qual = r['qual']
		if reverse_pos:
			r_qual = r_qual[::-1]
		for pos,q in enumerate(r_qual):
			assert ord(q) - 33 >= 0 # for combined reads it is possible to go above 41
			quals_at[pos][ord(q) - 33] += 1
			max_phred_seen = max(max_phred_seen, ord(q) - 33)

	# sanity check
	for pos in quals_at:
		sum(quals_at[pos]) == count

	poses = quals_at.keys()
	poses.sort()
	print >> sys.stderr, "{0} reads used".format(count)

	with open(output, 'w') as f:
		f.write("POS," + ",".join([str(x) for x in xrange(max_phred_seen)]) + '\n')
		for pos in poses:
			f.write(str(pos) + ',' + ",".join([str(quals_at[pos][x]) for x in xrange(max_phred_seen)]) + '\n')
def filter_low_qual_seqs(gz_filename, phred_offset, phred_cutoff):
    """
	Takes a BowTie-style gzipped file (ex: .aligned.composite.gz)
	and retain only seqs that have every base phred >= <cutoff>
	also uniquifies the sequences

	Outputs: .phred<cutoff>_passed.unique.fasta
	         .phred<cutoff>_passed.unique.count
	"""
    assert phred_offset >= 0
    assert phred_cutoff >= 0
    seen = {}  # seq --> {'ids':list of IDs, 'index':associated cluster index}
    index = 0  # for tracking unique clusters
    bad = 0
    good = 0
    start_t = time.time()
    f = open(
        gz_filename + ".phred{0}_passed.unique.fasta".format(phred_cutoff),
        'w')
    for r in BowTieReader(gz_filename, False):
        if all(ord(x) - phred_offset >= phred_cutoff for x in r['qual']):
            good += 1
            if r['seq'] in seen:
                seen[r['seq']]['ids'].append(r['ID'])
            else:
                seen[r['seq']] = {'index': index, 'ids': [r['ID']]}
                f.write('>' + str(index) + '\n')
                f.write(r['seq'] + '\n')
                index += 1
        else:
            bad += 1
    f.close()
    with open(
            gz_filename + ".phred{0}_passed.unique.count".format(phred_cutoff),
            'w') as f:
        for d in seen.itervalues():
            f.write("{0}\t{1}\n".format(d['index'], "\t".join(d['ids'])))
    with open(gz_filename + ".phred{0}_passed.unique.log".format(phred_cutoff),
              'w') as f:
        f.write(
            "Running filter_low_qual_seq took {0} secs\n".format(time.time() -
                                                                 start_t))
        f.write("Input: " + gz_filename + '\n')
        f.write("PhredCutoff: " + str(phred_cutoff) + '\n')
        f.write("RemovedDueToLowQual: " + str(bad) + '\n')
        f.write("RemainingTotal: " + str(good) + '\n')
        f.write("RemainingUnique: " + str(len(seen)) + '\n')
Esempio n. 10
0
def split_fasta_by_otu(fasta_filename, bowtie_gz_filename, otu_filename,
                       output_dir):
    """
	For each OTU, create a subdir <output_dir>/<cluster_index> and 
	put it in the OTU's fasta and bowtie (gzipped)
	"""
    otu = {}
    fa_d = {}
    bw_d = {}
    cids = set()
    with open(otu_filename) as f:
        for line in f:
            raw = line.strip().split()
            cid = raw[0]
            cids.add(cid)
            if not os.path.exists(os.path.join(output_dir, cid)):
                os.mkdir(os.path.join(output_dir, cid))
            #os.mkdir(os.path.join(output_dir, cid))
            for seqid in raw[1:]:
                otu[seqid] = cid
    print >> sys.stderr, "finished reading", otu_filename

    for cid in cids:
        fa_d[cid] = open(os.path.join(output_dir, cid, cid + '.fasta'), 'w')
        bw_d[cid] = BowTieWriter(os.path.join(output_dir, cid,
                                              cid + '.bowtie'),
                                 mode='w')

    for r in SeqIO.parse(open(fasta_filename), 'fasta'):
        if r.id not in otu:
            continue
        cid = otu[r.id]
        fa_d[cid].write(">{0}\n{1}\n".format(r.id, r.seq))

    for r in BowTieReader(bowtie_gz_filename, False):
        try:
            cid = otu[r['ID'].split()[0]]
        except KeyError:
            continue
        bw_d[cid].write(r)

    for handle in fa_d.itervalues():
        handle.close()

    for handle in bw_d.itervalues():
        handle.close()
        os.system("gzip " + handle.f.name)
Esempio n. 11
0
    def create_clusters_from_bowtie(self):
        """
		The 'offset' field is actually 'abundance'
		The 'ref' field is actually 'cycle' offset
		"""
        with open(self.otu_txt) as f:
            for line in f:
                otuid, rest = line.strip().split(None, 1)
                for x in rest.split():
                    self.otu_info[x] = otuid
                self.cluster_by_otu[otuid] = {}

        for r in BowTieReader(self.input_bowtie, False):
            cid = r['ID']
            otuid = self.otu_info[r['ID']]
            self.cluster_by_otu[otuid][cid] = {'dirty':True, 'cids':[cid], 'len':len(r['seq']), 'seq': MutableSeq(r['seq']), 'size':int(r['offset']), \
              'qual': [ord(x)-33 for x in r['qual']], 'cycle': range(int(r['ref']), int(r['ref'])+len(r['seq']))}
Esempio n. 12
0
def main(input_filename):
    f = open(input_filename + '.summary', 'w')
    f.write(
        'ID\tForR\tPRIMERlen\tMATCH\tSEQ_sansPrimer\tQUAL_sansPrimer\tMISMATCHES_globalPos\n'
    )
    for r in BowTieReader(input_filename, False):
        mm1 = diff(r['seq'], int(r['offset']), ecoli1)
        mm2 = diff(r['seq'], int(r['offset']), ecoli2)
        mm = mm1 if len(mm1) < len(mm2) else mm2
        if len(mm) > 5:
            print >> sys.stderr, "MORE than 5 errors. Discard!!!!"
            continue
        f.write(r['ID'] + '\t')
        f.write('F\t' + str(r['offset']) + '\tECOLI\t')
        f.write(r['seq'] + '\t')
        f.write(r['qual'] + '\t')
        f.write(",".join(mm) + '\n')
    f.close()
Esempio n. 13
0
def uniquify_bowtie_output_to_fastq(filename):
    """
	<filename> is in BowTie format (either pre- or post-composite), gzipped
	read through it, ignore the qual scores, and simply output:
	(1) <filename>.unique.fasta.gz
	(2) <filename>.unique.count.gz
	"""
    seen_seq = {}  # sequence --> list of ids
    for r in BowTieReader(filename, False):
        if r['seq'] in seen_seq:
            seen_seq[r['seq']].append(r['ID'])
        else:
            seen_seq[r['seq']] = [r['ID']]

    f1 = gzip.open(filename + '.unique.fasta.gz', 'w')
    f2 = gzip.open(filename + '.unique.count.gz', 'w')
    items = seen_seq.items()
    items.sort(key=lambda x: len(x[1]), reverse=True)
    for seq, ids in items:
        f1.write(">{0}\n{1}\n".format(ids[0], seq))
        f2.write("{0}\t{1}\t{2}\n".format(ids[0], len(ids), ",".join(ids)))
    f1.close()
    f2.close()
def main(input_filename, abundance_filename, output):
    abundance = {}
    with open(abundance_filename) as f:
        for line in f:
            _id, _count = line.strip().split('\t')
            abundance[_id] = int(_count)

    total = sum(abundance.itervalues())
    aligned = 0
    for r1, r2 in BowTieReader(input_filename, True):
        realid = r1['ID'][:r1['ID'].find('/')]
        aligned += abundance[realid]

    with open(output, 'w') as f:
        p = aligned * 100. / total
        f.write("# reads processed: {0}\n".format(total))
        f.write(
            "# reads with at least one reported alignment: {0} ({1:.2f}%)\n".
            format(aligned, p))
        f.write("# reads that failed to align: {0} ({1:.2f}%)\n".format(
            total - aligned, 100 - p))
        f.write("Reported {0} paired-end alignments to 1 output stream(s)\n".
                format(aligned))
def filter_low_count_low_qual_seqs(gz_filename):
    """
	Reads in a bowtie gzip file (ex: DS19187.aligned.composite.gz)
	which must have a corressponding .unique.count.gz file 

	Outputs a text file which denotes seqs that should be REMOVED
	(ex: via Qiime's filter_seqs.py) because it has
	(1) only 1 count
	and
	(2) has 1 or more phred-2 bases ('#')

	Output is written to .unique.count1phred2.filter.txt
	"""
    failed = {}
    for r in BowTieReader(gz_filename, False):
        if r['qual'].count('#') >= 1: failed[r['ID']] = 1
    print >> sys.stderr, "finished reading gz"

    with open(gz_filename + '.unique.count1phred2.filter.txt', 'w') as f:
        for line in gziplines(gz_filename + '.unique.count.gz'):
            a = line.strip().split('\t')
            if int(a[1]) == 1 and a[0] in failed:
                del failed[a[0]]
                f.write(a[0] + '\n')
Esempio n. 16
0
deino = 'TAGGAATCTTCCACAATGGGCGCAAGCCTGATGGAGCGACGCCGCGTGAGGGATGAAGGTTTTCGGATCGTAAACCTCTGAATCTGGGACGAAAGAGCCTTAGGGCAGATGACGGTACCAGAGTAATAGCACCGGCTAACTCC'

myco_r = Seq(myco).reverse_complement().tostring()
deino_r = Seq(deino).reverse_complement().tostring()

input1 = sys.argv[
    1]  # ex: DSXXXXX.aligned.composite.gz.primer_good.gz, BowTie format, gzipped
input2 = sys.argv[2]
sample = sys.argv[3]  # ex: DSXXXXX

h3 = open(sample + '.alien.summary', 'w')
h3.write(
    'Sample\tID\tForR\tPRIMERlen\tMATCH\tSEQ_sansPrimer\tQUAL_sansPrimer\tMISMATCHES_localPos\n'
)

for r1, r2 in itertools.izip(BowTieReader(input1, False),
                             BowTieReader(input2, False)):
    match = None
    if check_seq(r1, myco) and check_seq(r2, myco_r):
        mm1 = diff_seq(r1['seq'], 0, myco)
        mm2 = diff_seq(r2['seq'], 0, myco_r)
        if len(mm1) * 1. / len(r1['seq']) < .1 and len(mm2) * 1. / len(
                r2['seq']) < .1:
            match = 'MP'
    elif check_seq(r1, deino) and check_seq(r2, deino_r):
        mm1 = diff_seq(r1['seq'], 0, deino)
        mm2 = diff_seq(r2['seq'], 0, deino_r)
        if len(mm1) * 1. / len(r1['seq']) < .1 and len(mm2) * 1. / len(
                r2['seq']) < .1:
            match = 'DR'
    if match is not None:
Esempio n. 17
0
			qual += chr(int(-10*log10(e)+33))
		seq += r2['seq'][N-delta:]
		qual += r2['qual'][N-delta:]
	return seq, qual, N-delta

if __name__ == "__main__":
	from miscBowTie import BowTieReader, BowTieWriter
	from cPickle import *
	from optparse import OptionParser

	parser = OptionParser()
	parser.add_option("--input", dest="input", help="Input bowtie aligned file (gzipped)")
	parser.add_option("--output", dest="output", help="Output composite read filename")
	
	options, args = parser.parse_args()

	reader = BowTieReader(options.input, is_paired=True)
	print >> sys.stderr, "calculating base frequencies"
	base_freq_pickle = options.input + ".base_freq.pickle"
	if os.path.exists(base_freq_pickle):
		base_freq = load(open(base_freq_pickle))
	else:
		base_freq = reader.get_base_frequency()
		with open(options.input + ".base_freq.pickle", 'w') as f:
			dump(base_freq, f)
	print >> sys.stderr, "reading bowtie aligned file..."
	writer = BowTieWriter(options.output)
	for r1, r2 in reader:
		seq, qual, overlap = compose2(r1, r2, base_freq)
		writer.write_composite(r1, r2, seq, qual, overlap)
Esempio n. 18
0
    return mm[:-1]


d = {}  # SEQID --> {MATCH, MISMATCHES}

for x in DictReader(open(SUMMARY), delimiter='\t'):
    id = x['SEQID']
    id = id[:id.find('/')]
    d[id] = x

f = open(OUTPUT, 'w')
f.write(
    "ID\tForR\tPRIMERlen\tMATCH\tSEQ_sansPrimer\tQUAL_sansPrimer\tMISMATCHES_globalPos\n"
)

for x1, x2 in BowTieReader(ALIGNED_GZ, True):
    id = x1['ID'][:-2]
    if id in d:
        if d[id]['MATCH'] == 'MP':
            real, real_r = myco, myco_r
        else:
            real, real_r = deino, deino_r
        i = find_primer(x1['seq'], f_primer, 2, 10, False)
        f.write("{id}\tF\t{primer}\t{match}\t{seq}\t{qual}\t{mm}\n".format(\
          id=id, primer=i, match=d[id]['MATCH'], seq=x1['seq'][i:],\
          qual=x1['qual'][i:], mm=list_mm(real, x1['seq'][i:], i)))

        i = find_primer(x2['seq'], r_primer, 2, 10, True)
        x2_r = Seq.Seq(x2['seq'][:-i]).reverse_complement().tostring()
        q2_r = x2['qual'][:-i]
        q2_r = q2_r[::-1]