コード例 #1
0
	def write(self, output_prefix):
		"""
		Output to clusters to a fasta file <output_prefix>.fasta
		>{cluster_index}
		{sequence here}
		
		And to a otu-style file <output_prefix>.otu.txt
		<cluster_index> \t <tab delimited seq IDs>
		
		"""
		w = FastqWriter(output_prefix + '.fq')
		f = open(output_prefix + '.fasta', 'w')
		h = open(output_prefix + '.otu.txt', 'w')
		a = open(output_prefix + '.abundance.txt', 'w')
		for cluster in self.cluster_by_otu.itervalues():
			for o in cluster.itervalues():
				# need to massage qual for fq writing
				o['qual'] = "".join(chr(o['qual'][i]+33) for i in xrange(o['len']))
				o['ID'] = o['cids'][0]
				w.write(o)
				f.write(">{0}\n{1}\n".format(o['ID'], o['seq']))
				h.write("{0}\t{1}\n".format(o['ID'], "\t".join(o['cids'])))
				a.write("{0}\t{1}\n".format(o['ID'], o['size']))
		w.close()
		f.close()
		h.close()
		a.close()
		os.system("gzip " + w.f.name)	
コード例 #2
0
def combine_RF(fotu, rotu, ffastq, rfastq, output_prefix):
	"""
	Reads two OTU files, 1 for forward 1 for reverse
	Returns: forward otu cid --> reverse otu cid --> abundance
	"""
	seqid2otu = {}
	combo = {}
	with open(fotu) as f:
		for line in f:
			otu, rest = line.strip().split(None, 1)
			combo[otu] = defaultdict(lambda: 0) 
			for seqid in rest.split():
				if seqid.endswith('/1') or seqid.endswith('/2'):
					seqid = seqid[:-2]
				seqid2otu[seqid] = otu
	with open(rotu) as f:
		for line in f:
			otu2, rest = line.strip().split(None, 1)
			for seqid in rest.split():
				if seqid.endswith('/1') or seqid.endswith('/2'):
					seqid = seqid[:-2]
				if seqid not in seqid2otu:
					print >> sys.stderr, "{0} is missing in forward, ignore".format(seqid)
					continue
				otu1 = seqid2otu[seqid]
				combo[otu1][otu2] += 1		
		
	# now write this out as <output_prefix>.combined.{1|2}.fq
	seqdict = {}
	for r in FastqReader(rfastq):
		seqdict[r['ID']] = r

	fqw1 = FastqWriter(output_prefix + '.combined.1.fq')
	fqw2 = FastqWriter(output_prefix + '.combined.2.fq')
	fout = open(output_prefix + '.combined.abundance.txt','w')
	for r in FastqReader(ffastq):
		if r['ID'] in combo:
			for id2, abundance in combo[r['ID']].iteritems():
				newid = "{0}_{1}".format(r['ID'], id2)
				fqw1.write(r, id=newid+'/1') 
				fqw2.write(seqdict[id2], id=newid+'/2') 
				#fqw1.write(">{id}\n{seq}\n".format(seq=r.seq, id=newid+'/1'))
				#fqw2.write(">{id}\n{seq}\n".format(seq=seqdict[id2].seq, id=newid+'/2'))
				fout.write("{0}\t{1}\n".format(newid, abundance))
	fqw1.close()
	fqw2.close()
	fout.close()
	return combo
コード例 #3
0
def main(fq1, fq2, output_prefix, abundance_filename):
    abundance = {}
    if abundance_filename is None:
        abundance = defaultdict(lambda: 1)
    else:
        with open(abundance_filename) as f:
            for line in f:
                _id, _count = line.strip().split('\t')
                abundance[_id] = int(_count)

    matchf = BowTieWriter(output_prefix + '.overlap.aligned')
    unf1 = FastqWriter(output_prefix + '.overlap.1.unaligned')
    unf2 = FastqWriter(output_prefix + '.overlap.2.unaligned')

    total = 0
    total_expanded = 0
    aligned = 0
    aligned_expanded = 0
    for r1, r2 in FastqReaderPaired(fq1, fq2):
        realid = r1['ID'][:r1['ID'].find('/')]
        total += 1
        total_expanded += abundance[realid]
        if find_overlap(r1, r2, matchf, unf1, unf2):  #overlap found
            aligned += 1
            aligned_expanded += abundance[realid]

    with open(output_prefix + '.overlap.log', 'w') as f:
        p = aligned * 100. / total
        f.write("# reads processed: {0}\n".format(total))
        f.write(
            "# reads with at least one reported alignment: {0} ({1:.2f}%)\n".
            format(aligned, p))
        f.write("# reads that failed to align: {0} ({1:.2f}%)\n".format(
            total - aligned, 100 - p))
        f.write("Reported {0} paired-end alignments to 1 output stream(s)\n".
                format(aligned))

    with open(output_prefix + '.overlap.log_expanded', 'w') as f:
        p = aligned_expanded * 100. / total_expanded
        f.write("# reads processed: {0}\n".format(total_expanded))
        f.write(
            "# reads with at least one reported alignment: {0} ({1:.2f}%)\n".
            format(aligned_expanded, p))
        f.write("# reads that failed to align: {0} ({1:.2f}%)\n".format(
            total_expanded - aligned_expanded, 100 - p))
        f.write("Reported {0} paired-end alignments to 1 output stream(s)\n".
                format(aligned_expanded))

    matchf.close()
    unf1.close()
    unf2.close()
コード例 #4
0
def consolidate_corrected_clusters(dir, output_prefix):
	"""
	in each cluster <dir>/<cluster_index>
	find the corrected <cluster_index>.errcor.{fq|otu.txt}
	and consolate them into one file
	rename the new seq ids to <cluster_index>_<otu_index>
	"""
	fqw = FastqWriter(output_prefix+'.fq')
	otuw = open(output_prefix+'.otu.txt', 'w')

	for cid in os.listdir(dir):
		d2 = os.path.join(dir, cid)
		with open(os.path.join(d2, cid+'.errcor.otu.txt')) as f:
			for line in f:
				otuw.write("{cid}_{rest}".format(cid=cid, rest=line))
		for r in FastqReader(os.path.join(d2, cid+'.errcor.fq.gz')):
			r['ID'] = cid + '_' + r['ID']
			fqw.write(r)
	
	otuw.close()
	fqw.close()
コード例 #5
0
    def write(self, output_prefix):
        """
		Output to clusters to a fasta file <output_prefix>.fasta
		>{cluster_index}
		{sequence here}
		
		And to a otu-style file <output_prefix>.otu.txt
		<cluster_index> \t <tab delimited seq IDs>
		
		"""
        w = FastqWriter(output_prefix + '.fq')
        f = open(output_prefix + '.fasta', 'w')
        h = open(output_prefix + '.otu.txt', 'w')
        a = open(output_prefix + '.abundance.txt', 'w')
        for cluster in self.cluster_by_otu.itervalues():
            for o in cluster.itervalues():
                # need to massage qual for fq writing
                o['qual'] = "".join(
                    chr(o['qual'][i] + 33) for i in xrange(o['len']))
                o['ID'] = o['cids'][0]
                w.write(o)
                f.write(">{0}\n{1}\n".format(o['ID'], o['seq']))
                h.write("{0}\t{1}\n".format(o['ID'], "\t".join(o['cids'])))
                a.write("{0}\t{1}\n".format(o['ID'], o['size']))
        w.close()
        f.close()
        h.close()
        a.close()
        os.system("gzip " + w.f.name)
コード例 #6
0
def bowtie2fastq(input, revcomp=False):
    output = input + '.fq'
    f = FastqWriter(output)
    for r in BowTieReader(input, False):
        if revcomp:
            r['seq'] = Seq(r['seq']).reverse_complement().tostring()
            r['qual'] = r['qual'][::-1]
        f.write(r)
    f.close()       
コード例 #7
0
def main(fq1, fq2, output_prefix, abundance_filename):
	abundance = {}
	if abundance_filename is None:
		abundance = defaultdict(lambda: 1)
	else:	
		with open(abundance_filename) as f:
			for line in f:
				_id, _count = line.strip().split('\t')
				abundance[_id] = int(_count)
			
	matchf = BowTieWriter(output_prefix + '.overlap.aligned')
	unf1 = FastqWriter(output_prefix + '.overlap.1.unaligned')
	unf2 = FastqWriter(output_prefix + '.overlap.2.unaligned')

	total = 0
	total_expanded = 0
	aligned = 0
	aligned_expanded = 0
	for r1, r2 in FastqReaderPaired(fq1, fq2):
		realid = r1['ID'][:r1['ID'].find('/')]
		total += 1
		total_expanded += abundance[realid]
		if find_overlap(r1, r2, matchf, unf1, unf2): #overlap found
			aligned += 1
			aligned_expanded += abundance[realid]
	
	with open(output_prefix + '.overlap.log', 'w') as f:
		p = aligned*100./total
		f.write("# reads processed: {0}\n".format(total))
		f.write("# reads with at least one reported alignment: {0} ({1:.2f}%)\n".format(aligned,p))
		f.write("# reads that failed to align: {0} ({1:.2f}%)\n".format(total-aligned,100-p))
		f.write("Reported {0} paired-end alignments to 1 output stream(s)\n".format(aligned))

	with open(output_prefix + '.overlap.log_expanded', 'w') as f:
		p = aligned_expanded*100./total_expanded
		f.write("# reads processed: {0}\n".format(total_expanded))
		f.write("# reads with at least one reported alignment: {0} ({1:.2f}%)\n".format(aligned_expanded,p))
		f.write("# reads that failed to align: {0} ({1:.2f}%)\n".format(total_expanded-aligned_expanded,100-p))
		f.write("Reported {0} paired-end alignments to 1 output stream(s)\n".format(aligned_expanded))


	matchf.close()
	unf1.close()
	unf2.close()
コード例 #8
0
def consolidate_corrected_clusters(dir, output_prefix):
    """
	in each cluster <dir>/<cluster_index>
	find the corrected <cluster_index>.errcor.{fq|otu.txt}
	and consolate them into one file
	rename the new seq ids to <cluster_index>_<otu_index>
	"""
    fqw = FastqWriter(output_prefix + '.fq')
    otuw = open(output_prefix + '.otu.txt', 'w')

    for cid in os.listdir(dir):
        d2 = os.path.join(dir, cid)
        with open(os.path.join(d2, cid + '.errcor.otu.txt')) as f:
            for line in f:
                otuw.write("{cid}_{rest}".format(cid=cid, rest=line))
        for r in FastqReader(os.path.join(d2, cid + '.errcor.fq.gz')):
            r['ID'] = cid + '_' + r['ID']
            fqw.write(r)

    otuw.close()
    fqw.close()
コード例 #9
0
ファイル: remove_primers.py プロジェクト: cwt1/IlluminaPE
def detect_primers_PE(input1, input2, output_prefix, f_primer, r_primer, min_match_len, max_mm, max_de, max_in):
	"""
	NOTE: this is for paired end reads that comes in two separate files
	ex: DS19342_CTTGTA_L006_R1_001.fastq.gz and DS19342_CTTGTA_L006_R2_001.fastq.gz
	
	Given a pair of reads from input1, input2:
	1. Detect that F primer exists in one read and R primer in the other
	2. If both reads pass primer detection, output
	3. Otherwise, discard
	
	Output:  <output_prefix>.{F|R}primer_good
	         <output_prefix>.primer.bad
	         <output_prefix>.primer.log
	"""
	def process_primer(r, match_len, is_reverse):
		# get record into miscBowTie.BowTieReader format 
		# strip away primers from seq & qual, properly rev comp!
		r['offset'] = match_len
		r['seq'] = r['seq'][match_len:]
		r['qual'] = r['qual'][match_len:]
		r['ref'] = 'NA'
		if is_reverse:
			r['seq'] = Seq(r['seq']).reverse_complement().tostring()
			r['qual'] = r['qual'][::-1]
	
	os.system("rm {0}.*primer_*".format(output_prefix))
	Fgood = BowTieWriter(output_prefix + '.Fprimer_good')
	Rgood = BowTieWriter(output_prefix + '.Rprimer_good')
	hbad1 = FastqWriter(output_prefix + '.primer_bad.1')
	hbad2 = FastqWriter(output_prefix + '.primer_bad.2')
	hverbose = open(output_prefix + '.primer.verbose', 'w')
	hlog = open(output_prefix + '.primer.log', 'w')
	start_t = time.time()
	good, bad = 0,0
	
	pmF = PrimerMatch(f_primer)
	pmR = PrimerMatch(r_primer)

	for r1, r2 in itertools.izip(FastqReader(input1), FastqReader(input2)):
		# NOTE: in the case of PE reads
		#       regardless of whether we're matching for F or R primer
		#       they would all appear at the 5' end of the read
		#       which is why we call match_primer_len with is_reverse = False
		match_f_len1, mmf1 = match_primer_len(r1['seq'], f_primer, max_mm, min_match_len, False)
		match_r_len1, mmr1 = match_primer_len(r1['seq'], r_primer, max_mm, min_match_len, False)
		match_f_len2, mmf2 = match_primer_len(r2['seq'], f_primer, max_mm, min_match_len, False)
		match_r_len2, mmr2 = match_primer_len(r2['seq'], r_primer, max_mm, min_match_len, False)
		#match_f_len1 = match_f_len2 =match_r_len1=match_r_len2=0
		if match_f_len1 > 0 and match_r_len2 > 0:
			# case 1, read 1 is F, read 2 is R
			good += 1
			process_primer(r1, match_f_len1, False)
			Fgood.write(r1)
			process_primer(r2, match_r_len2, False)
			Rgood.write(r2)
		elif match_f_len2 > 0 and match_r_len1 > 0:
			# case 2, read 1 is R, case 2 is F
			good += 1
			process_primer(r2, match_f_len2, False)
			Fgood.write(r2)
			process_primer(r1, match_r_len1, False)
			Rgood.write(r1)
		else:
			pmF.make_suffix(r1['seq'])
			pmF.match(min_match_len, max_mm, max_in, max_de)
			if pmF.match_result is not None: 
				pmR.make_suffix(r2['seq'])
				pmR.match(min_match_len, max_mm, max_in, max_de)
				if pmR.match_result is not None:  # case 1, read 1 is F, read 2 is R
					good += 1
					process_primer(r1, pmF.match_result.match_len, False)
					Fgood.write(r1)
					hverbose.write("{0}\t{1}\t{2}\n".format(r1['ID'], pmF.match_result.match_len, pmF.match_result.miss))
					process_primer(r2, pmR.match_result.match_len, False)
					Rgood.write(r2)
					hverbose.write("{0}\t{1}\t{2}\n".format(r2['ID'], pmR.match_result.match_len, pmR.match_result.miss))
				else:
					hbad1.write(r1)
					hbad2.write(r2)
					bad += 1
			else:
				pmR.make_suffix(r1['seq'])
				pmR.match(min_match_len, max_mm, max_in, max_de)
				if pmR.match_result is not None:
					pmF.make_suffix(r2['seq'])
					pmF.match(min_match_len, max_mm, max_in, max_de)
					if pmF.match_result is not None:
						good += 1
						# case 2, read 1 is R, read 2 is F
						process_primer(r2, pmF.match_result.match_len, False)
						hverbose.write("{0}\t{1}\t{2}\n".format(r2['ID'], pmF.match_result.match_len, pmF.match_result.miss))
						Fgood.write(r2)
						process_primer(r1, pmR.match_result.match_len, False)
						Rgood.write(r1)
						hverbose.write("{0}\t{1}\t{2}\n".format(r1['ID'], pmR.match_result.match_len, pmR.match_result.miss))
					else:
						# case 3: unresolved, bad read pair
						hbad1.write(r1)
						hbad2.write(r2)
						bad += 1

	hlog.write("Input 1: {0}\nInput 2: {1}\n".format(input1, input2))
	hlog.write("F primer: {0}\nR primer: {1}\n".format(f_primer, r_primer))
	hlog.write("Min match len: {0}\n".format(min_match_len))
	hlog.write("Max mismatch: {0}\n".format(max_mm))
	hlog.write("Max deletion: {0}\n".format(max_de))
	hlog.write("Max insertion: {0}\n".format(max_in))
	hlog.write("Primer detection and removal took {0} sec.\n".format(time.time()-start_t))
	hlog.write("# of original reads: {0}\n".format(good+bad))
	hlog.write("# of reads removed: {0} ({1:.2f})\n".format(bad,bad*1./(good+bad)))
	hlog.write("# of reads remaining: {0} ({1:.2f})\n".format(good,good*1./(good+bad)))


	Fgood.close()
	Rgood.close()
	hbad1.close()
	hbad2.close()
	hlog.close()
	hverbose.close()
	os.system("gzip " + Fgood.f.name)
	os.system("gzip " + Rgood.f.name)
	os.system("gzip " + hbad1.f.name)
	os.system("gzip " + hbad2.f.name)
	os.system("gzip " + hverbose.name)
コード例 #10
0
def combine_RF(fotu, rotu, ffastq, rfastq, output_prefix):
    """
	Reads two OTU files, 1 for forward 1 for reverse
	Returns: forward otu cid --> reverse otu cid --> abundance
	"""
    seqid2otu = {}
    combo = {}
    with open(fotu) as f:
        for line in f:
            otu, rest = line.strip().split(None, 1)
            combo[otu] = defaultdict(lambda: 0)
            for seqid in rest.split():
                if seqid.endswith('/1') or seqid.endswith('/2'):
                    seqid = seqid[:-2]
                seqid2otu[seqid] = otu
    with open(rotu) as f:
        for line in f:
            otu2, rest = line.strip().split(None, 1)
            for seqid in rest.split():
                if seqid.endswith('/1') or seqid.endswith('/2'):
                    seqid = seqid[:-2]
                if seqid not in seqid2otu:
                    print >> sys.stderr, "{0} is missing in forward, ignore".format(
                        seqid)
                    continue
                otu1 = seqid2otu[seqid]
                combo[otu1][otu2] += 1

    # now write this out as <output_prefix>.combined.{1|2}.fq
    seqdict = {}
    for r in FastqReader(rfastq):
        seqdict[r['ID']] = r

    fqw1 = FastqWriter(output_prefix + '.combined.1.fq')
    fqw2 = FastqWriter(output_prefix + '.combined.2.fq')
    fout = open(output_prefix + '.combined.abundance.txt', 'w')
    for r in FastqReader(ffastq):
        if r['ID'] in combo:
            for id2, abundance in combo[r['ID']].iteritems():
                newid = "{0}_{1}".format(r['ID'], id2)
                fqw1.write(r, id=newid + '/1')
                fqw2.write(seqdict[id2], id=newid + '/2')
                #fqw1.write(">{id}\n{seq}\n".format(seq=r.seq, id=newid+'/1'))
                #fqw2.write(">{id}\n{seq}\n".format(seq=seqdict[id2].seq, id=newid+'/2'))
                fout.write("{0}\t{1}\n".format(newid, abundance))
    fqw1.close()
    fqw2.close()
    fout.close()
    return combo