Esempio n. 1
0
def best_match(SEQ1, LIST, MAX = float("inf"), IGNORE_N = 0, PRINT = 0 ):
	"""finds the best match for a sequence in a list of sequences.
	MAX sets the number of max number of mismatches before it moves on.
	Lowering MAX increases performance.
	IGNORE_N = 1 will ignore mismatches with N."""
	x = []
	xcount = []
	y = MAX
	no_exact_match = 0
	#first search for exact matach
	for i in range(len(LIST)):
		if SEQ1 ==  LIST[i]:
			no_exact_match = 0
			return i
			break 
	if no_exact_match:
		for i in range(len(LIST)):
		   z = BC.mismatches(SEQ1, LIST[i], y, IGNORE_N)
		   if z < y:
			  y = z
			  x.append(i)
			  xcount.append(z)
		   if z == 0:
			  break
		if len(x) > 0:
		   comp = "==" + str(min(xcount))
		   best =  [a for a,b in enumerate(xcount) if eval(str(b) + comp)]
		   if PRINT == 1:
			  print SEQ1
			  print LIST[x[0]]
			  print x[best[0]], xcount[best[0]]
		   return x[best[0]]
		else:
		   return -1
Esempio n. 2
0
def parse_fastq_by_multitag(directory, f_gzipped_fastqfile, r_gzipped_fastqfile,
		q = "fastq",
		f_seqtag_length = 8,
		r_seqtag_length = 8,
		f_multitag_length = 6,
		r_multitag_length = 6,
		f_lintag_length = 38,
		r_lintag_length = 38, 
		f_spacer_length = 43, #distance to first barcode in forward read (ignoring the length the thie multitag and the seqtag)
		r_spacer_length = 29, #distance second barcode in reverse read (ignoring the length the thie multitag and the seqtag)
		min_qs = 30, #the minimum avareage quality score for both lineage tags
		lintag_grep_filter1 ='\D*?(.ACC|T.CC|TA.C|TAC.)\D{4,7}?AA\D{4,7}?AA\D{4,7}?TT\D{4,7}?(.TAA|A.AA|AT.A|ATA.)\D*', #first barcode
		lintag_grep_filter2 ='\D*?(.ACC|T.CC|TA.C|TAC.)\D{4,7}?AA\D{4,7}?TT\D{4,7}?TT\D{4,7}?(.TAA|A.AA|AT.A|ATA.)\D*', #second barcode
		clip_ends = 1, #logical of whether or not to clip the front and back ends off of lintag1 and lintag2
		lintag1_front_clipper = '(.ACC|T.CC|TA.C|TAC.)', #only report lintag1 after this sequence
		lintag2_front_clipper = '(.ACC|T.CC|TA.C|TAC.)', #only report lintag2 after this sequence
		lintag1_rear_clipper = '(.TAA|A.AA|AT.A|ATA.)', #only report lintag1 before this sequence, this must be the COMPLIMENT of the true sequence
		lintag2_rear_clipper = '(.TAA|A.AA|AT.A|ATA.)', #only report lintag2 before this sequence, this must be the COMPLIMENT of the true sequence
		multitags = ["TAGCTTGCGTAC", "CGATGTGAGACG"], #concatenated multiplexing tags from the first and second reads that uniquely identify a sample, currently must have 2 or more multitags
		write_multitags = False): #write multitags to file
		
	"""
	Parses a F and R gzipped FastQ files and saves the UMIs, multiplexing tags, and barcodes
	Removes reeads where the mean quality score for each lineage tag is not greater than min_qs
	Removes reeads where both lineage tags do not match the regular expression 
	"""

	from Bio import SeqIO
	import os
	import gzip
	import numpy
	import BC
	import re
	from itertools import izip
	os.chdir(directory)
	print("Loading " + f_gzipped_fastqfile + " and " + r_gzipped_fastqfile + " and parsing")
	print( "Saving the combined forward and reverse sequencing tags as seqtag.txt")
	print( "Saving the combined forward and reverse multiplexing tags  as multitag.txt")
	print( "Saving the first lineage tag as lintag1.txt")
	print( "Saving the first lineage tag as lintag2.txt")
	
	#assign boundries
	f_boundries = (0, f_seqtag_length , f_multitag_length + f_seqtag_length,
			f_multitag_length + f_seqtag_length + f_spacer_length,
			f_multitag_length + f_seqtag_length + f_spacer_length + f_lintag_length)
	r_boundries = (0, r_seqtag_length , r_multitag_length + r_seqtag_length,
			r_multitag_length + r_seqtag_length + r_spacer_length,
			r_multitag_length + r_seqtag_length + r_spacer_length + r_lintag_length)
	
	
	#open files for writing
	#reads that sort to a multiplexing tag
	for i in multitags:
		vars()[i+'_seqtag'] = open(directory + i + '_seqtag.txt', 'w')
		vars()[i+'_lintag1'] = open(directory + i + '_lintag1.txt', 'w')
		vars()[i+'_lintag2'] = open(directory + i + '_lintag2.txt', 'w')
		if write_multitags: vars()[i+'_multitag'] = open(directory + i + '_multitag.txt', 'w')
	
	#reads that do not sort to a multiplexing tag
	unmatched_seqtag = open(directory + 'unmatched_seqtag.txt', 'w')
	unmatched_lintag1 = open(directory + 'unmatched_lintag1.txt', 'w')
	unmatched_lintag2 = open(directory + 'unmatched_lintag2.txt', 'w')
	unmatched_multitag = open(directory + 'unmatched_multitag.txt', 'w')
	
	#open files for reading by SeqIO
	f_file = SeqIO.parse(gzip.open(directory + f_gzipped_fastqfile, "rU"), q)
	r_file = SeqIO.parse(gzip.open(directory + r_gzipped_fastqfile, "rU"), q)
	
	#eliminate low quality reads and reads that don't pass a quality filter, optionally clip off ends of lintags
	# sort by multiplexing tags
	quality_reads = 0
	total_reads = 0
	for f, r in izip(f_file, r_file):
		fq = f.letter_annotations["phred_quality"]
		rq = r.letter_annotations["phred_quality"]
		total_reads = total_reads + 1
		if numpy.mean(fq[f_boundries[3]:f_boundries[4]]) > min_qs and numpy.mean(rq[r_boundries[3]:r_boundries[4]]) > min_qs:
			#checks that the quality scores of forward and reverse lintags are OK
			#print "quality ok"
			fr = str(f.seq)
			#print fr
			rr = str(r.seq)
			#print rr
			if BC.grep(fr[f_boundries[3]:f_boundries[4]], lintag_grep_filter1) and BC.grep(rr[r_boundries[3]:r_boundries[4]], lintag_grep_filter2):
				#checks the both lineage tags meet the regular expression filter
				#print "grep ok"
				quality_reads = quality_reads + 1 #these are reads where both lintags pass the quality and grep filters
				#next, find the closest matching multitag
				m = fr[f_boundries[1]:f_boundries[2]] + rr[r_boundries[1]:r_boundries[2]] #the concatintated multiplexing tag
				#print m
				j = BC.best_match(m, multitags, MAX = (f_multitag_length + r_multitag_length + 1)/3) #best matched multiplexing tag
				#print j
				if j > -1:
					tm = BC.mismatches(m, multitags[j]) #distance to this tag
				else:
					tm = 1000
				if tm < (f_multitag_length + r_multitag_length + 1)/4: #A multitag match has been found
					ftag = fr[f_boundries[3]:f_boundries[4]]
					rtag = rr[r_boundries[3]:r_boundries[4]] 
					if(clip_ends):
						fstart = re.search(lintag1_front_clipper, ftag).span()[1]
						fend = re.search(lintag1_rear_clipper, ftag[::-1]).span()[1]*-1
						if fend == 0: fend = len(ftag)
						ftag = ftag[fstart:fend]
						rstart = re.search(lintag2_front_clipper, rtag).span()[1]
						rend = re.search(lintag2_rear_clipper, rtag[::-1]).span()[1]*-1
						if rend == 0: rend = len(rtag)
						rtag = rtag[rstart:rend]
					vars()[multitags[j]+'_lintag1'].write(ftag + '\n')
					vars()[multitags[j]+'_lintag2'].write(rtag + '\n')
					vars()[multitags[j]+'_seqtag'].write(fr[f_boundries[0]:f_boundries[1]] + rr[r_boundries[0]:r_boundries[1]] + '\n')
					if write_multitags: vars()[multitags[j]+'_multitag'].write(fr[f_boundries[1]:f_boundries[2]] + rr[r_boundries[1]:r_boundries[2]] + '\n')
					#if (len(fr[f_boundries[1]:f_boundries[2]] + rr[r_boundries[1]:r_boundries[2]]) < 12
					#or len(fr[f_boundries[0]:f_boundries[1]] + rr[r_boundries[0]:r_boundries[1]]) < 16
					#or len(ftag) < 20
					#or len(rtag) < 20): 
					#    print rea
					#    print "match to " + multitags[j]
					#    print "multitag = " + fr[f_boundries[1]:f_boundries[2]] + rr[r_boundries[1]:r_boundries[2]] + " " + str(len(fr[f_boundries[1]:f_boundries[2]] + rr[r_boundries[1]:r_boundries[2]]))
					#    print "seqtag = " + fr[f_boundries[0]:f_boundries[1]] + rr[r_boundries[0]:r_boundries[1]]  + " " + str(len(fr[f_boundries[0]:f_boundries[1]] + rr[r_boundries[0]:r_boundries[1]]))
					#    print "lintag1 = " + ftag + " " + str(len(ftag))
					#    print "lintag2 = " + rtag + " " + str(len(rtag))
					#    break
					else:
						ftag = fr[f_boundries[3]:f_boundries[4]]
						rtag = rr[r_boundries[3]:r_boundries[4]] 
						if(clip_ends):
							fstart = re.search(lintag1_front_clipper, ftag).span()[1]
							fend = re.search(lintag1_rear_clipper, ftag[::-1]).span()[1]*-1
							if fend == 0: fend = len(ftag)
							ftag = ftag[fstart:fend]
							rstart = re.search(lintag2_front_clipper, rtag).span()[1]
							rend = re.search(lintag2_rear_clipper, rtag[::-1]).span()[1]*-1
							if rend == 0: rend = len(rtag)
							rtag = rtag[rstart:rend]
						unmatched_lintag1.write(ftag + '\n')
						unmatched_lintag2.write(rtag + '\n')
						unmatched_seqtag.write(fr[f_boundries[0]:f_boundries[1]] + rr[r_boundries[0]:r_boundries[1]] +'\n')
						unmatched_multitag.write(fr[f_boundries[1]:f_boundries[2]] + rr[r_boundries[1]:r_boundries[2]] +'\n')
						#if (len(fr[f_boundries[1]:f_boundries[2]] + rr[r_boundries[1]:r_boundries[2]]) < 12
						#or len(fr[f_boundries[0]:f_boundries[1]] + rr[r_boundries[0]:r_boundries[1]]) < 16
						#or len(ftag) < 20
						#or len(rtag) < 20): 
						#    print rea
						#    print "match to " + multitags[j]
						#    print "multitag = " + fr[f_boundries[1]:f_boundries[2]] + rr[r_boundries[1]:r_boundries[2]] + " " + str(len(fr[f_boundries[1]:f_boundries[2]] + rr[r_boundries[1]:r_boundries[2]]))
						#    print "seqtag = " + fr[f_boundries[0]:f_boundries[1]] + rr[r_boundries[0]:r_boundries[1]]  + " " + str(len(fr[f_boundries[0]:f_boundries[1]] + rr[r_boundries[0]:r_boundries[1]]))
						#    print "lintag1 = " + ftag + " " + str(len(ftag))
						#    print "lintag2 = " + rtag + " " + str(len(rtag))
						#    break
	print ( str(quality_reads) + " out of " + str(total_reads) +" reads passed grep and quality filters")
	for i in multitags:
		vars()[str(i)+'_seqtag'].close() 
		vars()[str(i)+'_lintag1'].close() 
		vars()[str(i)+'_lintag2'].close()
		if write_multitags: vars()[str(i)+'_multitag'].close()
	
	unmatched_seqtag.close()
	unmatched_lintag1.close()
	unmatched_lintag2.close()
	unmatched_multitag.close()
	f_file.close()
	r_file.close()