def best_match(SEQ1, LIST, MAX = float("inf"), IGNORE_N = 0, PRINT = 0 ): """finds the best match for a sequence in a list of sequences. MAX sets the number of max number of mismatches before it moves on. Lowering MAX increases performance. IGNORE_N = 1 will ignore mismatches with N.""" x = [] xcount = [] y = MAX no_exact_match = 0 #first search for exact matach for i in range(len(LIST)): if SEQ1 == LIST[i]: no_exact_match = 0 return i break if no_exact_match: for i in range(len(LIST)): z = BC.mismatches(SEQ1, LIST[i], y, IGNORE_N) if z < y: y = z x.append(i) xcount.append(z) if z == 0: break if len(x) > 0: comp = "==" + str(min(xcount)) best = [a for a,b in enumerate(xcount) if eval(str(b) + comp)] if PRINT == 1: print SEQ1 print LIST[x[0]] print x[best[0]], xcount[best[0]] return x[best[0]] else: return -1
def parse_fastq_by_multitag(directory, f_gzipped_fastqfile, r_gzipped_fastqfile, q = "fastq", f_seqtag_length = 8, r_seqtag_length = 8, f_multitag_length = 6, r_multitag_length = 6, f_lintag_length = 38, r_lintag_length = 38, f_spacer_length = 43, #distance to first barcode in forward read (ignoring the length the thie multitag and the seqtag) r_spacer_length = 29, #distance second barcode in reverse read (ignoring the length the thie multitag and the seqtag) min_qs = 30, #the minimum avareage quality score for both lineage tags lintag_grep_filter1 ='\D*?(.ACC|T.CC|TA.C|TAC.)\D{4,7}?AA\D{4,7}?AA\D{4,7}?TT\D{4,7}?(.TAA|A.AA|AT.A|ATA.)\D*', #first barcode lintag_grep_filter2 ='\D*?(.ACC|T.CC|TA.C|TAC.)\D{4,7}?AA\D{4,7}?TT\D{4,7}?TT\D{4,7}?(.TAA|A.AA|AT.A|ATA.)\D*', #second barcode clip_ends = 1, #logical of whether or not to clip the front and back ends off of lintag1 and lintag2 lintag1_front_clipper = '(.ACC|T.CC|TA.C|TAC.)', #only report lintag1 after this sequence lintag2_front_clipper = '(.ACC|T.CC|TA.C|TAC.)', #only report lintag2 after this sequence lintag1_rear_clipper = '(.TAA|A.AA|AT.A|ATA.)', #only report lintag1 before this sequence, this must be the COMPLIMENT of the true sequence lintag2_rear_clipper = '(.TAA|A.AA|AT.A|ATA.)', #only report lintag2 before this sequence, this must be the COMPLIMENT of the true sequence multitags = ["TAGCTTGCGTAC", "CGATGTGAGACG"], #concatenated multiplexing tags from the first and second reads that uniquely identify a sample, currently must have 2 or more multitags write_multitags = False): #write multitags to file """ Parses a F and R gzipped FastQ files and saves the UMIs, multiplexing tags, and barcodes Removes reeads where the mean quality score for each lineage tag is not greater than min_qs Removes reeads where both lineage tags do not match the regular expression """ from Bio import SeqIO import os import gzip import numpy import BC import re from itertools import izip os.chdir(directory) print("Loading " + f_gzipped_fastqfile + " and " + r_gzipped_fastqfile + " and parsing") print( "Saving the combined forward and reverse sequencing tags as seqtag.txt") print( "Saving the combined forward and reverse multiplexing tags as multitag.txt") print( "Saving the first lineage tag as lintag1.txt") print( "Saving the first lineage tag as lintag2.txt") #assign boundries f_boundries = (0, f_seqtag_length , f_multitag_length + f_seqtag_length, f_multitag_length + f_seqtag_length + f_spacer_length, f_multitag_length + f_seqtag_length + f_spacer_length + f_lintag_length) r_boundries = (0, r_seqtag_length , r_multitag_length + r_seqtag_length, r_multitag_length + r_seqtag_length + r_spacer_length, r_multitag_length + r_seqtag_length + r_spacer_length + r_lintag_length) #open files for writing #reads that sort to a multiplexing tag for i in multitags: vars()[i+'_seqtag'] = open(directory + i + '_seqtag.txt', 'w') vars()[i+'_lintag1'] = open(directory + i + '_lintag1.txt', 'w') vars()[i+'_lintag2'] = open(directory + i + '_lintag2.txt', 'w') if write_multitags: vars()[i+'_multitag'] = open(directory + i + '_multitag.txt', 'w') #reads that do not sort to a multiplexing tag unmatched_seqtag = open(directory + 'unmatched_seqtag.txt', 'w') unmatched_lintag1 = open(directory + 'unmatched_lintag1.txt', 'w') unmatched_lintag2 = open(directory + 'unmatched_lintag2.txt', 'w') unmatched_multitag = open(directory + 'unmatched_multitag.txt', 'w') #open files for reading by SeqIO f_file = SeqIO.parse(gzip.open(directory + f_gzipped_fastqfile, "rU"), q) r_file = SeqIO.parse(gzip.open(directory + r_gzipped_fastqfile, "rU"), q) #eliminate low quality reads and reads that don't pass a quality filter, optionally clip off ends of lintags # sort by multiplexing tags quality_reads = 0 total_reads = 0 for f, r in izip(f_file, r_file): fq = f.letter_annotations["phred_quality"] rq = r.letter_annotations["phred_quality"] total_reads = total_reads + 1 if numpy.mean(fq[f_boundries[3]:f_boundries[4]]) > min_qs and numpy.mean(rq[r_boundries[3]:r_boundries[4]]) > min_qs: #checks that the quality scores of forward and reverse lintags are OK #print "quality ok" fr = str(f.seq) #print fr rr = str(r.seq) #print rr if BC.grep(fr[f_boundries[3]:f_boundries[4]], lintag_grep_filter1) and BC.grep(rr[r_boundries[3]:r_boundries[4]], lintag_grep_filter2): #checks the both lineage tags meet the regular expression filter #print "grep ok" quality_reads = quality_reads + 1 #these are reads where both lintags pass the quality and grep filters #next, find the closest matching multitag m = fr[f_boundries[1]:f_boundries[2]] + rr[r_boundries[1]:r_boundries[2]] #the concatintated multiplexing tag #print m j = BC.best_match(m, multitags, MAX = (f_multitag_length + r_multitag_length + 1)/3) #best matched multiplexing tag #print j if j > -1: tm = BC.mismatches(m, multitags[j]) #distance to this tag else: tm = 1000 if tm < (f_multitag_length + r_multitag_length + 1)/4: #A multitag match has been found ftag = fr[f_boundries[3]:f_boundries[4]] rtag = rr[r_boundries[3]:r_boundries[4]] if(clip_ends): fstart = re.search(lintag1_front_clipper, ftag).span()[1] fend = re.search(lintag1_rear_clipper, ftag[::-1]).span()[1]*-1 if fend == 0: fend = len(ftag) ftag = ftag[fstart:fend] rstart = re.search(lintag2_front_clipper, rtag).span()[1] rend = re.search(lintag2_rear_clipper, rtag[::-1]).span()[1]*-1 if rend == 0: rend = len(rtag) rtag = rtag[rstart:rend] vars()[multitags[j]+'_lintag1'].write(ftag + '\n') vars()[multitags[j]+'_lintag2'].write(rtag + '\n') vars()[multitags[j]+'_seqtag'].write(fr[f_boundries[0]:f_boundries[1]] + rr[r_boundries[0]:r_boundries[1]] + '\n') if write_multitags: vars()[multitags[j]+'_multitag'].write(fr[f_boundries[1]:f_boundries[2]] + rr[r_boundries[1]:r_boundries[2]] + '\n') #if (len(fr[f_boundries[1]:f_boundries[2]] + rr[r_boundries[1]:r_boundries[2]]) < 12 #or len(fr[f_boundries[0]:f_boundries[1]] + rr[r_boundries[0]:r_boundries[1]]) < 16 #or len(ftag) < 20 #or len(rtag) < 20): # print rea # print "match to " + multitags[j] # print "multitag = " + fr[f_boundries[1]:f_boundries[2]] + rr[r_boundries[1]:r_boundries[2]] + " " + str(len(fr[f_boundries[1]:f_boundries[2]] + rr[r_boundries[1]:r_boundries[2]])) # print "seqtag = " + fr[f_boundries[0]:f_boundries[1]] + rr[r_boundries[0]:r_boundries[1]] + " " + str(len(fr[f_boundries[0]:f_boundries[1]] + rr[r_boundries[0]:r_boundries[1]])) # print "lintag1 = " + ftag + " " + str(len(ftag)) # print "lintag2 = " + rtag + " " + str(len(rtag)) # break else: ftag = fr[f_boundries[3]:f_boundries[4]] rtag = rr[r_boundries[3]:r_boundries[4]] if(clip_ends): fstart = re.search(lintag1_front_clipper, ftag).span()[1] fend = re.search(lintag1_rear_clipper, ftag[::-1]).span()[1]*-1 if fend == 0: fend = len(ftag) ftag = ftag[fstart:fend] rstart = re.search(lintag2_front_clipper, rtag).span()[1] rend = re.search(lintag2_rear_clipper, rtag[::-1]).span()[1]*-1 if rend == 0: rend = len(rtag) rtag = rtag[rstart:rend] unmatched_lintag1.write(ftag + '\n') unmatched_lintag2.write(rtag + '\n') unmatched_seqtag.write(fr[f_boundries[0]:f_boundries[1]] + rr[r_boundries[0]:r_boundries[1]] +'\n') unmatched_multitag.write(fr[f_boundries[1]:f_boundries[2]] + rr[r_boundries[1]:r_boundries[2]] +'\n') #if (len(fr[f_boundries[1]:f_boundries[2]] + rr[r_boundries[1]:r_boundries[2]]) < 12 #or len(fr[f_boundries[0]:f_boundries[1]] + rr[r_boundries[0]:r_boundries[1]]) < 16 #or len(ftag) < 20 #or len(rtag) < 20): # print rea # print "match to " + multitags[j] # print "multitag = " + fr[f_boundries[1]:f_boundries[2]] + rr[r_boundries[1]:r_boundries[2]] + " " + str(len(fr[f_boundries[1]:f_boundries[2]] + rr[r_boundries[1]:r_boundries[2]])) # print "seqtag = " + fr[f_boundries[0]:f_boundries[1]] + rr[r_boundries[0]:r_boundries[1]] + " " + str(len(fr[f_boundries[0]:f_boundries[1]] + rr[r_boundries[0]:r_boundries[1]])) # print "lintag1 = " + ftag + " " + str(len(ftag)) # print "lintag2 = " + rtag + " " + str(len(rtag)) # break print ( str(quality_reads) + " out of " + str(total_reads) +" reads passed grep and quality filters") for i in multitags: vars()[str(i)+'_seqtag'].close() vars()[str(i)+'_lintag1'].close() vars()[str(i)+'_lintag2'].close() if write_multitags: vars()[str(i)+'_multitag'].close() unmatched_seqtag.close() unmatched_lintag1.close() unmatched_lintag2.close() unmatched_multitag.close() f_file.close() r_file.close()