def primerStrip(file, GoodOut, BadOut, fwdprimer, revprimer): PL = len(fwdprimer) with open(GoodOut, 'w') as good: with open(BadOut, 'w') as bad: for title, seq, qual in FastqGeneralIterator(open(file)): Diffs = primer.MatchPrefix(seq, fwdprimer) if Diffs <= args.primer_mismatch: Seq = seq[PL:] Qual = qual[PL:] if revprimer:#now need to look for reverse primer BestPosRev, BestDiffsRev = primer.BestMatch2(Seq, revcomp_lib.RevComp(revprimer), args.primer_mismatch) if BestPosRev > 0: #reverse primer was found Seq = Seq[:BestPosRev] Qual = Qual[:BestPosRev] good.write("@%s\n%s\n+\n%s\n" % (title, Seq, Qual)) else: bad.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
if Barcode == "": #if not found, move onto next record continue BarcodeLength = len(Barcode) seq = seq[BarcodeLength:] qual = qual[BarcodeLength:] #look for forward primer if args.require_primer != 'off': #means we only want ones with forward primer and or reverse Diffs = primer.MatchPrefix(seq, FwdPrimer) if Diffs > args.primer_mismatch: continue #if found, trim away primer seq = seq[trim:] qual = qual[trim:] if args.require_primer == 'both': #look for reverse primer, strip if found BestPosRev, BestDiffsRev = primer.BestMatch2( seq, ReverseCompRev, args.primer_mismatch) if BestPosRev > 0: seq = seq[:BestPosRev] qual = qual[:BestPosRev] else: continue #check size if len( seq ) < args.min_len: #filter out sequences less than minimum length. continue runningTotal += 1 fileout = os.path.join(args.out, BarcodeLabel) with open(fileout, 'ab') as output: output.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) if args.require_primer == 'off':
def processRead(input): base = os.path.basename(input).split('.')[0] PL = len(FwdPrimer) RL = len(RevPrimer) DemuxOut = os.path.join(tmpdir, base+'.demux.fq') StatsOut = os.path.join(tmpdir, base+'.stats') Total = 0 NoBarcode = 0 NoRevBarcode = 0 NoPrimer = 0 TooShort = 0 RevPrimerFound = 0 ValidSeqs = 0 with open(StatsOut, 'w') as counts: with open(DemuxOut, 'w') as out: for title, seq, qual in FastqGeneralIterator(open(input)): Total += 1 #look for barcode, trim it off Barcode, BarcodeLabel = FindBarcode(seq, Barcodes) if Barcode == "": NoBarcode += 1 continue BarcodeLength = len(Barcode) Seq = seq[BarcodeLength:] Qual = qual[BarcodeLength:] #now search for forward primer Diffs = primer.MatchPrefix(Seq, FwdPrimer) if Diffs > args.primer_mismatch: NoPrimer += 1 continue ForTrim = PL #now search for reverse primer BestPosRev, BestDiffsRev = primer.BestMatch2(Seq, RevPrimer, args.primer_mismatch) if BestPosRev > 0: #reverse primer was found RevPrimerFound += 1 #location to trim sequences RevTrim = BestPosRev #determine reverse barcode if args.reverse_barcode: RevBCdiffs = 0 BCcut = BestPosRev + RL CutSeq = Seq[BCcut:] RevBarcode, RevBarcodeLabel = FindBarcode(CutSeq, RevBarcodes) if RevBarcode == "": NoRevBarcode += 1 continue BarcodeLabel = BarcodeLabel+'_'+RevBarcodeLabel #now trim record remove forward and reverse reads Seq = Seq[ForTrim:RevTrim] Qual = Qual[ForTrim:RevTrim] #since found reverse primer, now also need to pad/trim if not args.full_length: #check minimum length here or primer dimer type sequences will get padded with Ns if len(Seq) < int(args.min_len): TooShort += 1 continue if len(Seq) < args.trim_len and args.pad == 'on': pad = args.trim_len - len(Seq) Seq = Seq + pad*'N' Qual = Qual +pad*'J' else: #len(Seq) > args.trim_len: Seq = Seq[:args.trim_len] Qual = Qual[:args.trim_len] else: #trim record, did not find reverse primer if args.full_length: #if full length then move to next record continue #trim away forward primer Seq = Seq[ForTrim:] Qual = Qual[ForTrim:] #check length and trim, throw away if too short as it was bad read if len(Seq) < args.trim_len: TooShort += 1 continue Seq = Seq[:args.trim_len] Qual = Qual[:args.trim_len] #check minimum length if len(Seq) < int(args.min_len): TooShort += 1 continue ValidSeqs += 1 #rename header Name = 'R_'+str(ValidSeqs)+';barcodelabel='+BarcodeLabel+';' out.write("@%s\n%s\n+\n%s\n" % (Name, Seq, Qual)) counts.write('%i,%i,%i,%i,%i,%i,%i\n' % (Total, NoBarcode, NoPrimer, RevPrimerFound, NoRevBarcode, TooShort, ValidSeqs))
def ProcessReads(records): global OutCount for rec in records: #convert to string for processing Seq = str(rec.seq) #look for barcodes Barcode, BarcodeLabel = FindBarcode(Seq, Barcodes) if Barcode == "": #if not found, try to find with mismatches if args.barcode_mismatch > 0: hit = [None, None, 0, None, None] for k, v in Barcodes.items(): alignment = amptklib.fuzzymatch(v, Seq, args.barcode_mismatch) if alignment: if alignment[0] > hit[2]: hit = [ k, v, alignment[0], alignment[1], alignment[2] ] if hit[0] != None: BarcodeLength = hit[4] - hit[ 3] #might be shorter than actual barcode BarcodeLabel = hit[0] Barcode = hit[1] else: continue else: continue else: #barcode was found from dictionary BarcodeLength = len(Barcode) #now look for primer, if not found, move onto next record BestPosFor, BestDiffsFor = primer.BestMatch2(Seq, FwdPrimer, MAX_PRIMER_MISMATCHES) if BestPosFor > 0 and BestPosFor <= BarcodeLength + 2: #if found will be > 0, and should be found after barcode ForTrim = BestPosFor + PL else: continue #counter for numbering reads OutCount += 1 #look for reverse primer BestPosRev, BestDiffsRev = primer.BestMatch2(Seq, RevPrimer, MAX_PRIMER_MISMATCHES) if BestPosRev > 0: #reverse primer was found #location to trim sequences RevTrim = BestPosRev #determine reverse barcode if args.reverse_barcode: BCcut = BestPosRev + RL CutSeq = Seq[BCcut:] if not CutSeq in RevBarcodes: if args.barcode_mismatch > 0: hit = [None, None, 0, None, None] for k, v in RevBarcodes.items(): alignment = amptklib.fuzzymatch( k, CutSeq, args.barcode_mismatch) if alignment: if alignment[0] > hit[2]: hit = [ v, k, alignment[0], alignment[1], alignment[2] ] if hit[0] != None: BCname = hit[0] else: continue else: continue else: BCname = RevBarcodes.get(CutSeq) #update name BarcodeLabel = BarcodeLabel + '_' + BCname #trim record rec = TrimRead(rec, ForTrim, RevTrim, BarcodeLabel, OutCount) #check length L = len(rec.seq) if L < MinLen: continue if not args.full_length: #now check trim length, pad if necessary if L < TrimLen: pad = TrimLen - L Seq = str(rec.seq) Seq = Seq + pad * 'N' Qual = rec.letter_annotations["phred_quality"] pad = TrimLen - L add = [40] * pad Qual.extend(add) del rec.letter_annotations["phred_quality"] rec.seq = Seq rec.letter_annotations["phred_quality"] = Qual yield rec elif L >= TrimLen: rec = rec[:TrimLen] yield rec else: yield rec else: #if it is full length, we did not find reverse primer, so drop read if not args.full_length: #trim record rec = TrimRead(rec, ForTrim, False, BarcodeLabel, OutCount) #check length L = len(rec.seq) if L < MinLen: #remove if shorter than minimum length continue #truncate down to trim length if L >= TrimLen: rec = rec[:TrimLen] yield rec
def OnRec(Label, Seq, Qual): global PL, LabelPrefix, SeqCount, OutCount, TooShortCount, PadCount global FwdPrimerMismatchCount, RevPrimerStrippedCount global FwdPrimer, RevPrimer if SeqCount == 0: progress.InitFile(fastq.File) progress.File("%u reads, %u outupt, %u bad fwd primer, %u rev primer stripped, %u too short. %u padded" % \ (SeqCount, OutCount, FwdPrimerMismatchCount, RevPrimerStrippedCount, TooShortCount, PadCount)) SeqCount += 1 Seq = Seq Qual = Qual Diffs = MatchesPrimer(Seq, FwdPrimer) if Diffs > MAX_PRIMER_MISMATCHES: FwdPrimerMismatchCount += 1 return OutCount += 1 Label = LabelPrefix + str(OutCount) + ";barcodelabel=" + SampleLabel + ";" # Strip fwd primer Seq = Seq[PL:] Qual = Qual[PL:] BestPosRev, BestDiffsRev = primer.BestMatch2(Seq, RevPrimer, MAX_PRIMER_MISMATCHES) if BestPosRev > 0: # Strip rev primer RevPrimerStrippedCount += 1 StrippedSeq = Seq[:BestPosRev] StrippedQual = Qual[:BestPosRev] # correctness checks if 1: Tail = Seq[BestPosRev:] Diffs2 = primer.MatchPrefix(Tail, RevPrimer) if Diffs2 != BestDiffsRev: print >> sys.stderr print >> sys.stderr, " Seq=" + Seq print >> sys.stderr, "Tail=" + Tail print >> sys.stderr, "RevP=" + RevPrimer die.Die("BestPosRev %u Diffs2 %u BestDiffsRev %u" % (BestPosRev, Diffs2, BestDiffsRev)) assert StrippedSeq + Tail == Seq Seq = StrippedSeq Qual = StrippedQual L = len(Seq) assert len(Qual) == L if L < MinLen: return if L < TrimLen: PadCount += 1 Seq = Seq + (TrimLen - L) * 'N' Qual = Qual + (TrimLen - L) * 'I' L = len(Seq) assert L == TrimLen assert len(Qual) == TrimLen L = len(Seq) if L < TrimLen: TooShortCount += 1 return if L > TrimLen: Seq = Seq[:TrimLen] Qual = Qual[:TrimLen] L = len(Seq) assert L == TrimLen assert len(Qual) == TrimLen fastq.WriteRec(out_file, Label, Seq, Qual)
def stripPrimer(records): for rec in records: if args.utax == 'unite2utax': latin = unicode(rec.description, 'utf-8') test = latin.encode('ascii', 'latin2ascii') fields = test.split("|") for i in fields: if i.startswith("k__"): tax = i elif i.startswith("SH"): unite = i elif i.startswith("re"): reps = i else: gbID = i taxonomy = re.sub(";", ",", tax) taxonomy = re.sub("__", ":", taxonomy) tf = taxonomy.split(",") k = tf[0] k = re.sub('_', ' ', k) p = tf[1] p = re.sub('_', ' ', p) c = tf[2] c = re.sub('_', ' ', c) o = tf[3] o = re.sub('_', ' ', o) f = tf[4] f = re.sub('_', ' ', f) g = tf[5] g = re.sub('_', ' ', g) s = tf[6] s = re.sub('[(].*$', '', s) s = re.sub('_', ' ', s) s = re.sub('\.', '', s) test_species = s.split(' ') if len(test_species) < 2: s = 's:' reformat_tax = [] removal = ("unidentified", "Incertae", "uncultured", "Group", "incertae") sp_removal = (" sp", "_sp", "uncultured", "isolate", "mycorrhizae", "vouchered", "fungal", "basidiomycete", "ascomycete", "fungus", "symbiont") if not any(x in k for x in removal): reformat_tax.append(k) if not any(x in p for x in removal): reformat_tax.append(p) if not any(x in c for x in removal): reformat_tax.append(c) if not any(x in o for x in removal): reformat_tax.append(o) if not any(x in f for x in removal): reformat_tax.append(f) if not any(x in g for x in removal): reformat_tax.append(g) if not any(x in s for x in sp_removal): reformat_tax.append(s) rec.id = gbID + ";tax=" + ",".join(reformat_tax) rec.id = re.sub(",s:$", "", rec.id) rec.id = re.sub("=s:$", "=", rec.id) if rec.id.endswith( ";tax="): #if there is no taxonomy, get rid of it rec.id = "" rec.name = "" rec.description = "" elif args.utax == 'rdp2utax': latin = unicode(rec.description, 'utf-8') test = latin.encode('ascii', 'latin2ascii') temp = test.split("\t") taxLevels = temp[-1] split_temp = temp[0].split(";") ID = split_temp[0].split(" ")[0] s = "s:" + split_temp[0].split(" ", 1)[-1] s = re.sub('[(].*$', '', s) s = re.sub(',', '_', s) s = re.sub('\.', '', s) test_species = s.split(' ') if len(test_species) < 2: s = 's:' split_tax = taxLevels.split(";") if "domain" in split_tax: ki = split_tax.index("domain") - 1 k = "k:" + split_tax[ki] k = k.replace('"', '') k = k.split(" ")[0] else: k = "" if "phylum" in split_tax: pi = split_tax.index("phylum") - 1 p = "p:" + split_tax[pi] p = p.replace('"', '') p = p.split(" ")[0] else: p = "" if "class" in split_tax: ci = split_tax.index("class") - 1 c = "c:" + split_tax[ci] c = c.replace('"', '') c = c.split(" ")[0] else: c = "" if "order" in split_tax: oi = split_tax.index("order") - 1 o = "o:" + split_tax[oi] o = o.replace('"', '') o = o.split(" ")[0] else: o = "" if "family" in split_tax: fi = split_tax.index("family") - 1 f = "f:" + split_tax[fi] f = f.replace('"', '') f = f.split(" ")[0] else: f = "" if "genus" in split_tax: gi = split_tax.index("genus") - 1 g = "g:" + split_tax[gi] g = g.replace('"', '') g = g.split(" ")[0] else: g = "" reformat_tax = [] removal = ("unidentified", "Incertae", "uncultured", "Group", "incertae", "Chloroplast", "unclassified", "Family") sp_removal = (" sp", "_sp", "uncultured", "isolate", "mycorrhizae", "vouchered", "fungal", "basidiomycete", "ascomycete", "fungus", "symbiont", "unclassified", "unidentified", "bacterium", "phytoplasma") if not any(x in k for x in removal) and k != "": reformat_tax.append(k) if not any(x in p for x in removal) and p != "": reformat_tax.append(p) if not any(x in c for x in removal) and c != "": reformat_tax.append(c) if not any(x in o for x in removal) and o != "": reformat_tax.append(o) if not any(x in f for x in removal) and f != "": reformat_tax.append(f) if not any(x in g for x in removal) and g != "": reformat_tax.append(g) if not any(x in s for x in sp_removal): reformat_tax.append(s) rec.id = ID + ";tax=" + ",".join(reformat_tax) rec.id = re.sub(",s:$", "", rec.id) if rec.id.endswith( ";tax="): #if there is no taxonomy, get rid of it rec.id = "" rec.name = "" rec.description = "" if not args.trimming: Seq = rec.seq MAX_PRIMER_MISMATCHES = int(args.primer_mismatch) revPrimer = revcomp_lib.RevComp(RevPrimer) BestPosFor, BestDiffsFor = primer.BestMatch2( Seq, FwdPrimer, MAX_PRIMER_MISMATCHES) if BestDiffsFor < MAX_PRIMER_MISMATCHES: if BestPosFor > 0: stripfwdlen = fwdLen + BestPosFor StripSeq = Seq[stripfwdlen:] #now look for reverse BestPosRev, BestDiffsRev = primer.BestMatch2( StripSeq, revPrimer, MAX_PRIMER_MISMATCHES) if BestDiffsRev < MAX_PRIMER_MISMATCHES: StrippedSeq = StripSeq[:BestPosRev] else: StrippedSeq = StripSeq #after stripping primers, check for ambig bases if args.drop_ns != 0 and 'N' * args.drop_ns in StrippedSeq: continue rec.seq = StrippedSeq if rec.id != "" and rec.seq != "" and len(rec.seq) > 50: yield rec else: #if can't find forward primer, try to reverse complement and look again RevSeq = revcomp_lib.RevComp(Seq) BestPosFor, BestDiffsFor = primer.BestMatch2( RevSeq, FwdPrimer, MAX_PRIMER_MISMATCHES) if BestDiffsFor < MAX_PRIMER_MISMATCHES: if BestPosFor > 0: stripfwdlen = fwdLen + BestPosFor StripSeq = Seq[stripfwdlen:] #now look for reverse BestPosRev, BestDiffsRev = primer.BestMatch2( StripSeq, revPrimer, MAX_PRIMER_MISMATCHES) if BestDiffsRev < MAX_PRIMER_MISMATCHES: StrippedSeq = StripSeq[:BestPosRev] else: StrippedSeq = StripSeq #after stripping primers, check for ambig bases if args.drop_ns != 0 and 'N' * args.drop_ns in StrippedSeq: continue rec.seq = StrippedSeq if rec.id != "" and rec.seq != "" and len( rec.seq) > 50: yield rec else: if args.keep_all: StripSeq = Seq #now look for reverse BestPosRev, BestDiffsRev = primer.BestMatch2( StripSeq, revPrimer, MAX_PRIMER_MISMATCHES) if BestDiffsRev < MAX_PRIMER_MISMATCHES: StrippedSeq = StripSeq[:BestPosRev] else: StrippedSeq = StripSeq #after stripping primers, check for ambig bases if args.drop_ns != 0 and 'N' * args.drop_ns in StrippedSeq: continue rec.seq = StrippedSeq if rec.id != "" and rec.seq != "" and len( rec.seq) > 50: yield rec else: #check for ambig bases Seq = str(rec.seq) if args.drop_ns != 0 and 'N' * args.drop_ns in Seq: continue if rec.id != "" and rec.seq != "" and len(rec.seq) > 50: yield rec
def processRead(input): #input is expected to be a FASTQ file #local variables that need to be previously declared: ForPrimer, RevPrimer Name = os.path.basename(input).split(".fq",-1)[0] DemuxOut = os.path.join(args.out, Name + '.demux.fq') Sample = Name.split('_')[0] StatsOut = os.path.join(args.out, Name+'.stats') Total = 0 NoPrimer = 0 TooShort = 0 RevPrimerFound = 0 ValidSeqs = 0 PL = len(FwdPrimer) with open(StatsOut, 'w') as counts: with open(DemuxOut, 'w') as out: for title, seq, qual in FastqGeneralIterator(open(input)): Total += 1 #first thing is look for forward primer, if found trim it off Diffs = primer.MatchPrefix(seq, FwdPrimer) #if require primer is on make finding primer in amplicon required if amplicon is larger than read length #if less than read length, can't enforce primer because could have been trimmed via staggered trim in fastq_mergepairs if args.primer == 'on' and len(seq) > ReadLen: if Diffs > args.primer_mismatch: NoPrimer += 1 continue Seq = seq[PL:] Qual = qual[PL:] else: if Diffs <= args.primer_mismatch: Seq = seq[PL:] Qual = qual[PL:] else: NoPrimer += 1 Seq = seq Qual = qual #now look for reverse primer BestPosRev, BestDiffsRev = primer.BestMatch2(Seq, RevPrimer, args.primer_mismatch) if BestPosRev > 0: #reverse primer was found RevPrimerFound += 1 #location to trim sequences, trim seqs Seq = Seq[:BestPosRev] Qual = Qual[:BestPosRev] else: if args.full_length and len(Seq) > ReadLen: #if full length and no primer found, exit, except when length is less than read length continue #if full_length is passed, then only trim primers if not args.full_length: #got here if primers were found they were trimmed #now check seq length, pad if too short, trim if too long if len(Seq) < args.min_len: #need this check here or primer dimers will get through TooShort += 1 continue if len(Seq) < args.trim_len and args.pad == 'on': pad = args.trim_len - len(Seq) Seq = Seq + pad*'N' Qual = Qual +pad*'J' else: #len(Seq) > args.trim_len: Seq = Seq[:args.trim_len] Qual = Qual[:args.trim_len] #got here, reads are primers trimmed and trim/padded, check length if len(Seq) < args.min_len: TooShort += 1 continue ValidSeqs += 1 #now fix header Title = 'R_'+str(ValidSeqs)+';barcodelabel='+Sample+';' #now write to file out.write("@%s\n%s\n+\n%s\n" % (Title, Seq, Qual)) counts.write('%i,%i,%i,%i,%i\n' % (Total, NoPrimer, RevPrimerFound, TooShort, ValidSeqs))
def ProcessReads(records): OutCount = 0 MAX_PRIMER_MISMATCHES = int(args.primer_mismatch) LabelPrefix = args.prefix MinLen = int(args.min_len) TrimLen = int(args.trim_len) PL = len(FwdPrimer) revPrimer = revcomp_lib.RevComp(RevPrimer) for rec in records: OutCount += 1 rec.id = LabelPrefix + str(OutCount) + ";barcodelabel=" + name + ";" rec.name = "" rec.description = "" #turn sequence into string for matching Seq = str(rec.seq) Diffs = MatchesPrimer(Seq, FwdPrimer) if args.primer == "on": if Diffs > MAX_PRIMER_MISMATCHES: continue # Strip fwd primer from rec rec = rec[PL:] elif args.primer == "off": if Diffs < MAX_PRIMER_MISMATCHES: # Strip fwd primer from rec rec = rec[PL:] #turn seq into str again Seq = str(rec.seq) #look for reverse primer BestPosRev, BestDiffsRev = primer.BestMatch2(Seq, revPrimer, MAX_PRIMER_MISMATCHES) if BestPosRev > 0: # Strip rev primer from rec.seq rec = rec[:BestPosRev] #check length L = len(rec.seq) if L < MinLen: continue if not args.full_length: #now check trim length, pad if necessary if L < TrimLen: pad = TrimLen - L Seq = str(rec.seq) Seq = Seq + pad * 'N' Qual = rec.letter_annotations["phred_quality"] pad = TrimLen - L add = [40] * pad Qual.extend(add) del rec.letter_annotations["phred_quality"] rec.seq = Seq rec.letter_annotations["phred_quality"] = Qual yield rec elif L >= TrimLen: rec = rec[:TrimLen] yield rec else: yield rec else: #check length L = len(rec.seq) if not args.full_length: if args.primer == 'off': #if custom primer used, then need to pad from end not only if rev primer found if L < MinLen: #but for quality control, need to cull reads that are really short as they are likely garbage continue if L < TrimLen: pad = TrimLen - L Seq = str(rec.seq) Seq = Seq + pad * 'N' Qual = rec.letter_annotations["phred_quality"] pad = TrimLen - L add = [40] * pad Qual.extend(add) del rec.letter_annotations["phred_quality"] rec.seq = Seq rec.letter_annotations["phred_quality"] = Qual yield rec elif L >= TrimLen: rec = rec[:TrimLen] yield rec elif args.primer == 'on': #truncate down to trim length if L >= TrimLen: rec = rec[:TrimLen] yield rec else: if L >= MinLen: yield rec