def primerStrip(file, GoodOut, BadOut, fwdprimer, revprimer): PL = len(fwdprimer) with open(GoodOut, 'w') as good: with open(BadOut, 'w') as bad: for title, seq, qual in FastqGeneralIterator(open(file)): Diffs = primer.MatchPrefix(seq, fwdprimer) if Diffs <= args.primer_mismatch: Seq = seq[PL:] Qual = qual[PL:] if revprimer:#now need to look for reverse primer BestPosRev, BestDiffsRev = primer.BestMatch2(Seq, revcomp_lib.RevComp(revprimer), args.primer_mismatch) if BestPosRev > 0: #reverse primer was found Seq = Seq[:BestPosRev] Qual = Qual[:BestPosRev] good.write("@%s\n%s\n+\n%s\n" % (title, Seq, Qual)) else: bad.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
#this will loop through FASTQ file once, splitting those where barcodes are found, and primers trimmed runningTotal = 0 trim = len(FwdPrimer) #print Barcodes with open(args.FASTQ, 'rU') as input: for title, seq, qual in FastqGeneralIterator(input): Barcode, BarcodeLabel = FindBarcode(seq, Barcodes) if Barcode == "": #if not found, move onto next record continue BarcodeLength = len(Barcode) seq = seq[BarcodeLength:] qual = qual[BarcodeLength:] #look for forward primer if args.require_primer != 'off': #means we only want ones with forward primer and or reverse Diffs = primer.MatchPrefix(seq, FwdPrimer) if Diffs > args.primer_mismatch: continue #if found, trim away primer seq = seq[trim:] qual = qual[trim:] if args.require_primer == 'both': #look for reverse primer, strip if found BestPosRev, BestDiffsRev = primer.BestMatch2( seq, ReverseCompRev, args.primer_mismatch) if BestPosRev > 0: seq = seq[:BestPosRev] qual = qual[:BestPosRev] else: continue #check size
def processRead(input): base = os.path.basename(input).split('.')[0] PL = len(FwdPrimer) RL = len(RevPrimer) DemuxOut = os.path.join(tmpdir, base+'.demux.fq') StatsOut = os.path.join(tmpdir, base+'.stats') Total = 0 NoBarcode = 0 NoRevBarcode = 0 NoPrimer = 0 TooShort = 0 RevPrimerFound = 0 ValidSeqs = 0 with open(StatsOut, 'w') as counts: with open(DemuxOut, 'w') as out: for title, seq, qual in FastqGeneralIterator(open(input)): Total += 1 #look for barcode, trim it off Barcode, BarcodeLabel = FindBarcode(seq, Barcodes) if Barcode == "": NoBarcode += 1 continue BarcodeLength = len(Barcode) Seq = seq[BarcodeLength:] Qual = qual[BarcodeLength:] #now search for forward primer Diffs = primer.MatchPrefix(Seq, FwdPrimer) if Diffs > args.primer_mismatch: NoPrimer += 1 continue ForTrim = PL #now search for reverse primer BestPosRev, BestDiffsRev = primer.BestMatch2(Seq, RevPrimer, args.primer_mismatch) if BestPosRev > 0: #reverse primer was found RevPrimerFound += 1 #location to trim sequences RevTrim = BestPosRev #determine reverse barcode if args.reverse_barcode: RevBCdiffs = 0 BCcut = BestPosRev + RL CutSeq = Seq[BCcut:] RevBarcode, RevBarcodeLabel = FindBarcode(CutSeq, RevBarcodes) if RevBarcode == "": NoRevBarcode += 1 continue BarcodeLabel = BarcodeLabel+'_'+RevBarcodeLabel #now trim record remove forward and reverse reads Seq = Seq[ForTrim:RevTrim] Qual = Qual[ForTrim:RevTrim] #since found reverse primer, now also need to pad/trim if not args.full_length: #check minimum length here or primer dimer type sequences will get padded with Ns if len(Seq) < int(args.min_len): TooShort += 1 continue if len(Seq) < args.trim_len and args.pad == 'on': pad = args.trim_len - len(Seq) Seq = Seq + pad*'N' Qual = Qual +pad*'J' else: #len(Seq) > args.trim_len: Seq = Seq[:args.trim_len] Qual = Qual[:args.trim_len] else: #trim record, did not find reverse primer if args.full_length: #if full length then move to next record continue #trim away forward primer Seq = Seq[ForTrim:] Qual = Qual[ForTrim:] #check length and trim, throw away if too short as it was bad read if len(Seq) < args.trim_len: TooShort += 1 continue Seq = Seq[:args.trim_len] Qual = Qual[:args.trim_len] #check minimum length if len(Seq) < int(args.min_len): TooShort += 1 continue ValidSeqs += 1 #rename header Name = 'R_'+str(ValidSeqs)+';barcodelabel='+BarcodeLabel+';' out.write("@%s\n%s\n+\n%s\n" % (Name, Seq, Qual)) counts.write('%i,%i,%i,%i,%i,%i,%i\n' % (Total, NoBarcode, NoPrimer, RevPrimerFound, NoRevBarcode, TooShort, ValidSeqs))
def OnRec(Label, Seq, Qual): global PL, LabelPrefix, SeqCount, OutCount, TooShortCount, PadCount global FwdPrimerMismatchCount, RevPrimerStrippedCount global FwdPrimer, RevPrimer if SeqCount == 0: progress.InitFile(fastq.File) progress.File("%u reads, %u outupt, %u bad fwd primer, %u rev primer stripped, %u too short. %u padded" % \ (SeqCount, OutCount, FwdPrimerMismatchCount, RevPrimerStrippedCount, TooShortCount, PadCount)) SeqCount += 1 Seq = Seq Qual = Qual Diffs = MatchesPrimer(Seq, FwdPrimer) if Diffs > MAX_PRIMER_MISMATCHES: FwdPrimerMismatchCount += 1 return OutCount += 1 Label = LabelPrefix + str(OutCount) + ";barcodelabel=" + SampleLabel + ";" # Strip fwd primer Seq = Seq[PL:] Qual = Qual[PL:] BestPosRev, BestDiffsRev = primer.BestMatch2(Seq, RevPrimer, MAX_PRIMER_MISMATCHES) if BestPosRev > 0: # Strip rev primer RevPrimerStrippedCount += 1 StrippedSeq = Seq[:BestPosRev] StrippedQual = Qual[:BestPosRev] # correctness checks if 1: Tail = Seq[BestPosRev:] Diffs2 = primer.MatchPrefix(Tail, RevPrimer) if Diffs2 != BestDiffsRev: print >> sys.stderr print >> sys.stderr, " Seq=" + Seq print >> sys.stderr, "Tail=" + Tail print >> sys.stderr, "RevP=" + RevPrimer die.Die("BestPosRev %u Diffs2 %u BestDiffsRev %u" % (BestPosRev, Diffs2, BestDiffsRev)) assert StrippedSeq + Tail == Seq Seq = StrippedSeq Qual = StrippedQual L = len(Seq) assert len(Qual) == L if L < MinLen: return if L < TrimLen: PadCount += 1 Seq = Seq + (TrimLen - L) * 'N' Qual = Qual + (TrimLen - L) * 'I' L = len(Seq) assert L == TrimLen assert len(Qual) == TrimLen L = len(Seq) if L < TrimLen: TooShortCount += 1 return if L > TrimLen: Seq = Seq[:TrimLen] Qual = Qual[:TrimLen] L = len(Seq) assert L == TrimLen assert len(Qual) == TrimLen fastq.WriteRec(out_file, Label, Seq, Qual)
def MatchesPrimer(Seq, Primer): return primer.MatchPrefix(Seq, Primer)
def processRead(input): #input is expected to be a FASTQ file #local variables that need to be previously declared: ForPrimer, RevPrimer Name = os.path.basename(input).split(".fq",-1)[0] DemuxOut = os.path.join(args.out, Name + '.demux.fq') Sample = Name.split('_')[0] StatsOut = os.path.join(args.out, Name+'.stats') Total = 0 NoPrimer = 0 TooShort = 0 RevPrimerFound = 0 ValidSeqs = 0 PL = len(FwdPrimer) with open(StatsOut, 'w') as counts: with open(DemuxOut, 'w') as out: for title, seq, qual in FastqGeneralIterator(open(input)): Total += 1 #first thing is look for forward primer, if found trim it off Diffs = primer.MatchPrefix(seq, FwdPrimer) #if require primer is on make finding primer in amplicon required if amplicon is larger than read length #if less than read length, can't enforce primer because could have been trimmed via staggered trim in fastq_mergepairs if args.primer == 'on' and len(seq) > ReadLen: if Diffs > args.primer_mismatch: NoPrimer += 1 continue Seq = seq[PL:] Qual = qual[PL:] else: if Diffs <= args.primer_mismatch: Seq = seq[PL:] Qual = qual[PL:] else: NoPrimer += 1 Seq = seq Qual = qual #now look for reverse primer BestPosRev, BestDiffsRev = primer.BestMatch2(Seq, RevPrimer, args.primer_mismatch) if BestPosRev > 0: #reverse primer was found RevPrimerFound += 1 #location to trim sequences, trim seqs Seq = Seq[:BestPosRev] Qual = Qual[:BestPosRev] else: if args.full_length and len(Seq) > ReadLen: #if full length and no primer found, exit, except when length is less than read length continue #if full_length is passed, then only trim primers if not args.full_length: #got here if primers were found they were trimmed #now check seq length, pad if too short, trim if too long if len(Seq) < args.min_len: #need this check here or primer dimers will get through TooShort += 1 continue if len(Seq) < args.trim_len and args.pad == 'on': pad = args.trim_len - len(Seq) Seq = Seq + pad*'N' Qual = Qual +pad*'J' else: #len(Seq) > args.trim_len: Seq = Seq[:args.trim_len] Qual = Qual[:args.trim_len] #got here, reads are primers trimmed and trim/padded, check length if len(Seq) < args.min_len: TooShort += 1 continue ValidSeqs += 1 #now fix header Title = 'R_'+str(ValidSeqs)+';barcodelabel='+Sample+';' #now write to file out.write("@%s\n%s\n+\n%s\n" % (Title, Seq, Qual)) counts.write('%i,%i,%i,%i,%i\n' % (Total, NoPrimer, RevPrimerFound, TooShort, ValidSeqs))