Beispiel #1
0
def primerStrip(file, GoodOut, BadOut, fwdprimer, revprimer):
    PL = len(fwdprimer)
    with open(GoodOut, 'w') as good:
        with open(BadOut, 'w') as bad:
            for title, seq, qual in FastqGeneralIterator(open(file)):
                Diffs = primer.MatchPrefix(seq, fwdprimer)
                if Diffs <= args.primer_mismatch:
                    Seq = seq[PL:]
                    Qual = qual[PL:]
                    if revprimer:#now need to look for reverse primer
                        BestPosRev, BestDiffsRev = primer.BestMatch2(Seq, revcomp_lib.RevComp(revprimer), args.primer_mismatch)
                        if BestPosRev > 0:  #reverse primer was found
                            Seq = Seq[:BestPosRev]
                            Qual = Qual[:BestPosRev]                                           
                    good.write("@%s\n%s\n+\n%s\n" % (title, Seq, Qual))
                else:
                    bad.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))                   
Beispiel #2
0
    #this will loop through FASTQ file once, splitting those where barcodes are found, and primers trimmed
    runningTotal = 0
    trim = len(FwdPrimer)
    #print Barcodes
    with open(args.FASTQ, 'rU') as input:
        for title, seq, qual in FastqGeneralIterator(input):
            Barcode, BarcodeLabel = FindBarcode(seq, Barcodes)
            if Barcode == "":  #if not found, move onto next record
                continue
            BarcodeLength = len(Barcode)
            seq = seq[BarcodeLength:]
            qual = qual[BarcodeLength:]
            #look for forward primer
            if args.require_primer != 'off':  #means we only want ones with forward primer and or reverse
                Diffs = primer.MatchPrefix(seq, FwdPrimer)
                if Diffs > args.primer_mismatch:
                    continue
                #if found, trim away primer
                seq = seq[trim:]
                qual = qual[trim:]
                if args.require_primer == 'both':
                    #look for reverse primer, strip if found
                    BestPosRev, BestDiffsRev = primer.BestMatch2(
                        seq, ReverseCompRev, args.primer_mismatch)
                    if BestPosRev > 0:
                        seq = seq[:BestPosRev]
                        qual = qual[:BestPosRev]
                    else:
                        continue
            #check size
Beispiel #3
0
def processRead(input):
    base = os.path.basename(input).split('.')[0]
    PL = len(FwdPrimer)
    RL = len(RevPrimer)
    DemuxOut = os.path.join(tmpdir, base+'.demux.fq')
    StatsOut = os.path.join(tmpdir, base+'.stats')
    Total = 0
    NoBarcode = 0
    NoRevBarcode = 0
    NoPrimer = 0
    TooShort = 0
    RevPrimerFound = 0
    ValidSeqs = 0
    with open(StatsOut, 'w') as counts:
        with open(DemuxOut, 'w') as out:   
            for title, seq, qual in FastqGeneralIterator(open(input)):
                Total += 1
                #look for barcode, trim it off
                Barcode, BarcodeLabel = FindBarcode(seq, Barcodes)
                if Barcode == "":
                    NoBarcode += 1
                    continue
                BarcodeLength = len(Barcode)
                Seq = seq[BarcodeLength:]
                Qual = qual[BarcodeLength:]
                #now search for forward primer
                Diffs = primer.MatchPrefix(Seq, FwdPrimer)
                if Diffs > args.primer_mismatch:
                    NoPrimer += 1
                    continue
                ForTrim = PL      
                #now search for reverse primer
                BestPosRev, BestDiffsRev = primer.BestMatch2(Seq, RevPrimer, args.primer_mismatch)
                if BestPosRev > 0:  #reverse primer was found
                    RevPrimerFound += 1 
                    #location to trim sequences
                    RevTrim = BestPosRev                
                    #determine reverse barcode
                    if args.reverse_barcode:
                        RevBCdiffs = 0
                        BCcut = BestPosRev + RL
                        CutSeq = Seq[BCcut:]
                        RevBarcode, RevBarcodeLabel = FindBarcode(CutSeq, RevBarcodes)
                        if RevBarcode == "":
                            NoRevBarcode += 1
                            continue
                        BarcodeLabel = BarcodeLabel+'_'+RevBarcodeLabel                       
                    #now trim record remove forward and reverse reads
                    Seq = Seq[ForTrim:RevTrim]
                    Qual = Qual[ForTrim:RevTrim]
                    #since found reverse primer, now also need to pad/trim
                    if not args.full_length:
                        #check minimum length here or primer dimer type sequences will get padded with Ns
                        if len(Seq) < int(args.min_len):
                            TooShort += 1
                            continue
                        if len(Seq) < args.trim_len and args.pad == 'on':
                            pad = args.trim_len - len(Seq)
                            Seq = Seq + pad*'N'
                            Qual = Qual +pad*'J'
                        else: #len(Seq) > args.trim_len:
                            Seq = Seq[:args.trim_len]
                            Qual = Qual[:args.trim_len]
                else:
                    #trim record, did not find reverse primer
                    if args.full_length: #if full length then move to next record
                        continue
                    #trim away forward primer
                    Seq = Seq[ForTrim:]
                    Qual = Qual[ForTrim:]
                    #check length and trim, throw away if too short as it was bad read
                    if len(Seq) < args.trim_len:
                        TooShort += 1
                        continue
                    Seq = Seq[:args.trim_len]
                    Qual = Qual[:args.trim_len]
                #check minimum length
                if len(Seq) < int(args.min_len):
                    TooShort += 1
                    continue
                ValidSeqs += 1
                #rename header
                Name = 'R_'+str(ValidSeqs)+';barcodelabel='+BarcodeLabel+';'
                out.write("@%s\n%s\n+\n%s\n" % (Name, Seq, Qual))
            counts.write('%i,%i,%i,%i,%i,%i,%i\n' % (Total, NoBarcode, NoPrimer, RevPrimerFound, NoRevBarcode, TooShort, ValidSeqs))
Beispiel #4
0
def OnRec(Label, Seq, Qual):
    global PL, LabelPrefix, SeqCount, OutCount, TooShortCount, PadCount
    global FwdPrimerMismatchCount, RevPrimerStrippedCount
    global FwdPrimer, RevPrimer

    if SeqCount == 0:
        progress.InitFile(fastq.File)

    progress.File("%u reads, %u outupt, %u bad fwd primer, %u rev primer stripped, %u too short. %u padded" % \
      (SeqCount, OutCount, FwdPrimerMismatchCount, RevPrimerStrippedCount, TooShortCount, PadCount))

    SeqCount += 1
    Seq = Seq
    Qual = Qual
    Diffs = MatchesPrimer(Seq, FwdPrimer)
    if Diffs > MAX_PRIMER_MISMATCHES:
        FwdPrimerMismatchCount += 1
        return

    OutCount += 1
    Label = LabelPrefix + str(OutCount) + ";barcodelabel=" + SampleLabel + ";"

    # Strip fwd primer
    Seq = Seq[PL:]
    Qual = Qual[PL:]

    BestPosRev, BestDiffsRev = primer.BestMatch2(Seq, RevPrimer,
                                                 MAX_PRIMER_MISMATCHES)
    if BestPosRev > 0:
        # Strip rev primer
        RevPrimerStrippedCount += 1
        StrippedSeq = Seq[:BestPosRev]
        StrippedQual = Qual[:BestPosRev]

        # correctness checks
        if 1:
            Tail = Seq[BestPosRev:]
            Diffs2 = primer.MatchPrefix(Tail, RevPrimer)
            if Diffs2 != BestDiffsRev:
                print >> sys.stderr
                print >> sys.stderr, " Seq=" + Seq
                print >> sys.stderr, "Tail=" + Tail
                print >> sys.stderr, "RevP=" + RevPrimer
                die.Die("BestPosRev %u Diffs2 %u BestDiffsRev %u" %
                        (BestPosRev, Diffs2, BestDiffsRev))
            assert StrippedSeq + Tail == Seq

        Seq = StrippedSeq
        Qual = StrippedQual

        L = len(Seq)
        assert len(Qual) == L

        if L < MinLen:
            return

        if L < TrimLen:
            PadCount += 1
            Seq = Seq + (TrimLen - L) * 'N'
            Qual = Qual + (TrimLen - L) * 'I'
            L = len(Seq)
            assert L == TrimLen
            assert len(Qual) == TrimLen

    L = len(Seq)
    if L < TrimLen:
        TooShortCount += 1
        return

    if L > TrimLen:
        Seq = Seq[:TrimLen]
        Qual = Qual[:TrimLen]
        L = len(Seq)

    assert L == TrimLen
    assert len(Qual) == TrimLen

    fastq.WriteRec(out_file, Label, Seq, Qual)
Beispiel #5
0
def MatchesPrimer(Seq, Primer):
    return primer.MatchPrefix(Seq, Primer)
def processRead(input):
    #input is expected to be a FASTQ file
    #local variables that need to be previously declared: ForPrimer, RevPrimer
    Name = os.path.basename(input).split(".fq",-1)[0]
    DemuxOut = os.path.join(args.out, Name + '.demux.fq')
    Sample = Name.split('_')[0]
    StatsOut = os.path.join(args.out, Name+'.stats')
    Total = 0
    NoPrimer = 0
    TooShort = 0
    RevPrimerFound = 0
    ValidSeqs = 0
    PL = len(FwdPrimer)
    with open(StatsOut, 'w') as counts:
        with open(DemuxOut, 'w') as out:
            for title, seq, qual in FastqGeneralIterator(open(input)):
                Total += 1
                #first thing is look for forward primer, if found trim it off
                Diffs = primer.MatchPrefix(seq, FwdPrimer)
                #if require primer is on make finding primer in amplicon required if amplicon is larger than read length
                #if less than read length, can't enforce primer because could have been trimmed via staggered trim in fastq_mergepairs
                if args.primer == 'on' and len(seq) > ReadLen:
                    if Diffs > args.primer_mismatch:
                        NoPrimer += 1
                        continue
                    Seq = seq[PL:]
                    Qual = qual[PL:]
                else:
                    if Diffs <= args.primer_mismatch:
                        Seq = seq[PL:]
                        Qual = qual[PL:]
                    else:
                        NoPrimer += 1
                        Seq = seq
                        Qual = qual
                #now look for reverse primer
                BestPosRev, BestDiffsRev = primer.BestMatch2(Seq, RevPrimer, args.primer_mismatch)
                if BestPosRev > 0:  #reverse primer was found
                    RevPrimerFound += 1
                    #location to trim sequences, trim seqs
                    Seq = Seq[:BestPosRev]
                    Qual = Qual[:BestPosRev]
                else:
                    if args.full_length and len(Seq) > ReadLen: #if full length and no primer found, exit, except when length is less than read length
                        continue
                #if full_length is passed, then only trim primers
                if not args.full_length:
                    #got here if primers were found they were trimmed
                    #now check seq length, pad if too short, trim if too long
                    if len(Seq) < args.min_len: #need this check here or primer dimers will get through
                        TooShort += 1
                        continue
                    if len(Seq) < args.trim_len and args.pad == 'on':
                        pad = args.trim_len - len(Seq)
                        Seq = Seq + pad*'N'
                        Qual = Qual +pad*'J'
                    else: #len(Seq) > args.trim_len:
                        Seq = Seq[:args.trim_len]
                        Qual = Qual[:args.trim_len]
                #got here, reads are primers trimmed and trim/padded, check length
                if len(Seq) < args.min_len:
                    TooShort += 1
                    continue
                ValidSeqs += 1     
                #now fix header
                Title = 'R_'+str(ValidSeqs)+';barcodelabel='+Sample+';'
                #now write to file
                out.write("@%s\n%s\n+\n%s\n" % (Title, Seq, Qual))
            counts.write('%i,%i,%i,%i,%i\n' % (Total, NoPrimer, RevPrimerFound, TooShort, ValidSeqs))