Barcodes = {} with open(barcode_file, 'rU') as input: for line in input: if line.startswith('>'): if args.names: name = namesDict.get(line[1:-1]) name = name + ".fastq" else: name = line[1:-1] + ".fastq" continue Barcodes[name] = line.strip() #count FASTQ records in input amptklib.log.info("Loading FASTQ Records") total = amptklib.countfastq(args.FASTQ) size = amptklib.checkfastqsize(args.FASTQ) readablesize = amptklib.convertSize(size) amptklib.log.info('{0:,}'.format(total) + ' reads (' + readablesize + ')') #output message depending on primer requirement if args.require_primer == 'off': amptklib.log.info("Looking for %i barcodes" % (len(Barcodes))) elif args.require_primer == 'forward': amptklib.log.info( "Looking for %i barcodes that must have FwdPrimer: %s" % (len(Barcodes), FwdPrimer)) elif args.require_primer == 'both': amptklib.log.info( "Looking for %i barcodes that must have FwdPrimer: %s and RevPrimer: %s" % (len(Barcodes), FwdPrimer, RevPrimer))
Rversions = amptklib.checkRversion() R_pass = '******' dada2_pass = '******' #check dada2 first, if good move on, otherwise issue warning if not amptklib.gvc(Rversions[1], dada2_pass): amptklib.log.error("R v%s; DADA2 v%s detected, need atleast v%s" % (Rversions[0], Rversions[1], dada2_pass)) amptklib.log.error( "See: http://benjjneb.github.io/dada2/dada-installation.html") sys.exit(1) amptklib.log.info("R v%s; DADA2 v%s" % (Rversions[0], Rversions[1])) #Count FASTQ records and remove 3' N's as dada2 can't handle them amptklib.log.info("Loading FASTQ Records") orig_total = amptklib.countfastq(args.fastq) size = amptklib.checkfastqsize(args.fastq) readablesize = amptklib.convertSize(size) amptklib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize + ')') no_ns = args.out + '.cleaned_input.fq' amptklib.fastq_strip_padding(args.fastq, no_ns) #quality filter amptklib.log.info("Quality Filtering, expected errors < %s" % args.maxee) derep = args.out + '.qual-filtered.fq' filtercmd = [ 'vsearch', '--fastq_filter', no_ns, '--fastq_maxee', str(args.maxee), '--fastqout', derep, '--fastq_qmax', '55', '--fastq_maxns', '0' ] amptklib.runSubprocess(filtercmd, amptklib.log) total = amptklib.countfastq(derep)
#check dada2 first, if good move on, otherwise issue warning if not amptklib.gvc(Rversions[1], dada2_pass): amptklib.log.error("R v%s; DADA2 v%s detected, need atleast v%s" % (Rversions[0], Rversions[1], dada2_pass)) amptklib.log.error("See: http://benjjneb.github.io/dada2/dada-installation.html") sys.exit(1) amptklib.log.info("R v%s; DADA2 v%s" % (Rversions[0], Rversions[1])) #Count FASTQ records and remove 3' N's as dada2 can't handle them amptklib.log.info("Loading FASTQ Records") no_ns = args.out+'.cleaned_input.fq' amptklib.fastq_strip_padding(args.fastq, no_ns) demuxtmp = args.out+'.original.fa' cmd = ['vsearch', '--fastq_filter', os.path.abspath(no_ns),'--fastq_qmax', '55', '--fastaout', demuxtmp] amptklib.runSubprocess(cmd, amptklib.log) orig_total = amptklib.countfasta(demuxtmp) size = amptklib.checkfastqsize(no_ns) readablesize = amptklib.convertSize(size) amptklib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize + ')') #quality filter amptklib.log.info("Quality Filtering, expected errors < %s" % args.maxee) derep = args.out+'.qual-filtered.fq' filtercmd = ['vsearch', '--fastq_filter', no_ns, '--fastq_maxee', str(args.maxee), '--fastqout', derep, '--fastq_qmax', '55', '--fastq_maxns', '0'] amptklib.runSubprocess(filtercmd, amptklib.log) total = amptklib.countfastq(derep) amptklib.log.info('{0:,}'.format(total) + ' reads passed') #split into individual files amptklib.log.info("Splitting FASTQ file by Sample into individual files") filtfolder = args.out+'_filtered' if os.path.isdir(filtfolder):
#parse and put into dictionary with open(rev_barcode_file, 'w') as output: with open(args.reverse_barcode, 'rU') as input: for rec in SeqIO.parse(input, 'fasta'): RevSeq = str(rec.seq.reverse_complement()) if not rec.id in RevBarcodes: RevBarcodes[rec.id] = RevSeq output.write('>%s\n%s\n' % (rec.id, RevSeq)) else: amptklib.log.error( "Duplicate reverse barcodes detected, exiting") sys.exit(1) #Count FASTQ records amptklib.log.info("Loading FASTQ Records") orig_total = amptklib.countfastq(SeqIn) size = amptklib.checkfastqsize(SeqIn) readablesize = amptklib.convertSize(size) amptklib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize + ')') #create tmpdir and split input into n cpus tmpdir = args.out.split('.')[0] + '_' + str(os.getpid()) if not os.path.exists(tmpdir): os.makedirs(tmpdir) #split fastq file amptklib.split_fastq(SeqIn, orig_total, tmpdir, cpus * 2) #now get file list from tmp folder file_list = [] for file in os.listdir(tmpdir): if file.endswith(".fq"):
cleanR2 = os.path.join(tmpdir, 'renamedR2.fastq') amptklib.DemuxIllumina(args.fastq, args.reverse, args.index[0], mapdict, args.barcode_mismatch, cleanR1, cleanR2) #estimate read length if amptklib.check_valid_file(cleanR1): #if read length explicity passed use it otherwise measure it if args.read_length: ReadLen = args.read_length else: ReadLen = amptklib.GuessRL(cleanR1) #Count FASTQ records amptklib.log.info("Loading FASTQ Records") orig_total = amptklib.countfastq(cleanR1) size = amptklib.checkfastqsize(cleanR1) readablesize = amptklib.convertSize(size) amptklib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize + ')') #now we can merge the reads mergedReads = args.out + '.merged.fastq' amptklib.log.info("Merging PE reads using VSEARCH and filtering for phiX") amptklib.MergeReads(os.path.abspath(cleanR1), os.path.abspath(cleanR2), tmpdir, mergedReads, ReadLen, args.min_len, args.usearch, args.rescue_forward, 'vsearch', '', 1) if not args.full_length: if args.pad == 'off': amptklib.log.info("Stripping primers and trim to %s bp" % (args.trim_len)) else: