def splitter(inputfile, tempdir): ''' this function splits a fastq file into equal parts into a temporary directory and then returns the file names in a list ''' total = amptklib.countfastq(inputfile) #split the input FASTQ file into chunks to process with open(inputfile, 'rU') as input: SeqRecords = SeqIO.parse(input, 'fastq') chunks = total / (4 * cpus) + 1 #divide into chunks, store in tmp file for i, batch in enumerate(amptklib.batch_iterator(SeqRecords, chunks)): filename = "chunk_%i.fq" % (i + 1) tmpout = os.path.join(tempdir, filename) with open(tmpout, 'w') as handle: SeqIO.write(batch, handle, "fastq") #now get file list from tmp folder file_list = [] for file in os.listdir(tempdir): if file.endswith(".fq"): file = os.path.join(tempdir, file) file_list.append(file) return file_list
cpus = multiprocessing.cpu_count() else: cpus = args.cpus #get other values MAX_PRIMER_MISMATCHES = int(args.primer_mismatch) LabelPrefix = args.prefix MinLen = int(args.min_len) TrimLen = int(args.trim_len) PL = len(FwdPrimer) RL = len(RevPrimer) OutCount = 0 #split the input FASTQ file into chunks to process with open(SeqIn, 'rU') as input: SeqCount = amptklib.countfastq(SeqIn) amptklib.log.info('{0:,}'.format(SeqCount) + ' records loaded') SeqRecords = SeqIO.parse(SeqIn, 'fastq') chunks = SeqCount / cpus + 1 amptklib.log.info( "splitting job over %i cpus, this may still take awhile" % cpus) #divide into chunks, store in tmp file pid = os.getpid() folder = 'amptk_tmp_' + str(pid) if not os.path.exists(folder): os.makedirs(folder) for i, batch in enumerate(amptklib.batch_iterator(SeqRecords, chunks)): filename = "chunk_%i.fq" % (i + 1) tmpout = os.path.join(folder, filename) handle = open(tmpout, "w") count = SeqIO.write(batch, handle, "fastq")
for title, seq, qual in FastqGeneralIterator(open(GoodFor)): ID = title.split(' ')[0] if not ID in singleFor: peF.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) else: seF.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) with open(PErev, 'w') as peR: with open(SErev, 'w') as seR: for title, seq, qual in FastqGeneralIterator(open(GoodRev)): ID = title.split(' ')[0] if not ID in singleRev: peR.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) else: seR.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) #do some counts of output and cleanup total = amptklib.countfastq(args.input) passed = amptklib.countfastq(PEfor) passedrev = amptklib.countfastq(PErev) if passed != passedrev: print("Error: forward reads %i != reverse reads %i" % (passed, passedrev)) nopaired = len(singleFor) + len(singleRev) failed = len(bothfail) print("-------------------------------------------------------") print("%i total reads" % total) print("%i primer found properly paired: %s, %s" % (passed, PEfor, PErev)) print("%i primer found singletons: %s, %s" % (nopaired, SEfor, SErev)) print("%i primer not found in either forward or reverse reads" % (failed)) amptklib.removefile(GoodFor) amptklib.removefile(GoodRev) else:
#main start here cpus = multiprocessing.cpu_count() print "----------------------------------" tmpinput = 'amptk_show.tmp' if args.input.endswith('.gz'): amptklib.Funzip(args.input, tmpinput, cpus) else: tmpinput = args.input countBarcodes(tmpinput) print "----------------------------------" getSeqLength(tmpinput) print "----------------------------------" if args.quality_trim: #split the input FASTQ file into chunks to process #split fastq file SeqCount = amptklib.countfastq(tmpinput) pid = os.getpid() folder = 'amptk_tmp_' + str(pid) amptklib.split_fastq(tmpinput, SeqCount, folder, cpus*2) #now get file list from tmp folder file_list = [] for file in os.listdir(folder): if file.endswith(".fq"): file = os.path.join(folder, file) file_list.append(file) p = multiprocessing.Pool(cpus) for f in file_list: #worker(f) p.apply_async(worker, [f]) p.close()
else: name = ID + ".fastq" continue Barcodes[name] = line.strip() #check for compressed input file if args.FASTQ.endswith('.gz'): amptklib.log.info("Gzipped input files detected, uncompressing") FASTQ_IN = args.FASTQ.replace('.gz', '') amptklib.Funzip(args.FASTQ, FASTQ_IN, multiprocessing.cpu_count()) else: FASTQ_IN = args.FASTQ #count FASTQ records in input amptklib.log.info("Loading FASTQ Records") total = amptklib.countfastq(FASTQ_IN) size = amptklib.checkfastqsize(args.FASTQ) readablesize = amptklib.convertSize(size) amptklib.log.info('{0:,}'.format(total) + ' reads (' + readablesize + ')') #output message depending on primer requirement if args.require_primer == 'off': amptklib.log.info("Looking for %i barcodes" % (len(Barcodes))) elif args.require_primer == 'forward': amptklib.log.info( "Looking for %i barcodes that must have FwdPrimer: %s" % (len(Barcodes), FwdPrimer)) elif args.require_primer == 'both': amptklib.log.info( "Looking for %i barcodes that must have FwdPrimer: %s and RevPrimer: %s" % (len(Barcodes), FwdPrimer, RevPrimer))
#get utax_database if args.db in DataBase: utaxDB = DataBase.get(args.db)[1] else: if not args.closed_ref_only: if args.utax_db: utaxDB = os.path.abspath(args.utax_db) else: amptklib.log.error( "%s not pre-installed DB, must then also specify valid UTAX database via --utax_db" % args.db) sys.exit(1) #Count FASTQ records amptklib.log.info("Loading FASTQ Records") orig_total = amptklib.countfastq(args.FASTQ) size = checkfastqsize(args.FASTQ) readablesize = amptklib.convertSize(size) amptklib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize + ')') #Expected Errors filtering step and convert to fasta filter_out = os.path.join(tmp, args.out + '.EE' + args.maxee + '.filter.fq') filter_fasta = os.path.join(tmp, args.out + '.EE' + args.maxee + '.filter.fa') orig_fasta = os.path.join(tmp, args.out + '.orig.fa') amptklib.log.info("Quality Filtering, expected errors < %s" % args.maxee) cmd = [ 'vsearch', '--fastq_filter', args.FASTQ, '--fastq_maxee', str(args.maxee), '--fastqout', filter_out, '--fastaout', filter_fasta, '--fastq_qmax', '55' ] amptklib.runSubprocess(cmd, amptklib.log)
cleanR1 = os.path.join(tmpdir, 'renamedR1.fastq') cleanR2 = os.path.join(tmpdir, 'renamedR2.fastq') amptklib.DemuxIllumina(args.fastq, args.reverse, args.index[0], mapdict, args.barcode_mismatch, cleanR1, cleanR2) #estimate read length if amptklib.check_valid_file(cleanR1): #if read length explicity passed use it otherwise measure it if args.read_length: ReadLen = args.read_length else: ReadLen = amptklib.GuessRL(cleanR1) #Count FASTQ records amptklib.log.info("Loading FASTQ Records") orig_total = amptklib.countfastq(cleanR1) size = amptklib.checkfastqsize(cleanR1) readablesize = amptklib.convertSize(size) amptklib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize + ')') #now we can merge the reads mergedReads = args.out + '.merged.fastq' amptklib.log.info("Merging PE reads using VSEARCH and filtering for phiX") amptklib.MergeReads(os.path.abspath(cleanR1), os.path.abspath(cleanR2), tmpdir, mergedReads, ReadLen, args.min_len, args.usearch, args.rescue_forward, 'vsearch', '', 1) if not args.full_length: if args.pad == 'off': amptklib.log.info("Stripping primers and trim to %s bp" % (args.trim_len))