def getAvgLength(input): AvgLength = [] for title, seq, qual in FastqGeneralIterator(open(input)): AvgLength.append(len(seq)) Average = sum(AvgLength) / float(len(AvgLength)) Min = min(AvgLength) Max = max(AvgLength) a = np.array(AvgLength) nintyfive = np.percentile(a, 5) return (Average, Min, Max, int(nintyfive)) #remove logfile if exists log_name = args.out + '.amptk-dada2.log' if os.path.isfile(log_name): amptklib.removefile(log_name) amptklib.setupLogging(log_name) FNULL = open(os.devnull, 'w') cmd_args = " ".join(sys.argv) + '\n' amptklib.log.debug(cmd_args) print "-------------------------------------------------------" #initialize script, log system info and usearch version amptklib.SystemInfo() #Do a version check usearch = args.usearch amptklib.versionDependencyChecks(usearch) #check dependencies programs = ['Rscript'] amptklib.CheckDependencies(programs)
lines = [line.rstrip('\n') for line in input] remove = remove + lines if args.list: lines = args.list remove = remove + lines #make sure it is a set, faster lookup keep_list = set(remove) count = len(keep_list) #now run filtering keep_count = 0 total_count = 0 #rename to base if args.out.endswith('.gz'): outfile = args.out.replace('.gz', '') else: outfile = args.out #run filtering filter_sample(SeqIn, outfile) #compress and clean if args.out.endswith('.gz'): #compress in place amptklib.Fzip_inplace(outfile) if args.input.endswith('.gz'): amptklib.removefile(SeqIn) print("Removed %i samples" % count) print("Kept %i reads out of %i total reads" % (keep_count, total_count))
) #create OTU phylogeny for downstream processes amptklib.log.info("Generating phylogenetic tree") tree_out = base + '.tree.phy' cmd = [usearch, '-cluster_agg', args.fasta, '-treeout', tree_out] amptklib.runSubprocess(cmd, amptklib.log) #print some summary file locations amptklib.log.info("Taxonomy finished: %s" % taxFinal) if args.otu_table and not args.method == 'blast': amptklib.log.info("Classic OTU table with taxonomy: %s" % taxTable) #output final OTU table in Biom v1.0 (i.e. json format if biom installed) outBiom = base + '.biom' if amptklib.which('biom'): amptklib.removefile(outBiom) cmd = [ 'biom', 'convert', '-i', tmpTable, '-o', outBiom + '.tmp', '--table-type', "OTU table", '--to-json' ] amptklib.runSubprocess(cmd, amptklib.log) if args.mapping_file: cmd = [ 'biom', 'add-metadata', '-i', outBiom + '.tmp', '-o', outBiom, '--observation-metadata-fp', qiimeTax, '-m', args.mapping_file, '--sc-separated', 'taxonomy', '--output-as-json' ] amptklib.runSubprocess(cmd, amptklib.log) else: cmd = [ 'biom', 'add-metadata', '-i', outBiom + '.tmp', '-o', outBiom,
parser.add_argument('-o','--out', help='Base output name') parser.add_argument('--min_reads_otu', default=2, type=int, help='Minimum number of reads per OTU for experiment') parser.add_argument('-u','--usearch', dest="usearch", default='usearch9', help='USEARCH9 EXE') parser.add_argument('--debug', action='store_true', help='Remove Intermediate Files') args=parser.parse_args() if not args.out: #get base name of files base = args.otu_table.split(".otu_table") base = base[0] else: base = args.out #remove logfile if exists log_name = base + '.amptk-filter.log' amptklib.removefile(log_name) amptklib.setupLogging(log_name) FNULL = open(os.devnull, 'w') cmd_args = " ".join(sys.argv)+'\n' amptklib.log.debug(cmd_args) print "-------------------------------------------------------" #initialize script, log system info and usearch version amptklib.SystemInfo() #Do a version check usearch = args.usearch amptklib.versionDependencyChecks(usearch) #check if otu_table is empty amptklib.log.info("Loading OTU table: %s" % args.otu_table)
if k not in barcodes_found: barcodes_found.append(k) amptklib.log.info("Found %i barcoded samples\n%s" % (len(BarcodeCount), barcode_counts)) if not args.mapping_file: #create a generic mappingfile for downstream processes genericmapfile = args.out + '.mapping_file.txt' amptklib.CreateGenericMappingFile(barcode_file, FwdPrimer, revcomp_lib.RevComp(RevPrimer), Adapter, genericmapfile, barcodes_found) #compress the output to save space FinalDemux = catDemux + '.gz' amptklib.Fzip(catDemux, FinalDemux, cpus) amptklib.removefile(catDemux) if gzip_list: for file in gzip_list: file = file.replace('.gz', '') amptklib.removefile(file) #get file size filesize = os.path.getsize(FinalDemux) readablesize = amptklib.convertSize(filesize) amptklib.log.info("Output file: %s (%s)" % (FinalDemux, readablesize)) amptklib.log.info("Mapping file: %s" % genericmapfile) print "-------------------------------------------------------" if 'win32' in sys.platform: print "\nExample of next cmd: amptk cluster -i %s -o out\n" % (FinalDemux) else:
def primer2Strip(file, GoodOut, BadOut, fwdprimer, revprimer): #now run primer strip if args.reverse: if not args.rev_primer: print("ERROR: if reverse reads passed you must provide -r,--rev_primer") sys.exit(1) #first run forwards GoodFor = args.out + '.fwd.stripped.fq' BadFor = args.out + '.fwd.no_primer.fq' primerStrip(args.input, GoodFor, BadFor, args.fwd_primer, args.rev_primer) #now run reverse GoodRev = args.out + '.rev.stripped.fq' BadRev = args.out + '.rev.no_primer.fq' primerStrip(args.reverse, GoodRev, BadRev, args.rev_primer, args.fwd_primer) #now get bad reads into list singleRev = [] singleFor = [] for title, seq, qual in FastqGeneralIterator(open(BadFor)): singleRev.append(title.split(' ')[0]) for title, seq, qual in FastqGeneralIterator(open(BadRev)): singleFor.append(title.split(' ')[0]) bothfail = sorted(set(singleRev) & set(singleFor), key = singleFor.index) #now get PE and singletons PEfor = args.out +'.pe_R1.fastq' PErev = args.out +'.pe_R2.fastq' SEfor = args.out +'.fwd.singletons.fastq' SErev = args.out +'.rev.singletons.fastq' with open(PEfor, 'w') as peF: with open(SEfor, 'w') as seF: for title, seq, qual in FastqGeneralIterator(open(GoodFor)): ID = title.split(' ')[0] if not ID in singleFor: peF.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) else: seF.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) with open(PErev, 'w') as peR: with open(SErev, 'w') as seR: for title, seq, qual in FastqGeneralIterator(open(GoodRev)): ID = title.split(' ')[0] if not ID in singleRev: peR.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) else: seR.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) #do some counts of output and cleanup total = amptklib.countfastq(args.input) passed = amptklib.countfastq(PEfor) passedrev = amptklib.countfastq(PErev) if passed != passedrev: print("Error: forward reads %i != reverse reads %i" % (passed, passedrev)) nopaired = len(singleFor) + len(singleRev) failed = len(bothfail) print("%i total reads" % total) print("%i primer found properly paired: %s, %s" % (passed, PEfor, PErev)) print("%i primer found singletons: %s, %s" % (nopaired, SEfor, SErev)) print("%i primer not found in either forward or reverse reads" % (failed)) amptklib.removefile(GoodFor) amptklib.removefile(GoodRev) else: GoodOut = args.out +'.stripped.fq' BadOut = args.out +'.no_primer_found.fq' primerStrip(args.input, GoodOut, BadOut, args.fwd_primer, False) total = amptklib.countfastq(args.input) passed = amptklib.countfastq(GoodOut) failed = amptklib.countfastq(BadOut) print("%i total reads" % total) print("%i primer found/stripped: %s" % (passed, GoodOut)) print("%i primer not found: %s" % (failed, BadOut))
seR.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) #do some counts of output and cleanup total = amptklib.countfastq(args.input) passed = amptklib.countfastq(PEfor) passedrev = amptklib.countfastq(PErev) if passed != passedrev: print("Error: forward reads %i != reverse reads %i" % (passed, passedrev)) nopaired = len(singleFor) + len(singleRev) failed = len(bothfail) print("-------------------------------------------------------") print("%i total reads" % total) print("%i primer found properly paired: %s, %s" % (passed, PEfor, PErev)) print("%i primer found singletons: %s, %s" % (nopaired, SEfor, SErev)) print("%i primer not found in either forward or reverse reads" % (failed)) amptklib.removefile(GoodFor) amptklib.removefile(GoodRev) else: #setup tmpdir tmpdir = args.out.split('.')[0] + '_' + str(os.getpid()) if not os.path.exists(tmpdir): os.makedirs(tmpdir) GoodOut = args.out + '.stripped.fq' BadOut = args.out + '.no_primer_found.fq' filelist = splitter(args.input, tmpdir) amptklib.runMultiProgress(primerStrip, filelist, cpus) combiner(tmpdir, ".good", GoodOut) combiner(tmpdir, ".bad", BadOut) shutil.rmtree(tmpdir) total = amptklib.countfastq(args.input)
amptklib.runSubprocess(cmd, amptklib.log) #count OTUs otu_count = amptklib.countfasta(newOTUs) amptklib.log.info('{0:,}'.format(otu_count) + ' OTUs remaining') #count reads mapped total = amptklib.line_count(uc_out) orig_total = amptklib.countfasta(tmpReads) amptklib.log.info('{0:,}'.format(total) + ' reads mapped to OTUs ' + '({0:.0f}%)'.format(total / float(orig_total) * 100)) #Print location of files to STDOUT print "-------------------------------------------------------" print "Clustered OTUs: %s" % newOTUs print "OTU Table: %s" % newTable print "-------------------------------------------------------" #cleanup amptklib.removefile(tmpReads) amptklib.removefile(uc_out) otu_print = newOTUs.split('/')[-1] tab_print = newTable.split('/')[-1] if 'win32' in sys.platform: print "\nExample of next cmd: amptk filter -i %s -f %s -b <mock barcode>\n" % ( tab_print, otu_print) else: print colr.WARN + "\nExample of next cmd:" + colr.END + " amptk filter -i %s -f %s -b <mock barcode>\n" % ( tab_print, otu_print)
for file in os.listdir(folder): if file.endswith(".fq"): file = os.path.join(folder, file) file_list.append(file) p = multiprocessing.Pool(cpus) for f in file_list: #worker(f) p.apply_async(worker, [f]) p.close() p.join() #get filtered results catDemux = args.out with open(catDemux, 'w') as outfile: for filename in glob.glob(os.path.join(folder,'*.filter.fq')): if filename == catDemux: continue with open(filename, 'rU') as readfile: shutil.copyfileobj(readfile, outfile) if catDemux.endswith('.gz'): amptklib.Fzip_inplace(catDemux) shutil.rmtree(folder) print "----------------------------------" countBarcodes(args.out) print "----------------------------------" print "Script finished, output in %s" % args.out if args.input.endswith('.gz'): amptklib.removefile(tmpinput)
amptklib.log.info("Now Gzipping files") for file in os.listdir(args.out): if file.endswith(".fastq"): file_path = os.path.join(args.out, file) amptklib.Fzip_inplace(file_path) #after all files demuxed into output folder, loop through and create SRA metadata file filelist = [] for file in os.listdir(args.out): if file.endswith(".fastq.gz"): filelist.append(file) amptklib.log.info("Finished: output in %s" % args.out) #clean up if gzipped if args.FASTQ.endswith('.gz'): amptklib.removefile(FASTQ_IN) #check for BioSample meta file if args.biosample: amptklib.log.info( "NCBI BioSample file detected, creating SRA metadata file") #load in BioSample file to dictionary with open(args.biosample, 'rU') as input: reader = csv.reader(input, delimiter='\t') header = next(reader) acc = header.index('Accession') sample = header.index('Sample Name') bio = header.index('BioProject') try: host = header.index('Host') except ValueError:
ID = line.split("=", 1)[-1].split(";")[0] if ID not in BarcodeCount: BarcodeCount[ID] = 1 else: BarcodeCount[ID] += 1 #now let's count the barcodes found and count the number of times they are found. barcode_counts = "%30s: %s" % ('Sample', 'Count') for k, v in natsorted(BarcodeCount.items(), key=lambda (k, v): v, reverse=True): barcode_counts += "\n%30s: %s" % (k, str(BarcodeCount[k])) amptklib.log.info("Found %i barcoded samples\n%s" % (len(BarcodeCount), barcode_counts)) #compress the output to save space FinalDemux = Demux + '.gz' amptklib.Fzip(Demux, FinalDemux, cpus) amptklib.removefile(Demux) #get file size filesize = os.path.getsize(FinalDemux) readablesize = amptklib.convertSize(filesize) amptklib.log.info("Output file: %s (%s)" % (FinalDemux, readablesize)) amptklib.log.info("Mapping file: %s" % args.mapping_file) print "-------------------------------------------------------" if 'win32' in sys.platform: print "\nExample of next cmd: amptk cluster -i %s -o out\n" % (FinalDemux) else: print col.WARN + "\nExample of next cmd: " + col.END + "amptk cluster -i %s -o out\n" % ( FinalDemux)