def makeDB(input): db_details = args.out + '.udb.txt' usearch_db = args.out + '.udb' Total = amptklib.countfasta(input) if args.trimming: args.F_primer = 'None' args.R_primer = 'None' db_string = args.create_db + ' ' + args.fasta + ' ' + args.F_primer + ' ' + args.R_primer + ' ' + str( Total) with open(db_details, 'w') as details: details.write(db_string) report = args.out + '.report.txt' if args.create_db == 'utax': #create log file for this to troubleshoot utax_log = args.out + '.utax.log' if os.path.isfile(utax_log): os.remove(utax_log) amptklib.log.info("Creating UTAX Database, this may take awhile") amptklib.log.debug( "%s -makeudb_utax %s -output %s -report %s -utax_trainlevels kpcofgs -utax_splitlevels NVkpcofgs -notrunclabels" % (usearch, input, usearch_db, report)) with open(utax_log, 'w') as utaxLog: subprocess.call([ usearch, '-makeudb_utax', input, '-output', usearch_db, '-report', report, '-utax_trainlevels', args.utax_trainlevels, '-utax_splitlevels', args.utax_splitlevels, '-notrunclabels' ], stdout=utaxLog, stderr=utaxLog) #check if file is actually there if os.path.isfile(usearch_db): amptklib.log.info("Database %s created successfully" % usearch_db) else: amptklib.log.error( "There was a problem creating the DB, check the UTAX log file %s" % utax_log) if args.create_db == 'usearch': #create log file for this to troubleshoot usearch_log = args.out + '.usearch.log' if os.path.isfile(usearch_log): os.remove(usearch_log) amptklib.log.info("Creating USEARCH Database") amptklib.log.debug("%s -makeudb_usearch %s -output %s -notrunclabels" % (usearch, input, usearch_db)) with open(usearch_log, 'w') as logfile: subprocess.call([ usearch, '-makeudb_usearch', input, '-output', usearch_db, '-notrunclabels' ], stdout=logfile, stderr=logfile) if os.path.isfile(usearch_db): amptklib.log.info("Database %s created successfully" % usearch_db) else: amptklib.log.error( "There was a problem creating the DB, check the log file %s" % utax_log)
continue if taxCount > 5: #species level, not have more than 1 seq per "species" if ID in seenTax: continue out.write('>' + value + '\n' + key + '\n') seenTax.append(tax) seenID.append(ID) FNULL = open(os.devnull, 'w') pid = os.getpid() #reverse complement rev primer ForPrimer = args.fwdprimer RevPrimer = revcomp_lib.RevComp(args.revprimer) print 'Loading ' + '{0:,}'.format(amptklib.countfasta( args.input)) + ' sequence records' print 'Searching for forward primer: %s, and reverse primer: %s' % (ForPrimer, RevPrimer) print 'Requiring reverse primer match with at least %i mismatches' % args.primer_mismatch #loop through seqs, remove primer if found, and truncate to length truncated = 'bold2amptk_' + str(pid) + '.truncate.tmp' with open(truncated, 'w') as output: for record in FastaIterator(open(args.input)): Seq = str(record.seq) StripSeq = '' ForCutPos = amptklib.findFwdPrimer(ForPrimer, Seq, args.primer_mismatch, amptklib.degenNucSimple) RevCutPos = amptklib.findRevPrimer(RevPrimer, Seq, args.primer_mismatch, amptklib.degenNucSimple)
uchime_db = os.path.abspath(args.uchime_ref) else: amptklib.log.error( "%s is not a valid file, skipping reference chimera filtering" % args.uchime_ref) uchime_out = fastaout #now run chimera filtering if all checks out if not os.path.isfile(uchime_out): amptklib.log.info("Chimera Filtering (VSEARCH) using %s DB" % args.uchime_ref) cmd = [ 'vsearch', '--mindiv', '1.0', '--uchime_ref', fastaout, '--db', uchime_db, '--nonchimeras', uchime_out ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.countfasta(uchime_out) uchime_chimeras = validSeqs - total amptklib.log.info('{0:,}'.format(total) + ' iSeqs passed, ' + '{0:,}'.format(uchime_chimeras) + ' ref chimeras removed') #now reformat OTUs and OTU table, dropping chimeric OTUs from table, sorting the output as well nonchimeras = amptklib.fasta2list(uchime_out) inferredSeqs = SeqIO.index(uchime_out, 'fasta') with open(iSeqs, 'w') as iSeqout: for x in natsorted(nonchimeras): SeqIO.write(inferredSeqs[x], iSeqout, 'fasta') if not args.debug: #clean up chimeras fasta amptklib.removefile(uchime_out) if os.path.isfile(fastaout):
utaxDB = DataBase.get(args.db)[1] else: if not args.closed_ref_only: if args.utax_db: utaxDB = os.path.abspath(args.utax_db) else: amptklib.log.error("%s not pre-installed DB, must then also specify valid UTAX database via --utax_db" % args.db) sys.exit(1) #Count FASTQ records amptklib.log.info("Loading FASTQ Records") #convert to FASTA for mapping orig_fasta = os.path.join(tmp, args.out+'.orig.fa') cmd = ['vsearch', '--fastq_filter', args.FASTQ, '--fastaout', orig_fasta, '--fastq_qmax', '55'] amptklib.runSubprocess(cmd, amptklib.log) orig_total = amptklib.countfasta(orig_fasta) size = amptklib.checkfastqsize(args.FASTQ) readablesize = amptklib.convertSize(size) amptklib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize + ')') #Expected Errors filtering step filter_out = os.path.join(tmp, args.out + '.EE' + args.maxee + '.filter.fq') filter_fasta = os.path.join(tmp, args.out + '.EE' + args.maxee + '.filter.fa') amptklib.log.info("Quality Filtering, expected errors < %s" % args.maxee) cmd = ['vsearch', '--fastq_filter', args.FASTQ, '--fastq_maxee', str(args.maxee), '--fastqout', filter_out, '--fastaout', filter_fasta, '--fastq_qmax', '55'] amptklib.runSubprocess(cmd, amptklib.log) qtrimtotal = amptklib.countfastq(filter_out) amptklib.log.info('{0:,}'.format(qtrimtotal) + ' reads passed') #now run full length dereplication derep_out = os.path.join(tmp, args.out + '.EE' + args.maxee + '.derep.fa') amptklib.log.info("De-replication (remove duplicate reads)")
amptklib.log.error("If using the -b,--barcode option you must specify a fasta file of mock community via the --mc option") sys.exit(1) #get default mock community value if args.mc == "mock3": mock = os.path.join(parentdir, 'DB', 'amptk_mock3.fa') elif args.mc == "mock2": mock = os.path.join(parentdir, 'DB', 'amptk_mock2.fa') elif args.mc == "mock1": mock = os.path.join(parentdir, 'DB', 'amptk_mock1.fa') elif args.mc == "synmock": mock = os.path.join(parentdir, 'DB', 'amptk_synmock.fa') else: mock = os.path.abspath(args.mc) #open mock community fasta and count records mock_ref_count = amptklib.countfasta(mock) #load OTU lengths into dictionary SeqLength = amptklib.fastalen2dict(args.fasta) #map OTUs to mock community, this is fast enough, but running twice, first to get only top hit, then amptklib.log.info("Mapping OTUs to Mock Community (USEARCH)") cmd = [usearch, '-usearch_global', mock, '-strand', 'plus', '-id', '0.65', '-db', FastaCounts, '-userout', mock_out, '-userfields', 'query+target+id+ql+tl+alnlen+caln+mism+diffs', '-maxaccepts', '0', '-maxrejects', '0'] amptklib.runSubprocess(cmd, amptklib.log) #generate dictionary for name change ''' If args.calculate is set to all, that means the script is trying to measure a synthetic mock of some kind. if that is the case, then chimeras are < 95% identical to mock members and variants would be hits in between, i.e 95% > but not the best hit. '''
if args.add2db: #means user wants to add sequences to the usearch database on the fly, so we will grab sintax DB here, as not preformatted amptklib.log.info("Adding %s to database" % args.add2db) custom_db = base + '.custom_database.fa' if args.db: #this means that the fasta files are in sintax_db option current_db = sintax_db elif args.fasta_db: current_db = args.fasta_db with open(custom_db, 'wb') as outfile: with open(current_db, 'rU') as infile: shutil.copyfileobj(infile, outfile) with open(args.add2db, 'rU') as infile: shutil.copyfileobj(infile, outfile) #Count records amptklib.log.info("Loading FASTA Records") total = amptklib.countfasta(args.fasta) amptklib.log.info('{0:,}'.format(total) + ' OTUs') #declare output files/variables here blast_out = base + '.blast.txt' rdp_out = base + '.rdp.txt' utax_out = base + '.usearch.txt' usearch_out = base + '.usearch.txt' sintax_out = base + '.sintax.txt' if not args.taxonomy: #start with less common uses, i.e. Blast, rdp if args.method == 'blast': #check if command line blast installed if not amptklib.which('blastn'): amptklib.log.error("BLASTN not found in your PATH, exiting.")
#make tmp folder tmp = args.out + '_tmp' if not os.path.exists(tmp): os.makedirs(tmp) #Count FASTQ records amptklib.log.info("Loading FASTQ Records") #convert to FASTA for mapping orig_fasta = os.path.join(tmp, args.out + '.orig.fa') cmd = [ 'vsearch', '--fastq_filter', args.FASTQ, '--fastaout', orig_fasta, '--fastq_qmax', '55' ] amptklib.runSubprocess(cmd, amptklib.log) orig_total = amptklib.countfasta(orig_fasta) size = amptklib.checkfastqsize(args.FASTQ) readablesize = amptklib.convertSize(size) amptklib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize + ')') #Expected Errors filtering step filter_out = os.path.join(tmp, args.out + '.EE' + args.maxee + '.filter.fq') filter_fasta = os.path.join(tmp, args.out + '.EE' + args.maxee + '.filter.fa') amptklib.log.info("Quality Filtering, expected errors < %s" % args.maxee) cmd = [ 'vsearch', '--fastq_filter', args.FASTQ, '--fastq_maxee', str(args.maxee), '--fastqout', filter_out, '--fastaout', filter_fasta, '--fastq_qmax', '55' ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.countfastq(filter_out)
(FwdPrimer, RevPrimer)) else: amptklib.log.info("Working on file: %s" % args.fasta) if not args.cpus: cpus = multiprocessing.cpu_count() else: cpus = args.cpus #create temp directory pid = os.getpid() folder = 'amptk_tmp_' + str(pid) if not os.path.exists(folder): os.makedirs(folder) SeqCount = amptklib.countfasta(args.fasta) amptklib.log.info('{0:,}'.format(SeqCount) + ' records loaded') #if only 1 cpu just process data if cpus == 1: stripPrimer(args.fasta) else: amptklib.log.info("Using %i cpus to process data" % cpus) #now split it into chunks (as many cpus as are queried) amptklib.split_fasta(args.fasta, folder, cpus * 2) #get list of files file_list = [] for file in os.listdir(folder): if file.endswith(".fasta"): file = os.path.join(folder, file)
#make tmp folder tmp = args.out + '_tmp' if not os.path.exists(tmp): os.makedirs(tmp) #Count FASTQ records amptklib.log.info("Loading FASTQ Records") #convert to FASTA for mapping orig_fasta = os.path.join(tmp, args.out + '.orig.fa') cmd = [ 'vsearch', '--fastq_filter', args.FASTQ, '--fastaout', orig_fasta, '--fastq_qmax', '55' ] amptklib.runSubprocess(cmd, amptklib.log) orig_total = amptklib.countfasta(orig_fasta) size = amptklib.checkfastqsize(args.FASTQ) readablesize = amptklib.convertSize(size) amptklib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize + ')') #Expected Errors filtering step filter_out = os.path.join(tmp, args.out + '.EE' + args.maxee + '.filter.fq') filter_fasta = os.path.join(tmp, args.out + '.EE' + args.maxee + '.filter.fa') amptklib.log.info("Quality Filtering, expected errors < %s" % args.maxee) cmd = [ 'vsearch', '--fastq_filter', args.FASTQ, '--fastq_maxee', str(args.maxee), '--fastqout', filter_out, '--fastaout', filter_fasta, '--fastq_qmax', '55' ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.countfastq(filter_out)
dada2_pass = '******' #check dada2 first, if good move on, otherwise issue warning if not amptklib.gvc(Rversions[1], dada2_pass): amptklib.log.error("R v%s; DADA2 v%s detected, need atleast v%s" % (Rversions[0], Rversions[1], dada2_pass)) amptklib.log.error("See: http://benjjneb.github.io/dada2/dada-installation.html") sys.exit(1) amptklib.log.info("R v%s; DADA2 v%s" % (Rversions[0], Rversions[1])) #Count FASTQ records and remove 3' N's as dada2 can't handle them amptklib.log.info("Loading FASTQ Records") no_ns = args.out+'.cleaned_input.fq' amptklib.fastq_strip_padding(args.fastq, no_ns) demuxtmp = args.out+'.original.fa' cmd = ['vsearch', '--fastq_filter', os.path.abspath(no_ns),'--fastq_qmax', '55', '--fastaout', demuxtmp] amptklib.runSubprocess(cmd, amptklib.log) orig_total = amptklib.countfasta(demuxtmp) size = amptklib.checkfastqsize(no_ns) readablesize = amptklib.convertSize(size) amptklib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize + ')') #quality filter amptklib.log.info("Quality Filtering, expected errors < %s" % args.maxee) derep = args.out+'.qual-filtered.fq' filtercmd = ['vsearch', '--fastq_filter', no_ns, '--fastq_maxee', str(args.maxee), '--fastqout', derep, '--fastq_qmax', '55', '--fastq_maxns', '0'] amptklib.runSubprocess(filtercmd, amptklib.log) total = amptklib.countfastq(derep) amptklib.log.info('{0:,}'.format(total) + ' reads passed') #split into individual files amptklib.log.info("Splitting FASTQ file by Sample into individual files") filtfolder = args.out+'_filtered'
"Error, you must specifiy either list of OTU names or a file containing OTU names, not both" ) sys.exit(1) if args.file: count = amptklib.line_count(args.file) #load in list of names to remove with open(args.file, 'rU') as input: lines = [line.rstrip('\n') for line in input] if args.list: count = len(args.list) lines = args.list #make sure it is a set, faster lookup dropList = set(lines) #load data total = amptklib.countfasta(args.input) amptklib.log.info("Loading %i OTUs" % total) #load in the fasta file, change if in dictionary and output to stdout amptklib.log.info("Dropping %i OTUs" % count) newOTUs = args.out + '.cleaned.otus.fa' with open(newOTUs, 'w') as otus: with open(args.input, 'rU') as fasta: for rec in SeqIO.parse(fasta, 'fasta'): if not rec.id in dropList: SeqIO.write(rec, otus, 'fasta') #now make new OTU table amptklib.log.info("Mapping Reads to OTUs and Building OTU table") newTable = args.out + '.cleaned.otu_table.txt' tmpReads = args.out + '.reads.tmp'