def processReadsPE(input, args=False): base = os.path.basename(input) forward_reads = os.path.join(tmpdir, base + '_R1.fq') reverse_reads = os.path.join(tmpdir, base + '_R2.fq') index_reads = os.path.join(tmpdir, base + '_R3.fq') trim_forward = os.path.join(tmpdir, base + '_R1.trimmed.fq') trim_reverse = os.path.join(tmpdir, base + '_R2.trimmed.fq') merged_reads = os.path.join(tmpdir, base + '.merged.fq') DemuxOut = os.path.join(tmpdir, base + '.demux.fq') Total, BCFound, ForPrimerCount, RevPrimerCount = amptklib.DemuxIllumina( forward_reads, reverse_reads, index_reads, Barcodes, args.barcode_mismatch, FwdPrimer, RevPrimer, args.primer_mismatch, trim_forward, trim_reverse) amptklib.MergeReadsSimple(trim_forward, trim_reverse, '.', merged_reads, args.min_len, usearch, args.rescue_forward, args.merge_method) MergeCount = amptklib.countfastq(merged_reads) amptklib.losslessTrim(merged_reads, FwdPrimer, RevPrimer, args.primer_mismatch, args.trim_len, args.pad, args.min_len, DemuxOut) FinalCount = amptklib.countfastq(DemuxOut) TooShort = MergeCount - FinalCount stats = os.path.join(tmpdir, base + '.stats') with open(stats, 'w') as counts: counts.write("%i,%i,%i,%i,%i,%i\n" % (Total, BCFound, ForPrimerCount, RevPrimerCount, TooShort, FinalCount))
def processPEreads(input, args=False): ''' function for multiprocessing of the data, so take file list as input, need global forward/reverse list available ''' for_reads, rev_reads = input if '_' in os.path.basename(for_reads): name = os.path.basename(for_reads).split("_")[0] else: name = os.path.basename(for_reads) amptklib.log.debug('{:}: {:} {:}'.format(name, for_reads, rev_reads)) #for_reads = os.path.join(args.input, forwardRead) #rev_reads = os.path.join(args.input, reverseRead) StatsOut = os.path.join(args.out, name + '.stats') #if read length explicity passed use it otherwise measure it if args.read_length: read_length = args.read_length else: read_length = amptklib.GuessRL(for_reads) trimR1 = os.path.join(args.out, name + '_R1.fq') trimR2 = os.path.join(args.out, name + '_R2.fq') mergedReads = os.path.join(args.out, name + '.merged.fq') demuxReads = os.path.join(args.out, name + '.demux.fq') TotalCount, Written, DropMulti, FindForPrimer, FindRevPrimer = amptklib.stripPrimersPE( for_reads, rev_reads, read_length, name, FwdPrimer, RevPrimer, args.primer_mismatch, args.primer, args.full_length, trimR1, trimR2) MergedCount, PhixCleanedCount = amptklib.MergeReadsSimple( trimR1, trimR2, args.out, name + '.merged.fq', args.min_len, usearch, args.rescue_forward, args.merge_method) amptklib.losslessTrim(mergedReads, FwdPrimer, RevPrimer, args.primer_mismatch, args.trim_len, args.pad, args.min_len, demuxReads) FinalCount = amptklib.countfastq(demuxReads) TooShort = PhixCleanedCount - FinalCount with open(StatsOut, 'w') as counts: counts.write("%i,%i,%i,%i,%i,%i\n" % (TotalCount, FindForPrimer, FindRevPrimer, DropMulti, TooShort, FinalCount))
def main(args): global FwdPrimer, RevPrimer, SampleData, Barcodes, RevBarcodes, tmpdir, usearch parser=argparse.ArgumentParser(prog='amptk-process_ion.py', usage="%(prog)s [options] -i file.fastq\n%(prog)s -h for help menu", description='''Script finds barcodes, strips forward and reverse primers, relabels, and then trim/pads reads to a set length''', epilog="""Written by Jon Palmer (2015) [email protected]""", formatter_class=MyFormatter) parser.add_argument('-i','--fastq', dest='fastq', required=True, help='FASTQ R1 file') parser.add_argument('--reverse', help='Illumina R2 reverse reads') parser.add_argument('-o','--out', dest="out", default='illumina2', help='Base name for output') parser.add_argument('-f','--fwd_primer', dest="F_primer", default='fITS7', help='Forward Primer') parser.add_argument('-r','--rev_primer', dest="R_primer", default='ITS4', help='Reverse Primer') parser.add_argument('-m','--mapping_file', help='Mapping file: QIIME format can have extra meta data columns') parser.add_argument('-p','--pad', default='off', choices=['on', 'off'], help='Pad with Ns to a set length') parser.add_argument('--primer_mismatch', default=2, type=int, help='Number of mis-matches in primer') parser.add_argument('--barcode_mismatch', default=0, type=int, help='Number of mis-matches in barcode') parser.add_argument('--barcode_fasta', help='FASTA file containing Barcodes (Names & Sequences)') parser.add_argument('--barcode_not_anchored', action='store_true', help='Barcodes (indexes) are not at start of reads') parser.add_argument('--reverse_barcode', help='FASTA file containing 3 prime Barocdes') parser.add_argument('--min_len', default=100, type=int, help='Minimum read length to keep') parser.add_argument('-l','--trim_len', default=300, type=int, help='Trim length for reads') parser.add_argument('--full_length', action='store_true', help='Keep only full length reads (no trimming/padding)') parser.add_argument('--merge_method', default='usearch', choices=['usearch', 'vsearch'], help='Software to use for PE read merging') parser.add_argument('--cpus', type=int, help="Number of CPUs. Default: auto") parser.add_argument('-u','--usearch', dest="usearch", default='usearch9', help='USEARCH EXE') args=parser.parse_args(args) args.out = re.sub(r'\W+', '', args.out) log_name = args.out + '.amptk-demux.log' if os.path.isfile(log_name): os.remove(log_name) FNULL = open(os.devnull, 'w') amptklib.setupLogging(log_name) cmd_args = " ".join(sys.argv)+'\n' amptklib.log.debug(cmd_args) print("-------------------------------------------------------") #initialize script, log system info and usearch version amptklib.SystemInfo() #Do a version check usearch = args.usearch amptklib.versionDependencyChecks(usearch) #get number of CPUs to use if not args.cpus: cpus = multiprocessing.cpu_count() else: cpus = args.cpus #parse a mapping file or a barcode fasta file, primers, etc get setup #dealing with Barcodes, get ion barcodes or parse the barcode_fasta argument barcode_file = args.out + ".barcodes_used.fa" rev_barcode_file = args.out + '.revbarcodes_used.fa' amptklib.SafeRemove(barcode_file) amptklib.SafeRemove(rev_barcode_file) #check if mapping file passed, use this if present, otherwise use command line arguments SampleData = {} Barcodes = {} RevBarcodes = {} FwdPrimer = '' RevPrimer = '' if args.mapping_file: if not os.path.isfile(args.mapping_file): amptklib.log.error("Mapping file not found: %s" % args.mapping_file) sys.exit(1) SampleData, Barcodes, RevBarcodes, FwdPrimer, RevPrimer = amptklib.parseMappingFileNEW(args.mapping_file) else: #no mapping file, so create dictionaries from barcode fasta files if not args.barcode_fasta: amptklib.log.error("You did not specify a --barcode_fasta or --mapping_file, one is required") sys.exit(1) else: shutil.copyfile(args.barcode_fasta, barcode_file) Barcodes = amptklib.fasta2barcodes(barcode_file, False) if args.reverse_barcode: shutil.copyfile(args.reverse_barcode, rev_barcode_file) RevBarcodes = amptklib.fasta2barcodes(rev_barcode_file, False) #parse primers here so doesn't conflict with mapping primers #look up primer db otherwise default to entry if args.F_primer in amptklib.primer_db: FwdPrimer = amptklib.primer_db.get(args.F_primer) amptklib.log.info("{:} fwd primer found in AMPtk primer db, setting to: {:}".format(args.F_primer, FwdPrimer)) else: FwdPrimer = args.F_primer amptklib.log.info("{:} fwd primer not found in AMPtk primer db, assuming it is actual primer sequence.".format(args.F_primer)) if args.R_primer in amptklib.primer_db: RevPrimer = amptklib.primer_db.get(args.R_primer) amptklib.log.info("{:} rev primer found in AMPtk primer db, setting to: {:}".format(args.R_primer, RevPrimer)) else: RevPrimer = args.R_primer amptklib.log.info("{:} rev primer not found in AMPtk primer db, assuming it is actual primer sequence.".format(args.R_primer)) #check if input is compressed gzip_list = [] if args.fastq.endswith('.gz'): gzip_list.append(os.path.abspath(args.fastq)) if args.reverse: if args.reverse.endswith('.gz'): gzip_list.append(os.path.abspath(args.reverse)) if gzip_list: amptklib.log.info("Gzipped input files detected, uncompressing") for file in gzip_list: file_out = file.replace('.gz', '') amptklib.Funzip(file, file_out, cpus) args.fastq = args.fastq.replace('.gz', '') if args.reverse: args.reverse = args.reverse.replace('.gz', '') #Count FASTQ records amptklib.log.info("Loading FASTQ Records") orig_total = amptklib.countfastq(args.fastq) size = amptklib.checkfastqsize(args.fastq) readablesize = amptklib.convertSize(size*2) amptklib.log.info('{:,} reads ({:})'.format(orig_total, readablesize)) #output barcodes/samples amptklib.log.info('Searching for {:} forward barcodes and {:} reverse barcodes'.format(len(Barcodes), len(RevBarcodes))) #create tmpdir and split input into n cpus tmpdir = args.out.split('.')[0]+'_'+str(os.getpid()) if not os.path.exists(tmpdir): os.makedirs(tmpdir) #tell user about number of cores using amptklib.log.info("Splitting FASTQ files over {:} cpus".format(cpus)) if args.reverse: amptklib.log.info("Demuxing PE Illumina reads; FwdPrimer: {:} RevPrimer: {:}".format(FwdPrimer, RevPrimer)) else: amptklib.log.info("Demuxing SE Illumina reads; FwdPrimer: {:} RevPrimer: {:}".format(FwdPrimer, amptklib.RevComp(RevPrimer))) amptklib.log.info('Dropping reads less than {:} bp and setting lossless trimming to {:} bp.'.format(args.min_len, args.trim_len)) if cpus > 1: if args.reverse: amptklib.split_fastqPE(args.fastq, args.reverse, orig_total, tmpdir, cpus*4) file_list = [] for file in os.listdir(tmpdir): if file.endswith('.fq'): filepart = os.path.join(tmpdir, file.split('_R')[0]) if not filepart in file_list: file_list.append(filepart) amptklib.runMultiProgress(processReadsPE, file_list, cpus, args=args) else: #split fastq file amptklib.split_fastq(args.fastq, orig_total, tmpdir, cpus*4) #now get file list from tmp folder file_list = [] for file in os.listdir(tmpdir): if file.endswith(".fq"): file = os.path.join(tmpdir, file) file_list.append(file) #finally process reads over number of cpus amptklib.runMultiProgress(processRead, file_list, cpus, args=args) else: if args.reverse: shutil.copyfile(args.fastq, os.path.join(tmpdir, 'chunk_R1.fq')) shutil.copyfile(args.reverse, os.path.join(tmpdir, 'chunk_R2.fq')) processReadsPE(os.path.join(tmpdir, 'chunk'), args=args) else: shutil.copyfile(args.fastq, os.path.join(tmpdir, 'chunk.fq')) processRead(os.path.join(tmpdir, 'chunk.fq'), args=args) print("-------------------------------------------------------") #Now concatenate all of the demuxed files together amptklib.log.info("Concatenating Demuxed Files") tmpDemux = args.out + '.tmp.demux.fq' with open(tmpDemux, 'w') as outfile: for filename in glob.glob(os.path.join(tmpdir,'*.demux.fq')): if filename == tmpDemux: continue with open(filename, 'r') as readfile: shutil.copyfileobj(readfile, outfile) if args.reverse: #parse the stats finalstats = [0,0,0,0,0,0] for file in os.listdir(tmpdir): if file.endswith('.stats'): with open(os.path.join(tmpdir, file), 'r') as statsfile: line = statsfile.readline() line = line.rstrip() newstats = line.split(',') newstats = [int(i) for i in newstats] for x, num in enumerate(newstats): finalstats[x] += num amptklib.log.info('{0:,}'.format(finalstats[0])+' total reads') amptklib.log.info('{0:,}'.format(finalstats[0]-finalstats[1]-finalstats[3])+' valid Barcodes') amptklib.log.info('{0:,}'.format(finalstats[5])+' valid output reads (Barcodes and Primers)') else: #parse the stats finalstats = [0,0,0,0,0,0,0] for file in os.listdir(tmpdir): if file.endswith('.stats'): with open(os.path.join(tmpdir, file), 'r') as statsfile: line = statsfile.readline() line = line.rstrip() newstats = line.split(',') newstats = [int(i) for i in newstats] for x, num in enumerate(newstats): finalstats[x] += num amptklib.log.info('{0:,}'.format(finalstats[0])+' total reads') if args.reverse_barcode: amptklib.log.info('{0:,}'.format(finalstats[0]-finalstats[1]-finalstats[2]-finalstats[4])+' valid Fwd and Rev Barcodes') else: amptklib.log.info('{0:,}'.format(finalstats[0]-finalstats[1])+' valid Barcode') amptklib.log.info('{0:,}'.format(finalstats[0]-finalstats[1]-finalstats[2])+' Fwd Primer found, {0:,}'.format(finalstats[3])+ ' Rev Primer found') amptklib.log.info('{0:,}'.format(finalstats[5])+' discarded too short (< %i bp)' % args.min_len) amptklib.log.info('{0:,}'.format(finalstats[6])+' valid output reads') #clean up tmp folder amptklib.SafeRemove(tmpdir) #last thing is to re-number of reads as it is possible they could have same name from multitprocessor split catDemux = args.out+'.demux.fq' amptklib.fastqreindex(tmpDemux, catDemux) amptklib.SafeRemove(tmpDemux) #now loop through data and find barcoded samples, counting each..... BarcodeCount = {} with open(catDemux, 'r') as input: header = itertools.islice(input, 0, None, 4) for line in header: ID = line.split("=",1)[-1].split(";")[0] if ID not in BarcodeCount: BarcodeCount[ID] = 1 else: BarcodeCount[ID] += 1 #now let's count the barcodes found and count the number of times they are found. barcode_counts = "%22s: %s" % ('Sample', 'Count') for k,v in natsorted(list(BarcodeCount.items()), key=lambda k_v: k_v[1], reverse=True): barcode_counts += "\n%22s: %s" % (k, str(BarcodeCount[k])) amptklib.log.info("Found %i barcoded samples\n%s" % (len(BarcodeCount), barcode_counts)) genericmapfile = args.out + '.mapping_file.txt' if not args.mapping_file: #create a generic mappingfile for downstream processes amptklib.CreateGenericMappingFile(Barcodes, RevBarcodes, FwdPrimer, RevPrimer, genericmapfile, BarcodeCount) else: amptklib.updateMappingFile(args.mapping_file, BarcodeCount, genericmapfile) #compress the output to save space FinalDemux = catDemux+'.gz' amptklib.Fzip(catDemux, FinalDemux, cpus) amptklib.removefile(catDemux) if gzip_list: for file in gzip_list: file = file.replace('.gz', '') amptklib.removefile(file) #get file size filesize = os.path.getsize(FinalDemux) readablesize = amptklib.convertSize(filesize) amptklib.log.info("Output file: %s (%s)" % (FinalDemux, readablesize)) amptklib.log.info("Mapping file: %s" % genericmapfile) print("-------------------------------------------------------") if 'darwin' in sys.platform: print(col.WARN + "\nExample of next cmd: " + col.END + "amptk cluster -i %s -o out\n" % (FinalDemux)) else: print("\nExample of next cmd: amptk cluster -i %s -o out\n" % (FinalDemux))
def main(args): parser=argparse.ArgumentParser(prog='amptk-fastq2sra.py', usage="%(prog)s [options] -i folder", description='''Script to split FASTQ file from Ion, 454, or Illumina by barcode sequence into separate files for submission to SRA. This script can take the BioSample worksheet from NCBI and create an SRA metadata file for submission.''', epilog="""Written by Jon Palmer (2015) [email protected]""", formatter_class=MyFormatter) parser.add_argument('-i','--input', dest='FASTQ', required=True, help='Input FASTQ file or folder') parser.add_argument('-o','--out', dest='out', help='Basename for output folder/files') parser.add_argument('--min_len', default=50, type=int, help='Minimum length of read to keep') parser.add_argument('-b','--barcode_fasta', help='Multi-fasta file containing barcodes used') parser.add_argument('--reverse_barcode', help='Reverse barcode fasta file') parser.add_argument('-s','--biosample', dest='biosample', help='BioSample file from NCBI') parser.add_argument('-p','--platform', dest='platform', default='ion', choices=['ion', 'illumina', '454'], help='Sequencing platform') parser.add_argument('-f','--fwd_primer', dest="F_primer", default='fITS7', help='Forward Primer (fITS7)') parser.add_argument('-r','--rev_primer', dest="R_primer", default='ITS4', help='Reverse Primer (ITS4)') parser.add_argument('-n', '--names', help='CSV mapping file BC,NewName') parser.add_argument('-d', '--description', help='Paragraph description for SRA metadata') parser.add_argument('-t','--title', default='Fungal ITS', help='Start of title for SRA submission, name it according to amplicon') parser.add_argument('-m','--mapping_file', help='Mapping file: QIIME format can have extra meta data columns') parser.add_argument('--primer_mismatch', default=2, type=int, help='Number of mis-matches in primer') parser.add_argument('--barcode_mismatch', default=0, type=int, help='Number of mis-matches in barcode') parser.add_argument('--require_primer', default='off', choices=['forward', 'both', 'off'], help='Require Primers to be present') parser.add_argument('--force', action='store_true', help='Overwrite existing directory') parser.add_argument('-a','--append', help='Append a name to all sample names for a run, i.e. --append run1 would yield Sample_run1') args=parser.parse_args(args) #get basename if not args.out passed if args.out: base = args.out else: if 'demux' in args.FASTQ: base = os.path.basename(args.FASTQ).split('.demux')[0] else: base = os.path.basename(args.FASTQ).split('.f')[0] log_name = base + '.amptk-sra.log' if os.path.isfile(log_name): os.remove(log_name) amptklib.setupLogging(log_name) FNULL = open(os.devnull, 'w') cmd_args = " ".join(sys.argv)+'\n' amptklib.log.debug(cmd_args) print("-------------------------------------------------------") amptklib.SystemInfo() amptkversion = amptklib.get_version() #create output directory if not os.path.exists(base): os.makedirs(base) else: if not args.force: amptklib.log.error("Directory %s exists, add --force argument to overwrite" % base) sys.exit(1) else: shutil.rmtree(base) os.makedirs(base) #parse a mapping file or a barcode fasta file, primers, etc get setup #dealing with Barcodes, get ion barcodes or parse the barcode_fasta argument barcode_file = os.path.join(base, base + ".barcodes_used.fa") rev_barcode_file = os.path.join(base, base + ".revbarcodes_used.fa") if os.path.isfile(barcode_file): os.remove(barcode_file) #check if mapping file passed, use this if present, otherwise use command line arguments SampleData = {} Barcodes = {} RevBarcodes = {} FwdPrimer = '' RevPrimer = '' if args.mapping_file: if not os.path.isfile(args.mapping_file): amptklib.log.error("Mapping file not found: %s" % args.mapping_file) sys.exit(1) SampleData, Barcodes, RevBarcodes, FwdPrimer, RevPrimer = amptklib.parseMappingFileNEW(args.mapping_file) else: if args.barcode_fasta: with open(barcode_file, 'w') as barcodeout: with open(args.barcode_fasta, 'r') as input: for rec in SeqIO.parse(input, 'fasta'): outname = args.multi+'.'+rec.id barcodeout.write(">%s\n%s\n" % (outname, rec.seq)) if args.reverse_barcode: with open(rev_barcode_file, 'w') as barcodeout: with open(args.reverse_barcode, 'r') as input: for rec in SeqIO.parse(input, 'fasta'): outname = args.multi+'.'+rec.id barcodeout.write(">%s\n%s\n" % (outname, rec.seq)) #parse primers here so doesn't conflict with mapping primers #look up primer db otherwise default to entry if FwdPrimer == '': if args.F_primer in amptklib.primer_db: FwdPrimer = amptklib.primer_db.get(args.F_primer) amptklib.log.info("{:} fwd primer found in AMPtk primer db, setting to: {:}".format(args.F_primer, FwdPrimer)) else: FwdPrimer = args.F_primer amptklib.log.info("{:} fwd primer not found in AMPtk primer db, assuming it is actual primer sequence.".format(args.F_primer)) if RevPrimer == '': if args.R_primer in amptklib.primer_db: RevPrimer = amptklib.primer_db.get(args.R_primer) amptklib.log.info("{:} rev primer found in AMPtk primer db, setting to: {:}".format(args.R_primer, RevPrimer)) else: RevPrimer = args.R_primer amptklib.log.info("{:} rev primer not found in AMPtk primer db, assuming it is actual primer sequence.".format(args.R_primer)) #then setup barcode dictionary if len(Barcodes) < 1 and os.path.isfile(barcode_file): Barcodes = amptklib.fasta2barcodes(barcode_file, False) #setup for looking for reverse barcode if len(RevBarcodes) < 1 and args.reverse_barcode: if not os.path.isfile(args.reverse_barcode): amptklib.log.info("Reverse barcode is not a valid file, exiting") sys.exit(1) shutil.copyfile(args.reverse_barcode, rev_barcode_file) RevBarcodes = amptklib.fasta2barcodes(rev_barcode_file, True) if args.platform != 'illumina': if not args.mapping_file and not args.barcode_fasta: amptklib.log.error("For ion, 454, or illumina2 datasets you must specificy a multi-fasta file containing barcodes with -b, --barcode_fasta, or -m/--mapping_file") sys.exit(1) if args.platform == 'illumina': #just need to get the correct .fastq.gz files into a folder by themselves #if illumina is selected, verify that args.fastq is a folder if not os.path.isdir(args.FASTQ): amptklib.log.error("%s is not a folder, for '--platform illumina', -i must be a folder containing raw reads" % (args.FASTQ)) sys.exit(1) rawlist = [] filelist = [] for file in os.listdir(args.FASTQ): if file.endswith(".fastq.gz") or file.endswith('.fastq') or file.endswith('.fq'): rawlist.append(file) if len(rawlist) > 0: if not '_R2' in sorted(rawlist)[1]: amptklib.log.info("Found %i single files, copying to %s folder" % (len(rawlist), base)) filelist = rawlist for file in rawlist: shutil.copyfile(os.path.join(args.FASTQ,file),(os.path.join(base,file))) else: amptklib.log.info("Found %i paired-end files, copying to %s folder" % (len(rawlist) / 2, base)) for file in rawlist: shutil.copyfile(os.path.join(args.FASTQ,file),(os.path.join(base,file))) if '_R1' in file: filelist.append(file) else: #start here to process the reads, first reverse complement the reverse primer ReverseCompRev = amptklib.RevComp(RevPrimer) #if --names given, load into dictonary if args.names: amptklib.log.info("Parsing names for output files via %s" % args.names) namesDict = {} with open(args.names, 'r') as input: for line in input: line = line.replace('\n', '') cols = line.split(',') if not cols[0] in namesDict: namesDict[cols[0]] = cols[1] #check for compressed input file if args.FASTQ.endswith('.gz'): amptklib.log.info("Gzipped input files detected, uncompressing") FASTQ_IN = args.FASTQ.replace('.gz', '') amptklib.Funzip(args.FASTQ, FASTQ_IN, multiprocessing.cpu_count()) else: FASTQ_IN = args.FASTQ #count FASTQ records in input amptklib.log.info("Loading FASTQ Records") total = amptklib.countfastq(FASTQ_IN) size = amptklib.checkfastqsize(args.FASTQ) readablesize = amptklib.convertSize(size) amptklib.log.info('{0:,}'.format(total) + ' reads (' + readablesize + ')') #output message depending on primer requirement if args.require_primer == 'off': amptklib.log.info("Looking for %i barcodes" % (len(Barcodes))) elif args.require_primer == 'forward': amptklib.log.info("Looking for %i barcodes that must have FwdPrimer: %s" % (len(Barcodes), FwdPrimer)) elif args.require_primer == 'both': amptklib.log.info("Looking for %i barcodes that must have FwdPrimer: %s and RevPrimer: %s" % (len(Barcodes), FwdPrimer, RevPrimer)) #this will loop through FASTQ file once, splitting those where barcodes are found, and primers trimmed runningTotal = 0 with open(FASTQ_IN, 'r') as input: for title, seq, qual in FastqGeneralIterator(input): Barcode, BarcodeLabel = amptklib.AlignBarcode(seq, Barcodes, args.barcode_mismatch) if Barcode == "": continue #trim barcode from sequence BarcodeLength = len(Barcode) seq = seq[BarcodeLength:] qual = qual[BarcodeLength:] #look for forward primer if args.require_primer != 'off': #means we only want ones with forward primer and or reverse, but don't remove them #now search for forward primer foralign = edlib.align(FwdPrimer, seq, mode="HW", k=args.primer_mismatch, additionalEqualities=amptklib.degenNuc) if foralign["editDistance"] < 0: continue if args.require_primer == 'both': #now search for reverse primer revalign = edlib.align(ReverseCompRev, seq, mode="HW", task="locations", k=args.primer_mismatch, additionalEqualities=amptklib.degenNuc) if revalign["editDistance"] < 0: #reverse primer was not found continue #check size if len(seq) < args.min_len: #filter out sequences less than minimum length. continue runningTotal += 1 fileout = os.path.join(base, BarcodeLabel+'.fastq') with open(fileout, 'a') as output: output.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) if args.require_primer == 'off': amptklib.log.info('{0:,}'.format(runningTotal) + ' total reads with valid barcode') elif args.require_primer == 'forward': amptklib.log.info('{0:,}'.format(runningTotal) + ' total reads with valid barcode and fwd primer') elif args.require_primer == 'both': amptklib.log.info('{0:,}'.format(runningTotal) + ' total reads with valid barcode and both primers') amptklib.log.info("Now Gzipping files") for file in os.listdir(base): if file.endswith(".fastq"): file_path = os.path.join(base, file) amptklib.Fzip_inplace(file_path) #after all files demuxed into output folder, loop through and create SRA metadata file filelist = [] for file in os.listdir(base): if file.endswith(".fastq.gz"): filelist.append(file) amptklib.log.info("Finished: output in %s" % base) #clean up if gzipped if args.FASTQ.endswith('.gz'): amptklib.removefile(FASTQ_IN) #check for BioSample meta file if args.biosample: amptklib.log.info("NCBI BioSample file detected, creating SRA metadata file") #load in BioSample file to dictionary with open(args.biosample, 'r') as input: reader = csv.reader(input, delimiter=str('\t')) header = next(reader) acc = header.index('Accession') sample = header.index('Sample Name') bio = header.index('BioProject') try: host = header.index('Host') except ValueError: host = header.index('Organism') BioDict = {col[sample]:(col[acc],col[bio],col[host]) for col in reader} #set some defaults based on the platform header = 'bioproject_accession\tbiosample_accession\tlibrary_ID\ttitle\tlibrary_strategy\tlibrary_source\tlibrary_selection\tlibrary_layout\tplatform\tinstrument_model\tdesign_description\tfiletype\tfilename\tfilename2\tforward_barcode\treverse_barcode\tforward_primer\treverse_primer\n' if args.platform == 'ion': sequencer = 'ION_TORRENT' model = 'Ion Torrent PGM' lib_layout = 'single' elif args.platform == '454': sequencer = '_LS454' model = '454 GS FLX Titanium' lib_layout = 'single' elif args.platform == 'illumina': sequencer = 'ILLUMINA' model = 'Illumina MiSeq' lib_layout = 'paired' else: amptklib.log.error("You specified a platform that is not supported") sys.exit(1) lib_strategy = 'AMPLICON' lib_source = 'GENOMIC' lib_selection = 'RANDOM PCR' filetype = 'fastq' #now open file for writing, input header and then loop through samples sub_out = base + '.submission.txt' with open(sub_out, 'w') as output: output.write(header) for file in filelist: barcode_for = '' barcode_rev = '' if not args.description: description = '%s amplicon library was created using a barcoded fusion primer PCR protocol using Pfx50 polymerase (Thermo Fisher Scientific), size selected, and sequenced on the %s platform. Sequence data was minimally processed, sequences were exported directly from the sequencing platform and only the barcode (index sequence) was trimmed prior to SRA submission. SRA submission generated with AMPtk %s' % (args.title, model, amptkversion.split(' ')[-1]) else: description = args.description if args.platform == 'ion' or args.platform == '454': name = file.split(".fastq")[0] if not name in BioDict: #lets try to look a bit harder, i.e. split on _ and - and look again searchname = name.replace('-', '_') searchname = searchname.split('_')[0] if not searchname in BioDict: #if still not found, then skip continue else: searchname = name bioproject = BioDict.get(searchname)[1] if not bioproject.startswith('PRJNA'): bioproject = 'PRJNA'+bioproject sample_name = BioDict.get(searchname)[0] title = '%s amplicon sequencing of %s: sample %s' % (args.title, BioDict.get(name)[2], name) bc_name = file.split(".f")[0] if bc_name in Barcodes: barcode_for = Barcodes.get(bc_name) if bc_name in RevBarcodes: barcode_rev = RevBarcodes.get(bc_name) if args.append: finalname = name+'_'+args.append #also need to change the name for output files newfile = file.replace(name, finalname) os.rename(os.path.join(base, file), os.path.join(base, newfile)) else: finalname = name newfile = file line = [bioproject,sample_name,finalname,title,lib_strategy,lib_source,lib_selection,lib_layout,sequencer,model,description,filetype,newfile,'',barcode_for,barcode_rev,FwdPrimer,RevPrimer] elif args.platform == 'illumina': name = file.split("_")[0] if not name in BioDict: amptklib.log.info('{:} not found in BioSample text file'.format(name)) continue bioproject = BioDict.get(name)[1] if not bioproject.startswith('PRJNA'): bioproject = 'PRJNA'+bioproject sample_name = BioDict.get(name)[0] title = '%s amplicon sequencing of %s: sample %s' % (args.title, BioDict.get(name)[2], name) file2 = file.replace('_R1', '_R2') #count number of _ in name, determines the dataformat fields = file.count("_") if fields > 3: #this is full illumina name with dual barcodes dualBC = file.split("_")[1] if '-' in dualBC: barcode_for = dualBC.split('-')[0] barcode_rev = dualBC.split('-')[1] elif fields == 3: #this is older reverse barcoded name barcode_for = '' barcode_rev = file.split("_")[1] if args.append: finalname = name+'_'+args.append newfile = file.replace(name, finalname) newfile2 = file2.replace(name, finalname) #also need to change the name for output files os.rename(os.path.join(base, file), os.path.join(base, newfile1)) os.rename(os.path.join(base, file2), os.path.join(base, newfile2)) file = file.replace(name, finalname) else: finalname = name newfile = file newfile2 = file2 line = [bioproject,sample_name,finalname,title,lib_strategy,lib_source,lib_selection,lib_layout,sequencer,model,description,filetype,newfile,newfile2,barcode_for,barcode_rev,FwdPrimer,RevPrimer] #write output to file output.write('\t'.join(line)+'\n') amptklib.log.info("SRA submission file created: %s" % sub_out)
def main(args): parser = argparse.ArgumentParser( prog='amptk-OTU_cluster_ref.py', usage="%(prog)s [options] -i file.demux.fq\n%(prog)s -h for help menu", description='''Script runs UPARSE OTU clustering. Requires USEARCH by Robert C. Edgar: http://drive5.com/usearch''', epilog="""Written by Jon Palmer (2016) [email protected]""", formatter_class=MyFormatter) parser.add_argument('-i', '--fastq', dest="FASTQ", required=True, help='FASTQ file (Required)') parser.add_argument('-o', '--out', help='Base output name') parser.add_argument('-e', '--maxee', default='1.0', help='Quality trim EE value') parser.add_argument('-p', '--pct_otu', default='97', help="OTU Clustering Percent") parser.add_argument('--id', default='97', help="Threshold for alignment") parser.add_argument('-m', '--minsize', default='2', help='Min identical seqs to process') parser.add_argument('-u', '--usearch', dest="usearch", default='usearch9', help='USEARCH9 EXE') parser.add_argument('--map_filtered', action='store_true', help='map quality filtered reads back to OTUs') parser.add_argument( '-d', '--db', required=True, help='Reference Database [ITS,ITS1,ITS2,16S,LSU,COI,custom]') parser.add_argument('--utax_db', help='UTAX Reference Database') parser.add_argument('--utax_cutoff', default=0.8, type=restricted_float, help='UTAX confidence value threshold.') parser.add_argument('--utax_level', default='k', choices=['k', 'p', 'c', 'o', 'f', 'g', 's'], help='UTAX classification level to retain') parser.add_argument('--mock', default='synmock', help='Spike-in mock community (fasta)') parser.add_argument('--debug', action='store_true', help='Remove Intermediate Files') parser.add_argument('--closed_ref_only', action='store_true', help='Only run closed reference clustering') parser.add_argument('--cpus', type=int, help="Number of CPUs. Default: auto") args = parser.parse_args(args) parentdir = os.path.join(os.path.dirname(amptklib.__file__)) #get basename if not args.out passed if args.out: base = args.out else: if 'demux' in args.FASTQ: base = os.path.basename(args.FASTQ).split('.demux')[0] else: base = os.path.basename(args.FASTQ).split('.f')[0] taxonomyLookup = { 'k': 'Kingdom', 'p': 'Phylum', 'c': 'Class', 'o': 'Order', 'f': 'Family', 'g': 'Genus', 's': 'Species' } #remove logfile if exists log_name = base + '.amptk-cluster_ref.log' if os.path.isfile(log_name): os.remove(log_name) amptklib.setupLogging(log_name) FNULL = open(os.devnull, 'w') cmd_args = " ".join(sys.argv) + '\n' amptklib.log.debug(cmd_args) print("-------------------------------------------------------") #initialize script, log system info and usearch version amptklib.SystemInfo() #Do a version check usearch = args.usearch amptklib.versionDependencyChecks(usearch) #get number of cpus if args.cpus: cpus = args.cpus else: cpus = amptklib.getCPUS() #make tmp folder tmp = base + '_tmp' if not os.path.exists(tmp): os.makedirs(tmp) #Setup DB locations and names, etc DBdir = os.path.join(parentdir, 'DB') DataBase = { 'ITS1': (os.path.join(DBdir, 'ITS.udb'), os.path.join(DBdir, 'ITS1_UTAX.udb')), 'ITS2': (os.path.join(DBdir, 'ITS.udb'), os.path.join(DBdir, 'ITS2_UTAX.udb')), 'ITS': (os.path.join(DBdir, 'ITS.udb'), os.path.join(DBdir, 'ITS_UTAX.udb')), '16S': (os.path.join(DBdir, '16S.udb'), os.path.join(DBdir, '16S.udb')), 'LSU': (os.path.join(DBdir, 'LSU.udb'), os.path.join(DBdir, 'LSU_UTAX.udb')), 'COI': (os.path.join(DBdir, 'COI.udb'), os.path.join(DBdir, 'COI_UTAX.udb')) } #setup refDB amptklib.log.info("Checking Reference Database") if args.db in DataBase: #need to write to fasta from vsearch UDB DB = os.path.join(tmp, args.db + '.extracted.fa') cmd = [ 'vsearch', '--udb2fasta', DataBase.get(args.db)[0], '--output', DB ] amptklib.runSubprocess(cmd, amptklib.log) else: DB = os.path.abspath(args.db) refDB = os.path.join(tmp, 'reference_DB.fa') if args.mock: if args.mock == 'synmock': mock = os.path.join(parentdir, 'DB', 'amptk_synmock.fa') else: mock = os.path.abspath(args.mock) seen = [] with open(refDB, 'w') as output: if args.mock: with open(mock) as input1: for rec in SeqIO.parse(input1, 'fasta'): if not rec.id in seen: SeqIO.write(rec, output, 'fasta') else: amptklib.log.error( "Duplicate ID's in Ref DB: %s, exiting" % rec.id) sys.exit(1) with open(DB) as input2: for rec in SeqIO.parse(input2, 'fasta'): if not rec.id in seen: SeqIO.write(rec, output, 'fasta') else: amptklib.log.error( "Duplicate ID's in Ref DB: %s, exiting" % rec.id) sys.exit(1) #get utax_database if args.db in DataBase: utaxDB = DataBase.get(args.db)[1] else: if not args.closed_ref_only: if args.utax_db: utaxDB = os.path.abspath(args.utax_db) else: amptklib.log.error( "%s not pre-installed DB, must then also specify valid UTAX database via --utax_db" % args.db) sys.exit(1) #Count FASTQ records amptklib.log.info("Loading FASTQ Records") #convert to FASTA for mapping orig_fasta = os.path.join(tmp, base + '.orig.fa') cmd = [ 'vsearch', '--fastq_filter', args.FASTQ, '--fastaout', orig_fasta, '--fastq_qmax', '55', '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) orig_total = amptklib.countfasta(orig_fasta) size = amptklib.checkfastqsize(args.FASTQ) readablesize = amptklib.convertSize(size) amptklib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize + ')') #Expected Errors filtering step filter_out = os.path.join(tmp, base + '.EE' + args.maxee + '.filter.fq') filter_fasta = os.path.join(tmp, base + '.EE' + args.maxee + '.filter.fa') amptklib.log.info("Quality Filtering, expected errors < %s" % args.maxee) cmd = [ 'vsearch', '--fastq_filter', args.FASTQ, '--fastq_maxee', str(args.maxee), '--fastqout', filter_out, '--fastaout', filter_fasta, '--fastq_qmax', '55', '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) qtrimtotal = amptklib.countfastq(filter_out) amptklib.log.info('{0:,}'.format(qtrimtotal) + ' reads passed') #now run full length dereplication derep_out = os.path.join(tmp, base + '.EE' + args.maxee + '.derep.fa') amptklib.log.info("De-replication (remove duplicate reads)") cmd = [ 'vsearch', '--derep_fulllength', filter_fasta, '--sizeout', '--output', derep_out, '--threads', str(cpus), '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.countfasta(derep_out) amptklib.log.info('{0:,}'.format(total) + ' reads passed') #now run sort by size sort_out = os.path.join(tmp, base + '.EE' + args.maxee + '.sort.fa') amptklib.log.info( "Sorting reads by size: removing reads seen less than %s times" % args.minsize) cmd = [ 'vsearch', '--sortbysize', derep_out, '--minsize', args.minsize, '--output', sort_out, '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.countfasta(sort_out) amptklib.log.info('{0:,}'.format(total) + ' reads passed') #chimera detection #first run through de novo chimera detection amptklib.log.info("De novo chimera detection (VSEARCH)") chimera_out = os.path.join(tmp, base + '.EE' + args.maxee + '.chimera_check.fa') cmd = [ 'vsearch', '--uchime_denovo', sort_out, '--relabel', 'Seq', '--sizeout', '--nonchimeras', chimera_out, '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.countfasta(chimera_out) amptklib.log.info('{0:,}'.format(total) + ' reads passed') #now run uchime_ref uchime_out = os.path.join(tmp, base + '.EE' + args.maxee + '.uchime.otus.fa') #now run chimera filtering if all checks out amptklib.log.info("Chimera Filtering (VSEARCH)") cmd = [ 'vsearch', '--mindiv', '1.0', '--uchime_ref', chimera_out, '--db', refDB, '--sizeout', '--nonchimeras', uchime_out, '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.countfasta(uchime_out) amptklib.log.info('{0:,}'.format(total) + ' OTUs passed') #now run usearch_global versus reference database align_out = os.path.join(tmp, base + '.align.uc') pident = int(args.id) * 0.01 amptklib.log.info( "Reference Clustering using Global Alignment, %s%% identity" % args.id) cmd = [ 'vsearch', '--usearch_global', uchime_out, '--db', refDB, '--id', str(pident), '--output_no_hits', '--top_hits_only', '--notrunclabels', '--uc', align_out, '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) #parse results ref_results = {} nohits = [] with open(align_out, 'r') as alignment: for line in alignment: line = line.replace('\n', '') col = line.split('\t') counts = col[8].split(';') counts = int(counts[1].replace('size=', '')) if col[3] == '*': nohits.append(col[8]) continue if float(col[3]) >= float(args.id): if not col[8] in ref_results: ref_results[col[8]] = (col[9], col[3], counts) else: print("Error: %s duplicated ID" % col[8]) else: nohits.append(col[8]) #summarize results from first ref clustering num_refcluster = len(ref_results) seqs_refcluster = 0 for k, v in list(ref_results.items()): seqs_refcluster += v[2] amptklib.log.info("%i OTUs classified " % num_refcluster + "({0:.0f}%".format(seqs_refcluster / float(qtrimtotal) * 100) + " of reads)") #get ref clustered hits to file with taxonomy ref_clustered = os.path.join(tmp, base + '.ref_clustered.fa') with open(ref_clustered, 'w') as refoutput: with open(uchime_out, 'r') as input: otu_counter = 1 for rec in SeqIO.parse(input, 'fasta'): if rec.id in ref_results: res = ref_results.get(rec.id) pident = res[1] tax = res[0] newID = 'OTU' + str( otu_counter) + ';pident=' + pident + ';' + tax rec.id = newID rec.name = '' rec.description = '' SeqIO.write(rec, refoutput, 'fasta') otu_counter += 1 if not args.closed_ref_only: #get nohits file to run clustering utax_ref = os.path.join(tmp, base + '.EE' + args.maxee + '.utax_ref.fa') with open(utax_ref, 'w') as output: with open(uchime_out, 'r') as input: for rec in SeqIO.parse(input, 'fasta'): if rec.id in nohits: SeqIO.write(rec, output, 'fasta') #input needs to be sorted, so ref_sort = os.path.join(tmp, base + '.utax_ref.sorted.fa') cmd = [ 'vsearch', '--sortbysize', utax_ref, '--minsize', args.minsize, '--output', ref_sort, '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) #now run clustering algorithm on those not found in reference database radius = str(100 - int(args.pct_otu)) otu_out = os.path.join(tmp, base + '.EE' + args.maxee + '.otus.fa') amptklib.log.info("De novo Clustering remaining sequences (UPARSE)") cmd = [ usearch, '-cluster_otus', ref_sort, '-relabel', 'OTU', '-otu_radius_pct', radius, '-otus', otu_out ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.countfasta(otu_out) amptklib.log.info('{0:,}'.format(total) + ' de novo OTUs') #try utax reference clustering amptklib.log.info("Reference Clustering de novo OTUs using UTAX") cmd = [ usearch, '-cluster_otus_utax', otu_out, '-db', utaxDB, '-utax_cutoff', str(args.utax_cutoff), '-utax_level', 's', '-strand', 'plus', '-utaxout', os.path.join(tmp, base + '.utax.out') ] amptklib.runSubprocess(cmd, amptklib.log) #setup tax filtering tax_values = ['k', 'p', 'c', 'o', 'f', 'g', 's'] filter_index = tax_values.index(args.utax_level) filt_tax_values = [s + ':' for s in tax_values[filter_index:]] #get results from utax with open(ref_clustered, 'a') as output: seqDict = SeqIO.index(otu_out, 'fasta') utaxresults = [] with open(os.path.join(tmp, base + '.utax.out'), 'r') as utax: for line in utax: line = line.replace('\n', '') col = line.split('\t') ID = col[0] tax = col[2] if any(x in tax for x in filt_tax_values): record = seqDict[ID] record.id = 'OTU' + str( otu_counter) + ';UTAX;tax=' + tax record.name = '' record.description = '' SeqIO.write(record, output, 'fasta') otu_counter += 1 total = amptklib.countfasta(ref_clustered) - num_refcluster amptklib.log.info('{0:,}'.format(total) + ' classified to %s' % taxonomyLookup.get(args.utax_level)) #clean up padded N's amptklib.log.info("Cleaning up padding from OTUs") otu_clean = os.path.join(tmp, base + '.clean.otus.fa') amptklib.fasta_strip_padding(ref_clustered, otu_clean) total = amptklib.countfasta(otu_clean) amptklib.log.info('{0:,}'.format(total) + ' total OTUs') #now map reads back to OTUs uc_out = os.path.join(tmp, base + '.EE' + args.maxee + '.mapping.uc') otu_table = os.path.join(tmp, base + '.EE' + args.maxee + '.otu_table.txt') #setup reads to map if args.map_filtered: reads = filter_fasta else: reads = orig_fasta amptklib.log.info("Mapping Reads to OTUs and Building OTU table") cmd = [ 'vsearch', '--usearch_global', reads, '--strand', 'plus', '--id', '0.97', '--db', otu_clean, '--uc', uc_out, '--otutabout', otu_table, '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) #count reads mapped total = amptklib.line_count2(uc_out) amptklib.log.info('{0:,}'.format(total) + ' reads mapped to OTUs ' + '({0:.0f}%)'.format(total / float(orig_total) * 100)) #Move files around, delete tmp if argument passed. currentdir = os.getcwd() final_otu = os.path.join(currentdir, base + '.cluster.otus.fa') shutil.copyfile(otu_clean, final_otu) final_otu_table = os.path.join(currentdir, base + '.otu_table.txt') shutil.copyfile(otu_table, final_otu_table) if not args.debug: shutil.rmtree(tmp) #Print location of files to STDOUT print("-------------------------------------------------------") print("OTU Clustering Script has Finished Successfully") print("-------------------------------------------------------") if not not args.debug: print("Tmp Folder of files: %s" % tmp) print("Clustered OTUs: %s" % os.path.basename(final_otu)) print("OTU Table: %s" % os.path.basename(final_otu_table)) print("-------------------------------------------------------") otu_print = final_otu.split('/')[-1] tab_print = final_otu_table.split('/')[-1] if 'darwin' in sys.platform: print(colr.WARN + "\nExample of next cmd:" + colr.END + " amptk filter -i %s -f %s -b <mock barcode>\n" % (tab_print, otu_print)) else: print( "\nExample of next cmd: amptk filter -i %s -f %s -b <mock barcode>\n" % (tab_print, otu_print))
def main(args): parser = argparse.ArgumentParser( prog='amptk-OTU_cluster.py', usage="%(prog)s [options] -i file.demux.fq\n%(prog)s -h for help menu", description='''Script runs UPARSE OTU clustering. Requires USEARCH by Robert C. Edgar: http://drive5.com/usearch''', epilog="""Written by Jon Palmer (2015) [email protected]""", formatter_class=MyFormatter) parser.add_argument('-i', '--fastq', dest="FASTQ", required=True, help='FASTQ file (Required)') parser.add_argument('-o', '--out', help='Base output name') parser.add_argument('-e', '--maxee', default='1.0', help='Quality trim EE value') parser.add_argument('-p', '--pct_otu', default='97', help="OTU Clustering Percent") parser.add_argument('-m', '--minsize', default='2', help='Min size to keep for clustering') parser.add_argument('-u', '--usearch', dest="usearch", default='usearch9', help='USEARCH9 EXE') parser.add_argument('--uchime_ref', help='Run UCHIME REF [ITS,16S,LSU,COI,custom]') parser.add_argument('--map_filtered', action='store_true', help='map quality filtered reads back to OTUs') parser.add_argument('--unoise', action='store_true', help='Run De-noising (UNOISE)') parser.add_argument('--debug', action='store_true', help='Remove Intermediate Files') parser.add_argument('--cpus', type=int, help="Number of CPUs. Default: auto") args = parser.parse_args(args) parentdir = os.path.join(os.path.dirname(amptklib.__file__)) #get basename if not args.out passed if args.out: base = args.out else: if 'demux' in args.FASTQ: base = os.path.basename(args.FASTQ).split('.demux')[0] else: base = os.path.basename(args.FASTQ).split('.f')[0] #remove logfile if exists log_name = base + '.amptk-cluster.log' if os.path.isfile(log_name): os.remove(log_name) amptklib.setupLogging(log_name) FNULL = open(os.devnull, 'w') cmd_args = " ".join(sys.argv) + '\n' amptklib.log.debug(cmd_args) print("-------------------------------------------------------") #initialize script, log system info and usearch version amptklib.SystemInfo() #Do a version check usearch = args.usearch amptklib.versionDependencyChecks(usearch) #get number of cpus if args.cpus: cpus = args.cpus else: cpus = amptklib.getCPUS() #make tmp folder tmp = base + '_tmp' if not os.path.exists(tmp): os.makedirs(tmp) #Count FASTQ records amptklib.log.info("Loading FASTQ Records") #convert to FASTA for mapping orig_fasta = os.path.join(tmp, base + '.orig.fa') cmd = [ 'vsearch', '--fastq_filter', args.FASTQ, '--fastaout', orig_fasta, '--fastq_qmax', '55', '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) orig_total = amptklib.countfasta(orig_fasta) size = amptklib.checkfastqsize(args.FASTQ) readablesize = amptklib.convertSize(size) amptklib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize + ')') #Expected Errors filtering step filter_out = os.path.join(tmp, base + '.EE' + args.maxee + '.filter.fq') filter_fasta = os.path.join(tmp, base + '.EE' + args.maxee + '.filter.fa') amptklib.log.info("Quality Filtering, expected errors < %s" % args.maxee) cmd = [ 'vsearch', '--fastq_filter', args.FASTQ, '--fastq_maxee', str(args.maxee), '--fastqout', filter_out, '--fastaout', filter_fasta, '--fastq_qmax', '55', '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.countfastq(filter_out) amptklib.log.info('{0:,}'.format(total) + ' reads passed') #now run full length dereplication derep_out = os.path.join(tmp, base + '.EE' + args.maxee + '.derep.fa') amptklib.log.info("De-replication (remove duplicate reads)") cmd = [ 'vsearch', '--derep_fulllength', filter_fasta, '--sizeout', '--output', derep_out, '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.countfasta(derep_out) amptklib.log.info('{0:,}'.format(total) + ' reads passed') #optional run UNOISE if args.unoise: unoise_out = unoise_out = os.path.join( tmp, base + '.EE' + args.maxee + '.denoised.fa') amptklib.log.info("Denoising Data with UNOISE") cmd = [ usearch, '-cluster_fast', derep_out, '-centroids', unoise_out, '-id', '0.9', '--maxdiffs', '5', '-abskew', '10', '-sizein', '-sizeout', '-sort', 'size', '-threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.countfasta(unoise_out) amptklib.log.info('{0:,}'.format(total) + ' reads passed') else: unoise_out = derep_out #now sort by size remove singletons sort_out = os.path.join(tmp, base + '.EE' + args.maxee + '.sort.fa') cmd = [ 'vsearch', '--sortbysize', unoise_out, '--minsize', args.minsize, '--output', sort_out, '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) #now run clustering algorithm radius = str(100 - int(args.pct_otu)) otu_out = os.path.join(tmp, base + '.EE' + args.maxee + '.otus.fa') amptklib.log.info("Clustering OTUs (UPARSE)") cmd = [ usearch, '-cluster_otus', sort_out, '-relabel', 'OTU', '-otu_radius_pct', radius, '-otus', otu_out, '-threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) numOTUs = amptklib.countfasta(otu_out) amptklib.log.info('{0:,}'.format(numOTUs) + ' OTUs') #clean up padded N's amptklib.log.info("Cleaning up padding from OTUs") otu_clean = os.path.join(tmp, base + '.EE' + args.maxee + '.clean.otus.fa') amptklib.fasta_strip_padding(otu_out, otu_clean) #optional UCHIME Ref if not args.uchime_ref: uchime_out = otu_clean else: uchime_out = os.path.join( tmp, base + '.EE' + args.maxee + '.uchime.otus.fa') #check if file is present, remove from previous run if it is. if os.path.isfile(uchime_out): os.remove(uchime_out) #R. Edgar now says using largest DB is better for UCHIME, so use the one distributed with taxonomy if args.uchime_ref in [ 'ITS', '16S', 'LSU', 'COI' ]: #test if it is one that is setup, otherwise default to full path uchime_db = os.path.join(parentdir, 'DB', args.uchime_ref + '.udb') if not os.path.isfile(uchime_db): amptklib.log.error( "Database not properly configured, run `amptk install` to setup DB, skipping chimera filtering" ) uchime_out = otu_clean #since uchime cannot work with udb database, need to extract fasta sequences, do this if if not amptklib.checkfile( os.path.join(parentdir, 'DB', args.uchime_ref + '.extracted.fa')): uchime_db = os.path.join(parentdir, 'DB', args.uchime_ref + '.extracted.fa') cmd = [ 'vsearch', '--udb2fasta', os.path.join(parentdir, 'DB', args.uchime_ref + '.udb'), '--output', uchime_db ] amptklib.runSubprocess(cmd, amptklib.log) else: uchime_db = os.path.join(parentdir, 'DB', args.uchime_ref + '.extracted.fa') else: if os.path.isfile(args.uchime_ref): uchime_db = os.path.abspath(args.uchime_ref) else: amptklib.log.error( "%s is not a valid file, skipping reference chimera filtering" % args.uchime_ref) uchime_out = otu_clean #now run chimera filtering if all checks out if not os.path.isfile(uchime_out): amptklib.log.info("Chimera Filtering (VSEARCH) using %s DB" % args.uchime_ref) cmd = [ 'vsearch', '--mindiv', '1.0', '--uchime_ref', otu_clean, '--db', uchime_db, '--nonchimeras', uchime_out, '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.countfasta(uchime_out) uchime_chimeras = numOTUs - total amptklib.log.info('{0:,}'.format(total) + ' OTUs passed, ' + '{0:,}'.format(uchime_chimeras) + ' ref chimeras') #Filter out OTUs in wrong orientation amptklib.log.info('Validating OTU orientation') passingOTUs = os.path.join(tmp, base + '.passed.otus.fa') numKept, numDropped = amptklib.validateorientation(tmp, sort_out, uchime_out, passingOTUs) amptklib.log.info('{:,} OTUs validated ({:,} dropped)'.format( numKept, numDropped)) #now map reads back to OTUs and build OTU table uc_out = os.path.join(tmp, base + '.EE' + args.maxee + '.mapping.uc') otu_table = os.path.join(tmp, base + '.EE' + args.maxee + '.otu_table.txt') #setup reads to map if args.map_filtered: reads = filter_fasta else: reads = orig_fasta amptklib.log.info("Mapping Reads to OTUs and Building OTU table") cmd = [ 'vsearch', '--usearch_global', reads, '--strand', 'plus', '--id', '0.97', '--db', passingOTUs, '--uc', uc_out, '--otutabout', otu_table, '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) #count reads mapped total = amptklib.line_count2(uc_out) amptklib.log.info('{0:,}'.format(total) + ' reads mapped to OTUs ' + '({0:.0f}%)'.format(total / float(orig_total) * 100)) #Move files around, delete tmp if argument passed. currentdir = os.getcwd() final_otu = os.path.join(currentdir, base + '.cluster.otus.fa') shutil.copyfile(passingOTUs, final_otu) final_otu_table = os.path.join(currentdir, base + '.otu_table.txt') shutil.copyfile(otu_table, final_otu_table) if not args.debug: shutil.rmtree(tmp) #Print location of files to STDOUT print("-------------------------------------------------------") print("OTU Clustering Script has Finished Successfully") print("-------------------------------------------------------") if not not args.debug: print("Tmp Folder of files: %s" % tmp) print("Clustered OTUs: %s" % os.path.basename(final_otu)) print("OTU Table: %s" % os.path.basename(final_otu_table)) print("-------------------------------------------------------") otu_print = final_otu.split('/')[-1] tab_print = final_otu_table.split('/')[-1] if 'darwin' in sys.platform: print(colr.WARN + "\nExample of next cmd:" + colr.END + " amptk filter -i %s -f %s -b <mock barcode>\n" % (tab_print, otu_print)) else: print( "\nExample of next cmd: amptk filter -i %s -f %s -b <mock barcode>\n" % (tab_print, otu_print))
def main(args): parser = argparse.ArgumentParser( prog='amptk-dada2.py', description= '''Script takes output from amptk pre-processing and runs DADA2''', epilog="""Written by Jon Palmer (2016) [email protected]""", formatter_class=MyFormatter) parser.add_argument('-i', '--fastq', required=True, help='Input Demuxed containing FASTQ') parser.add_argument('-o', '--out', help='Output Basename') parser.add_argument( '-m', '--min_reads', default=10, type=int, help="Minimum number of reads after Q filtering to run DADA2 on") parser.add_argument('-l', '--length', type=int, help='Length to truncate reads') parser.add_argument('-e', '--maxee', default='1.0', help='MaxEE quality filtering') parser.add_argument('-p', '--pct_otu', default='97', help="Biological OTU Clustering Percent") parser.add_argument('--platform', default='ion', choices=['ion', 'illumina', '454'], help='Sequencing platform') parser.add_argument('--chimera_method', default='consensus', choices=['consensus', 'pooled', 'per-sample'], help='bimera removal method') parser.add_argument('--uchime_ref', help='Run UCHIME REF [ITS,16S,LSU,COI,custom]') parser.add_argument('--pool', action='store_true', help='Pool all sequences together for DADA2') parser.add_argument('--debug', action='store_true', help='Keep all intermediate files') parser.add_argument('-u', '--usearch', dest="usearch", default='usearch9', help='USEARCH9 EXE') parser.add_argument('--cpus', type=int, help="Number of CPUs. Default: auto") args = parser.parse_args(args) parentdir = os.path.join(os.path.dirname(amptklib.__file__)) dada2script = os.path.join(parentdir, 'dada2_pipeline_nofilt.R') #get basename if not args.out passed if args.out: base = args.out else: if 'demux' in args.fastq: base = os.path.basename(args.fastq).split('.demux')[0] else: base = os.path.basename(args.fastq).split('.f')[0] #remove logfile if exists log_name = base + '.amptk-dada2.log' if os.path.isfile(log_name): amptklib.removefile(log_name) amptklib.setupLogging(log_name) FNULL = open(os.devnull, 'w') cmd_args = " ".join(sys.argv) + '\n' amptklib.log.debug(cmd_args) print("-------------------------------------------------------") #initialize script, log system info and usearch version amptklib.SystemInfo() #Do a version check usearch = args.usearch amptklib.versionDependencyChecks(usearch) #get number of cores if args.cpus: CORES = str(args.cpus) else: CORES = str(amptklib.getCPUS()) #check dependencies programs = ['Rscript'] amptklib.CheckDependencies(programs) Rversions = amptklib.checkRversion() R_pass = '******' dada2_pass = '******' #check dada2 first, if good move on, otherwise issue warning if not amptklib.gvc(Rversions[1], dada2_pass): amptklib.log.error("R v%s; DADA2 v%s detected, need atleast v%s" % (Rversions[0], Rversions[1], dada2_pass)) amptklib.log.error( "See: http://benjjneb.github.io/dada2/dada-installation.html") sys.exit(1) amptklib.log.info("R v%s; DADA2 v%s" % (Rversions[0], Rversions[1])) #Count FASTQ records and remove 3' N's as dada2 can't handle them amptklib.log.info("Loading FASTQ Records") no_ns = base + '.cleaned_input.fq' if args.fastq.endswith('.gz'): fastqInput = args.fastq.replace('.gz', '') amptklib.Funzip(os.path.abspath(args.fastq), os.path.basename(fastqInput), CORES) else: fastqInput = os.path.abspath(args.fastq) amptklib.fastq_strip_padding(os.path.basename(fastqInput), no_ns) demuxtmp = base + '.original.fa' cmd = [ 'vsearch', '--fastq_filter', os.path.abspath(no_ns), '--fastq_qmax', '55', '--fastaout', demuxtmp, '--threads', CORES ] amptklib.runSubprocess(cmd, amptklib.log) orig_total = amptklib.countfasta(demuxtmp) size = amptklib.checkfastqsize(no_ns) readablesize = amptklib.convertSize(size) amptklib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize + ')') #quality filter amptklib.log.info("Quality Filtering, expected errors < %s" % args.maxee) derep = base + '.qual-filtered.fq' filtercmd = [ 'vsearch', '--fastq_filter', no_ns, '--fastq_maxee', str(args.maxee), '--fastqout', derep, '--fastq_qmax', '55', '--fastq_maxns', '0', '--threads', CORES ] amptklib.runSubprocess(filtercmd, amptklib.log) total = amptklib.countfastq(derep) amptklib.log.info('{0:,}'.format(total) + ' reads passed') #split into individual files amptklib.log.info("Splitting FASTQ file by Sample into individual files") filtfolder = base + '_filtered' if os.path.isdir(filtfolder): shutil.rmtree(filtfolder) os.makedirs(filtfolder) splitDemux2(derep, filtfolder, args=args) #check for minimum number of reads in each sample remove = [] files = [i for i in os.listdir(filtfolder) if i.endswith('.fastq')] for x in files: if amptklib.countfastq(os.path.join(filtfolder, x)) < args.min_reads: remove.append(x) if len(remove) > 0: amptklib.log.info("Dropping %s as fewer than %i reads" % (', '.join(remove), args.min_reads)) for y in remove: os.remove(os.path.join(filtfolder, y)) #now run DADA2 on filtered folder amptklib.log.info("Running DADA2 pipeline") dada2log = base + '.dada2.Rscript.log' dada2out = base + '.dada2.csv' #check pooling vs notpooled, default is not pooled. if args.pool: POOL = 'TRUE' else: POOL = 'FALSE' with open(dada2log, 'w') as logfile: subprocess.call([ 'Rscript', '--vanilla', dada2script, filtfolder, dada2out, args.platform, POOL, CORES, args.chimera_method ], stdout=logfile, stderr=logfile) #check for results if not os.path.isfile(dada2out): amptklib.log.error("DADA2 run failed, please check %s logfile" % dada2log) sys.exit(1) #now process the output, pull out fasta, rename, etc fastaout = base + '.otus.tmp' OTUCounts = {} counter = 1 with open(fastaout, 'w') as writefasta: with open(dada2out, 'r') as input: next(input) for line in input: line = line.replace('\n', '') line = line.replace('"', '') cols = line.split(',') Seq = cols[0] countList = [int(x) for x in cols[1:]] counts = sum(countList) ID = 'ASV' + str(counter) if not ID in OTUCounts: OTUCounts[ID] = counts writefasta.write(">%s\n%s\n" % (ID, Seq)) counter += 1 #get number of bimeras from logfile with open(dada2log, 'r') as bimeracheck: for line in bimeracheck: if line.startswith('Identified '): bimeraline = line.split(' ') bimeras = int(bimeraline[1]) totalSeqs = int(bimeraline[5]) validSeqs = totalSeqs - bimeras amptklib.log.info('{0:,}'.format(totalSeqs) + ' total amplicon sequence variants (ASVs)') amptklib.log.info('{0:,}'.format(bimeras) + ' denovo chimeras removed') amptklib.log.info('{0:,}'.format(validSeqs) + ' valid ASVs') #optional UCHIME Ref uchime_out = base + '.nonchimeras.fa' chimeraFreeTable = base + '.otu_table.txt' iSeqs = base + '.ASVs.fa' if not args.uchime_ref: os.rename(fastaout, iSeqs) else: #check if file is present, remove from previous run if it is. if os.path.isfile(iSeqs): amptklib.removefile(iSeqs) #R. Edgar now says using largest DB is better for UCHIME, so use the one distributed with taxonomy if args.uchime_ref in [ 'ITS', '16S', 'LSU', 'COI' ]: #test if it is one that is setup, otherwise default to full path uchime_db = os.path.join(parentdir, 'DB', args.uchime_ref + '.udb') if not os.path.isfile(uchime_db): amptklib.log.error( "Database not properly configured, run `amptk install` to setup DB, skipping chimera filtering" ) uchime_out = fastaout #since uchime cannot work with udb database, need to extract fasta sequences, do this if if not amptklib.checkfile( os.path.join(parentdir, 'DB', args.uchime_ref + '.extracted.fa')): uchime_db = os.path.join(parentdir, 'DB', args.uchime_ref + '.extracted.fa') cmd = [ 'vsearch', '--udb2fasta', os.path.join(parentdir, 'DB', args.uchime_ref + '.udb'), '--output', uchime_db ] amptklib.runSubprocess(cmd, amptklib.log) else: uchime_db = os.path.join(parentdir, 'DB', args.uchime_ref + '.extracted.fa') else: if os.path.isfile(args.uchime_ref): uchime_db = os.path.abspath(args.uchime_ref) else: amptklib.log.error( "%s is not a valid file, skipping reference chimera filtering" % args.uchime_ref) iSeqs = fastaout #now run chimera filtering if all checks out if not os.path.isfile(iSeqs): amptklib.log.info("Chimera Filtering (VSEARCH) using %s DB" % args.uchime_ref) cmd = [ 'vsearch', '--mindiv', '1.0', '--uchime_ref', fastaout, '--db', uchime_db, '--nonchimeras', iSeqs, '--threads', CORES ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.countfasta(iSeqs) uchime_chimeras = validSeqs - total amptklib.log.info('{0:,}'.format(total) + ' ASVs passed, ' + '{0:,}'.format(uchime_chimeras) + ' ref chimeras removed') if os.path.isfile(fastaout): amptklib.removefile(fastaout) #setup output files dadademux = base + '.dada2.map.uc' bioSeqs = base + '.cluster.otus.fa' bioTable = base + '.cluster.otu_table.txt' uctmp = base + '.map.uc' ClusterComp = base + '.ASVs2clusters.txt' #Filter out ASVs in wrong orientation amptklib.log.info('Validating ASV orientation') os.rename(iSeqs, iSeqs + '.bak') numKept, numDropped = amptklib.validateorientationDADA2( OTUCounts, iSeqs + '.bak', iSeqs) amptklib.log.info('{:,} ASVs validated ({:,} dropped)'.format( numKept, numDropped)) amptklib.SafeRemove(iSeqs + '.bak') #map reads to DADA2 OTUs amptklib.log.info("Mapping reads to DADA2 ASVs") cmd = [ 'vsearch', '--usearch_global', demuxtmp, '--db', iSeqs, '--id', '0.97', '--uc', dadademux, '--strand', 'plus', '--otutabout', chimeraFreeTable, '--threads', CORES ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.line_count2(dadademux) amptklib.log.info('{0:,}'.format(total) + ' reads mapped to ASVs ' + '({0:.0f}%)'.format(total / float(orig_total) * 100)) #cluster amptklib.log.info("Clustering ASVs at %s%% to generate biological OTUs" % args.pct_otu) radius = float(args.pct_otu) / 100. cmd = [ 'vsearch', '--cluster_smallmem', iSeqs, '--centroids', bioSeqs, '--id', str(radius), '--strand', 'plus', '--relabel', 'OTU', '--qmask', 'none', '--usersort', '--threads', CORES ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.countfasta(bioSeqs) amptklib.log.info('{0:,}'.format(total) + ' OTUs generated') #determine where iSeqs clustered iSeqmap = base + '.ASV_map.uc' cmd = [ 'vsearch', '--usearch_global', iSeqs, '--db', bioSeqs, '--id', str(radius), '--uc', iSeqmap, '--strand', 'plus', '--threads', CORES ] amptklib.runSubprocess(cmd, amptklib.log) iSeqMapped = {} with open(iSeqmap, 'r') as mapping: for line in mapping: line = line.replace('\n', '') cols = line.split('\t') OTU = cols[9] Hit = cols[8] if not OTU in iSeqMapped: iSeqMapped[OTU] = [Hit] else: iSeqMapped[OTU].append(Hit) with open(ClusterComp, 'w') as clusters: clusters.write('OTU\tASVs\n') for k, v in natsorted(list(iSeqMapped.items())): clusters.write('%s\t%s\n' % (k, ', '.join(v))) #create OTU table amptklib.log.info("Mapping reads to OTUs") cmd = [ 'vsearch', '--usearch_global', demuxtmp, '--db', bioSeqs, '--id', '0.97', '--uc', uctmp, '--strand', 'plus', '--otutabout', bioTable, '--threads', CORES ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.line_count2(uctmp) amptklib.log.info('{0:,}'.format(total) + ' reads mapped to OTUs ' + '({0:.0f}%)'.format(total / float(orig_total) * 100)) if not args.debug: amptklib.removefile(no_ns) shutil.rmtree(filtfolder) amptklib.removefile(dada2out) amptklib.removefile(derep) amptklib.removefile(demuxtmp) amptklib.removefile(uctmp) amptklib.removefile(iSeqmap) amptklib.removefile(dadademux) #Print location of files to STDOUT print("-------------------------------------------------------") print("DADA2 Script has Finished Successfully") print("-------------------------------------------------------") if args.debug: print("Tmp Folder of files: %s" % filtfolder) print("Amplicon sequence variants: %s" % iSeqs) print("ASV OTU Table: %s" % chimeraFreeTable) print("Clustered OTUs: %s" % bioSeqs) print("OTU Table: %s" % bioTable) print("ASVs 2 OTUs: %s" % ClusterComp) print("-------------------------------------------------------") otu_print = bioSeqs.split('/')[-1] tab_print = bioTable.split('/')[-1] if 'darwin' in sys.platform: print(colr.WARN + "\nExample of next cmd:" + colr.END + " amptk filter -i %s -f %s -b <mock barcode>\n" % (tab_print, otu_print)) else: print( "\nExample of next cmd: amptk filter -i %s -f %s -b <mock barcode>\n" % (tab_print, otu_print))
def main(args): global FwdPrimer, RevPrimer, Barcodes, tmpdir, usearch parser = argparse.ArgumentParser( prog='amptk-process_illumina_raw.py', usage="%(prog)s [options] -i file.fastq\n%(prog)s -h for help menu", description= '''Script finds barcodes, strips forward and reverse primers, relabels, and then trim/pads reads to a set length''', epilog="""Written by Jon Palmer (2015) [email protected]""", formatter_class=MyFormatter) parser.add_argument('-f', '--forward', dest='fastq', required=True, help='Illumina FASTQ R1 reads') parser.add_argument('-r', '--reverse', required=True, help='Illumina FASTQ R2 reads') parser.add_argument('-i', '--index', nargs='+', required=True, help='Illumina FASTQ index reads') parser.add_argument('-m', '--mapping_file', help='QIIME-like mapping file') parser.add_argument('--read_length', type=int, help='Read length, i.e. 2 x 300 bp = 300') parser.add_argument('-o', '--out', dest="out", default='illumina_out', help='Base name for output') parser.add_argument('--fwd_primer', dest="F_primer", default='515FB', help='Forward Primer') parser.add_argument('--rev_primer', dest="R_primer", default='806RB', help='Reverse Primer') parser.add_argument('--primer_mismatch', default=2, type=int, help='Number of mis-matches in primer') parser.add_argument('--barcode_mismatch', default=0, type=int, help='Number of mis-matches in barcode') parser.add_argument( '--barcode_fasta', help='FASTA file containing Barcodes (Names & Sequences)') parser.add_argument('--rescue_forward', default='on', choices=['on', 'off'], help='Rescue Not-merged forward reads') parser.add_argument('--barcode_rev_comp', action='store_true', help='Reverse complement barcode sequences') parser.add_argument('--min_len', default=100, type=int, help='Minimum read length to keep') parser.add_argument('-l', '--trim_len', default=300, type=int, help='Trim length for reads') parser.add_argument('-p', '--pad', default='off', choices=['on', 'off'], help='Pad with Ns to a set length') parser.add_argument('--cpus', type=int, help="Number of CPUs. Default: auto") parser.add_argument('-u', '--usearch', dest="usearch", default='usearch9', help='USEARCH9 EXE') parser.add_argument('--cleanup', action='store_true', help='remove intermediate files') parser.add_argument('--merge_method', default='usearch', choices=['usearch', 'vsearch'], help='Software to use for PE read merging') args = parser.parse_args(args) args.out = re.sub(r'\W+', '', args.out) log_name = args.out + '.amptk-demux.log' if os.path.isfile(log_name): os.remove(log_name) amptklib.setupLogging(log_name) FNULL = open(os.devnull, 'w') cmd_args = " ".join(sys.argv) + '\n' amptklib.log.debug(cmd_args) print("-------------------------------------------------------") #initialize script, log system info and usearch version amptklib.SystemInfo() #get version of amptk usearch = args.usearch amptklib.versionDependencyChecks(usearch) #get number of CPUs to use if not args.cpus: cpus = multiprocessing.cpu_count() else: cpus = args.cpus #create tmpdir tmpdir = args.out.split('.')[0] + '_' + str(os.getpid()) if not os.path.exists(tmpdir): os.makedirs(tmpdir) #parse a mapping file or a barcode fasta file, primers, etc get setup #dealing with Barcodes, get ion barcodes or parse the barcode_fasta argument barcode_file = args.out + ".barcodes_used.fa" if os.path.isfile(barcode_file): os.remove(barcode_file) #check if mapping file passed, use this if present, otherwise use command line arguments SampleData = {} Barcodes = {} RevBarcodes = {} FwdPrimer = '' RevPrimer = '' if args.mapping_file: if not os.path.isfile(args.mapping_file): amptklib.log.error("Mapping file not found: %s" % args.mapping_file) sys.exit(1) SampleData, Barcodes, RevBarcodes, FwdPrimer, RevPrimer = amptklib.parseMappingFileNEW( args.mapping_file) else: #no mapping file, so create dictionaries from barcode fasta files if not args.barcode_fasta: amptklib.log.error( "You did not specify a --barcode_fasta or --mapping_file, one is required" ) sys.exit(1) else: shutil.copyfile(args.barcode_fasta, barcode_file) Barcodes = amptklib.fasta2barcodes(barcode_file, False) if FwdPrimer == '' or RevPrimer == '': #parse primers here so doesn't conflict with mapping primers #look up primer db otherwise default to entry if args.F_primer in amptklib.primer_db: FwdPrimer = amptklib.primer_db.get(args.F_primer) amptklib.log.info( "{:} fwd primer found in AMPtk primer db, setting to: {:}". format(args.F_primer, FwdPrimer)) else: FwdPrimer = args.F_primer amptklib.log.info( "{:} fwd primer not found in AMPtk primer db, assuming it is actual primer sequence." .format(args.F_primer)) if args.R_primer in amptklib.primer_db: RevPrimer = amptklib.primer_db.get(args.R_primer) amptklib.log.info( "{:} rev primer found in AMPtk primer db, setting to: {:}". format(args.R_primer, RevPrimer)) else: RevPrimer = args.R_primer amptklib.log.info( "{:} rev primer not found in AMPtk primer db, assuming it is actual primer sequence." .format(args.R_primer)) #if still no primers set, then exit if FwdPrimer == '' or RevPrimer == '': amptklib.log.error( "Please provide primer sequences via --fwd_primer and --rev_primer" ) sys.exit(1) #if barcodes_rev_comp passed then reverse complement the keys in mapdict if args.barcode_rev_comp: amptklib.log.info("Reverse complementing barcode sequences") backupDict = Barcodes Barcodes = {} for k, v in list(backupDict.items()): RCkey = amptklib.RevComp(v) Barcodes[k] = RCkey amptklib.log.info("Loading %i samples from mapping file" % len(Barcodes)) amptklib.log.info('FwdPrimer: {:} RevPrimer: {:}'.format( FwdPrimer, RevPrimer)) amptklib.log.info( 'Dropping reads less than {:} bp and setting lossless trimming to {:} bp.' .format(args.min_len, args.trim_len)) #rename reads according to indexes if not amptklib.PEandIndexCheck( args.fastq, args.reverse, args.index[0]): #check they are all same length amptklib.log.error("FASTQ input malformed, read numbers do not match") sys.exit(1) amptklib.log.info("Loading FASTQ Records") NumSeqs = amptklib.countfastq(args.fastq) if cpus > 1: amptklib.log.info("Splitting FASTQ files over {:} cpus".format(cpus)) amptklib.split_fastqPEandI(args.fastq, args.reverse, args.index[0], NumSeqs, tmpdir, cpus * 2) file_list = [] for file in os.listdir(tmpdir): if file.endswith('.fq'): filepart = os.path.join(tmpdir, file.split('_R')[0]) if not filepart in file_list: file_list.append(filepart) amptklib.log.info("Mapping indexes to reads and renaming PE reads") amptklib.runMultiProgress(safe_run, file_list, cpus, args=args) else: amptklib.log.info("Mapping indexes to reads and renaming PE reads") shutil.copyfile(args.fastq, os.path.join(tmpdir, 'chunk_R1.fq')) shutil.copyfile(args.reverse, os.path.join(tmpdir, 'chunk_R2.fq')) shutil.copyfile(args.index[0], os.path.join(tmpdir, 'chunk_R3.fq')) processReadsPE(os.path.join(tmpdir, 'chunk'), args=args) print("-------------------------------------------------------") #Now concatenate all of the demuxed files together amptklib.log.info("Concatenating Demuxed Files") tmpDemux = os.path.join(tmpdir, args.out + '.demux.fq') with open(tmpDemux, 'wb') as outfile: for filename in glob.glob(os.path.join(tmpdir, '*.demux.fq')): if filename == tmpDemux: continue with open(filename, 'r') as readfile: shutil.copyfileobj(readfile, outfile) #parse the stats finalstats = [0, 0, 0, 0, 0, 0] for file in os.listdir(tmpdir): if file.endswith('.stats'): with open(os.path.join(tmpdir, file), 'r') as statsfile: line = statsfile.readline() line = line.replace('\n', '') newstats = line.split(',') newstats = [int(i) for i in newstats] for x, num in enumerate(newstats): finalstats[x] += num #finally reindex output #last thing is to re-number of reads as it is possible they could have same name from multitprocessor split Demux = args.out + '.demux.fq' amptklib.fastqreindex(tmpDemux, Demux) amptklib.SafeRemove(tmpDemux) #output stats of the run amptklib.log.info('{0:,}'.format(finalstats[0]) + ' total reads') amptklib.log.info('{0:,}'.format(finalstats[0] - finalstats[1]) + ' discarded no index match') amptklib.log.info('{0:,}'.format(finalstats[2]) + ' Fwd Primer found, {0:,}'.format(finalstats[3]) + ' Rev Primer found') amptklib.log.info('{0:,}'.format(finalstats[4]) + ' discarded too short (< %i bp)' % args.min_len) amptklib.log.info('{0:,}'.format(finalstats[5]) + ' valid output reads') #now loop through data and find barcoded samples, counting each..... BarcodeCount = {} with open(Demux, 'r') as input: header = itertools.islice(input, 0, None, 4) for line in header: ID = line.split("=", 1)[-1].split(";")[0] if ID not in BarcodeCount: BarcodeCount[ID] = 1 else: BarcodeCount[ID] += 1 #now let's count the barcodes found and count the number of times they are found. barcode_counts = "%30s: %s" % ('Sample', 'Count') for k, v in natsorted(list(BarcodeCount.items()), key=lambda k_v: k_v[1], reverse=True): barcode_counts += "\n%30s: %s" % (k, str(BarcodeCount[k])) amptklib.log.info("Found %i barcoded samples\n%s" % (len(BarcodeCount), barcode_counts)) #create mapping file if one doesn't exist genericmapfile = args.out + '.mapping_file.txt' amptklib.CreateGenericMappingFile(Barcodes, {}, FwdPrimer, RevPrimer, genericmapfile, BarcodeCount) #compress the output to save space FinalDemux = Demux + '.gz' amptklib.Fzip(Demux, FinalDemux, cpus) amptklib.removefile(Demux) if args.cleanup: amptklib.SafeRemove(tmpdir) #get file size filesize = os.path.getsize(FinalDemux) readablesize = amptklib.convertSize(filesize) amptklib.log.info("Output file: %s (%s)" % (FinalDemux, readablesize)) amptklib.log.info("Mapping file: %s" % genericmapfile) print("-------------------------------------------------------") if 'darwin' in sys.platform: print(col.WARN + "\nExample of next cmd: " + col.END + "amptk cluster -i %s -o out\n" % (FinalDemux)) else: print("\nExample of next cmd: amptk cluster -i %s -o out\n" % (FinalDemux))
def main(args): global FwdPrimer, RevPrimer, Barcodes, tmpdir parser = argparse.ArgumentParser( prog='amptk-process_ion.py', usage="%(prog)s [options] -i file.fastq\n%(prog)s -h for help menu", description= '''Script finds barcodes, strips forward and reverse primers, relabels, and then trim/pads reads to a set length''', epilog="""Written by Jon Palmer (2015) [email protected]""", formatter_class=MyFormatter) parser.add_argument('-i', '--fastq', '--sff', '--fasta', '--bam', dest='fastq', required=True, help='BAM/FASTQ/SFF/FASTA file') parser.add_argument('-q', '--qual', help='QUAL file (if -i is FASTA)') parser.add_argument('-o', '--out', dest="out", default='ion', help='Base name for output') parser.add_argument('-f', '--fwd_primer', dest="F_primer", default='fITS7-ion', help='Forward Primer') parser.add_argument('-r', '--rev_primer', dest="R_primer", default='ITS4', help='Reverse Primer') parser.add_argument( '-m', '--mapping_file', help='Mapping file: QIIME format can have extra meta data columns') parser.add_argument('-p', '--pad', default='off', choices=['on', 'off'], help='Pad with Ns to a set length') parser.add_argument('--primer_mismatch', default=2, type=int, help='Number of mis-matches in primer') parser.add_argument('--barcode_mismatch', default=0, type=int, help='Number of mis-matches in barcode') parser.add_argument( '--barcode_fasta', default='ionxpress', help='FASTA file containing Barcodes (Names & Sequences)') parser.add_argument('--reverse_barcode', help='FASTA file containing 3 prime Barocdes') parser.add_argument('-b', '--list_barcodes', dest="barcodes", default='all', help='Enter Barcodes used separated by commas') parser.add_argument('--min_len', default=100, type=int, help='Minimum read length to keep') parser.add_argument('-l', '--trim_len', default=300, type=int, help='Trim length for reads') parser.add_argument( '--full_length', action='store_true', help='Keep only full length reads (no trimming/padding)') parser.add_argument('--mult_samples', dest="multi", default='False', help='Combine multiple samples (i.e. FACE1)') parser.add_argument('--ion', action='store_true', help='Input data is Ion Torrent') parser.add_argument('--454', action='store_true', help='Input data is 454') parser.add_argument('--cpus', type=int, help="Number of CPUs. Default: auto") parser.add_argument('-u', '--usearch', dest="usearch", default='usearch9', help='USEARCH EXE') args = parser.parse_args(args) args.out = re.sub(r'\W+', '', args.out) log_name = args.out + '.amptk-demux.log' if os.path.isfile(log_name): os.remove(log_name) FNULL = open(os.devnull, 'w') amptklib.setupLogging(log_name) cmd_args = " ".join(sys.argv) + '\n' amptklib.log.debug(cmd_args) print("-------------------------------------------------------") #initialize script, log system info and usearch version amptklib.SystemInfo() #Do a version check usearch = args.usearch amptklib.versionDependencyChecks(usearch) #get number of CPUs to use if not args.cpus: cpus = multiprocessing.cpu_count() else: cpus = args.cpus #parse a mapping file or a barcode fasta file, primers, etc get setup #dealing with Barcodes, get ion barcodes or parse the barcode_fasta argument barcode_file = args.out + ".barcodes_used.fa" rev_barcode_file = args.out + '.revbarcodes_used.fa' amptklib.SafeRemove(barcode_file) amptklib.SafeRemove(rev_barcode_file) #check if mapping file passed, use this if present, otherwise use command line arguments SampleData = {} Barcodes = {} RevBarcodes = {} if args.mapping_file: if not os.path.isfile(args.mapping_file): amptklib.log.error("Mapping file not found: %s" % args.mapping_file) sys.exit(1) SampleData, Barcodes, RevBarcodes, FwdPrimer, RevPrimer = amptklib.parseMappingFileNEW( args.mapping_file) genericmapfile = args.mapping_file else: #no mapping file, so create dictionaries from barcode fasta files if args.barcode_fasta == 'ionxpress': #get script path and barcode file name pgm_barcodes = os.path.join(os.path.dirname(amptklib.__file__), 'DB', 'ionxpress_barcodes.fa') elif args.barcode_fasta == 'ioncode': pgm_barcodes = os.path.join(os.path.dirname(amptklib.__file__), 'DB', 'ioncode_barcodes.fa') if args.barcode_fasta == 'ionxpress' or args.barcode_fasta == 'ioncode': if args.barcodes == "all": if args.multi == 'False': shutil.copyfile(pgm_barcodes, barcode_file) else: with open(barcode_file, 'w') as barcodeout: with open(pgm_barcodes, 'r') as input: for rec in SeqIO.parse(input, 'fasta'): outname = args.multi + '.' + rec.id barcodeout.write(">%s\n%s\n" % (outname, rec.seq)) else: bc_list = args.barcodes.split(",") inputSeqFile = open(pgm_barcodes, "rU") SeqRecords = SeqIO.to_dict(SeqIO.parse(inputSeqFile, "fasta")) for rec in bc_list: name = "BC." + rec seq = SeqRecords[name].seq if args.multi != 'False': outname = args.multi + '.' + name else: outname = name outputSeqFile = open(barcode_file, "a") outputSeqFile.write(">%s\n%s\n" % (outname, seq)) outputSeqFile.close() inputSeqFile.close() else: #check for multi_samples and add if necessary if args.multi == 'False': shutil.copyfile(args.barcode_fasta, barcode_file) if args.reverse_barcode: shutil.copyfile(args.reverse_barcode, rev_barcode_file) else: with open(barcode_file, 'w') as barcodeout: with open(args.barcode_fasta, 'r') as input: for rec in SeqIO.parse(input, 'fasta'): outname = args.multi + '.' + rec.id barcodeout.write(">%s\n%s\n" % (outname, rec.seq)) if args.reverse_barcode: with open(rev_barcode_file, 'w') as barcodeout: with open(args.reverse_barcode, 'r') as input: for rec in SeqIO.parse(input, 'fasta'): outname = args.multi + '.' + rec.id barcodeout.write(">%s\n%s\n" % (outname, rec.seq)) #parse primers here so doesn't conflict with mapping primers #look up primer db otherwise default to entry if args.F_primer in amptklib.primer_db: FwdPrimer = amptklib.primer_db.get(args.F_primer) amptklib.log.info( "{:} fwd primer found in AMPtk primer db, setting to: {:}". format(args.F_primer, FwdPrimer)) else: FwdPrimer = args.F_primer amptklib.log.info( "{:} fwd primer not found in AMPtk primer db, assuming it is actual primer sequence." .format(args.F_primer)) if args.R_primer in amptklib.primer_db: RevPrimer = amptklib.primer_db.get(args.R_primer) amptklib.log.info( "{:} rev primer found in AMPtk primer db, setting to: {:}". format(args.R_primer, RevPrimer)) else: RevPrimer = args.R_primer amptklib.log.info( "{:} rev primer not found in AMPtk primer db, assuming it is actual primer sequence." .format(args.R_primer)) #check if input is compressed gzip_list = [] if args.fastq.endswith('.gz'): gzip_list.append(os.path.abspath(args.fastq)) if gzip_list: amptklib.log.info("Gzipped input files detected, uncompressing") for file in gzip_list: file_out = file.replace('.gz', '') amptklib.Funzip(file, file_out, cpus) args.fastq = args.fastq.replace('.gz', '') #if SFF file passed, convert to FASTQ with biopython if args.fastq.endswith(".sff"): if args.barcode_fasta == 'ionxpress': if not args.mapping_file: amptklib.log.error( "You did not specify a --barcode_fasta or --mapping_file, one is required for 454 data" ) sys.exit(1) amptklib.log.info("SFF input detected, converting to FASTQ") SeqIn = args.out + '.sff.extract.fastq' SeqIO.convert(args.fastq, "sff-trim", SeqIn, "fastq") elif args.fastq.endswith(".fas") or args.fastq.endswith( ".fasta") or args.fastq.endswith(".fa"): if not args.qual: amptklib.log.error( "FASTA input detected, however no QUAL file was given. You must have FASTA + QUAL files" ) sys.exit(1) else: if args.barcode_fasta == 'ionxpress': if not args.mapping_file: amptklib.log.error( "You did not specify a --barcode_fasta or --mapping_file, one is required for 454 data" ) sys.exit(1) SeqIn = args.out + '.fastq' amptklib.log.info("FASTA + QUAL detected, converting to FASTQ") amptklib.faqual2fastq(args.fastq, args.qual, SeqIn) elif args.fastq.endswith('.bam'): #so we can convert natively with pybam, however it is 10X slower than bedtools/samtools #since samtools is fastest, lets use that if exists, if not then bedtools, else default to pybam amptklib.log.info("Converting Ion Torrent BAM file to FASTQ") SeqIn = args.out + '.fastq' if amptklib.which('samtools'): cmd = ['samtools', 'fastq', '-@', str(cpus), args.fastq] amptklib.runSubprocess2(cmd, amptklib.log, SeqIn) else: if amptklib.which('bedtools'): cmd = [ 'bedtools', 'bamtofastq', '-i', args.fastq, '-fq', SeqIn ] amptklib.runSubprocess(cmd, amptklib.log) else: #default to pybam amptklib.bam2fastq(args.fastq, SeqIn) else: SeqIn = args.fastq #start here to process the reads, first reverse complement the reverse primer catDemux = args.out + '.demux.fq' origRevPrimer = RevPrimer RevPrimer = amptklib.RevComp(RevPrimer) amptklib.log.info("Foward primer: %s, Rev comp'd rev primer: %s" % (FwdPrimer, RevPrimer)) #then setup barcode dictionary if len(Barcodes) < 1: Barcodes = amptklib.fasta2barcodes(barcode_file, False) #setup for looking for reverse barcode if len(RevBarcodes) < 1 and args.reverse_barcode: if not os.path.isfile(args.reverse_barcode): amptklib.log.info("Reverse barcode is not a valid file, exiting") sys.exit(1) shutil.copyfile(args.reverse_barcode, rev_barcode_file) RevBarcodes = amptklib.fasta2barcodes(rev_barcode_file, True) #Count FASTQ records amptklib.log.info("Loading FASTQ Records") orig_total = amptklib.countfastq(SeqIn) size = amptklib.checkfastqsize(SeqIn) readablesize = amptklib.convertSize(size) amptklib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize + ')') #create tmpdir and split input into n cpus tmpdir = args.out.split('.')[0] + '_' + str(os.getpid()) if not os.path.exists(tmpdir): os.makedirs(tmpdir) amptklib.log.info( 'Dropping reads less than {:} bp and setting lossless trimming to {:} bp.' .format(args.min_len, args.trim_len)) if cpus > 1: #split fastq file amptklib.log.info("Splitting FASTQ files over {:} cpus".format(cpus)) amptklib.split_fastq(SeqIn, orig_total, tmpdir, cpus * 2) #now get file list from tmp folder file_list = [] for file in os.listdir(tmpdir): if file.endswith(".fq"): file = os.path.join(tmpdir, file) file_list.append(file) #finally process reads over number of cpus amptklib.runMultiProgress(processRead, file_list, cpus, args=args) else: shutil.copyfile(SeqIn, os.path.join(tmpdir, 'chunk.fq')) processRead(os.path.join(tmpdir, 'chunk.fq'), args=args) print("-------------------------------------------------------") #Now concatenate all of the demuxed files together amptklib.log.info("Concatenating Demuxed Files") tmpDemux = args.out + '.tmp.demux.fq' with open(tmpDemux, 'w') as outfile: for filename in glob.glob(os.path.join(tmpdir, '*.demux.fq')): if filename == tmpDemux: continue with open(filename, 'r') as readfile: shutil.copyfileobj(readfile, outfile) #parse the stats finalstats = [0, 0, 0, 0, 0, 0, 0] for file in os.listdir(tmpdir): if file.endswith('.stats'): with open(os.path.join(tmpdir, file), 'r') as statsfile: line = statsfile.readline() line = line.rstrip() newstats = line.split(',') newstats = [int(i) for i in newstats] for x, num in enumerate(newstats): finalstats[x] += num #clean up tmp folder shutil.rmtree(tmpdir) #last thing is to re-number of reads as it is possible they could have same name from multitprocessor split amptklib.fastqreindex(tmpDemux, catDemux) os.remove(tmpDemux) amptklib.log.info('{0:,}'.format(finalstats[0]) + ' total reads') if args.reverse_barcode: amptklib.log.info('{0:,}'.format(finalstats[0] - finalstats[1] - finalstats[2] - finalstats[4]) + ' valid Fwd and Rev Barcodes') else: amptklib.log.info('{0:,}'.format(finalstats[0] - finalstats[1]) + ' valid Barcode') amptklib.log.info('{0:,}'.format(finalstats[0] - finalstats[1] - finalstats[2]) + ' Fwd Primer found, {0:,}'.format(finalstats[3]) + ' Rev Primer found') amptklib.log.info('{0:,}'.format(finalstats[5]) + ' discarded too short (< %i bp)' % args.min_len) amptklib.log.info('{0:,}'.format(finalstats[6]) + ' valid output reads') #now loop through data and find barcoded samples, counting each..... BarcodeCount = {} with open(catDemux, 'r') as input: header = itertools.islice(input, 0, None, 4) for line in header: ID = line.split("=", 1)[-1].split(";")[0] if ID not in BarcodeCount: BarcodeCount[ID] = 1 else: BarcodeCount[ID] += 1 #now let's count the barcodes found and count the number of times they are found. barcode_counts = "%22s: %s" % ('Sample', 'Count') for k, v in natsorted(list(BarcodeCount.items()), key=lambda k_v: k_v[1], reverse=True): barcode_counts += "\n%22s: %s" % (k, str(BarcodeCount[k])) amptklib.log.info("Found %i barcoded samples\n%s" % (len(BarcodeCount), barcode_counts)) #create a generic mappingfile for downstream processes genericmapfile = args.out + '.mapping_file.txt' if not args.mapping_file: amptklib.CreateGenericMappingFile(Barcodes, RevBarcodes, FwdPrimer, origRevPrimer, genericmapfile, BarcodeCount) else: amptklib.updateMappingFile(args.mapping_file, BarcodeCount, genericmapfile) #compress the output to save space FinalDemux = catDemux + '.gz' amptklib.Fzip(catDemux, FinalDemux, cpus) amptklib.removefile(catDemux) if gzip_list: for file in gzip_list: file = file.replace('.gz', '') amptklib.removefile(file) #get file size filesize = os.path.getsize(FinalDemux) readablesize = amptklib.convertSize(filesize) amptklib.log.info("Output file: %s (%s)" % (FinalDemux, readablesize)) amptklib.log.info("Mapping file: %s" % genericmapfile) print("-------------------------------------------------------") if 'darwin' in sys.platform: print(col.WARN + "\nExample of next cmd: " + col.END + "amptk cluster -i %s -o out\n" % (FinalDemux)) else: print("\nExample of next cmd: amptk cluster -i %s -o out\n" % (FinalDemux))
def main(args): parser = argparse.ArgumentParser( prog='amptk-unoise2.py', usage="%(prog)s [options] -i file.demux.fq\n%(prog)s -h for help menu", description='''Script runs UNOISE2 algorithm. Requires USEARCH9 by Robert C. Edgar: http://drive5.com/usearch''', epilog="""Written by Jon Palmer (2016) [email protected]""", formatter_class=MyFormatter) parser.add_argument('-i', '--fastq', dest="FASTQ", required=True, help='FASTQ file (Required)') parser.add_argument('-o', '--out', help='Base output name') parser.add_argument('-e', '--maxee', default='1.0', help='Quality trim EE value') parser.add_argument('-m', '--minsize', default='8', help='Min size to keep for denoising') parser.add_argument('-u', '--usearch', dest="usearch", default='usearch9', help='USEARCH9 EXE') parser.add_argument('-p', '--pct_otu', default='97', help="Biological OTU Clustering Percent") parser.add_argument('--uchime_ref', help='Run UCHIME2 REF [ITS,16S,LSU,COI,custom]') parser.add_argument('--map_filtered', action='store_true', help='map quality filtered reads back to OTUs') parser.add_argument('--debug', action='store_true', help='Remove Intermediate Files') parser.add_argument('--cpus', type=int, help="Number of CPUs. Default: auto") args = parser.parse_args(args) parentdir = os.path.join(os.path.dirname(amptklib.__file__)) #get basename if not args.out passed if args.out: base = args.out else: if 'demux' in args.FASTQ: base = os.path.basename(args.FASTQ).split('.demux')[0] else: base = os.path.basename(args.FASTQ).split('.f')[0] #remove logfile if exists log_name = base + '.amptk-unoise2.log' if os.path.isfile(log_name): os.remove(log_name) amptklib.setupLogging(log_name) FNULL = open(os.devnull, 'w') cmd_args = " ".join(sys.argv) + '\n' amptklib.log.debug(cmd_args) print("-------------------------------------------------------") #initialize script, log system info and usearch version amptklib.SystemInfo() #Do a version check usearch = args.usearch amptklib.versionDependencyChecks(usearch) #get number of cpus if args.cpus: cpus = args.cpus else: cpus = amptklib.getCPUS() #make tmp folder tmp = base + '_tmp' if not os.path.exists(tmp): os.makedirs(tmp) #Count FASTQ records amptklib.log.info("Loading FASTQ Records") #convert to FASTA for mapping orig_fasta = os.path.join(tmp, base + '.orig.fa') cmd = [ 'vsearch', '--fastq_filter', args.FASTQ, '--fastaout', orig_fasta, '--fastq_qmax', '55', '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) orig_total = amptklib.countfasta(orig_fasta) size = amptklib.checkfastqsize(args.FASTQ) readablesize = amptklib.convertSize(size) amptklib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize + ')') #Expected Errors filtering step filter_out = os.path.join(tmp, base + '.EE' + args.maxee + '.filter.fq') filter_fasta = os.path.join(tmp, base + '.EE' + args.maxee + '.filter.fa') amptklib.log.info("Quality Filtering, expected errors < %s" % args.maxee) cmd = [ 'vsearch', '--fastq_filter', args.FASTQ, '--fastq_maxee', str(args.maxee), '--fastqout', filter_out, '--fastaout', filter_fasta, '--fastq_qmax', '55', '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.countfastq(filter_out) amptklib.log.info('{0:,}'.format(total) + ' reads passed') #now run full length dereplication derep_out = os.path.join(tmp, base + '.EE' + args.maxee + '.derep.fa') amptklib.log.info("De-replication (remove duplicate reads)") cmd = [ 'vsearch', '--derep_fulllength', filter_out, '--relabel', 'Read_', '--sizeout', '--output', derep_out, '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.countfasta(derep_out) amptklib.log.info('{0:,}'.format(total) + ' reads passed') #now run de-noiser UNOISE2 amptklib.log.info("Denoising reads with UNOISE2") unoise_out = os.path.join(tmp, base + '.EE' + args.maxee + '.unoise.fa') cmd = [ usearch, '-unoise2', derep_out, '-fastaout', unoise_out, '-minampsize', args.minsize, '-threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.countfasta(unoise_out) amptklib.log.info('{0:,}'.format(total) + ' denoised sequences') #strip N's amptklib.log.info("Cleaning up padding from OTUs") otu_clean = os.path.join(tmp, base + '.EE' + args.maxee + '.clean.fa') amptklib.fasta_strip_padding(unoise_out, otu_clean) #run optional uchime_ref if not args.uchime_ref: uchime_out = otu_clean else: uchime_out = os.path.join( tmp, base + '.EE' + args.maxee + '.uchime.otus.fa') #R. Edgar now says using largest DB is better for UCHIME, so use the one distributed with taxonomy if args.uchime_ref in [ 'ITS', '16S', 'LSU', 'COI' ]: #test if it is one that is setup, otherwise default to full path uchime_db = os.path.join(parentdir, 'DB', args.uchime_ref + '.udb') if not os.path.isfile(uchime_db): amptklib.log.error( "Database not properly configured, run `amptk install` to setup DB, skipping chimera filtering" ) uchime_out = otu_clean #since uchime cannot work with udb database, need to extract fasta sequences, do this if if not amptklib.checkfile( os.path.join(parentdir, 'DB', args.uchime_ref + '.extracted.fa')): uchime_db = os.path.join(parentdir, 'DB', args.uchime_ref + '.extracted.fa') cmd = [ 'vsearch', '--udb2fasta', os.path.join(parentdir, 'DB', args.uchime_ref + '.udb'), '--output', uchime_db ] amptklib.runSubprocess(cmd, amptklib.log) else: uchime_db = os.path.join(parentdir, 'DB', args.uchime_ref + '.extracted.fa') else: uchime_db = os.path.abspath(args.uchime_ref) #now run chimera filtering if all checks out if not os.path.isfile(uchime_out): amptklib.log.info("Chimera Filtering (VSEARCH)") cmd = [ 'vsearch', '--mindiv', '1.0', '--uchime_ref', otu_clean, '--db', uchime_db, '--nonchimeras', uchime_out, '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.countfasta(uchime_out) amptklib.log.info('{0:,}'.format(total) + ' OTUs passed') #inferred sequences iSeqs = base + '.ASVs.fa' amptklib.fastarename(uchime_out, 'ASV', iSeqs) #Filter out ASVs in wrong orientation amptklib.log.info('Validating ASV orientation') passingOTUs = os.path.join(tmp, base + '.passed.asvs.fa') numKept, numDropped = amptklib.validateorientation(tmp, derep_out, uchime_out, passingOTUs) amptklib.log.info('{:,} ASVs validated ({:,} dropped)'.format( numKept, numDropped)) #build OTU table with iSeqs uc_iSeq_out = os.path.join(tmp, base + '.EE' + args.maxee + '.mapping.uc') iSeq_otu_table = base + '.otu_table.txt' #setup reads to map if args.map_filtered: reads = filter_fasta else: reads = orig_fasta amptklib.log.info("Mapping Reads to ASVs and Building OTU table") cmd = [ 'vsearch', '--usearch_global', reads, '--strand', 'plus', '--id', '0.97', '--db', passingOTUs, '--uc', uc_iSeq_out, '--otutabout', iSeq_otu_table, '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) #count reads mapped total = amptklib.line_count2(uc_iSeq_out) amptklib.log.info('{0:,}'.format(total) + ' reads mapped to ASVs ' + '({0:.0f}%)'.format(total / float(orig_total) * 100)) #now cluster to biological OTUs with UCLUST radius = float(args.pct_otu) / 100. amptklib.log.info( "Clustering denoised sequences into biological OTUs at %s%%" % args.pct_otu) uclust_out = os.path.join(tmp, base + '.EE' + args.maxee + '.uclust.fa') cmd = [ 'vsearch', '--cluster_smallmem', passingOTUs, '--centroids', uclust_out, '--id', str(radius), '--strand', 'plus', '--relabel', 'OTU', '--qmask', 'none', '--usersort', '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) total = amptklib.countfasta(uclust_out) amptklib.log.info('{0:,}'.format(total) + ' OTUs generated') #determine where denoised sequences clustered ClusterComp = base + '.ASVs2clusters.txt' iSeqmap = base + '.unoise_map.uc' cmd = [ usearch, '-usearch_global', passingOTUs, '-db', uclust_out, '-id', str(radius), '-uc', iSeqmap, '-strand', 'plus', '-threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) iSeqMapped = {} with open(iSeqmap, 'r') as mapping: for line in mapping: line = line.replace('\n', '') cols = line.split('\t') OTU = cols[9] Hit = cols[8] if not OTU in iSeqMapped: iSeqMapped[OTU] = [Hit] else: iSeqMapped[OTU].append(Hit) with open(ClusterComp, 'w') as clusters: clusters.write('OTU\tASVs\n') for k, v in natsorted(list(iSeqMapped.items())): clusters.write('%s\t%s\n' % (k, ', '.join(v))) #now map reads back to OTUs and build OTU table uc_out = os.path.join(tmp, base + '.EE' + args.maxee + '.cluster.mapping.uc') otu_table = os.path.join( tmp, base + '.EE' + args.maxee + '.cluster.otu_table.txt') #setup reads to map if args.map_filtered: reads = filter_fasta else: reads = orig_fasta amptklib.log.info("Mapping Reads to OTUs and Building OTU table") cmd = [ 'vsearch', '--usearch_global', reads, '--strand', 'plus', '--id', '0.97', '--db', uclust_out, '--uc', uc_out, '--otutabout', otu_table, '--threads', str(cpus) ] amptklib.runSubprocess(cmd, amptklib.log) #count reads mapped total = amptklib.line_count2(uc_out) amptklib.log.info('{0:,}'.format(total) + ' reads mapped to OTUs ' + '({0:.0f}%)'.format(total / float(orig_total) * 100)) #Move files around, delete tmp if argument passed. currentdir = os.getcwd() final_otu = os.path.join(currentdir, base + '.cluster.otus.fa') shutil.copyfile(uclust_out, final_otu) final_otu_table = os.path.join(currentdir, base + '.cluster.otu_table.txt') shutil.copyfile(otu_table, final_otu_table) if not args.debug: shutil.rmtree(tmp) #Print location of files to STDOUT print("-------------------------------------------------------") print("UNOISE2 Script has Finished Successfully") print("-------------------------------------------------------") if not not args.debug: print("Tmp Folder of files: %s" % tmp) print("Amplicon sequence variants: %s" % passingOTUs) print("ASV OTU Table: %s" % iSeq_otu_table) print("Clustered OTUs: %s" % os.path.basename(final_otu)) print("OTU Table: %s" % os.path.basename(final_otu_table)) print("ASVs 2 OTUs: %s" % ClusterComp) print("-------------------------------------------------------") otu_print = final_otu.split('/')[-1] tab_print = final_otu_table.split('/')[-1] if 'darwin' in sys.platform: print(colr.WARN + "\nExample of next cmd:" + colr.END + " amptk filter -i %s -f %s -b <mock barcode>\n" % (tab_print, otu_print)) else: print( "\nExample of next cmd: amptk filter -i %s -f %s -b <mock barcode>\n" % (tab_print, otu_print))
def main(args): parser = argparse.ArgumentParser( prog='amptk-get_barcode_counts.py', description= '''Script loops through demuxed fastq file counting occurances of barcodes, can optionally quality trim and recount.''', epilog="""Written by Jon Palmer (2015) [email protected]""", formatter_class=MyFormatter) parser.add_argument('-i', '--input', required=True, help='Input demuxed FASTQ') parser.add_argument('--quality_trim', action='store_true', help='Quality trim data') parser.add_argument('-e', '--maxee', default=1.0, type=float, help='MaxEE Q-trim threshold') parser.add_argument('-l', '--trunclen', default=250, type=int, help='Read truncation length') parser.add_argument('-o', '--out', help='Output for quality trimmed data') args = parser.parse_args(args) if args.quality_trim and not args.out: print("Error, to run quality trimming you must provide -o, --output") sys.exit(1) #main start here cpus = multiprocessing.cpu_count() print("----------------------------------") tmpinput = 'amptk_show.tmp' if args.input.endswith('.gz'): amptklib.Funzip(args.input, tmpinput, cpus) else: tmpinput = args.input countBarcodes(tmpinput) print("----------------------------------") getSeqLength(tmpinput) print("----------------------------------") if args.quality_trim: #split the input FASTQ file into chunks to process #split fastq file SeqCount = amptklib.countfastq(tmpinput) pid = os.getpid() folder = 'amptk_tmp_' + str(pid) amptklib.split_fastq(tmpinput, SeqCount, folder, cpus * 2) #now get file list from tmp folder file_list = [] for file in os.listdir(folder): if file.endswith(".fq"): file = os.path.join(folder, file) file_list.append(file) p = multiprocessing.Pool(cpus) for f in file_list: #worker(f) p.apply_async(worker, [f]) p.close() p.join() #get filtered results catDemux = args.out with open(catDemux, 'w') as outfile: for filename in glob.glob(os.path.join(folder, '*.filter.fq')): if filename == catDemux: continue with open(filename, 'r') as readfile: shutil.copyfileobj(readfile, outfile) if catDemux.endswith('.gz'): amptklib.Fzip_inplace(catDemux) shutil.rmtree(folder) print("----------------------------------") countBarcodes(args.out) print("----------------------------------") print("Script finished, output in %s" % args.out) if args.input.endswith('.gz'): amptklib.removefile(tmpinput)