Ejemplo n.º 1
0
def main(args):
	global FwdPrimer, RevPrimer, SampleData, Barcodes, RevBarcodes, tmpdir, usearch
	parser=argparse.ArgumentParser(prog='amptk-process_ion.py', usage="%(prog)s [options] -i file.fastq\n%(prog)s -h for help menu",
		description='''Script finds barcodes, strips forward and reverse primers, relabels, and then trim/pads reads to a set length''',
		epilog="""Written by Jon Palmer (2015) [email protected]""",
		formatter_class=MyFormatter)

	parser.add_argument('-i','--fastq', dest='fastq', required=True, help='FASTQ R1 file')
	parser.add_argument('--reverse', help='Illumina R2 reverse reads')
	parser.add_argument('-o','--out', dest="out", default='illumina2', help='Base name for output')
	parser.add_argument('-f','--fwd_primer', dest="F_primer", default='fITS7', help='Forward Primer')
	parser.add_argument('-r','--rev_primer', dest="R_primer", default='ITS4', help='Reverse Primer')
	parser.add_argument('-m','--mapping_file', help='Mapping file: QIIME format can have extra meta data columns')
	parser.add_argument('-p','--pad', default='off', choices=['on', 'off'], help='Pad with Ns to a set length')
	parser.add_argument('--primer_mismatch', default=2, type=int, help='Number of mis-matches in primer')
	parser.add_argument('--barcode_mismatch', default=0, type=int, help='Number of mis-matches in barcode')
	parser.add_argument('--barcode_fasta', help='FASTA file containing Barcodes (Names & Sequences)')
	parser.add_argument('--barcode_not_anchored', action='store_true', help='Barcodes (indexes) are not at start of reads')
	parser.add_argument('--reverse_barcode', help='FASTA file containing 3 prime Barocdes')
	parser.add_argument('--min_len', default=100, type=int, help='Minimum read length to keep')
	parser.add_argument('-l','--trim_len', default=300, type=int, help='Trim length for reads')
	parser.add_argument('--full_length', action='store_true', help='Keep only full length reads (no trimming/padding)')
	parser.add_argument('--merge_method', default='usearch', choices=['usearch', 'vsearch'], help='Software to use for PE read merging')
	parser.add_argument('--cpus', type=int, help="Number of CPUs. Default: auto")
	parser.add_argument('-u','--usearch', dest="usearch", default='usearch9', help='USEARCH EXE')
	args=parser.parse_args(args)    

	args.out = re.sub(r'\W+', '', args.out)

	log_name = args.out + '.amptk-demux.log'
	if os.path.isfile(log_name):
		os.remove(log_name)
	FNULL = open(os.devnull, 'w')
	amptklib.setupLogging(log_name)
	cmd_args = " ".join(sys.argv)+'\n'
	amptklib.log.debug(cmd_args)
	print("-------------------------------------------------------")

	#initialize script, log system info and usearch version
	amptklib.SystemInfo()
	#Do a version check
	usearch = args.usearch
	amptklib.versionDependencyChecks(usearch)

	#get number of CPUs to use
	if not args.cpus:
		cpus = multiprocessing.cpu_count()
	else:
		cpus = args.cpus

	#parse a mapping file or a barcode fasta file, primers, etc get setup
	#dealing with Barcodes, get ion barcodes or parse the barcode_fasta argument
	barcode_file = args.out + ".barcodes_used.fa"
	rev_barcode_file = args.out + '.revbarcodes_used.fa'
	amptklib.SafeRemove(barcode_file)
	amptklib.SafeRemove(rev_barcode_file)

	#check if mapping file passed, use this if present, otherwise use command line arguments
	SampleData = {}
	Barcodes = {}
	RevBarcodes = {}
	FwdPrimer = ''
	RevPrimer = ''
	if args.mapping_file:
		if not os.path.isfile(args.mapping_file):
			amptklib.log.error("Mapping file not found: %s" % args.mapping_file)
			sys.exit(1)
		SampleData, Barcodes, RevBarcodes, FwdPrimer, RevPrimer = amptklib.parseMappingFileNEW(args.mapping_file)  
	else: #no mapping file, so create dictionaries from barcode fasta files
		if not args.barcode_fasta:
			amptklib.log.error("You did not specify a --barcode_fasta or --mapping_file, one is required")
			sys.exit(1)
		else:
			shutil.copyfile(args.barcode_fasta, barcode_file)
			Barcodes = amptklib.fasta2barcodes(barcode_file, False)
			if args.reverse_barcode:
				shutil.copyfile(args.reverse_barcode, rev_barcode_file)
				RevBarcodes = amptklib.fasta2barcodes(rev_barcode_file, False)                   
	
		#parse primers here so doesn't conflict with mapping primers
		#look up primer db otherwise default to entry
		if args.F_primer in amptklib.primer_db:
			FwdPrimer = amptklib.primer_db.get(args.F_primer)
			amptklib.log.info("{:} fwd primer found in AMPtk primer db, setting to: {:}".format(args.F_primer, FwdPrimer))
		else:
			FwdPrimer = args.F_primer
			amptklib.log.info("{:} fwd primer not found in AMPtk primer db, assuming it is actual primer sequence.".format(args.F_primer))
		if args.R_primer in amptklib.primer_db:
			RevPrimer = amptklib.primer_db.get(args.R_primer)
			amptklib.log.info("{:} rev primer found in AMPtk primer db, setting to: {:}".format(args.R_primer, RevPrimer))
		else:
			RevPrimer = args.R_primer
			amptklib.log.info("{:} rev primer not found in AMPtk primer db, assuming it is actual primer sequence.".format(args.R_primer))

	#check if input is compressed
	gzip_list = []
	if args.fastq.endswith('.gz'):
		gzip_list.append(os.path.abspath(args.fastq))
	if args.reverse:
		if args.reverse.endswith('.gz'):
			gzip_list.append(os.path.abspath(args.reverse))
	if gzip_list:
		amptklib.log.info("Gzipped input files detected, uncompressing")
		for file in gzip_list:
			file_out = file.replace('.gz', '')
			amptklib.Funzip(file, file_out, cpus)
		args.fastq = args.fastq.replace('.gz', '')
		if args.reverse:
			args.reverse = args.reverse.replace('.gz', '')

	#Count FASTQ records
	amptklib.log.info("Loading FASTQ Records")
	orig_total = amptklib.countfastq(args.fastq)
	size = amptklib.checkfastqsize(args.fastq)
	readablesize = amptklib.convertSize(size*2)
	amptklib.log.info('{:,} reads ({:})'.format(orig_total, readablesize))

	#output barcodes/samples
	amptklib.log.info('Searching for {:} forward barcodes and {:} reverse barcodes'.format(len(Barcodes), len(RevBarcodes)))

	#create tmpdir and split input into n cpus
	tmpdir = args.out.split('.')[0]+'_'+str(os.getpid())
	if not os.path.exists(tmpdir):
		os.makedirs(tmpdir)
	
	#tell user about number of cores using
	amptklib.log.info("Splitting FASTQ files over {:} cpus".format(cpus))

	if args.reverse:
		amptklib.log.info("Demuxing PE Illumina reads; FwdPrimer: {:} RevPrimer: {:}".format(FwdPrimer, RevPrimer))
	else:
		amptklib.log.info("Demuxing SE Illumina reads; FwdPrimer: {:} RevPrimer: {:}".format(FwdPrimer, amptklib.RevComp(RevPrimer)))

	amptklib.log.info('Dropping reads less than {:} bp and setting lossless trimming to {:} bp.'.format(args.min_len, args.trim_len))

	if cpus > 1:
		if args.reverse:
			amptklib.split_fastqPE(args.fastq, args.reverse, orig_total, tmpdir, cpus*4)
			file_list = []
			for file in os.listdir(tmpdir):
				if file.endswith('.fq'):
					filepart = os.path.join(tmpdir, file.split('_R')[0])
					if not filepart in file_list:
						file_list.append(filepart)
			amptklib.runMultiProgress(processReadsPE, file_list, cpus, args=args)               
		else:
			#split fastq file
			amptklib.split_fastq(args.fastq, orig_total, tmpdir, cpus*4)    
			#now get file list from tmp folder
			file_list = []
			for file in os.listdir(tmpdir):
				if file.endswith(".fq"):
					file = os.path.join(tmpdir, file)
					file_list.append(file)
			#finally process reads over number of cpus
			amptklib.runMultiProgress(processRead, file_list, cpus, args=args)
	else:
		if args.reverse:
			shutil.copyfile(args.fastq, os.path.join(tmpdir, 'chunk_R1.fq'))
			shutil.copyfile(args.reverse, os.path.join(tmpdir, 'chunk_R2.fq'))
			processReadsPE(os.path.join(tmpdir, 'chunk'), args=args)
		else:
			shutil.copyfile(args.fastq, os.path.join(tmpdir, 'chunk.fq'))
			processRead(os.path.join(tmpdir, 'chunk.fq'), args=args)

	print("-------------------------------------------------------")
	#Now concatenate all of the demuxed files together
	amptklib.log.info("Concatenating Demuxed Files")

	tmpDemux = args.out + '.tmp.demux.fq'
	with open(tmpDemux, 'w') as outfile:
		for filename in glob.glob(os.path.join(tmpdir,'*.demux.fq')):
			if filename == tmpDemux:
				continue
			with open(filename, 'r') as readfile:
				shutil.copyfileobj(readfile, outfile)
	if args.reverse:
		#parse the stats
		finalstats = [0,0,0,0,0,0]
		for file in os.listdir(tmpdir):
			if file.endswith('.stats'):
				with open(os.path.join(tmpdir, file), 'r') as statsfile:
					line = statsfile.readline()
					line = line.rstrip()
					newstats = line.split(',')
					newstats = [int(i) for i in newstats]
					for x, num in enumerate(newstats):
						finalstats[x] += num
	
		amptklib.log.info('{0:,}'.format(finalstats[0])+' total reads')
		amptklib.log.info('{0:,}'.format(finalstats[0]-finalstats[1]-finalstats[3])+' valid Barcodes')
		amptklib.log.info('{0:,}'.format(finalstats[5])+' valid output reads (Barcodes and Primers)')
	else:
		#parse the stats
		finalstats = [0,0,0,0,0,0,0]
		for file in os.listdir(tmpdir):
			if file.endswith('.stats'):
				with open(os.path.join(tmpdir, file), 'r') as statsfile:
					line = statsfile.readline()
					line = line.rstrip()
					newstats = line.split(',')
					newstats = [int(i) for i in newstats]
					for x, num in enumerate(newstats):
						finalstats[x] += num
			
		amptklib.log.info('{0:,}'.format(finalstats[0])+' total reads')
		if args.reverse_barcode:
			amptklib.log.info('{0:,}'.format(finalstats[0]-finalstats[1]-finalstats[2]-finalstats[4])+' valid Fwd and Rev Barcodes')
		else:
			amptklib.log.info('{0:,}'.format(finalstats[0]-finalstats[1])+' valid Barcode')
			amptklib.log.info('{0:,}'.format(finalstats[0]-finalstats[1]-finalstats[2])+' Fwd Primer found, {0:,}'.format(finalstats[3])+ ' Rev Primer found')
		amptklib.log.info('{0:,}'.format(finalstats[5])+' discarded too short (< %i bp)' % args.min_len)
		amptklib.log.info('{0:,}'.format(finalstats[6])+' valid output reads')


	#clean up tmp folder
	amptklib.SafeRemove(tmpdir)

	#last thing is to re-number of reads as it is possible they could have same name from multitprocessor split
	catDemux = args.out+'.demux.fq'
	amptklib.fastqreindex(tmpDemux, catDemux)
	amptklib.SafeRemove(tmpDemux)
	#now loop through data and find barcoded samples, counting each.....
	BarcodeCount = {}
	with open(catDemux, 'r') as input:
		header = itertools.islice(input, 0, None, 4)
		for line in header:
			ID = line.split("=",1)[-1].split(";")[0]
			if ID not in BarcodeCount:
				BarcodeCount[ID] = 1
			else:
				BarcodeCount[ID] += 1

	#now let's count the barcodes found and count the number of times they are found.
	barcode_counts = "%22s:  %s" % ('Sample', 'Count')
	for k,v in natsorted(list(BarcodeCount.items()), key=lambda k_v: k_v[1], reverse=True):
		barcode_counts += "\n%22s:  %s" % (k, str(BarcodeCount[k]))
	amptklib.log.info("Found %i barcoded samples\n%s" % (len(BarcodeCount), barcode_counts))

	genericmapfile = args.out + '.mapping_file.txt'
	if not args.mapping_file:
		#create a generic mappingfile for downstream processes
		amptklib.CreateGenericMappingFile(Barcodes, RevBarcodes, FwdPrimer, RevPrimer, genericmapfile, BarcodeCount)
	else:
		amptklib.updateMappingFile(args.mapping_file, BarcodeCount, genericmapfile)
	#compress the output to save space
	FinalDemux = catDemux+'.gz'
	amptklib.Fzip(catDemux, FinalDemux, cpus)
	amptklib.removefile(catDemux)
	if gzip_list:
		for file in gzip_list:
			file = file.replace('.gz', '')
			amptklib.removefile(file)

	#get file size
	filesize = os.path.getsize(FinalDemux)
	readablesize = amptklib.convertSize(filesize)
	amptklib.log.info("Output file:  %s (%s)" % (FinalDemux, readablesize))
	amptklib.log.info("Mapping file: %s" % genericmapfile)

	print("-------------------------------------------------------")
	if 'darwin' in sys.platform:
		print(col.WARN + "\nExample of next cmd: " + col.END + "amptk cluster -i %s -o out\n" % (FinalDemux))
	else:
		print("\nExample of next cmd: amptk cluster -i %s -o out\n" % (FinalDemux))
Ejemplo n.º 2
0
def main(args):
	parser=argparse.ArgumentParser(prog='amptk-fastq2sra.py', usage="%(prog)s [options] -i folder",
		description='''Script to split FASTQ file from Ion, 454, or Illumina by barcode sequence into separate files for submission to SRA.  This script can take the BioSample worksheet from NCBI and create an SRA metadata file for submission.''',
		epilog="""Written by Jon Palmer (2015) [email protected]""",
		formatter_class=MyFormatter)
	parser.add_argument('-i','--input', dest='FASTQ', required=True, help='Input FASTQ file or folder')
	parser.add_argument('-o','--out', dest='out', help='Basename for output folder/files')
	parser.add_argument('--min_len', default=50, type=int, help='Minimum length of read to keep')
	parser.add_argument('-b','--barcode_fasta', help='Multi-fasta file containing barcodes used')
	parser.add_argument('--reverse_barcode', help='Reverse barcode fasta file')
	parser.add_argument('-s','--biosample', dest='biosample', help='BioSample file from NCBI')
	parser.add_argument('-p','--platform', dest='platform', default='ion', choices=['ion', 'illumina', '454'], help='Sequencing platform')
	parser.add_argument('-f','--fwd_primer', dest="F_primer", default='fITS7', help='Forward Primer (fITS7)')
	parser.add_argument('-r','--rev_primer', dest="R_primer", default='ITS4', help='Reverse Primer (ITS4)')
	parser.add_argument('-n', '--names', help='CSV mapping file BC,NewName')
	parser.add_argument('-d', '--description', help='Paragraph description for SRA metadata')
	parser.add_argument('-t','--title', default='Fungal ITS', help='Start of title for SRA submission, name it according to amplicon')
	parser.add_argument('-m','--mapping_file', help='Mapping file: QIIME format can have extra meta data columns')
	parser.add_argument('--primer_mismatch', default=2, type=int, help='Number of mis-matches in primer')
	parser.add_argument('--barcode_mismatch', default=0, type=int, help='Number of mis-matches in barcode')
	parser.add_argument('--require_primer', default='off', choices=['forward', 'both', 'off'], help='Require Primers to be present')
	parser.add_argument('--force', action='store_true', help='Overwrite existing directory')
	parser.add_argument('-a','--append', help='Append a name to all sample names for a run, i.e. --append run1 would yield Sample_run1')
	args=parser.parse_args(args)

	#get basename if not args.out passed
	if args.out:
		base = args.out
	else:
		if 'demux' in args.FASTQ:
			base = os.path.basename(args.FASTQ).split('.demux')[0]
		else:
			base = os.path.basename(args.FASTQ).split('.f')[0]


	log_name = base + '.amptk-sra.log'
	if os.path.isfile(log_name):
		os.remove(log_name)

	amptklib.setupLogging(log_name)
	FNULL = open(os.devnull, 'w')
	cmd_args = " ".join(sys.argv)+'\n'
	amptklib.log.debug(cmd_args)
	print("-------------------------------------------------------")
	amptklib.SystemInfo()

	amptkversion = amptklib.get_version()

	#create output directory
	if not os.path.exists(base):
		os.makedirs(base)
	else:
		if not args.force:
			amptklib.log.error("Directory %s exists, add --force argument to overwrite" % base)
			sys.exit(1)
		else:
			shutil.rmtree(base)
			os.makedirs(base)

	#parse a mapping file or a barcode fasta file, primers, etc get setup
	#dealing with Barcodes, get ion barcodes or parse the barcode_fasta argument
	barcode_file = os.path.join(base, base + ".barcodes_used.fa")
	rev_barcode_file = os.path.join(base, base + ".revbarcodes_used.fa")
	if os.path.isfile(barcode_file):
		os.remove(barcode_file)

	#check if mapping file passed, use this if present, otherwise use command line arguments
	SampleData = {}
	Barcodes = {}
	RevBarcodes = {}
	FwdPrimer = ''
	RevPrimer = ''
	if args.mapping_file:
		if not os.path.isfile(args.mapping_file):
			amptklib.log.error("Mapping file not found: %s" % args.mapping_file)
			sys.exit(1)
		SampleData, Barcodes, RevBarcodes, FwdPrimer, RevPrimer = amptklib.parseMappingFileNEW(args.mapping_file)  
	else:
		if args.barcode_fasta:
			with open(barcode_file, 'w') as barcodeout:
				with open(args.barcode_fasta, 'r') as input:
					for rec in SeqIO.parse(input, 'fasta'):
						outname = args.multi+'.'+rec.id
						barcodeout.write(">%s\n%s\n" % (outname, rec.seq))
		if args.reverse_barcode:
			with open(rev_barcode_file, 'w') as barcodeout:
				with open(args.reverse_barcode, 'r') as input:
					for rec in SeqIO.parse(input, 'fasta'):
						outname = args.multi+'.'+rec.id
						barcodeout.write(">%s\n%s\n" % (outname, rec.seq))                   
	
	#parse primers here so doesn't conflict with mapping primers
	#look up primer db otherwise default to entry
	if FwdPrimer == '':
		if args.F_primer in amptklib.primer_db:
			FwdPrimer = amptklib.primer_db.get(args.F_primer)
			amptklib.log.info("{:} fwd primer found in AMPtk primer db, setting to: {:}".format(args.F_primer, FwdPrimer))
		else:
			FwdPrimer = args.F_primer
			amptklib.log.info("{:} fwd primer not found in AMPtk primer db, assuming it is actual primer sequence.".format(args.F_primer))
	if RevPrimer == '':
		if args.R_primer in amptklib.primer_db:
			RevPrimer = amptklib.primer_db.get(args.R_primer)
			amptklib.log.info("{:} rev primer found in AMPtk primer db, setting to: {:}".format(args.R_primer, RevPrimer))
		else:
			RevPrimer = args.R_primer
			amptklib.log.info("{:} rev primer not found in AMPtk primer db, assuming it is actual primer sequence.".format(args.R_primer))


	#then setup barcode dictionary
	if len(Barcodes) < 1 and os.path.isfile(barcode_file):
		Barcodes = amptklib.fasta2barcodes(barcode_file, False)

	#setup for looking for reverse barcode
	if len(RevBarcodes) < 1 and args.reverse_barcode:
		if not os.path.isfile(args.reverse_barcode):
			amptklib.log.info("Reverse barcode is not a valid file, exiting")
			sys.exit(1) 
		shutil.copyfile(args.reverse_barcode, rev_barcode_file)
		RevBarcodes = amptklib.fasta2barcodes(rev_barcode_file, True)


	if args.platform != 'illumina':
		if not args.mapping_file and not args.barcode_fasta:
			amptklib.log.error("For ion, 454, or illumina2 datasets you must specificy a multi-fasta file containing barcodes with -b, --barcode_fasta, or -m/--mapping_file")
			sys.exit(1)

	if args.platform == 'illumina':
		#just need to get the correct .fastq.gz files into a folder by themselves
		#if illumina is selected, verify that args.fastq is a folder
		if not os.path.isdir(args.FASTQ):
			amptklib.log.error("%s is not a folder, for '--platform illumina', -i must be a folder containing raw reads" % (args.FASTQ))
			sys.exit(1)
		rawlist = []
		filelist = []
		for file in os.listdir(args.FASTQ):
			if file.endswith(".fastq.gz") or file.endswith('.fastq') or file.endswith('.fq'):
				rawlist.append(file)
		if len(rawlist) > 0:
			if not '_R2' in sorted(rawlist)[1]:
				amptklib.log.info("Found %i single files, copying to %s folder" % (len(rawlist), base))
				filelist = rawlist
				for file in rawlist:
					shutil.copyfile(os.path.join(args.FASTQ,file),(os.path.join(base,file)))
			else:
				amptklib.log.info("Found %i paired-end files, copying to %s folder" % (len(rawlist) / 2, base))
				for file in rawlist:
					shutil.copyfile(os.path.join(args.FASTQ,file),(os.path.join(base,file)))
					if '_R1' in file:
						filelist.append(file)

	else:
		#start here to process the reads, first reverse complement the reverse primer
		ReverseCompRev = amptklib.RevComp(RevPrimer)

		#if --names given, load into dictonary
		if args.names:
			amptklib.log.info("Parsing names for output files via %s" % args.names)
			namesDict = {}
			with open(args.names, 'r') as input:
				for line in input:
					line = line.replace('\n', '')
					cols = line.split(',')
					if not cols[0] in namesDict:
						namesDict[cols[0]] = cols[1]
	
		#check for compressed input file
		if args.FASTQ.endswith('.gz'):
			amptklib.log.info("Gzipped input files detected, uncompressing")
			FASTQ_IN = args.FASTQ.replace('.gz', '')
			amptklib.Funzip(args.FASTQ, FASTQ_IN, multiprocessing.cpu_count())
		else:
			FASTQ_IN = args.FASTQ
   
		#count FASTQ records in input
		amptklib.log.info("Loading FASTQ Records")
		total = amptklib.countfastq(FASTQ_IN)
		size = amptklib.checkfastqsize(args.FASTQ)
		readablesize = amptklib.convertSize(size)
		amptklib.log.info('{0:,}'.format(total) + ' reads (' + readablesize + ')')
	
		#output message depending on primer requirement
		if args.require_primer == 'off':   
			amptklib.log.info("Looking for %i barcodes" % (len(Barcodes)))
		elif args.require_primer == 'forward':
			amptklib.log.info("Looking for %i barcodes that must have FwdPrimer: %s" % (len(Barcodes), FwdPrimer))
		elif args.require_primer == 'both':
			amptklib.log.info("Looking for %i barcodes that must have FwdPrimer: %s and RevPrimer: %s" % (len(Barcodes), FwdPrimer, RevPrimer))
	
		#this will loop through FASTQ file once, splitting those where barcodes are found, and primers trimmed
		runningTotal = 0
		with open(FASTQ_IN, 'r') as input:
			for title, seq, qual in FastqGeneralIterator(input):
				Barcode, BarcodeLabel = amptklib.AlignBarcode(seq, Barcodes, args.barcode_mismatch)
				if Barcode == "":
					continue
				#trim barcode from sequence
				BarcodeLength = len(Barcode)
				seq = seq[BarcodeLength:]
				qual = qual[BarcodeLength:]
				#look for forward primer
				if args.require_primer != 'off': #means we only want ones with forward primer and or reverse, but don't remove them             
					#now search for forward primer
					foralign = edlib.align(FwdPrimer, seq, mode="HW", k=args.primer_mismatch, additionalEqualities=amptklib.degenNuc)
					if foralign["editDistance"] < 0:
						continue
					if args.require_primer == 'both': 
						#now search for reverse primer
						revalign = edlib.align(ReverseCompRev, seq, mode="HW", task="locations", k=args.primer_mismatch, additionalEqualities=amptklib.degenNuc)
						if revalign["editDistance"] < 0:  #reverse primer was not found
							continue         
				#check size
				if len(seq) < args.min_len: #filter out sequences less than minimum length.
					continue
				runningTotal += 1
				fileout = os.path.join(base, BarcodeLabel+'.fastq')
				with open(fileout, 'a') as output:
					output.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
				
		if args.require_primer == 'off':   
			amptklib.log.info('{0:,}'.format(runningTotal) + ' total reads with valid barcode')
		elif args.require_primer == 'forward':
			amptklib.log.info('{0:,}'.format(runningTotal) + ' total reads with valid barcode and fwd primer')
		elif args.require_primer == 'both':
			amptklib.log.info('{0:,}'.format(runningTotal) + ' total reads with valid barcode and both primers')
	
		amptklib.log.info("Now Gzipping files")
		for file in os.listdir(base):
			if file.endswith(".fastq"):
				file_path = os.path.join(base, file)
				amptklib.Fzip_inplace(file_path)
	
		#after all files demuxed into output folder, loop through and create SRA metadata file
		filelist = []
		for file in os.listdir(base):
			if file.endswith(".fastq.gz"):
				filelist.append(file)

	amptklib.log.info("Finished: output in %s" % base)
	#clean up if gzipped
	if args.FASTQ.endswith('.gz'):
		amptklib.removefile(FASTQ_IN)

	#check for BioSample meta file
	if args.biosample:
		amptklib.log.info("NCBI BioSample file detected, creating SRA metadata file") 
		#load in BioSample file to dictionary
		with open(args.biosample, 'r') as input:
			reader = csv.reader(input, delimiter=str('\t'))
			header = next(reader)
			acc = header.index('Accession')
			sample = header.index('Sample Name')
			bio = header.index('BioProject')     
			try:
				host = header.index('Host')
			except ValueError:
				host = header.index('Organism')
			BioDict = {col[sample]:(col[acc],col[bio],col[host]) for col in reader}
		#set some defaults based on the platform
		header = 'bioproject_accession\tbiosample_accession\tlibrary_ID\ttitle\tlibrary_strategy\tlibrary_source\tlibrary_selection\tlibrary_layout\tplatform\tinstrument_model\tdesign_description\tfiletype\tfilename\tfilename2\tforward_barcode\treverse_barcode\tforward_primer\treverse_primer\n'
		if args.platform == 'ion':
			sequencer = 'ION_TORRENT'
			model = 'Ion Torrent PGM' 
			lib_layout = 'single'
		elif args.platform == '454':
			sequencer = '_LS454'
			model = '454 GS FLX Titanium'
			lib_layout = 'single'
		elif args.platform == 'illumina':
			sequencer = 'ILLUMINA'
			model = 'Illumina MiSeq'
			lib_layout = 'paired'
		else:
			amptklib.log.error("You specified a platform that is not supported")
			sys.exit(1)
		lib_strategy = 'AMPLICON'
		lib_source = 'GENOMIC'
		lib_selection = 'RANDOM PCR'
		filetype = 'fastq'
	
		#now open file for writing, input header and then loop through samples
		sub_out = base + '.submission.txt'
		with open(sub_out, 'w') as output:
			output.write(header)
			for file in filelist:
				barcode_for = ''
				barcode_rev = ''
				if not args.description:
					description = '%s amplicon library was created using a barcoded fusion primer PCR protocol using Pfx50 polymerase (Thermo Fisher Scientific), size selected, and sequenced on the %s platform.  Sequence data was minimally processed, sequences were exported directly from the sequencing platform and only the barcode (index sequence) was trimmed prior to SRA submission. SRA submission generated with AMPtk %s' % (args.title, model, amptkversion.split(' ')[-1])
				else:
					description = args.description
				if args.platform == 'ion' or args.platform == '454': 
					name = file.split(".fastq")[0]
					if not name in BioDict: #lets try to look a bit harder, i.e. split on _ and - and look again
						searchname = name.replace('-', '_')
						searchname = searchname.split('_')[0]
						if not searchname in BioDict: #if still not found, then skip
							continue
					else:
						searchname = name     
					bioproject = BioDict.get(searchname)[1]
					if not bioproject.startswith('PRJNA'):
						bioproject = 'PRJNA'+bioproject
					sample_name = BioDict.get(searchname)[0]
					title = '%s amplicon sequencing of %s: sample %s' % (args.title, BioDict.get(name)[2], name)
					bc_name = file.split(".f")[0]
					if bc_name in Barcodes:
						barcode_for = Barcodes.get(bc_name)
					if bc_name in RevBarcodes:
						barcode_rev = RevBarcodes.get(bc_name)
					if args.append:
						finalname = name+'_'+args.append
						#also need to change the name for output files
						newfile = file.replace(name, finalname)
						os.rename(os.path.join(base, file), os.path.join(base, newfile))
					else:
						finalname = name
						newfile = file
					line = [bioproject,sample_name,finalname,title,lib_strategy,lib_source,lib_selection,lib_layout,sequencer,model,description,filetype,newfile,'',barcode_for,barcode_rev,FwdPrimer,RevPrimer]
				elif args.platform == 'illumina':
					name = file.split("_")[0]
					if not name in BioDict:
						amptklib.log.info('{:} not found in BioSample text file'.format(name))
						continue
					bioproject = BioDict.get(name)[1]
					if not bioproject.startswith('PRJNA'):
						bioproject = 'PRJNA'+bioproject
					sample_name = BioDict.get(name)[0]
					title = '%s amplicon sequencing of %s: sample %s' % (args.title, BioDict.get(name)[2], name)   
					file2 = file.replace('_R1', '_R2')             
					#count number of _ in name, determines the dataformat
					fields = file.count("_")
					if fields > 3: #this is full illumina name with dual barcodes
						dualBC = file.split("_")[1]
						if '-' in dualBC:
							barcode_for = dualBC.split('-')[0]
							barcode_rev = dualBC.split('-')[1]
					elif fields == 3: #this is older reverse barcoded name
						barcode_for = ''
						barcode_rev = file.split("_")[1]
					if args.append:
						finalname = name+'_'+args.append
						newfile = file.replace(name, finalname)
						newfile2 = file2.replace(name, finalname)
						#also need to change the name for output files
						os.rename(os.path.join(base, file), os.path.join(base, newfile1))
						os.rename(os.path.join(base, file2), os.path.join(base, newfile2))
						file = file.replace(name, finalname)
					else:
						finalname = name
						newfile = file
						newfile2 = file2
					line = [bioproject,sample_name,finalname,title,lib_strategy,lib_source,lib_selection,lib_layout,sequencer,model,description,filetype,newfile,newfile2,barcode_for,barcode_rev,FwdPrimer,RevPrimer]
				#write output to file
				output.write('\t'.join(line)+'\n')
		amptklib.log.info("SRA submission file created: %s" % sub_out)
Ejemplo n.º 3
0
def main(args):
    global FwdPrimer, RevPrimer, usearch
    parser = argparse.ArgumentParser(
        prog='amptk-process_illumina_folder.py',
        usage="%(prog)s [options] -i folder",
        description=
        '''Script that takes De-mulitplexed Illumina data from a folder and processes it for amptk (merge PE reads, strip primers, trim/pad to set length.''',
        epilog="""Written by Jon Palmer (2015) [email protected]""",
        formatter_class=MyFormatter)

    parser.add_argument('-i',
                        '--input',
                        dest='input',
                        required=True,
                        help='Folder of Illumina Data')
    parser.add_argument('-o',
                        '--out',
                        dest="out",
                        default='amptk-illumina',
                        help='Name for output folder')
    parser.add_argument(
        '-m',
        '--mapping_file',
        help='Mapping file: QIIME format can have extra meta data columns')
    parser.add_argument('--reads',
                        dest="reads",
                        default='paired',
                        choices=['paired', 'forward'],
                        help='PE or forward reads')
    parser.add_argument('--read_length',
                        type=int,
                        help='Read length, i.e. 2 x 300 bp = 300')
    parser.add_argument('-f',
                        '--fwd_primer',
                        dest="F_primer",
                        default='fITS7',
                        help='Forward Primer (fITS7)')
    parser.add_argument('-r',
                        '--rev_primer',
                        dest="R_primer",
                        default='ITS4',
                        help='Reverse Primer (ITS4)')
    parser.add_argument('--require_primer',
                        dest="primer",
                        default='on',
                        choices=['on', 'off'],
                        help='Require Fwd primer to be present')
    parser.add_argument('--primer_mismatch',
                        default=2,
                        type=int,
                        help='Number of mis-matches in primer')
    parser.add_argument('--barcode_mismatch',
                        default=1,
                        type=int,
                        help='Number of mis-matches allowed in index')
    parser.add_argument('--rescue_forward',
                        default='on',
                        choices=['on', 'off'],
                        help='Rescue Not-merged forward reads')
    parser.add_argument('--min_len',
                        default=100,
                        type=int,
                        help='Minimum read length to keep')
    parser.add_argument('--merge_method',
                        default='usearch',
                        choices=['usearch', 'vsearch'],
                        help='Software to use for PE read merging')
    parser.add_argument('-l',
                        '--trim_len',
                        default=300,
                        type=int,
                        help='Trim length for reads')
    parser.add_argument('--cpus',
                        type=int,
                        help="Number of CPUs. Default: auto")
    parser.add_argument(
        '--full_length',
        action='store_true',
        help='Keep only full length reads (no trimming/padding)')
    parser.add_argument('-p',
                        '--pad',
                        default='off',
                        choices=['on', 'off'],
                        help='Pad with Ns to a set length')
    parser.add_argument('-u',
                        '--usearch',
                        dest="usearch",
                        default='usearch9',
                        help='USEARCH executable')
    parser.add_argument(
        '--sra',
        action='store_true',
        help='Input files are from NCBI SRA not direct from illumina')
    parser.add_argument('--cleanup',
                        action='store_true',
                        help='Delete all intermediate files')
    args = parser.parse_args(args)

    #sometimes people add slashes in the output directory, this could be bad, try to fix it
    args.out = re.sub(r'\W+', '', args.out)

    #create directory and check for existing logfile
    if not os.path.exists(args.out):
        os.makedirs(args.out)

    log_name = args.out + '.amptk-demux.log'
    if os.path.isfile(log_name):
        os.remove(log_name)

    amptklib.setupLogging(log_name)
    FNULL = open(os.devnull, 'w')
    cmd_args = " ".join(sys.argv) + '\n'
    amptklib.log.debug(cmd_args)
    print("-------------------------------------------------------")

    #initialize script, log system info and usearch version
    amptklib.SystemInfo()
    #get version of amptk
    usearch = args.usearch
    amptklib.versionDependencyChecks(usearch)

    #Now all the data is in folder args.out that needs to be de-multiplexed
    if not args.cpus:
        cpus = multiprocessing.cpu_count()
    else:
        cpus = args.cpus

    #check folder if files are gzipped, then gunzip them
    #try to gunzip files
    gzip_list = []
    for file in os.listdir(args.input):
        if file.endswith(".fastq.gz"):
            gzip_list.append(file)
    if gzip_list:
        amptklib.log.info("Gzipped files detected, uncompressing")
        for file in gzip_list:
            amptklib.log.debug("Uncompressing %s" % file)
            OutName = os.path.join(args.input, os.path.splitext(file)[0])
            amptklib.Funzip(os.path.join(args.input, file), OutName, cpus)

    #check for mapping file, if exists, then use names from first column only for filenames
    SampleData = {}
    Barcodes = {}
    RevBarcodes = {}
    FwdPrimer = ''
    RevPrimer = ''
    if args.mapping_file:
        if not os.path.isfile(args.mapping_file):
            amptklib.error("Mapping file is not valid: %s" % args.mapping_file)
            sys.exit(1)
        SampleData, Barcodes, RevBarcodes, FwdPrimer, RevPrimer = amptklib.parseMappingFileNEW(
            args.mapping_file)
        mapdata = amptklib.parseMappingFileIllumina(args.mapping_file)
        #forward primer in first item in tuple, reverse in second
        sample_names = list(SampleData.keys())
        #loop through the files in the folder and get the ones in the sample_names lit
        filenames = []
        for file in os.listdir(args.input):
            if file.startswith(tuple(sample_names)):
                if file.endswith('.fastq'):
                    filenames.append(file)

        if len(filenames) < 1:
            amptklib.log.error(
                "Found 0 valid files from mapping file. Mapping file SampleID must match start of filenames"
            )
            sys.exit(1)

    else:  #if not then search through and find all the files you can in the folder
        '''get filenames, store in list, Illumina file names look like the following:
		<sample name>_<i5>-<i7>_L<lane (0-padded to 3 digits)>_R<read number>_<set number (0-padded to 3 digits>.fastq.gz'''

        #now get the FASTQ files and proceed
        filenames = []
        for file in os.listdir(args.input):
            if file.endswith(".fastq"):
                filenames.append(file)
        #look up primer db otherwise default to entry
        if args.F_primer in amptklib.primer_db:
            FwdPrimer = amptklib.primer_db.get(args.F_primer)
            amptklib.log.info(
                "{:} fwd primer found in AMPtk primer db, setting to: {:}".
                format(args.F_primer, FwdPrimer))
        else:
            FwdPrimer = args.F_primer
            amptklib.log.info(
                "{:} fwd primer not found in AMPtk primer db, assuming it is actual primer sequence."
                .format(args.F_primer))
        if args.R_primer in amptklib.primer_db:
            RevPrimer = amptklib.primer_db.get(args.R_primer)
            amptklib.log.info(
                "{:} rev primer found in AMPtk primer db, setting to: {:}".
                format(args.R_primer, RevPrimer))
        else:
            RevPrimer = args.R_primer
            amptklib.log.info(
                "{:} rev primer not found in AMPtk primer db, assuming it is actual primer sequence."
                .format(args.R_primer))

    #if files are from SRA, then do something different as they are already merged
    if args.sra:
        #take list of filenames, move over to output folder
        sampleDict = {}
        fastq_for = []
        for x in filenames:
            rename = os.path.basename(x).split(".f", -1)[0]
            sampleDict[rename] = 'unknown'
            shutil.copyfile(os.path.join(args.input, x),
                            os.path.join(args.out, rename + '.fq'))
            fastq_for.append(os.path.join(args.out, rename + '.fq'))
        args.reads = 'forward'
    else:
        if len(filenames) % 2 != 0:
            print(
                "Check your input files, they do not seem to be properly paired"
            )
            sys.exit(1)
        #check list for files, i.e. they need to have _R1 and _R2 in the filenames, otherwise throw exception
        if not any('_R1' in x for x in filenames):
            amptklib.log.error(
                "Did not find valid FASTQ files.  Your files must have _R1 and _R2 in filename, rename your files and restart script."
            )
            sys.exit(1)
        uniq_names = []
        fastq_for = []
        fastq_rev = []
        sampleDict = {}
        map = args.out + '.filenames.txt'
        with open(map, 'w') as map_file:
            map_file.write("Name\t[i5]\t[i7]\tLane\tSet_num\n")
            for item in sorted(filenames):
                if '_R1' in item:
                    fastq_for.append(os.path.join(args.input, item))
                if '_R2' in item:
                    fastq_rev.append(os.path.join(args.input, item))
                column = item.split("_")
                if column[0] not in uniq_names:
                    uniq_names.append(column[0])
                    if "-" in column[1]:
                        barcode = column[1].split(
                            "-"
                        )  #looking here for the linker between i5 and i7 seqs
                        i5 = barcode[0]
                        i7 = barcode[1]
                        try:
                            map_file.write("%s\t%s\t%s\t%s\t%s\n" %
                                           (column[0], i5, i7, column[2],
                                            column[4].split(".", 1)[0]))
                        except IndexError:
                            amptklib.log.debug(
                                "Non-standard names detected, skipping mapping file"
                            )
                    else:
                        i5 = column[1]
                        i7 = "None"
                        try:
                            map_file.write("%s\t%s\t%s\t%s\t%s\n" %
                                           (column[0], i5, i7, column[2],
                                            column[4].split(".", 1)[0]))
                        except IndexError:
                            amptklib.log.debug(
                                "Non-standard names detected, skipping mapping file"
                            )
                    if i7 != "None":
                        sampleDict[column[0]] = i5 + '-' + i7
                    else:
                        sampleDict[column[0]] = i5

    if args.full_length and args.primer == 'off':
        amptklib.log.info(
            '--full_length is not compatible with --require_primer off, turning --full_length off'
        )
        args.full_length = False

    #tell user about number of cores using
    amptklib.log.info('Demuxing data using {:} cpus'.format(cpus))
    amptklib.log.info(
        'Dropping reads less than {:} bp and setting lossless trimming to {:} bp.'
        .format(args.min_len, args.trim_len))

    #zip read lists into a single list of tuples
    if args.reads == 'paired':
        amptklib.log.info(
            "Strip Primers and Merge PE reads. FwdPrimer: {:} RevPrimer: {:}".
            format(FwdPrimer, RevPrimer))
        readList = list(zip(fastq_for, fastq_rev))
        amptklib.runMultiProgress(safe_run, readList, cpus, args=args)
    else:
        amptklib.log.info(
            "Strip Primers. FwdPrimer: {:} RevPrimer: {:}".format(
                FwdPrimer, RevPrimer))
        amptklib.runMultiProgress(safe_run2, fastq_for, cpus, args=args)

    #cleanup to save space
    if gzip_list:
        for file in gzip_list:
            file = file.replace('.gz', '')
            amptklib.removefile(os.path.join(args.input, file))
    print("-------------------------------------------------------")
    #Now concatenate all of the demuxed files together
    amptklib.log.info("Concatenating Demuxed Files")

    catDemux = args.out + '.demux.fq'
    with open(catDemux, 'w') as outfile:
        for filename in glob.glob(os.path.join(args.out, '*.demux.fq')):
            if filename == catDemux:
                continue
            with open(filename, 'r') as readfile:
                shutil.copyfileobj(readfile, outfile)

    #parse the stats
    #(Total, ForPrimerFound, RevPrimerFound, multiHits, TooShort, ValidSeqs))
    finalstats = [0, 0, 0, 0, 0, 0]
    for file in os.listdir(args.out):
        if file.endswith('.stats'):
            with open(os.path.join(args.out, file), 'r') as statsfile:
                line = statsfile.readline()
                line = line.replace('\n', '')
                newstats = line.split(',')
                newstats = [int(i) for i in newstats]
                for x, num in enumerate(newstats):
                    finalstats[x] += num
    amptklib.log.info('{0:,}'.format(finalstats[0]) + ' total reads')
    amptklib.log.info('{0:,}'.format(finalstats[1]) +
                      ' Fwd Primer found, {0:,}'.format(finalstats[2]) +
                      ' Rev Primer found')
    amptklib.log.info('{0:,}'.format(finalstats[3]) +
                      ' discarded Primer incompatibility')
    amptklib.log.info('{0:,}'.format(finalstats[4]) +
                      ' discarded too short (< %i bp)' % args.min_len)
    amptklib.log.info('{0:,}'.format(finalstats[5]) + ' valid output reads')

    #now loop through data and find barcoded samples, counting each.....
    BarcodeCount = {}
    with open(catDemux, 'r') as input:
        header = itertools.islice(input, 0, None, 4)
        for line in header:
            ID = line.split("=")[-1].split(";")[0]
            if ID not in BarcodeCount:
                BarcodeCount[ID] = 1
            else:
                BarcodeCount[ID] += 1

    #now let's count the barcodes found and count the number of times they are found.
    barcode_counts = "%30s:  %s" % ('Sample', 'Count')
    for k, v in natsorted(list(BarcodeCount.items()),
                          key=lambda k_v: k_v[1],
                          reverse=True):
        barcode_counts += "\n%30s:  %s" % (k, str(BarcodeCount[k]))
    amptklib.log.info("Found %i barcoded samples\n%s" %
                      (len(BarcodeCount), barcode_counts))

    genericmapfile = args.out + '.mapping_file.txt'
    if not args.mapping_file:
        #create a generic mappingfile for downstream processes
        amptklib.CreateGenericMappingFileIllumina(sampleDict, FwdPrimer,
                                                  RevPrimer, genericmapfile,
                                                  BarcodeCount)
    else:
        amptklib.updateMappingFile(args.mapping_file, BarcodeCount,
                                   genericmapfile)

    #compress the output to save space
    FinalDemux = catDemux + '.gz'
    amptklib.Fzip(catDemux, FinalDemux, cpus)
    amptklib.removefile(catDemux)

    #get file size
    filesize = os.path.getsize(FinalDemux)
    readablesize = amptklib.convertSize(filesize)
    amptklib.log.info("Output file:  %s (%s)" % (FinalDemux, readablesize))
    amptklib.log.info("Mapping file: %s" % genericmapfile)
    if args.cleanup:
        shutil.rmtree(args.out)
    print("-------------------------------------------------------")
    if 'darwin' in sys.platform:
        print(col.WARN + "\nExample of next cmd: " + col.END +
              "amptk cluster -i %s -o out\n" % (FinalDemux))
    else:
        print("\nExample of next cmd: amptk cluster -i %s -o out\n" %
              (FinalDemux))
Ejemplo n.º 4
0
def main(args):
	parser=argparse.ArgumentParser(prog='amptk-remove_samples.py',
		description='''Script parses AMPtk de-multiplexed FASTQ file and keeps those sequences with barocde names in list ''',
		epilog="""Written by Jon Palmer (2015) [email protected]""",
		formatter_class=MyFormatter)
	parser.add_argument('-i','--input', required=True, help='Input AMPtk demux FASTQ')
	parser.add_argument('-l','--list', nargs='+', help='Input list of (BC) names to remove')
	parser.add_argument('-t','--threshold', type=int, help='Keep samples with more reads than threshold')
	parser.add_argument('-f','--file', help='File containing list of names to remove')
	parser.add_argument('-o','--out', required=True, help='Output name')
	parser.add_argument('--format', default='fastq', choices=['fastq','fasta'], help='format of output file')
	args=parser.parse_args(args)

	#check if input compressed, incompress if it is
	if args.input.endswith('.gz'):
		SeqIn = args.input.replace('.gz', '')
		amptklib.Funzip(args.input, SeqIn, multiprocessing.cpu_count())
	else:
		SeqIn = args.input

	remove = []
	if args.threshold:
		print("Finding samples with less than %i reads" % args.threshold)
		BC_counts = countBarcodes(SeqIn)
		for k,v in list(BC_counts.items()):
			if int(v) <= args.threshold:
				if not k in remove:
					remove.append(k)
		print("Removing samples: %s" % ','.join(remove))
	
	if args.file:   
		#load in list of sample names to keep
		with open(args.file, 'r') as input:
			lines = [line.rstrip('\n') for line in input]
		print("Removing samples from file: %s" % ','.join(lines))
		remove = remove + lines

	if args.list:
		lines = args.list
		print("Removing samples from list: %s" % ','.join(lines))
		remove = remove + lines

	#make sure it is a set, faster lookup
	keep_list = set(remove)
	count = len(keep_list)

	#now run filtering 
	keep_count = 0
	total_count = 0

	#rename to base
	if args.out.endswith('.gz'):
		outfile = args.out.replace('.gz', '')
	else:
		outfile = args.out
	#run filtering
	filter_sample(SeqIn, outfile)
	#compress and clean
	if args.out.endswith('.gz'): #compress in place
		amptklib.Fzip_inplace(outfile)
	if args.input.endswith('.gz'):
		amptklib.removefile(SeqIn)

   
	print("Removed %i samples" % count)
	print("Kept %i reads out of %i total reads" % (keep_count, total_count))
Ejemplo n.º 5
0
def main(args):
    parser = argparse.ArgumentParser(
        prog='amptk-barcode_rarify.py',
        description=
        '''Script to sub-sample reads down to the same number for each sample (barcode)''',
        epilog="""Written by Jon Palmer (2015) [email protected]""",
        formatter_class=MyFormatter)
    parser.add_argument('-i', '--input', required=True, help='Input FASTQ')
    parser.add_argument('-n',
                        '--num_reads',
                        required=True,
                        type=int,
                        help='Number of reads to rarify down to')
    parser.add_argument('-o', '--out', required=True, help='Output name')
    args = parser.parse_args(args)

    #check if input compressed, incompress if it is
    if args.input.endswith('.gz'):
        SeqIn = args.input.replace('.gz', '')
        amptklib.Funzip(args.input, SeqIn, multiprocessing.cpu_count())
    else:
        SeqIn = args.input
    if args.out.endswith('.gz'):
        outfile = args.out.replace('.gz', '')
    else:
        outfile = args.out

    IndexSeqs(SeqIn)
    countBarcodes(SeqIn)
    print("----------------------------------")
    print("Now sub-sampling reads down to a max of %s per sample" %
          args.num_reads)
    Reads = []
    for key, value in list(BarcodeCount.items()):
        sample = []
        for rec in SeqIndex:
            ID = rec.split("=")[-1].split(";")[0]
            if key == ID:
                sample.append(rec)
        Reads.append(sample)
    print("Finished indexing reads, split up by barcodelabel")
    Subsample = []
    for line in Reads:
        if len(line) > int(args.num_reads):
            line = random.sample(line, int(args.num_reads))
        Subsample.append(line)

    Subsample = [item for sublist in Subsample for item in sublist]

    #convert list to set for faster lookup
    Lookup = set(Subsample)

    print("Finished randomly sampling reads, now writing %i sequences to %s" %
          (len(Lookup), outfile))
    filterSeqs(SeqIn, Lookup, outfile)
    print("----------------------------------")
    countBarcodes(outfile)
    #compress and clean
    if args.out.endswith('.gz'):  #compress in place
        amptklib.Fzip_inplace(outfile)
    if args.input.endswith('.gz'):
        amptklib.removefile(SeqIn)
    print("----------------------------------")
    print("Sub-sampling done: %s" % args.out)
Ejemplo n.º 6
0
def main(args):
    parser = argparse.ArgumentParser(
        prog='amptk-dada2.py',
        description=
        '''Script takes output from amptk pre-processing and runs DADA2''',
        epilog="""Written by Jon Palmer (2016) [email protected]""",
        formatter_class=MyFormatter)

    parser.add_argument('-i',
                        '--fastq',
                        required=True,
                        help='Input Demuxed containing FASTQ')
    parser.add_argument('-o', '--out', help='Output Basename')
    parser.add_argument(
        '-m',
        '--min_reads',
        default=10,
        type=int,
        help="Minimum number of reads after Q filtering to run DADA2 on")
    parser.add_argument('-l',
                        '--length',
                        type=int,
                        help='Length to truncate reads')
    parser.add_argument('-e',
                        '--maxee',
                        default='1.0',
                        help='MaxEE quality filtering')
    parser.add_argument('-p',
                        '--pct_otu',
                        default='97',
                        help="Biological OTU Clustering Percent")
    parser.add_argument('--platform',
                        default='ion',
                        choices=['ion', 'illumina', '454'],
                        help='Sequencing platform')
    parser.add_argument('--chimera_method',
                        default='consensus',
                        choices=['consensus', 'pooled', 'per-sample'],
                        help='bimera removal method')
    parser.add_argument('--uchime_ref',
                        help='Run UCHIME REF [ITS,16S,LSU,COI,custom]')
    parser.add_argument('--pool',
                        action='store_true',
                        help='Pool all sequences together for DADA2')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Keep all intermediate files')
    parser.add_argument('-u',
                        '--usearch',
                        dest="usearch",
                        default='usearch9',
                        help='USEARCH9 EXE')
    parser.add_argument('--cpus',
                        type=int,
                        help="Number of CPUs. Default: auto")
    args = parser.parse_args(args)

    parentdir = os.path.join(os.path.dirname(amptklib.__file__))
    dada2script = os.path.join(parentdir, 'dada2_pipeline_nofilt.R')

    #get basename if not args.out passed
    if args.out:
        base = args.out
    else:
        if 'demux' in args.fastq:
            base = os.path.basename(args.fastq).split('.demux')[0]
        else:
            base = os.path.basename(args.fastq).split('.f')[0]

    #remove logfile if exists
    log_name = base + '.amptk-dada2.log'
    if os.path.isfile(log_name):
        amptklib.removefile(log_name)

    amptklib.setupLogging(log_name)
    FNULL = open(os.devnull, 'w')
    cmd_args = " ".join(sys.argv) + '\n'
    amptklib.log.debug(cmd_args)
    print("-------------------------------------------------------")
    #initialize script, log system info and usearch version
    amptklib.SystemInfo()
    #Do a version check
    usearch = args.usearch
    amptklib.versionDependencyChecks(usearch)

    #get number of cores
    if args.cpus:
        CORES = str(args.cpus)
    else:
        CORES = str(amptklib.getCPUS())

    #check dependencies
    programs = ['Rscript']
    amptklib.CheckDependencies(programs)
    Rversions = amptklib.checkRversion()
    R_pass = '******'
    dada2_pass = '******'

    #check dada2 first, if good move on, otherwise issue warning
    if not amptklib.gvc(Rversions[1], dada2_pass):
        amptklib.log.error("R v%s; DADA2 v%s detected, need atleast v%s" %
                           (Rversions[0], Rversions[1], dada2_pass))
        amptklib.log.error(
            "See: http://benjjneb.github.io/dada2/dada-installation.html")
        sys.exit(1)
    amptklib.log.info("R v%s; DADA2 v%s" % (Rversions[0], Rversions[1]))

    #Count FASTQ records and remove 3' N's as dada2 can't handle them
    amptklib.log.info("Loading FASTQ Records")
    no_ns = base + '.cleaned_input.fq'
    if args.fastq.endswith('.gz'):
        fastqInput = args.fastq.replace('.gz', '')
        amptklib.Funzip(os.path.abspath(args.fastq),
                        os.path.basename(fastqInput), CORES)
    else:
        fastqInput = os.path.abspath(args.fastq)
    amptklib.fastq_strip_padding(os.path.basename(fastqInput), no_ns)
    demuxtmp = base + '.original.fa'
    cmd = [
        'vsearch', '--fastq_filter',
        os.path.abspath(no_ns), '--fastq_qmax', '55', '--fastaout', demuxtmp,
        '--threads', CORES
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    orig_total = amptklib.countfasta(demuxtmp)
    size = amptklib.checkfastqsize(no_ns)
    readablesize = amptklib.convertSize(size)
    amptklib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize +
                      ')')

    #quality filter
    amptklib.log.info("Quality Filtering, expected errors < %s" % args.maxee)
    derep = base + '.qual-filtered.fq'
    filtercmd = [
        'vsearch', '--fastq_filter', no_ns, '--fastq_maxee',
        str(args.maxee), '--fastqout', derep, '--fastq_qmax', '55',
        '--fastq_maxns', '0', '--threads', CORES
    ]
    amptklib.runSubprocess(filtercmd, amptklib.log)
    total = amptklib.countfastq(derep)
    amptklib.log.info('{0:,}'.format(total) + ' reads passed')

    #split into individual files
    amptklib.log.info("Splitting FASTQ file by Sample into individual files")
    filtfolder = base + '_filtered'
    if os.path.isdir(filtfolder):
        shutil.rmtree(filtfolder)
    os.makedirs(filtfolder)
    splitDemux2(derep, filtfolder, args=args)

    #check for minimum number of reads in each sample
    remove = []
    files = [i for i in os.listdir(filtfolder) if i.endswith('.fastq')]
    for x in files:
        if amptklib.countfastq(os.path.join(filtfolder, x)) < args.min_reads:
            remove.append(x)
    if len(remove) > 0:
        amptklib.log.info("Dropping %s as fewer than %i reads" %
                          (', '.join(remove), args.min_reads))
        for y in remove:
            os.remove(os.path.join(filtfolder, y))

    #now run DADA2 on filtered folder
    amptklib.log.info("Running DADA2 pipeline")
    dada2log = base + '.dada2.Rscript.log'
    dada2out = base + '.dada2.csv'
    #check pooling vs notpooled, default is not pooled.
    if args.pool:
        POOL = 'TRUE'
    else:
        POOL = 'FALSE'
    with open(dada2log, 'w') as logfile:
        subprocess.call([
            'Rscript', '--vanilla', dada2script, filtfolder, dada2out,
            args.platform, POOL, CORES, args.chimera_method
        ],
                        stdout=logfile,
                        stderr=logfile)

    #check for results
    if not os.path.isfile(dada2out):
        amptklib.log.error("DADA2 run failed, please check %s logfile" %
                           dada2log)
        sys.exit(1)

    #now process the output, pull out fasta, rename, etc
    fastaout = base + '.otus.tmp'
    OTUCounts = {}
    counter = 1
    with open(fastaout, 'w') as writefasta:
        with open(dada2out, 'r') as input:
            next(input)
            for line in input:
                line = line.replace('\n', '')
                line = line.replace('"', '')
                cols = line.split(',')
                Seq = cols[0]
                countList = [int(x) for x in cols[1:]]
                counts = sum(countList)
                ID = 'ASV' + str(counter)
                if not ID in OTUCounts:
                    OTUCounts[ID] = counts
                writefasta.write(">%s\n%s\n" % (ID, Seq))
                counter += 1

    #get number of bimeras from logfile
    with open(dada2log, 'r') as bimeracheck:
        for line in bimeracheck:
            if line.startswith('Identified '):
                bimeraline = line.split(' ')
                bimeras = int(bimeraline[1])
                totalSeqs = int(bimeraline[5])
    validSeqs = totalSeqs - bimeras
    amptklib.log.info('{0:,}'.format(totalSeqs) +
                      ' total amplicon sequence variants (ASVs)')
    amptklib.log.info('{0:,}'.format(bimeras) + ' denovo chimeras removed')
    amptklib.log.info('{0:,}'.format(validSeqs) + ' valid ASVs')

    #optional UCHIME Ref
    uchime_out = base + '.nonchimeras.fa'
    chimeraFreeTable = base + '.otu_table.txt'
    iSeqs = base + '.ASVs.fa'
    if not args.uchime_ref:
        os.rename(fastaout, iSeqs)
    else:
        #check if file is present, remove from previous run if it is.
        if os.path.isfile(iSeqs):
            amptklib.removefile(iSeqs)
        #R. Edgar now says using largest DB is better for UCHIME, so use the one distributed with taxonomy
        if args.uchime_ref in [
                'ITS', '16S', 'LSU', 'COI'
        ]:  #test if it is one that is setup, otherwise default to full path
            uchime_db = os.path.join(parentdir, 'DB', args.uchime_ref + '.udb')
            if not os.path.isfile(uchime_db):
                amptklib.log.error(
                    "Database not properly configured, run `amptk install` to setup DB, skipping chimera filtering"
                )
                uchime_out = fastaout
            #since uchime cannot work with udb database, need to extract fasta sequences, do this if
            if not amptklib.checkfile(
                    os.path.join(parentdir, 'DB',
                                 args.uchime_ref + '.extracted.fa')):
                uchime_db = os.path.join(parentdir, 'DB',
                                         args.uchime_ref + '.extracted.fa')
                cmd = [
                    'vsearch', '--udb2fasta',
                    os.path.join(parentdir, 'DB', args.uchime_ref + '.udb'),
                    '--output', uchime_db
                ]
                amptklib.runSubprocess(cmd, amptklib.log)
            else:
                uchime_db = os.path.join(parentdir, 'DB',
                                         args.uchime_ref + '.extracted.fa')
        else:
            if os.path.isfile(args.uchime_ref):
                uchime_db = os.path.abspath(args.uchime_ref)
            else:
                amptklib.log.error(
                    "%s is not a valid file, skipping reference chimera filtering"
                    % args.uchime_ref)
                iSeqs = fastaout
        #now run chimera filtering if all checks out
        if not os.path.isfile(iSeqs):
            amptklib.log.info("Chimera Filtering (VSEARCH) using %s DB" %
                              args.uchime_ref)
            cmd = [
                'vsearch', '--mindiv', '1.0', '--uchime_ref', fastaout, '--db',
                uchime_db, '--nonchimeras', iSeqs, '--threads', CORES
            ]
            amptklib.runSubprocess(cmd, amptklib.log)
            total = amptklib.countfasta(iSeqs)
            uchime_chimeras = validSeqs - total
            amptklib.log.info('{0:,}'.format(total) + ' ASVs passed, ' +
                              '{0:,}'.format(uchime_chimeras) +
                              ' ref chimeras removed')
            if os.path.isfile(fastaout):
                amptklib.removefile(fastaout)

    #setup output files
    dadademux = base + '.dada2.map.uc'
    bioSeqs = base + '.cluster.otus.fa'
    bioTable = base + '.cluster.otu_table.txt'
    uctmp = base + '.map.uc'
    ClusterComp = base + '.ASVs2clusters.txt'

    #Filter out ASVs in wrong orientation
    amptklib.log.info('Validating ASV orientation')
    os.rename(iSeqs, iSeqs + '.bak')
    numKept, numDropped = amptklib.validateorientationDADA2(
        OTUCounts, iSeqs + '.bak', iSeqs)
    amptklib.log.info('{:,} ASVs validated ({:,} dropped)'.format(
        numKept, numDropped))
    amptklib.SafeRemove(iSeqs + '.bak')

    #map reads to DADA2 OTUs
    amptklib.log.info("Mapping reads to DADA2 ASVs")
    cmd = [
        'vsearch', '--usearch_global', demuxtmp, '--db', iSeqs, '--id', '0.97',
        '--uc', dadademux, '--strand', 'plus', '--otutabout', chimeraFreeTable,
        '--threads', CORES
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    total = amptklib.line_count2(dadademux)
    amptklib.log.info('{0:,}'.format(total) + ' reads mapped to ASVs ' +
                      '({0:.0f}%)'.format(total / float(orig_total) * 100))

    #cluster
    amptklib.log.info("Clustering ASVs at %s%% to generate biological OTUs" %
                      args.pct_otu)
    radius = float(args.pct_otu) / 100.
    cmd = [
        'vsearch', '--cluster_smallmem', iSeqs, '--centroids', bioSeqs, '--id',
        str(radius), '--strand', 'plus', '--relabel', 'OTU', '--qmask', 'none',
        '--usersort', '--threads', CORES
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    total = amptklib.countfasta(bioSeqs)
    amptklib.log.info('{0:,}'.format(total) + ' OTUs generated')

    #determine where iSeqs clustered
    iSeqmap = base + '.ASV_map.uc'
    cmd = [
        'vsearch', '--usearch_global', iSeqs, '--db', bioSeqs, '--id',
        str(radius), '--uc', iSeqmap, '--strand', 'plus', '--threads', CORES
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    iSeqMapped = {}
    with open(iSeqmap, 'r') as mapping:
        for line in mapping:
            line = line.replace('\n', '')
            cols = line.split('\t')
            OTU = cols[9]
            Hit = cols[8]
            if not OTU in iSeqMapped:
                iSeqMapped[OTU] = [Hit]
            else:
                iSeqMapped[OTU].append(Hit)
    with open(ClusterComp, 'w') as clusters:
        clusters.write('OTU\tASVs\n')
        for k, v in natsorted(list(iSeqMapped.items())):
            clusters.write('%s\t%s\n' % (k, ', '.join(v)))
    #create OTU table
    amptklib.log.info("Mapping reads to OTUs")
    cmd = [
        'vsearch', '--usearch_global', demuxtmp, '--db', bioSeqs, '--id',
        '0.97', '--uc', uctmp, '--strand', 'plus', '--otutabout', bioTable,
        '--threads', CORES
    ]
    amptklib.runSubprocess(cmd, amptklib.log)
    total = amptklib.line_count2(uctmp)
    amptklib.log.info('{0:,}'.format(total) + ' reads mapped to OTUs ' +
                      '({0:.0f}%)'.format(total / float(orig_total) * 100))

    if not args.debug:
        amptklib.removefile(no_ns)
        shutil.rmtree(filtfolder)
        amptklib.removefile(dada2out)
        amptklib.removefile(derep)
        amptklib.removefile(demuxtmp)
        amptklib.removefile(uctmp)
        amptklib.removefile(iSeqmap)
        amptklib.removefile(dadademux)

    #Print location of files to STDOUT
    print("-------------------------------------------------------")
    print("DADA2 Script has Finished Successfully")
    print("-------------------------------------------------------")
    if args.debug:
        print("Tmp Folder of files: %s" % filtfolder)
    print("Amplicon sequence variants: %s" % iSeqs)
    print("ASV OTU Table: %s" % chimeraFreeTable)
    print("Clustered OTUs: %s" % bioSeqs)
    print("OTU Table: %s" % bioTable)
    print("ASVs 2 OTUs: %s" % ClusterComp)
    print("-------------------------------------------------------")

    otu_print = bioSeqs.split('/')[-1]
    tab_print = bioTable.split('/')[-1]
    if 'darwin' in sys.platform:
        print(colr.WARN + "\nExample of next cmd:" + colr.END +
              " amptk filter -i %s -f %s -b <mock barcode>\n" %
              (tab_print, otu_print))
    else:
        print(
            "\nExample of next cmd: amptk filter -i %s -f %s -b <mock barcode>\n"
            % (tab_print, otu_print))
Ejemplo n.º 7
0
def main(args):
    global FwdPrimer, RevPrimer, Barcodes, tmpdir
    parser = argparse.ArgumentParser(
        prog='amptk-process_ion.py',
        usage="%(prog)s [options] -i file.fastq\n%(prog)s -h for help menu",
        description=
        '''Script finds barcodes, strips forward and reverse primers, relabels, and then trim/pads reads to a set length''',
        epilog="""Written by Jon Palmer (2015) [email protected]""",
        formatter_class=MyFormatter)

    parser.add_argument('-i',
                        '--fastq',
                        '--sff',
                        '--fasta',
                        '--bam',
                        dest='fastq',
                        required=True,
                        help='BAM/FASTQ/SFF/FASTA file')
    parser.add_argument('-q', '--qual', help='QUAL file (if -i is FASTA)')
    parser.add_argument('-o',
                        '--out',
                        dest="out",
                        default='ion',
                        help='Base name for output')
    parser.add_argument('-f',
                        '--fwd_primer',
                        dest="F_primer",
                        default='fITS7-ion',
                        help='Forward Primer')
    parser.add_argument('-r',
                        '--rev_primer',
                        dest="R_primer",
                        default='ITS4',
                        help='Reverse Primer')
    parser.add_argument(
        '-m',
        '--mapping_file',
        help='Mapping file: QIIME format can have extra meta data columns')
    parser.add_argument('-p',
                        '--pad',
                        default='off',
                        choices=['on', 'off'],
                        help='Pad with Ns to a set length')
    parser.add_argument('--primer_mismatch',
                        default=2,
                        type=int,
                        help='Number of mis-matches in primer')
    parser.add_argument('--barcode_mismatch',
                        default=0,
                        type=int,
                        help='Number of mis-matches in barcode')
    parser.add_argument(
        '--barcode_fasta',
        default='ionxpress',
        help='FASTA file containing Barcodes (Names & Sequences)')
    parser.add_argument('--reverse_barcode',
                        help='FASTA file containing 3 prime Barocdes')
    parser.add_argument('-b',
                        '--list_barcodes',
                        dest="barcodes",
                        default='all',
                        help='Enter Barcodes used separated by commas')
    parser.add_argument('--min_len',
                        default=100,
                        type=int,
                        help='Minimum read length to keep')
    parser.add_argument('-l',
                        '--trim_len',
                        default=300,
                        type=int,
                        help='Trim length for reads')
    parser.add_argument(
        '--full_length',
        action='store_true',
        help='Keep only full length reads (no trimming/padding)')
    parser.add_argument('--mult_samples',
                        dest="multi",
                        default='False',
                        help='Combine multiple samples (i.e. FACE1)')
    parser.add_argument('--ion',
                        action='store_true',
                        help='Input data is Ion Torrent')
    parser.add_argument('--454', action='store_true', help='Input data is 454')
    parser.add_argument('--cpus',
                        type=int,
                        help="Number of CPUs. Default: auto")
    parser.add_argument('-u',
                        '--usearch',
                        dest="usearch",
                        default='usearch9',
                        help='USEARCH EXE')
    args = parser.parse_args(args)

    args.out = re.sub(r'\W+', '', args.out)

    log_name = args.out + '.amptk-demux.log'
    if os.path.isfile(log_name):
        os.remove(log_name)
    FNULL = open(os.devnull, 'w')
    amptklib.setupLogging(log_name)
    cmd_args = " ".join(sys.argv) + '\n'
    amptklib.log.debug(cmd_args)
    print("-------------------------------------------------------")

    #initialize script, log system info and usearch version
    amptklib.SystemInfo()
    #Do a version check
    usearch = args.usearch
    amptklib.versionDependencyChecks(usearch)

    #get number of CPUs to use
    if not args.cpus:
        cpus = multiprocessing.cpu_count()
    else:
        cpus = args.cpus

    #parse a mapping file or a barcode fasta file, primers, etc get setup
    #dealing with Barcodes, get ion barcodes or parse the barcode_fasta argument
    barcode_file = args.out + ".barcodes_used.fa"
    rev_barcode_file = args.out + '.revbarcodes_used.fa'
    amptklib.SafeRemove(barcode_file)
    amptklib.SafeRemove(rev_barcode_file)

    #check if mapping file passed, use this if present, otherwise use command line arguments
    SampleData = {}
    Barcodes = {}
    RevBarcodes = {}
    if args.mapping_file:
        if not os.path.isfile(args.mapping_file):
            amptklib.log.error("Mapping file not found: %s" %
                               args.mapping_file)
            sys.exit(1)
        SampleData, Barcodes, RevBarcodes, FwdPrimer, RevPrimer = amptklib.parseMappingFileNEW(
            args.mapping_file)
        genericmapfile = args.mapping_file
    else:  #no mapping file, so create dictionaries from barcode fasta files
        if args.barcode_fasta == 'ionxpress':
            #get script path and barcode file name
            pgm_barcodes = os.path.join(os.path.dirname(amptklib.__file__),
                                        'DB', 'ionxpress_barcodes.fa')
        elif args.barcode_fasta == 'ioncode':
            pgm_barcodes = os.path.join(os.path.dirname(amptklib.__file__),
                                        'DB', 'ioncode_barcodes.fa')
        if args.barcode_fasta == 'ionxpress' or args.barcode_fasta == 'ioncode':
            if args.barcodes == "all":
                if args.multi == 'False':
                    shutil.copyfile(pgm_barcodes, barcode_file)
                else:
                    with open(barcode_file, 'w') as barcodeout:
                        with open(pgm_barcodes, 'r') as input:
                            for rec in SeqIO.parse(input, 'fasta'):
                                outname = args.multi + '.' + rec.id
                                barcodeout.write(">%s\n%s\n" %
                                                 (outname, rec.seq))
            else:
                bc_list = args.barcodes.split(",")
                inputSeqFile = open(pgm_barcodes, "rU")
                SeqRecords = SeqIO.to_dict(SeqIO.parse(inputSeqFile, "fasta"))
                for rec in bc_list:
                    name = "BC." + rec
                    seq = SeqRecords[name].seq
                    if args.multi != 'False':
                        outname = args.multi + '.' + name
                    else:
                        outname = name
                    outputSeqFile = open(barcode_file, "a")
                    outputSeqFile.write(">%s\n%s\n" % (outname, seq))
                outputSeqFile.close()
                inputSeqFile.close()
        else:
            #check for multi_samples and add if necessary
            if args.multi == 'False':
                shutil.copyfile(args.barcode_fasta, barcode_file)
                if args.reverse_barcode:
                    shutil.copyfile(args.reverse_barcode, rev_barcode_file)
            else:
                with open(barcode_file, 'w') as barcodeout:
                    with open(args.barcode_fasta, 'r') as input:
                        for rec in SeqIO.parse(input, 'fasta'):
                            outname = args.multi + '.' + rec.id
                            barcodeout.write(">%s\n%s\n" % (outname, rec.seq))
                if args.reverse_barcode:
                    with open(rev_barcode_file, 'w') as barcodeout:
                        with open(args.reverse_barcode, 'r') as input:
                            for rec in SeqIO.parse(input, 'fasta'):
                                outname = args.multi + '.' + rec.id
                                barcodeout.write(">%s\n%s\n" %
                                                 (outname, rec.seq))

        #parse primers here so doesn't conflict with mapping primers
        #look up primer db otherwise default to entry
        if args.F_primer in amptklib.primer_db:
            FwdPrimer = amptklib.primer_db.get(args.F_primer)
            amptklib.log.info(
                "{:} fwd primer found in AMPtk primer db, setting to: {:}".
                format(args.F_primer, FwdPrimer))
        else:
            FwdPrimer = args.F_primer
            amptklib.log.info(
                "{:} fwd primer not found in AMPtk primer db, assuming it is actual primer sequence."
                .format(args.F_primer))
        if args.R_primer in amptklib.primer_db:
            RevPrimer = amptklib.primer_db.get(args.R_primer)
            amptklib.log.info(
                "{:} rev primer found in AMPtk primer db, setting to: {:}".
                format(args.R_primer, RevPrimer))
        else:
            RevPrimer = args.R_primer
            amptklib.log.info(
                "{:} rev primer not found in AMPtk primer db, assuming it is actual primer sequence."
                .format(args.R_primer))

    #check if input is compressed
    gzip_list = []
    if args.fastq.endswith('.gz'):
        gzip_list.append(os.path.abspath(args.fastq))
    if gzip_list:
        amptklib.log.info("Gzipped input files detected, uncompressing")
        for file in gzip_list:
            file_out = file.replace('.gz', '')
            amptklib.Funzip(file, file_out, cpus)
        args.fastq = args.fastq.replace('.gz', '')

    #if SFF file passed, convert to FASTQ with biopython
    if args.fastq.endswith(".sff"):
        if args.barcode_fasta == 'ionxpress':
            if not args.mapping_file:
                amptklib.log.error(
                    "You did not specify a --barcode_fasta or --mapping_file, one is required for 454 data"
                )
                sys.exit(1)
        amptklib.log.info("SFF input detected, converting to FASTQ")
        SeqIn = args.out + '.sff.extract.fastq'
        SeqIO.convert(args.fastq, "sff-trim", SeqIn, "fastq")
    elif args.fastq.endswith(".fas") or args.fastq.endswith(
            ".fasta") or args.fastq.endswith(".fa"):
        if not args.qual:
            amptklib.log.error(
                "FASTA input detected, however no QUAL file was given.  You must have FASTA + QUAL files"
            )
            sys.exit(1)
        else:
            if args.barcode_fasta == 'ionxpress':
                if not args.mapping_file:
                    amptklib.log.error(
                        "You did not specify a --barcode_fasta or --mapping_file, one is required for 454 data"
                    )
                    sys.exit(1)
            SeqIn = args.out + '.fastq'
            amptklib.log.info("FASTA + QUAL detected, converting to FASTQ")
            amptklib.faqual2fastq(args.fastq, args.qual, SeqIn)
    elif args.fastq.endswith('.bam'):
        #so we can convert natively with pybam, however it is 10X slower than bedtools/samtools
        #since samtools is fastest, lets use that if exists, if not then bedtools, else default to pybam
        amptklib.log.info("Converting Ion Torrent BAM file to FASTQ")
        SeqIn = args.out + '.fastq'
        if amptklib.which('samtools'):
            cmd = ['samtools', 'fastq', '-@', str(cpus), args.fastq]
            amptklib.runSubprocess2(cmd, amptklib.log, SeqIn)
        else:
            if amptklib.which('bedtools'):
                cmd = [
                    'bedtools', 'bamtofastq', '-i', args.fastq, '-fq', SeqIn
                ]
                amptklib.runSubprocess(cmd, amptklib.log)
            else:  #default to pybam
                amptklib.bam2fastq(args.fastq, SeqIn)
    else:
        SeqIn = args.fastq

    #start here to process the reads, first reverse complement the reverse primer
    catDemux = args.out + '.demux.fq'
    origRevPrimer = RevPrimer
    RevPrimer = amptklib.RevComp(RevPrimer)
    amptklib.log.info("Foward primer: %s,  Rev comp'd rev primer: %s" %
                      (FwdPrimer, RevPrimer))

    #then setup barcode dictionary
    if len(Barcodes) < 1:
        Barcodes = amptklib.fasta2barcodes(barcode_file, False)

    #setup for looking for reverse barcode
    if len(RevBarcodes) < 1 and args.reverse_barcode:
        if not os.path.isfile(args.reverse_barcode):
            amptklib.log.info("Reverse barcode is not a valid file, exiting")
            sys.exit(1)
        shutil.copyfile(args.reverse_barcode, rev_barcode_file)
        RevBarcodes = amptklib.fasta2barcodes(rev_barcode_file, True)

    #Count FASTQ records
    amptklib.log.info("Loading FASTQ Records")
    orig_total = amptklib.countfastq(SeqIn)
    size = amptklib.checkfastqsize(SeqIn)
    readablesize = amptklib.convertSize(size)
    amptklib.log.info('{0:,}'.format(orig_total) + ' reads (' + readablesize +
                      ')')

    #create tmpdir and split input into n cpus
    tmpdir = args.out.split('.')[0] + '_' + str(os.getpid())
    if not os.path.exists(tmpdir):
        os.makedirs(tmpdir)

    amptklib.log.info(
        'Dropping reads less than {:} bp and setting lossless trimming to {:} bp.'
        .format(args.min_len, args.trim_len))

    if cpus > 1:
        #split fastq file
        amptklib.log.info("Splitting FASTQ files over {:} cpus".format(cpus))
        amptklib.split_fastq(SeqIn, orig_total, tmpdir, cpus * 2)
        #now get file list from tmp folder
        file_list = []
        for file in os.listdir(tmpdir):
            if file.endswith(".fq"):
                file = os.path.join(tmpdir, file)
                file_list.append(file)
        #finally process reads over number of cpus
        amptklib.runMultiProgress(processRead, file_list, cpus, args=args)
    else:
        shutil.copyfile(SeqIn, os.path.join(tmpdir, 'chunk.fq'))
        processRead(os.path.join(tmpdir, 'chunk.fq'), args=args)

    print("-------------------------------------------------------")
    #Now concatenate all of the demuxed files together
    amptklib.log.info("Concatenating Demuxed Files")

    tmpDemux = args.out + '.tmp.demux.fq'
    with open(tmpDemux, 'w') as outfile:
        for filename in glob.glob(os.path.join(tmpdir, '*.demux.fq')):
            if filename == tmpDemux:
                continue
            with open(filename, 'r') as readfile:
                shutil.copyfileobj(readfile, outfile)
    #parse the stats
    finalstats = [0, 0, 0, 0, 0, 0, 0]
    for file in os.listdir(tmpdir):
        if file.endswith('.stats'):
            with open(os.path.join(tmpdir, file), 'r') as statsfile:
                line = statsfile.readline()
                line = line.rstrip()
                newstats = line.split(',')
                newstats = [int(i) for i in newstats]
                for x, num in enumerate(newstats):
                    finalstats[x] += num

    #clean up tmp folder
    shutil.rmtree(tmpdir)

    #last thing is to re-number of reads as it is possible they could have same name from multitprocessor split
    amptklib.fastqreindex(tmpDemux, catDemux)
    os.remove(tmpDemux)

    amptklib.log.info('{0:,}'.format(finalstats[0]) + ' total reads')
    if args.reverse_barcode:
        amptklib.log.info('{0:,}'.format(finalstats[0] - finalstats[1] -
                                         finalstats[2] - finalstats[4]) +
                          ' valid Fwd and Rev Barcodes')
    else:
        amptklib.log.info('{0:,}'.format(finalstats[0] - finalstats[1]) +
                          ' valid Barcode')
        amptklib.log.info('{0:,}'.format(finalstats[0] - finalstats[1] -
                                         finalstats[2]) +
                          ' Fwd Primer found, {0:,}'.format(finalstats[3]) +
                          ' Rev Primer found')
    amptklib.log.info('{0:,}'.format(finalstats[5]) +
                      ' discarded too short (< %i bp)' % args.min_len)
    amptklib.log.info('{0:,}'.format(finalstats[6]) + ' valid output reads')

    #now loop through data and find barcoded samples, counting each.....
    BarcodeCount = {}
    with open(catDemux, 'r') as input:
        header = itertools.islice(input, 0, None, 4)
        for line in header:
            ID = line.split("=", 1)[-1].split(";")[0]
            if ID not in BarcodeCount:
                BarcodeCount[ID] = 1
            else:
                BarcodeCount[ID] += 1

    #now let's count the barcodes found and count the number of times they are found.
    barcode_counts = "%22s:  %s" % ('Sample', 'Count')
    for k, v in natsorted(list(BarcodeCount.items()),
                          key=lambda k_v: k_v[1],
                          reverse=True):
        barcode_counts += "\n%22s:  %s" % (k, str(BarcodeCount[k]))
    amptklib.log.info("Found %i barcoded samples\n%s" %
                      (len(BarcodeCount), barcode_counts))

    #create a generic mappingfile for downstream processes
    genericmapfile = args.out + '.mapping_file.txt'
    if not args.mapping_file:
        amptklib.CreateGenericMappingFile(Barcodes, RevBarcodes, FwdPrimer,
                                          origRevPrimer, genericmapfile,
                                          BarcodeCount)
    else:
        amptklib.updateMappingFile(args.mapping_file, BarcodeCount,
                                   genericmapfile)

    #compress the output to save space
    FinalDemux = catDemux + '.gz'
    amptklib.Fzip(catDemux, FinalDemux, cpus)
    amptklib.removefile(catDemux)
    if gzip_list:
        for file in gzip_list:
            file = file.replace('.gz', '')
            amptklib.removefile(file)

    #get file size
    filesize = os.path.getsize(FinalDemux)
    readablesize = amptklib.convertSize(filesize)
    amptklib.log.info("Output file:  %s (%s)" % (FinalDemux, readablesize))
    amptklib.log.info("Mapping file: %s" % genericmapfile)

    print("-------------------------------------------------------")
    if 'darwin' in sys.platform:
        print(col.WARN + "\nExample of next cmd: " + col.END +
              "amptk cluster -i %s -o out\n" % (FinalDemux))
    else:
        print("\nExample of next cmd: amptk cluster -i %s -o out\n" %
              (FinalDemux))
Ejemplo n.º 8
0
def main(args):
    parser = argparse.ArgumentParser(
        prog='amptk-get_barcode_counts.py',
        description=
        '''Script loops through demuxed fastq file counting occurances of barcodes, can optionally quality trim and recount.''',
        epilog="""Written by Jon Palmer (2015) [email protected]""",
        formatter_class=MyFormatter)
    parser.add_argument('-i',
                        '--input',
                        required=True,
                        help='Input demuxed FASTQ')
    parser.add_argument('--quality_trim',
                        action='store_true',
                        help='Quality trim data')
    parser.add_argument('-e',
                        '--maxee',
                        default=1.0,
                        type=float,
                        help='MaxEE Q-trim threshold')
    parser.add_argument('-l',
                        '--trunclen',
                        default=250,
                        type=int,
                        help='Read truncation length')
    parser.add_argument('-o', '--out', help='Output for quality trimmed data')
    args = parser.parse_args(args)

    if args.quality_trim and not args.out:
        print("Error, to run quality trimming you must provide -o, --output")
        sys.exit(1)

    #main start here
    cpus = multiprocessing.cpu_count()
    print("----------------------------------")
    tmpinput = 'amptk_show.tmp'
    if args.input.endswith('.gz'):
        amptklib.Funzip(args.input, tmpinput, cpus)
    else:
        tmpinput = args.input
    countBarcodes(tmpinput)
    print("----------------------------------")
    getSeqLength(tmpinput)
    print("----------------------------------")
    if args.quality_trim:
        #split the input FASTQ file into chunks to process
        #split fastq file
        SeqCount = amptklib.countfastq(tmpinput)
        pid = os.getpid()
        folder = 'amptk_tmp_' + str(pid)
        amptklib.split_fastq(tmpinput, SeqCount, folder, cpus * 2)
        #now get file list from tmp folder
        file_list = []
        for file in os.listdir(folder):
            if file.endswith(".fq"):
                file = os.path.join(folder, file)
                file_list.append(file)

        p = multiprocessing.Pool(cpus)
        for f in file_list:
            #worker(f)
            p.apply_async(worker, [f])
        p.close()
        p.join()

        #get filtered results
        catDemux = args.out
        with open(catDemux, 'w') as outfile:
            for filename in glob.glob(os.path.join(folder, '*.filter.fq')):
                if filename == catDemux:
                    continue
                with open(filename, 'r') as readfile:
                    shutil.copyfileobj(readfile, outfile)
        if catDemux.endswith('.gz'):
            amptklib.Fzip_inplace(catDemux)
        shutil.rmtree(folder)
        print("----------------------------------")
        countBarcodes(args.out)
        print("----------------------------------")
        print("Script finished, output in %s" % args.out)

    if args.input.endswith('.gz'):
        amptklib.removefile(tmpinput)